diff --git a/scripts/paris/average-correlations__rkcorr-to-npy.py b/scripts/paris/average-correlations__rkcorr-to-npy.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeec6c3f34eab10b47b260252ff24c92fd9272c5
--- /dev/null
+++ b/scripts/paris/average-correlations__rkcorr-to-npy.py
@@ -0,0 +1,116 @@
+import itertools
+import numpy
+import sys
+import signatures
+import csv
+
+if __name__ == "__main__":
+
+    rankfile = sys.argv[1]
+    fgenescorr = sys.argv[2]
+    size = int(sys.argv[3])
+    signatures_files = sys.argv[4:6]
+
+    print("Load genes correlations...", file=sys.stderr, flush=True)
+    genes_correlations = numpy.load(fgenescorr)
+    ngenes = genes_correlations.shape[0]
+    assert(genes_correlations.shape[1] == ngenes)
+
+    print("Load signatures...", file=sys.stderr, flush=True)
+    genesets_1,genome_1,breaks_1 = signatures.load([signatures_files[0]], filter_size=size)
+    assert(len(genesets_1) > 0)
+    assert(len(genome_1) > 0)
+
+    genesets_2,genome_2,breaks_2 = signatures.load([signatures_files[1]], filter_size=size)
+    assert(len(genesets_2) > 0)
+    assert(len(genome_2) > 0)
+
+    print("Load genes indices...", file=sys.stderr, flush=True)
+    genome = genome_1 | genome_2
+    genes_idx = {}
+    ranks_l = []
+    with open(rankfile) as fd:
+        csvreader = csv.reader(fd, delimiter=" ")
+        header = next(csvreader)
+        n=0
+        for row in csvreader:
+            gene = row[0]
+            assert(len(gene) > 0)
+            if gene in genome:
+                genes_idx[gene] = n
+                n += 1
+                print("\r",n,end=" ", file=sys.stderr,flush=True)
+        assert(len(genes_idx) > 0)
+    print("genes", file=sys.stderr, flush=True)
+
+    print("Compute paired signatures correlations...", file=sys.stderr, flush=True)
+    nsign_1 = len(genesets_1)
+    nsign_2 = len(genesets_2)
+    signature_correlations = numpy.zeros((nsign_1, nsign_2))
+    for i1,s1 in enumerate(genesets_1):
+        for i2,s2 in enumerate(genesets_2):
+            g1 = [genes_idx[g] for g in s1 if g in genes_idx]
+            assert(len(g1) > 0)
+            g2 = [genes_idx[g] for g in s2 if g in genes_idx]
+            assert(len(g2) > 0)
+            subidx = itertools.product(g1,g2)
+            i,j = zip(*subidx)
+            submat = genes_correlations[i,j]
+            assert(submat.size == len(g1)*len(g2))
+            avcorr = numpy.mean(submat)
+            assert(avcorr > 0)
+            signature_correlations[i1,i2] = avcorr
+
+    fname = "signatures-average-correlations.npy"
+    print("Save signatures correlation matrix of shape", signature_correlations.shape, "to `", fname,"`...", file=sys.stderr, flush=True)
+    numpy.save(fname, signature_correlations)
+
+    print("Compute all signatures correlations...", file=sys.stderr, flush=True)
+    genesets = genesets_1 | genesets_2
+    nsign = len(genesets)
+    all_signature_correlations = numpy.zeros((nsign, nsign))
+    signature_origins = {}
+    for i1,s1 in enumerate(genesets):
+        if s1 in genesets_1 and s1 in genesets_2:
+            signature_origins[i1] = "BOTH"
+        elif s1 in genesets_1:
+            signature_origins[i1] = signatures_files[0]
+        elif s1 in genesets_2:
+            signature_origins[i1] = signatures_files[1]
+        else:
+            assert(s1 in genesets_1 or s1 in genesets_2)
+
+        for i2,s2 in enumerate(genesets):
+            # if i2 < i1:
+            #     continue
+            g1 = [genes_idx[g] for g in s1 if g in genes_idx]
+            assert(len(g1) > 0)
+            g2 = [genes_idx[g] for g in s2 if g in genes_idx]
+            assert(len(g2) > 0)
+            # Cross-product of all genes indices.
+            subidx = itertools.product(g1,g2)
+            # Corresponding tsb-matrix indices.
+            i,j = zip(*subidx)
+            submat = genes_correlations[i,j]
+            assert(submat.size == len(g1)*len(g2))
+            # Average correlation over the sub-matrix.
+            # TODO would be faster over the triangular sub-matrix.
+            avcorr = numpy.mean(submat)
+            assert(avcorr > 0)
+            all_signature_correlations[i1,i2] = avcorr
+
+    assert(len(signature_origins) == nsign)
+    fname = "signatures-origins.csv"
+    print("Save unique signatures origin table to `", fname, "`...", file=sys.stderr, flush=True)
+
+    with open(fname, 'w') as fd:
+        writer = csv.DictWriter(fd, fieldnames=["gene_index","origin"])
+        writer.writeheader()
+        for i in signature_origins:
+            writer.writerow({"gene_index":i, "origin":signature_origins[i]})
+
+    fname = "signatures-average-correlations_full.npy"
+    print("Save full signatures correlation matrix of shape", all_signature_correlations.shape, "to `", fname,"`...", file=sys.stderr, flush=True)
+    numpy.save(fname, all_signature_correlations)
+
+    print("Done", file=sys.stderr, flush=True)
diff --git a/scripts/paris/config.yaml b/scripts/paris/config.yaml
deleted file mode 100644
index 81bde9f8a09369870d715e70a4e0da400ce93eed..0000000000000000000000000000000000000000
--- a/scripts/paris/config.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-snakefile: Snakefile
-
-# executable: "singularity run ../frictionlesser.sif"
-executable: "../../release/app/frictionlesser"
-
-cluster: "sbatch --job-name {resources.job_name} --mem {resources.mem_mb} --cpus-per-task=1 --partition common,dedicated --qos normal --output slurmout/%j.out --error slurmout/%j.err"
-
-sizes:
-    - 10
-    - 20
-    - 30
-
-# Number of runs = number of (possibly duplicated) signatures.
-runs: 5000
-
diff --git a/scripts/paris/config_preproc.yaml b/scripts/paris/config_preproc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..beb2afa5217d90f112a4ac69079b9e3ccd106ca8
--- /dev/null
+++ b/scripts/paris/config_preproc.yaml
@@ -0,0 +1,12 @@
+snakefile: Snakefile
+
+# executable: "singularity run ../../frictionlesser.sif"
+executable: "../../release/app/frictionlesser"
+
+cluster: "sbatch --job-name {resources.job_name} --mem 16G --cpus-per-task=1 --partition common,dedicated --qos fast --output logs/%j.out --error logs/%j.err"
+
+sizes:
+    - 10
+    - 20
+    - 30
+
diff --git a/scripts/paris/counts__hdf5-to-csv.py b/scripts/paris/counts__hdf5-to-csv.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a22e5432e2e8c4761d77113d886f136181d8332
--- /dev/null
+++ b/scripts/paris/counts__hdf5-to-csv.py
@@ -0,0 +1,35 @@
+import sys
+import numpy
+import scanpy
+import pandas as pd
+import anndata as ad
+
+def check(counts, meta, genes):
+    assert(counts.shape[0] > 0)
+    assert(counts.shape[1] > 0)
+    assert(counts.shape[0] == len(meta))
+    assert(counts.shape[1] == len(genes))
+    print("OK", file=sys.stderr, flush=True)
+
+
+if __name__ == "__main__":
+
+    print("Load annotated data...", file=sys.stderr, flush=True)
+    adata = ad.read(sys.argv[1])
+    check(adata.X, adata.obs_names, adata.var_names)
+    print("Loaded",len(adata.var_names),"genes and",len(adata.obs_names),"cells", file=sys.stderr, flush=True)
+
+    print("Output as CSV...", file=sys.stderr, flush=True)
+    counts_full = numpy.asarray(adata.X.todense())
+
+    # Column names are sample, not cells IDs.
+    print("GENE,", ",".join(adata.obs["sample"]), sep="")
+    i = 1
+    for row in counts_full.T:
+        print("\r",i, end="", flush=True, file=sys.stderr)
+        # assert(len(row) == len(meta_cancer["cell_types_v2"]))
+        print(adata.var_names[i], ",".join(str(x) for x in row))
+        i += 1
+    assert( i == len(adata.var_names))
+
+    print("\nDone", file=sys.stderr, flush=True)
diff --git a/scripts/paris/genes-correlations__sign-to-npy.py b/scripts/paris/genes-correlations__sign-to-npy.py
new file mode 100644
index 0000000000000000000000000000000000000000..98511794accf6f46a06f5ae7543bb66c33faabff
--- /dev/null
+++ b/scripts/paris/genes-correlations__sign-to-npy.py
@@ -0,0 +1,71 @@
+import scipy
+import numpy
+from sortedcollections import OrderedSet
+from scipy.spatial.distance import squareform, pdist, cdist, jaccard
+from fastcluster import linkage
+from scipy.stats import spearmanr
+
+from signatures import *
+
+if __name__=="__main__":
+    import sys
+    from matplotlib import gridspec
+    import matplotlib.pyplot as plt
+    from matplotlib import cm
+    from matplotlib.colors import Normalize
+    # import pandas
+    import csv
+
+    rankfile = sys.argv[1]
+
+    size = int(sys.argv[2])
+    if size == 0:
+        size = None
+
+    signatures_files = sys.argv[3:5] # Only two files
+
+
+    genome = load_genome(signatures_files, filter_size=size)
+    assert(len(genome) > 0)
+    print("Genome across",len(signatures_files), "files:", len(genome), "genes", file=sys.stderr, flush=True)
+
+    print("Load ranks...", file=sys.stderr, flush=True)
+    genes_idx = {}
+    ranks_l = []
+    with open(rankfile) as fd:
+        csv = csv.reader(fd, delimiter=" ")
+        header = next(csv)
+        n=0
+        for row in csv:
+            gene = row[0]
+            assert(len(gene) > 0)
+            if gene in genome:
+                genes_idx[gene] = n
+                n += 1
+                print("\r",n,end=" ", file=sys.stderr,flush=True)
+                ranks_l.append(numpy.array([float(i) for i in row[1:]]))
+    assert(len(ranks_l) > 0)
+    assert(len(ranks_l) == len(genes_idx))
+    ngenes = len(genes_idx)
+    ncells = len(ranks_l[0])
+    print("\rLoaded ranks for",ngenes,"genes,",ncells,"cells", file=sys.stderr, flush=True)
+    # Convert to array.
+    ranks = numpy.array(ranks_l)
+    assert(ranks.shape == (ngenes,ncells))
+
+    ranked_genes = OrderedSet(genes_idx.keys())
+    unloaded_genes = genome - ranked_genes
+    if len(unloaded_genes) > 0:
+        print("WARNING: Genes of genome not found in ranks file:",unloaded_genes, file=sys.stderr, flush=True)
+        # sys.exit(1)
+    # common_genome = genome & ranked_genes
+
+    print("Compute genes correlation matrix...", file=sys.stderr, flush=True)
+    genes_correlations = numpy.absolute(spearmanr(ranks, axis=1)[0]) # [0] = statistic, [1] = pvalue
+    assert(genes_correlations.shape == (ngenes,ngenes))
+
+    fname = "genes-ranks-correlations.npy"
+    print("Save genes correlation matrix to `", fname,"`...", file=sys.stderr, flush=True)
+    numpy.save(fname, genes_correlations)
+
+    print("Done", file=sys.stderr, flush=True)
diff --git a/scripts/paris/paris_preproc-seurat_hdf5.py b/scripts/paris/paris_preproc-seurat_hdf5.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c543d06541cb0464ced3ff3267b096139b2443d
--- /dev/null
+++ b/scripts/paris/paris_preproc-seurat_hdf5.py
@@ -0,0 +1,65 @@
+import sys
+from scipy.sparse import load_npz
+import scanpy
+import pandas as pd
+import anndata as ad
+# from sklearn.preprocessing import normalize
+# from itertools import zip_longest
+# import numpy
+# import scipy
+
+def check(counts, meta, genes):
+    assert(counts.shape[0] > 0)
+    assert(counts.shape[1] > 0)
+    assert(counts.shape[0] == len(meta))
+    assert(counts.shape[1] == len(genes))
+    print("OK", file=sys.stderr, flush=True)
+
+if __name__ == "__main__":
+
+    if len(sys.argv) == 4:
+
+        fcounts = sys.argv[1]
+        ffeatures = sys.argv[2]
+        fmeta = sys.argv[3]
+
+
+        print("Load data files...", file=sys.stderr, flush=True)
+        # Load data.
+        with open(fcounts) as fd:
+            # This is a numpy sparse matrix: cells × genes.
+            counts = load_npz(fcounts)
+
+        # Those are pandas' dataframes.
+        meta  = pd.read_csv(fmeta)
+        genes = pd.read_csv(ffeatures)
+
+        print("Loaded",len(genes["id"]),"genes and",len(meta["cell_types_v2"]),"cells", file=sys.stderr, flush=True)
+        check(counts, meta["cell_types_v2"], genes["id"])
+
+
+        print("Build annotated data...", file=sys.stderr, flush=True)
+        # adata = ad.AnnData(counts, meta, genes, dtype=counts.dtype) # Seurat does not support sparse matrices.
+        adata = ad.AnnData(counts.todense(), meta, genes, dtype=counts.dtype)
+        check(adata.X, adata.obs_names, adata.var_names)
+
+
+        # print("Save raw annotated data...", file=sys.stderr, flush=True)
+        # adata.write(fcounts+".hdf5")
+        # print("OK", file=sys.stderr, flush=True)
+
+    elif len(sys.argv) == 2:
+        print("Load annotated data...", file=sys.stderr, flush=True)
+        adata = ad.read(sys.argv[1])
+        check(adata.X, adata.obs_names, adata.var_names)
+
+    else:
+        assert(len(sys.argv) == 2 or len(sys.argv) == 4)
+
+    print("Seurat preprocessing...", file=sys.stderr, flush=True)
+    scanpy.pp.recipe_seurat(adata)
+    check(adata.X, adata.obs_names, adata.var_names)
+    adata.write(fcounts+".seurat.hdf5")
+    adata.file.close()
+
+    print("Done", file=sys.stderr, flush=True)
diff --git a/scripts/paris/preproc-mara__npz-to-hdf5.py b/scripts/paris/preproc-mara__npz-to-hdf5.py
new file mode 100644
index 0000000000000000000000000000000000000000..a35560dede4cf571bbb1b141170559a1688bb927
--- /dev/null
+++ b/scripts/paris/preproc-mara__npz-to-hdf5.py
@@ -0,0 +1,106 @@
+import sys
+from scipy.sparse import load_npz
+import scanpy as sc
+import pandas as pd
+import anndata as ad
+import pathlib
+# from sklearn.preprocessing import normalize
+# from itertools import zip_longest
+# import numpy
+# import scipy
+
+def check(counts, cells, genes):
+    assert(counts.shape[0] > 0)
+    assert(counts.shape[1] > 0)
+    assert(counts.shape[0] == len(cells))
+    assert(counts.shape[1] == len(genes))
+    print(len(genes),"genes",len(cells),"cells")
+    print("OK", file=sys.stderr, flush=True)
+
+if __name__ == "__main__":
+
+    if len(sys.argv) == 4:
+
+        fcounts = sys.argv[1]
+        ffeatures = sys.argv[2]
+        fmeta = sys.argv[3]
+
+
+        print("Load data files...", file=sys.stderr, flush=True)
+        # Load data.
+        with open(fcounts) as fd:
+            # This is a numpy sparse matrix: cells × genes.
+            counts = load_npz(fcounts)
+
+        # Those are pandas' dataframes.
+        cells  = pd.read_csv(fmeta)
+        genes = pd.read_csv(ffeatures)
+
+        print("Loaded",len(genes["id"]),"genes and",len(cells["cell_types_v2"]),"cells", file=sys.stderr, flush=True)
+        check(counts, cells["cell_types_v2"], genes["id"])
+
+        print("Build annotated data...", file=sys.stderr, flush=True)
+        adata = ad.AnnData(counts.tocsr(), cells, genes, dtype=counts.dtype)
+        check(adata.X, adata.obs_names, adata.var_names)
+
+        # print("Save raw annotated data...", file=sys.stderr, flush=True)
+        # adata.write(fcounts+".hdf5")
+        # print("OK", file=sys.stderr, flush=True)
+
+    elif len(sys.argv) == 2:
+        print("Load annotated data...", file=sys.stderr, flush=True)
+        adata = ad.read(sys.argv[1])
+        check(adata.X, adata.obs_names, adata.var_names)
+
+    else:
+        assert(len(sys.argv) == 2 or len(sys.argv) == 4)
+
+    # https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html
+    print("Mara's preprocessing...", file=sys.stderr, flush=True)
+
+    # sc.pl.highest_expr_genes(adata, n_top=20, ) # TODO checkpoint
+
+    # print("Basic filtering...", file=sys.stderr, flush=True)
+    # sc.pp.filter_cells(adata, min_genes=200)
+    # sc.pp.filter_genes(adata, min_cells=3)
+
+    # adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
+    # sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
+
+    # sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) # TODO checkpoint
+    # sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt') # TODO checkpoint
+    # sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts') # TODO checkpoint
+
+    # print("Actually do the basic filtering...", file=sys.stderr, flush=True)
+    # adata = adata[adata.obs.n_genes_by_counts < 2500, :]
+    # adata = adata[adata.obs.pct_counts_mt < 5, :]
+
+    print("Filter out non-cancer...", file=sys.stderr, flush=True)
+    adata = adata[adata.obs.cell_types_v2 == "cancer", :]
+    check(adata.X, adata.obs_names, adata.var_names)
+
+    print("Total-count normalize...", file=sys.stderr, flush=True)
+    # (library-size correct) the data matrix X to 10,000 reads per cell, so that counts become comparable among cells.
+    sc.pp.normalize_total(adata, target_sum=1e4)
+
+    print("Logarithmize the data...", file=sys.stderr, flush=True)
+    sc.pp.log1p(adata)
+
+    print("Identify highly-variable genes...", file=sys.stderr, flush=True)
+    # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
+    sc.pp.highly_variable_genes(adata, n_top_genes=10000)
+
+    # sc.pl.highly_variable_genes(adata) # TODO checkpoint
+
+    print("Actually do the highly-variable filtering...", file=sys.stderr, flush=True)
+    adata = adata[:, adata.var.highly_variable]
+
+    check(adata.X, adata.obs_names, adata.var_names)
+
+    fname = str(pathlib.Path(fcounts).with_suffix(".mara.hdf5"))
+    print("Write data to", fname, file=sys.stderr, flush=True)
+    adata.write(fname)
+
+    adata.file.close() # In case it was backed.
+    print("Done", file=sys.stderr, flush=True)
+
diff --git a/scripts/paris/Snakefile b/scripts/paris/preproc.Snakefile
similarity index 51%
rename from scripts/paris/Snakefile
rename to scripts/paris/preproc.Snakefile
index 3ad1eb2858a397da3915815f8af31cd4a7db44d7..23eeafb857841c4b2196ac8db54d25df8cd683e7 100644
--- a/scripts/paris/Snakefile
+++ b/scripts/paris/preproc.Snakefile
@@ -11,28 +11,43 @@ FRICTIONLESSER=config["executable"]
 
 rule all:
     input:
-        expand("signatures_of_{size}-genes/signature_{seed}.txt", size=config["sizes"], seed=SEEDS)
+        "data/inter/ranks.tsv",
+        "cache/trans.cache.dat",
+        expand("cache/size_{size}.cache.dat", size=config["sizes"])
 
+rule preprocessing:
+    input:
+        counts="data/input/2022_02_18_version_2_EOC_counts.npz",
+        features="data/input/2022_02_18_version_2_EOC_features.csv",
+        meta="data/input/2022_02_18_version_2_EOC_meta.csv"
+    output:
+        "data/inter/2022_02_18_version_2_EOC_counts.mara.hdf5"
+    shell:
+        "python3 preproc-mara__npz-to-hdf5.py {input.counts} {input.features} {input.meta}"
 
-# rule rank:
-#     input:
-#         "exprs.csv"
-#     output:
-#         "ranks.tsv"
-#     shell:
-#         "{FRICTIONLESSER}"
-#         "  --exprs={input}"
-#         "  > {output}"
+rule counts:
+    input:
+        "data/inter/2022_02_18_version_2_EOC_counts.mara.hdf5"
+    output:
+        "data/inter/counts.csv"
+    shell:
+        "python3 counts__hdf5-to-csv.py {input} > {output}"
+
+rule ranks:
+    input:
+        "data/inter/counts.csv"
+    output:
+        "data/inter/ranks.tsv"
+    shell:
+        "{FRICTIONLESSER} --exprs={input} > {output}"
 
 rule save_cache_transcriptome:
     input:
-        "ranks.tsv"
+        "data/inter/ranks.tsv"
     output:
         protected("cache/trans.cache.dat")
     log: "logs/save_cache_transcriptome.log"
     benchmark: "logs/save_cache_transcriptome.bench"
-    resources:
-        job_name="cache_trans"
     shell:
         "{FRICTIONLESSER}"
         "  --ranks={input}"
@@ -42,7 +57,7 @@ rule save_cache_transcriptome:
 
 rule save_cache_size:
     input:
-        ranks="ranks.tsv",
+        ranks="data/inter/ranks.tsv",
         transcache="cache/trans.cache.dat"
     output:
         protected("cache/size_{size}.cache.dat")
@@ -61,29 +76,3 @@ rule save_cache_size:
         "  --cache-only"
         "  2> {log}"
 
-rule load_and_run:
-    input:
-        ranks="ranks.tsv",
-        transcache="cache/trans.cache.dat",
-        sizecache="cache/size_{size}.cache.dat"
-    output:
-        "signatures_of_{size}-genes/signature_{seed}.txt"
-    wildcard_constraints:
-        # Wildcards should be numeric.
-        seed="\d+",
-        size="\d+"
-    log: "logs/load_and_run_size-{size}_seed-{seed}.log"
-    benchmark: "logs/load_and_run_size-{size}_seed-{seed}.bench"
-    resources:
-        mem_mb=16000,
-        job_name=lambda wildcards: f"z{wildcards.size}_s{wildcards.seed}"
-    shell:
-        "{FRICTIONLESSER}"
-        "  --ranks={input.ranks}"
-        "  --cache-transcriptome={input.transcache}"
-        "  --cache-size={input.sizecache}"
-        "  --ngenes={wildcards.size}"
-        "  --seed={wildcards.seed}"
-        "  > {output}"
-        "  2> {log}"
-
diff --git a/scripts/paris/signatures.py b/scripts/paris/signatures.py
new file mode 100644
index 0000000000000000000000000000000000000000..22f86e142d1e38097886c901fb8071e2cb8e799d
--- /dev/null
+++ b/scripts/paris/signatures.py
@@ -0,0 +1,217 @@
+import sys
+import scipy
+import numpy
+from sortedcollections import OrderedSet
+from scipy.spatial.distance import squareform, pdist, cdist, jaccard
+from fastcluster import linkage
+
+# geneset = set of genes.
+# e.g. {"A","B"}
+# signature = dictionary of all seen genes ("genome"), with value being true if the gene is in the signature.
+# e.g. {"A":1, "B":1, "C": 0, "D":0}
+
+def load_genome(filenames, filter_size=None):
+    assert(len(filenames) > 0)
+
+    genome = OrderedSet()
+
+    for filename in filenames:
+        with open(filename) as fd:
+            lines = fd.readlines()
+            for line in lines:
+                #print(line)
+                fields = line.strip().split()
+                if len(fields) == 1:
+                    fields = line.strip().split(",")
+                assert(len(fields)>1)
+                i = 0
+                if fields[1].isdigit():
+                    # Johann’ format: <score> <ngenes> <genes>…
+                    score = float(fields[0])
+                    n = int(fields[1])
+                    i = 2
+                if filter_size:
+                    if len(fields[i:]) != filter_size:
+                        continue
+                genes = frozenset(fields[i:])
+                genome |= genes # Merge genes into all genes.
+
+    return genome
+
+
+def load(filenames, filter_size=None):
+    if filter_size == 0:
+        filter_size = None
+    assert(len(filenames) > 0)
+
+    genesets = OrderedSet()
+    # current_id = 0
+    genome = OrderedSet()
+    #genesets_scores = {}
+    breaks = []
+
+    for filename in filenames:
+        kept_lines = 0
+        with open(filename) as fd:
+            lines = fd.readlines()
+            file_genesets = OrderedSet()
+            for line in lines:
+                fields = line.strip().split()
+                # print(len(fields),fields)
+                if len(fields) == 1:
+                    fields = line.strip().split(",")
+                assert(len(fields)>1)
+                i = 0
+                if fields[1].isdigit():
+                    # Johann’ format: <score> <ngenes> <genes>…
+                    score = float(fields[0])
+                    n = int(fields[1])
+                    i = 2
+                if filter_size:
+                    if len(fields[i:]) != filter_size:
+                        continue
+                kept_lines += 1
+                genes = frozenset(fields[i:])
+                #genesets_scores[genes] = score
+                #assert(len(genes) == n)
+                genome |= genes # Merge genes into all genes.
+                # geneset = {"id":current_id, "score":score, "genes": genes}
+                # geneset = {"score":score, "genes": genes}
+                # if geneset not in geneset:
+                    # geneset.append( geneset )
+                # current_id += 1
+                file_genesets |= OrderedSet([genes]) # Add a genes as a geneset if not already here.
+                #print(genes)
+        # if kept_lines > 1:
+        if filter_size:
+            print("File `",filename,"` had",len(file_genesets),"unique signatures among",kept_lines,"signatures of size",filter_size, file=sys.stderr, flush=True)
+        else:
+            print("File `",filename,"` had",len(file_genesets),"unique signatures among",kept_lines,"signatures of all size", file=sys.stderr, flush=True)
+
+        breaks.append(len(file_genesets))
+
+        genesets |= file_genesets
+
+    return genesets,genome,breaks
+
+
+def genesets_to_signatures(genesets,genome):
+    signatures = numpy.zeros( (len(genesets),len(genome)), bool )
+    for s,geneset in enumerate(genesets):
+        for g,gene in enumerate(genome):
+            # if gene in signature["genome"]:
+            if gene in geneset:
+                signatures[s][g] = 1
+            else:
+                signatures[s][g] = 0
+    return signatures
+
+
+def self_similarity_matrix(signatures):
+    dissim = pdist(signatures, 'jaccard') # jaccard = DISsimilarity
+    return 1-squareform(dissim)
+
+
+def seriation(Z,N,cur_index):
+    '''
+        input:
+            - Z is a hierarchical tree (dendrogram)
+            - N is the number of points given to the clustering process
+            - cur_index is the position in the tree for the recursive traversal
+        output:
+            - order implied by the hierarchical tree Z
+
+        seriation computes the order implied by a hierarchical tree (dendrogram)
+    '''
+    if cur_index < N:
+        return [cur_index]
+    else:
+        left = int(Z[cur_index-N,0])
+        right = int(Z[cur_index-N,1])
+        return (seriation(Z,N,left) + seriation(Z,N,right))
+
+
+def compute_serial_matrix(dist_mat,method="ward"):
+    '''
+        input:
+            - dist_mat is a distance matrix
+            - method = ["ward","single","average","complete"]
+        output:
+            - seriated_dist is the input dist_mat,
+              but with re-ordered rows and columns
+              according to the seriation, i.e. the
+              order implied by the hierarchical tree
+            - res_order is the order implied by
+              the hierarhical tree
+            - res_linkage is the hierarhical tree (dendrogram)
+
+        compute_serial_matrix transforms a distance matrix into
+        a sorted distance matrix according to the order implied
+        by the hierarchical tree (dendrogram)
+    '''
+    N = len(dist_mat)
+    flat_dist_mat = squareform(dist_mat)
+    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
+    res_order = seriation(res_linkage, N, N + N-2)
+    seriated_dist = numpy.zeros((N,N))
+    a,b = numpy.triu_indices(N,k=1)
+    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
+    seriated_dist[b,a] = seriated_dist[a,b]
+
+    return seriated_dist, res_order, res_linkage
+
+def colormesh(mat, ax, cmap, norm):
+    # cmap = cm.get_cmap("Blues")
+    cmap.set_bad("white")
+
+    pcm = ax.pcolormesh(mat, cmap=cmap, norm=norm)#, edgecolors="#eeeeee", linewidth=0.01)
+    #plt.imshow(mat, cmap=cmap)
+
+    # ax.set_aspect("equal")
+
+    xticks=numpy.arange(0,mat.shape[1],max(1,int(mat.shape[1]/40)))
+    yticks=numpy.arange(0,mat.shape[0],max(1,int(mat.shape[0]/30)))
+
+    # Major ticks
+    ax.set_xticks(xticks+0.5)
+    ax.set_yticks(yticks+0.5)
+    # Labels
+    ax.set_xticklabels(xticks, rotation=90)
+    ax.set_yticklabels(yticks, rotation=0)
+    # Minor ticks
+    # ax.set_xticks(xticks-0.5, minor=True)
+    # ax.set_yticks(yticks-0.5, minor=True)
+    # ax.tick_params(which="minor", bottom=False, left=False)
+    # ax.grid(True, which="minor", axis="both", color="white", linestyle="-", linewidth=1)
+    return pcm
+
+
+if __name__=="__main__":
+    import sys
+    from matplotlib import gridspec
+    import matplotlib.pyplot as plt
+    from matplotlib import cm
+    from matplotlib.colors import Normalize
+
+    size = int(sys.argv[1])
+    if size == 0:
+        size = None
+
+    sim_thresh = float(sys.argv[2])
+
+    filenames = sys.argv[3:]
+
+    genesets,genome,breaks = load(filenames, filter_size=size)
+    assert(len(genesets) > 0)
+    assert(len(genome) > 0)
+    #print(genome)
+    #for s in genesets:
+    #    print(s)
+    print(len(filenames), "files,", len(genesets), "unique genesets,", len(genome), "genes", flush=True)
+    sizes = []
+    for geneset in genesets:
+        s = len(geneset)
+        sizes.append(s)
+    if not all(s == len(genesets[0]) for s in sizes):
+        print("Statistics or sizes:",scipy.stats.describe(sizes), flush=True)
+
diff --git a/scripts/paris/signatures_jaccard-distances_plot.py b/scripts/paris/signatures_jaccard-distances_plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c5e3e6135d3b4ff745586c2dbfc2e1ee5ba7dd5
--- /dev/null
+++ b/scripts/paris/signatures_jaccard-distances_plot.py
@@ -0,0 +1,138 @@
+import scipy
+import numpy
+from sortedcollections import OrderedSet
+from scipy.spatial.distance import squareform, pdist, cdist, jaccard
+from fastcluster import linkage
+
+# geneset = set of genes.
+# e.g. {"A","B"}
+# signature = dictionary of all seen genes ("genome"), with value being true if the gene is in the signature.
+# e.g. {"A":1, "B":1, "C": 0, "D":0}
+
+from signatures import *
+
+if __name__=="__main__":
+    import sys
+    from matplotlib import gridspec
+    import matplotlib.pyplot as plt
+    from matplotlib import cm
+    from matplotlib.colors import Normalize
+
+    size = int(sys.argv[1])
+    if size == 0:
+        size = None
+
+    sim_thresh = float(sys.argv[2])
+
+    filenames = sys.argv[3:5] # Only two files
+
+    genesets,genome,breaks = load(filenames, filter_size=size)
+    assert(len(genesets) > 0)
+    assert(len(genome) > 0)
+    #print(genome)
+    #for s in genesets:
+    #    print(s)
+    print(len(filenames), "files,", len(genesets), "unique genesets,", len(genome), "genes", flush=True)
+    sizes = []
+    for geneset in genesets:
+        s = len(geneset)
+        sizes.append(s)
+    if not all(s == len(genesets[0]) for s in sizes):
+        print("Statistics or sizes:",scipy.stats.describe(sizes), flush=True)
+
+    signatures = genesets_to_signatures(genesets,genome)
+    # with numpy.printoptions(threshold=numpy.inf):
+        # print(signatures)
+    raw_distance_matrix = self_similarity_matrix(signatures)
+
+    # full_distance_matrix, res_order, res_linkage = compute_serial_matrix(raw_distance_matrix, "ward")
+    full_distance_matrix = raw_distance_matrix
+
+    # PLOT
+    fig = plt.figure(figsize=(10,10))
+    fig.tight_layout()
+    gs = gridspec.GridSpec(4, 3, width_ratios=[40,1,1], height_ratios=[20,20,1,1])
+    ax1 = fig.add_subplot(gs[0])
+    ax2 = fig.add_subplot(gs[3])
+    ax2r = fig.add_subplot(gs[4],sharey=ax2)
+    ax2rr = fig.add_subplot(gs[5],sharey=ax2)
+    ax2b = fig.add_subplot(gs[6],sharex=ax2)
+    ax2bb = fig.add_subplot(gs[9],sharex=ax2)
+
+    cmap = cm.get_cmap("Reds")
+    normalizer = Normalize(0,1)
+    im = cm.ScalarMappable(norm=normalizer, cmap=cmap)
+    fig.colorbar(im, ax=[ax1,ax2,ax2r,ax2rr,ax2b,ax2bb])
+
+    # Full distance matrx
+    mask = numpy.tri(full_distance_matrix.shape[0], k=-1).transpose() # k>0 = above
+    mat = numpy.ma.array(full_distance_matrix, mask=mask)
+    pcm = colormesh(mat, ax1, cmap, normalizer)
+    ax1.set_title("Jaccard similarities of all {} signatures of size {} on {} genes".format(len(genesets),len(next(iter(genesets))),len(genome)))
+
+    # Dual distance matrix
+    genesets_1,genome_1,breaks_1 = load([filenames[0]], filter_size=size)
+    assert(len(genesets_1) > 0)
+    assert(len(genome_1) > 0)
+    signatures_1 = genesets_to_signatures(genesets_1, genome) # against common genome
+
+    genesets_2,genome_2,breaks_2 = load([filenames[1]], filter_size=size)
+    assert(len(genesets_2) > 0)
+    assert(len(genome_2) > 0)
+    signatures_2 = genesets_to_signatures(genesets_2, genome) # against common genome
+
+    # print(signatures_1.shape, signatures_2.shape)
+
+    sub_distance_matrix = 1 - cdist(signatures_1, signatures_2, "jaccard")
+
+
+
+    # Sort columns on sum.
+    # dual_distance_matrix = sub_distance_matrix[:, (sub_distance_matrix.sum(axis=0)*-1).argsort()] # rows
+    # dual_distance_matrix = sub_distance_matrix[(sub_distance_matrix.sum(axis=1)).argsort(), :] # columns
+    sorted_distance_matrix = sub_distance_matrix[:, (sub_distance_matrix.sum(axis=0)*-1).argsort()] # rows
+    dual_distance_matrix = sorted_distance_matrix[(sorted_distance_matrix.sum(axis=1)).argsort(), :] # columns
+
+    # argsort the columns, sorting by the last row first, then the second-to-last row, continuing up to the first row
+    # dual_distance_matrix = sub_distance_matrix[numpy.lexsort(sub_distance_matrix)]
+    # dual_distance_matrix = sub_distance_matrix.T[numpy.lexsort(sub_distance_matrix.T)]
+
+    colormesh(dual_distance_matrix, ax2, cmap, normalizer)
+    ax2.set_ylabel(filenames[0])
+    # ax2.xaxis.tick_top()
+
+    # Sum over rows
+    # sum_rows = dual_distance_matrix.mean(1)
+    sum_rows = dual_distance_matrix.sum(1)
+    colormesh(sum_rows.reshape(-1,1), ax2r, cmap, normalizer)
+    ax2r.set_ylabel("Sum of similarities for {} signatures in `{}`".format(len(sum_rows),filenames[0]))
+    ax2r.yaxis.set_label_position("right")
+    ax2r.yaxis.tick_right()
+
+    # Sum over columns
+    # sum_cols = dual_distance_matrix.mean(0)
+    sum_cols = dual_distance_matrix.sum(0)
+    colormesh(sum_cols.reshape(1,-1), ax2b, cmap, normalizer)
+    # ax2b.set_xlabel(filenames[1])
+    ax2b.set_xlabel("Sum of similarities for {} signatures in `{}`".format(len(sum_cols),filenames[1]))
+
+    def thresh(slice):
+        return any([s > sim_thresh for s in slice])
+
+    # Threshold count over rows
+    match_rows = numpy.apply_along_axis(thresh, axis=1, arr=dual_distance_matrix)
+    colormesh(match_rows.reshape(-1,1), ax2rr, cmap, normalizer)
+    sm = sum(match_rows)
+    n = len(match_rows)
+    ax2rr.set_ylabel("{:.0%} of signatures in `{}` are similar enough (>{}) to at least another one in `{}`".format(sm/n, filenames[0], sim_thresh, filenames[1]))
+    ax2rr.yaxis.set_label_position("right")
+    ax2rr.yaxis.tick_right()
+
+    # Threshold count over columns
+    match_cols = numpy.apply_along_axis(thresh, axis=0, arr=dual_distance_matrix)
+    colormesh(match_cols.reshape(1,-1), ax2bb, cmap, normalizer)
+    sm = sum(match_cols)
+    n = len(match_cols)
+    ax2bb.set_xlabel("{:.0%} of signatures in `{}` are similar enough (>{}) to at least another one in `{}`".format(sm/n, filenames[1], sim_thresh, filenames[0]))
+
+    plt.show()
diff --git a/scripts/paris/validation.Snakefile b/scripts/paris/validation.Snakefile
new file mode 100644
index 0000000000000000000000000000000000000000..fa855ddbca765d6dc5f3478f1d1add9f2d971c7e
--- /dev/null
+++ b/scripts/paris/validation.Snakefile
@@ -0,0 +1,29 @@
+
+SIZE=10
+
+rule all:
+    input:
+        "signatures-average-correlations.npy",
+        "signatures-average-correlations_full.npy"
+
+rule ranksgenes_correlations_npy:
+    input:
+        ranks="ranks.tsv",
+        A="signatures/emil.csv",
+        B="signatures/johann.tsv"
+    output:
+        "genes-ranks-correlations.npy"
+    shell:
+        "python3 genes-correlations__sign-to-npy.py {input.ranks} {SIZE} {input.A} {input.B}"
+
+rule npycorrsign_signavcorr_npy:
+    input:
+        ranks="ranks.tsv",
+        corrs="genes-ranks-correlations.npy",
+        A="signatures/emil.csv",
+        B="signatures/johann.tsv"
+    output:
+        "signatures-average-correlations.npy",
+        "signatures-average-correlations_full.npy"
+    shell:
+        "python3 average-correlations__rkcorr-to-npy.py {input.ranks} {input.corrs} {SIZE} {input.A} {input.B}"
diff --git a/src/parser.cpp b/src/parser.cpp
index f533c8c4874f26b4900e358a1047c4eac9be5c26..4dec3058f21b42617fc4901d749d5874f6f50e82 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -125,6 +125,11 @@ size_t TranscriptomeParser::load_header(const std::string& line)
     }
     ASSERT(loaded.size() == i);
     ASSERT(_rt._samples.size() == i);
+    if( _rt.samples_nb() / _rt.cells_nb() > 0.0005) {
+        CLUTCHLOG(warning, "Suspiciously many samples (" << _rt.samples_nb()
+            << ") regarding the number of cells (" << _rt.cells_nb() << ")");
+    }
+
     return loaded.size();
 }