From 599ff581ab76dced5be7f1111d35e453931d5ab9 Mon Sep 17 00:00:00 2001
From: Johann Dreo <johann.dreo@pasteur.fr>
Date: Tue, 14 Feb 2023 10:17:33 +0100
Subject: [PATCH] feat(paris): analysis script

---
 scripts/paris/signatures.py | 250 ++++++++++++++++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 scripts/paris/signatures.py

diff --git a/scripts/paris/signatures.py b/scripts/paris/signatures.py
new file mode 100644
index 0000000..5369933
--- /dev/null
+++ b/scripts/paris/signatures.py
@@ -0,0 +1,250 @@
+import scipy
+import numpy
+from sortedcollections import OrderedSet
+from scipy.spatial.distance import squareform, pdist, jaccard
+from fastcluster import linkage
+
+# geneset = set of genes.
+# e.g. {"A","B"}
+# signature = dictionary of all seen genes ("genome"), with value being true if the gene is in the signature.
+# e.g. {"A":1, "B":1, "C": 0, "D":0}
+
+def load(filenames, filter_size=None):
+    assert(len(filenames) > 0)
+
+    genesets = OrderedSet()
+    # current_id = 0
+    genome = OrderedSet()
+    #genesets_scores = {}
+    breaks = []
+
+    for filename in filenames:
+        with open(filename) as fd:
+            lines = fd.readlines()
+            kept_lines = 0
+            file_genesets = OrderedSet()
+            for line in lines:
+                #print(line)
+                fields = line.split()
+                if len(fields) == 1:
+                    fields = line.split(",")
+                assert(len(fields)>1)
+                i = 0
+                if fields[1].isdigit():
+                    # Johann’ format: <score> <ngenes> <genes>…
+                    score = float(fields[0])
+                    n = int(fields[1])
+                    i = 2
+                if filter_size:
+                    if len(fields[i:]) != filter_size:
+                        continue
+                kept_lines += 1
+                genes = frozenset(fields[i:])
+                #genesets_scores[genes] = score
+                #assert(len(genes) == n)
+                genome |= genes # Merge genes into all genes.
+                # geneset = {"id":current_id, "score":score, "genes": genes}
+                # geneset = {"score":score, "genes": genes}
+                # if geneset not in geneset:
+                    # geneset.append( geneset )
+                # current_id += 1
+                file_genesets |= OrderedSet([genes]) # Add a genes as a geneset if not already here.
+                #print(genes)
+        if kept_lines > 1:
+            if filter_size:
+                print("File `",filename,"` had",len(file_genesets),"unique signatures among",kept_lines,"signatures of size",filter_size, flush=True)
+            else:
+                print("File `",filename,"` had",len(file_genesets),"unique signatures among",kept_lines,"signatures of all size", flush=True)
+        breaks.append(len(file_genesets))
+
+        genesets |= file_genesets
+
+    return genesets,genome,breaks
+
+
+def genesets_to_signatures(genesets,genome):
+    signatures = numpy.zeros( (len(genesets),len(genome)), bool )
+    for s,geneset in enumerate(genesets):
+        for g,gene in enumerate(genome):
+            # if gene in signature["genome"]:
+            if gene in geneset:
+                signatures[s][g] = 1
+            else:
+                signatures[s][g] = 0
+    return signatures
+
+
+def self_similarity_matrix(signatures):
+    dissim = pdist(signatures, 'jaccard') # jaccard = DISsimilarity
+    return 1-squareform(dissim)
+
+
+def seriation(Z,N,cur_index):
+    '''
+        input:
+            - Z is a hierarchical tree (dendrogram)
+            - N is the number of points given to the clustering process
+            - cur_index is the position in the tree for the recursive traversal
+        output:
+            - order implied by the hierarchical tree Z
+
+        seriation computes the order implied by a hierarchical tree (dendrogram)
+    '''
+    if cur_index < N:
+        return [cur_index]
+    else:
+        left = int(Z[cur_index-N,0])
+        right = int(Z[cur_index-N,1])
+        return (seriation(Z,N,left) + seriation(Z,N,right))
+
+
+def compute_serial_matrix(dist_mat,method="ward"):
+    '''
+        input:
+            - dist_mat is a distance matrix
+            - method = ["ward","single","average","complete"]
+        output:
+            - seriated_dist is the input dist_mat,
+              but with re-ordered rows and columns
+              according to the seriation, i.e. the
+              order implied by the hierarchical tree
+            - res_order is the order implied by
+              the hierarhical tree
+            - res_linkage is the hierarhical tree (dendrogram)
+
+        compute_serial_matrix transforms a distance matrix into
+        a sorted distance matrix according to the order implied
+        by the hierarchical tree (dendrogram)
+    '''
+    N = len(dist_mat)
+    flat_dist_mat = squareform(dist_mat)
+    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
+    res_order = seriation(res_linkage, N, N + N-2)
+    seriated_dist = numpy.zeros((N,N))
+    a,b = numpy.triu_indices(N,k=1)
+    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
+    seriated_dist[b,a] = seriated_dist[a,b]
+
+    return seriated_dist, res_order, res_linkage
+
+def colormesh(mat, ax):
+    cmap = cm.get_cmap("Blues")
+    cmap.set_bad("white")
+
+    pcm = ax.pcolormesh(mat, cmap=cmap)#, edgecolors="#eeeeee", linewidth=0.01)
+    #plt.imshow(mat, cmap=cmap)
+
+    # ax.set_aspect("equal")
+
+    xticks=numpy.arange(0,mat.shape[1],max(1,int(mat.shape[1]/100)))
+    yticks=numpy.arange(0,mat.shape[0],max(1,int(mat.shape[0]/50)))
+
+    # Major ticks
+    ax.set_xticks(xticks+0.5)
+    ax.set_yticks(yticks+0.5)
+    # Labels
+    ax.set_xticklabels(xticks, rotation=90)
+    ax.set_yticklabels(yticks, rotation=0)
+    # Minor ticks
+    # ax.set_xticks(xticks-0.5, minor=True)
+    # ax.set_yticks(yticks-0.5, minor=True)
+    # ax.tick_params(which="minor", bottom=False, left=False)
+    # ax.grid(True, which="minor", axis="both", color="white", linestyle="-", linewidth=1)
+    return pcm
+
+
+if __name__=="__main__":
+    import sys
+    from matplotlib import gridspec
+    import matplotlib.pyplot as plt
+    from matplotlib import cm
+
+    size = int(sys.argv[1])
+    if size == 0:
+        size = None
+
+    sim_thresh = float(sys.argv[2])
+
+    filenames = sys.argv[3:]
+
+    genesets,genome,breaks = load(filenames, filter_size=size)
+    assert(len(genesets) > 0)
+    assert(len(genome) > 0)
+    #print(genome)
+    #for s in genesets:
+    #    print(s)
+    print(len(filenames), "files,", len(genesets), "unique genesets,", len(genome), "genes", flush=True)
+    sizes = []
+    for geneset in genesets:
+        s = len(geneset)
+        sizes.append(s)
+    if not all(s == len(genesets[0]) for s in sizes):
+        print("Statistics or sizes:",scipy.stats.describe(sizes), flush=True)
+
+    signatures = genesets_to_signatures(genesets,genome)
+    # with numpy.printoptions(threshold=numpy.inf):
+        # print(signatures)
+    raw_distance_matrix = self_similarity_matrix(signatures)
+
+    # full_distance_matrix, res_order, res_linkage = compute_serial_matrix(raw_distance_matrix, "ward")
+    full_distance_matrix = raw_distance_matrix
+
+    # PLOT
+    fig = plt.figure(figsize=(10,10))
+    fig.tight_layout()
+    gs = gridspec.GridSpec(4, 3, width_ratios=[20,1,1], height_ratios=[10,10,1,1])
+    ax1 = fig.add_subplot(gs[0])
+    ax2 = fig.add_subplot(gs[3])
+    ax2r = fig.add_subplot(gs[4],sharey=ax2)
+    ax2rr = fig.add_subplot(gs[5],sharey=ax2)
+    ax2b = fig.add_subplot(gs[6],sharex=ax2)
+    ax2bb = fig.add_subplot(gs[9],sharex=ax2)
+
+    # Full distance matrx
+    mask = numpy.tri(full_distance_matrix.shape[0], k=-1).transpose() # k>0 = above
+    mat = numpy.ma.array(full_distance_matrix, mask=mask)
+    pcm = colormesh(mat, ax1)
+    fig.colorbar(pcm, ax=ax1)
+    ax1.set_title("Jaccard similarities of all {} signatures of size {} on {} genes".format(len(genesets),len(next(iter(genesets))),len(genome)))
+
+    # Dual distance matrix
+    sub_distance_matrix = raw_distance_matrix[breaks[1]:,:breaks[1]]
+    dual_distance_matrix = sub_distance_matrix[:, (sub_distance_matrix.sum(axis=0)*-1).argsort()]
+
+    colormesh(dual_distance_matrix, ax2)
+    ax2.set_ylabel(filenames[0])
+    # ax2.xaxis.tick_top()
+
+    # Sum over rows
+    sum_rows = dual_distance_matrix.sum(1)
+    colormesh(sum_rows.reshape(-1,1), ax2r)
+    ax2r.set_ylabel("Sum of similarities for {} signatures in `{}`".format(len(sum_rows),filenames[0]))
+    ax2r.yaxis.set_label_position("right")
+    ax2r.yaxis.tick_right()
+
+    # Sum over columns
+    sum_cols = dual_distance_matrix.sum(0)
+    colormesh(sum_cols.reshape(1,-1), ax2b)
+    # ax2b.set_xlabel(filenames[1])
+    ax2b.set_xlabel("Sum of similarities for {} signatures in `{}`".format(len(sum_cols),filenames[1]))
+
+    def thresh(slice):
+        return any([s > sim_thresh for s in slice])
+
+    # Threshold count over rows
+    match_rows = numpy.apply_along_axis(thresh, axis=1, arr=dual_distance_matrix)
+    colormesh(match_rows.reshape(-1,1), ax2rr)
+    sm = sum(match_rows)
+    n = len(match_rows)
+    ax2rr.set_ylabel("{:.0%} of signatures in `{}` are similar enough (>{}) to at least another one in `{}`".format(sm/n, filenames[0], sim_thresh, filenames[1]))
+    ax2rr.yaxis.set_label_position("right")
+    ax2rr.yaxis.tick_right()
+
+    # Threshold count over columns
+    match_cols = numpy.apply_along_axis(thresh, axis=0, arr=dual_distance_matrix)
+    colormesh(match_cols.reshape(1,-1), ax2bb)
+    sm = sum(match_cols)
+    n = len(match_cols)
+    ax2bb.set_xlabel("{:.0%} of signatures in `{}` are similar enough (>{}) to at least another one in `{}`".format(sm/n, filenames[1], sim_thresh, filenames[0]))
+
+    plt.show()
-- 
GitLab