diff --git a/scripts/paris/average-correlations__rkcorr-to-npy.py b/scripts/paris/average-correlations__rkcorr-to-npy.py new file mode 100644 index 0000000000000000000000000000000000000000..aeec6c3f34eab10b47b260252ff24c92fd9272c5 --- /dev/null +++ b/scripts/paris/average-correlations__rkcorr-to-npy.py @@ -0,0 +1,116 @@ +import itertools +import numpy +import sys +import signatures +import csv + +if __name__ == "__main__": + + rankfile = sys.argv[1] + fgenescorr = sys.argv[2] + size = int(sys.argv[3]) + signatures_files = sys.argv[4:6] + + print("Load genes correlations...", file=sys.stderr, flush=True) + genes_correlations = numpy.load(fgenescorr) + ngenes = genes_correlations.shape[0] + assert(genes_correlations.shape[1] == ngenes) + + print("Load signatures...", file=sys.stderr, flush=True) + genesets_1,genome_1,breaks_1 = signatures.load([signatures_files[0]], filter_size=size) + assert(len(genesets_1) > 0) + assert(len(genome_1) > 0) + + genesets_2,genome_2,breaks_2 = signatures.load([signatures_files[1]], filter_size=size) + assert(len(genesets_2) > 0) + assert(len(genome_2) > 0) + + print("Load genes indices...", file=sys.stderr, flush=True) + genome = genome_1 | genome_2 + genes_idx = {} + ranks_l = [] + with open(rankfile) as fd: + csvreader = csv.reader(fd, delimiter=" ") + header = next(csvreader) + n=0 + for row in csvreader: + gene = row[0] + assert(len(gene) > 0) + if gene in genome: + genes_idx[gene] = n + n += 1 + print("\r",n,end=" ", file=sys.stderr,flush=True) + assert(len(genes_idx) > 0) + print("genes", file=sys.stderr, flush=True) + + print("Compute paired signatures correlations...", file=sys.stderr, flush=True) + nsign_1 = len(genesets_1) + nsign_2 = len(genesets_2) + signature_correlations = numpy.zeros((nsign_1, nsign_2)) + for i1,s1 in enumerate(genesets_1): + for i2,s2 in enumerate(genesets_2): + g1 = [genes_idx[g] for g in s1 if g in genes_idx] + assert(len(g1) > 0) + g2 = [genes_idx[g] for g in s2 if g in genes_idx] + assert(len(g2) > 0) + subidx = itertools.product(g1,g2) + i,j = zip(*subidx) + submat = genes_correlations[i,j] + assert(submat.size == len(g1)*len(g2)) + avcorr = numpy.mean(submat) + assert(avcorr > 0) + signature_correlations[i1,i2] = avcorr + + fname = "signatures-average-correlations.npy" + print("Save signatures correlation matrix of shape", signature_correlations.shape, "to `", fname,"`...", file=sys.stderr, flush=True) + numpy.save(fname, signature_correlations) + + print("Compute all signatures correlations...", file=sys.stderr, flush=True) + genesets = genesets_1 | genesets_2 + nsign = len(genesets) + all_signature_correlations = numpy.zeros((nsign, nsign)) + signature_origins = {} + for i1,s1 in enumerate(genesets): + if s1 in genesets_1 and s1 in genesets_2: + signature_origins[i1] = "BOTH" + elif s1 in genesets_1: + signature_origins[i1] = signatures_files[0] + elif s1 in genesets_2: + signature_origins[i1] = signatures_files[1] + else: + assert(s1 in genesets_1 or s1 in genesets_2) + + for i2,s2 in enumerate(genesets): + # if i2 < i1: + # continue + g1 = [genes_idx[g] for g in s1 if g in genes_idx] + assert(len(g1) > 0) + g2 = [genes_idx[g] for g in s2 if g in genes_idx] + assert(len(g2) > 0) + # Cross-product of all genes indices. + subidx = itertools.product(g1,g2) + # Corresponding tsb-matrix indices. + i,j = zip(*subidx) + submat = genes_correlations[i,j] + assert(submat.size == len(g1)*len(g2)) + # Average correlation over the sub-matrix. + # TODO would be faster over the triangular sub-matrix. + avcorr = numpy.mean(submat) + assert(avcorr > 0) + all_signature_correlations[i1,i2] = avcorr + + assert(len(signature_origins) == nsign) + fname = "signatures-origins.csv" + print("Save unique signatures origin table to `", fname, "`...", file=sys.stderr, flush=True) + + with open(fname, 'w') as fd: + writer = csv.DictWriter(fd, fieldnames=["gene_index","origin"]) + writer.writeheader() + for i in signature_origins: + writer.writerow({"gene_index":i, "origin":signature_origins[i]}) + + fname = "signatures-average-correlations_full.npy" + print("Save full signatures correlation matrix of shape", all_signature_correlations.shape, "to `", fname,"`...", file=sys.stderr, flush=True) + numpy.save(fname, all_signature_correlations) + + print("Done", file=sys.stderr, flush=True) diff --git a/scripts/paris/config.yaml b/scripts/paris/config.yaml deleted file mode 100644 index 81bde9f8a09369870d715e70a4e0da400ce93eed..0000000000000000000000000000000000000000 --- a/scripts/paris/config.yaml +++ /dev/null @@ -1,15 +0,0 @@ -snakefile: Snakefile - -# executable: "singularity run ../frictionlesser.sif" -executable: "../../release/app/frictionlesser" - -cluster: "sbatch --job-name {resources.job_name} --mem {resources.mem_mb} --cpus-per-task=1 --partition common,dedicated --qos normal --output slurmout/%j.out --error slurmout/%j.err" - -sizes: - - 10 - - 20 - - 30 - -# Number of runs = number of (possibly duplicated) signatures. -runs: 5000 - diff --git a/scripts/paris/config_preproc.yaml b/scripts/paris/config_preproc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..beb2afa5217d90f112a4ac69079b9e3ccd106ca8 --- /dev/null +++ b/scripts/paris/config_preproc.yaml @@ -0,0 +1,12 @@ +snakefile: Snakefile + +# executable: "singularity run ../../frictionlesser.sif" +executable: "../../release/app/frictionlesser" + +cluster: "sbatch --job-name {resources.job_name} --mem 16G --cpus-per-task=1 --partition common,dedicated --qos fast --output logs/%j.out --error logs/%j.err" + +sizes: + - 10 + - 20 + - 30 + diff --git a/scripts/paris/counts__hdf5-to-csv.py b/scripts/paris/counts__hdf5-to-csv.py new file mode 100644 index 0000000000000000000000000000000000000000..8a22e5432e2e8c4761d77113d886f136181d8332 --- /dev/null +++ b/scripts/paris/counts__hdf5-to-csv.py @@ -0,0 +1,35 @@ +import sys +import numpy +import scanpy +import pandas as pd +import anndata as ad + +def check(counts, meta, genes): + assert(counts.shape[0] > 0) + assert(counts.shape[1] > 0) + assert(counts.shape[0] == len(meta)) + assert(counts.shape[1] == len(genes)) + print("OK", file=sys.stderr, flush=True) + + +if __name__ == "__main__": + + print("Load annotated data...", file=sys.stderr, flush=True) + adata = ad.read(sys.argv[1]) + check(adata.X, adata.obs_names, adata.var_names) + print("Loaded",len(adata.var_names),"genes and",len(adata.obs_names),"cells", file=sys.stderr, flush=True) + + print("Output as CSV...", file=sys.stderr, flush=True) + counts_full = numpy.asarray(adata.X.todense()) + + # Column names are sample, not cells IDs. + print("GENE,", ",".join(adata.obs["sample"]), sep="") + i = 1 + for row in counts_full.T: + print("\r",i, end="", flush=True, file=sys.stderr) + # assert(len(row) == len(meta_cancer["cell_types_v2"])) + print(adata.var_names[i], ",".join(str(x) for x in row)) + i += 1 + assert( i == len(adata.var_names)) + + print("\nDone", file=sys.stderr, flush=True) diff --git a/scripts/paris/genes-correlations__sign-to-npy.py b/scripts/paris/genes-correlations__sign-to-npy.py new file mode 100644 index 0000000000000000000000000000000000000000..98511794accf6f46a06f5ae7543bb66c33faabff --- /dev/null +++ b/scripts/paris/genes-correlations__sign-to-npy.py @@ -0,0 +1,71 @@ +import scipy +import numpy +from sortedcollections import OrderedSet +from scipy.spatial.distance import squareform, pdist, cdist, jaccard +from fastcluster import linkage +from scipy.stats import spearmanr + +from signatures import * + +if __name__=="__main__": + import sys + from matplotlib import gridspec + import matplotlib.pyplot as plt + from matplotlib import cm + from matplotlib.colors import Normalize + # import pandas + import csv + + rankfile = sys.argv[1] + + size = int(sys.argv[2]) + if size == 0: + size = None + + signatures_files = sys.argv[3:5] # Only two files + + + genome = load_genome(signatures_files, filter_size=size) + assert(len(genome) > 0) + print("Genome across",len(signatures_files), "files:", len(genome), "genes", file=sys.stderr, flush=True) + + print("Load ranks...", file=sys.stderr, flush=True) + genes_idx = {} + ranks_l = [] + with open(rankfile) as fd: + csv = csv.reader(fd, delimiter=" ") + header = next(csv) + n=0 + for row in csv: + gene = row[0] + assert(len(gene) > 0) + if gene in genome: + genes_idx[gene] = n + n += 1 + print("\r",n,end=" ", file=sys.stderr,flush=True) + ranks_l.append(numpy.array([float(i) for i in row[1:]])) + assert(len(ranks_l) > 0) + assert(len(ranks_l) == len(genes_idx)) + ngenes = len(genes_idx) + ncells = len(ranks_l[0]) + print("\rLoaded ranks for",ngenes,"genes,",ncells,"cells", file=sys.stderr, flush=True) + # Convert to array. + ranks = numpy.array(ranks_l) + assert(ranks.shape == (ngenes,ncells)) + + ranked_genes = OrderedSet(genes_idx.keys()) + unloaded_genes = genome - ranked_genes + if len(unloaded_genes) > 0: + print("WARNING: Genes of genome not found in ranks file:",unloaded_genes, file=sys.stderr, flush=True) + # sys.exit(1) + # common_genome = genome & ranked_genes + + print("Compute genes correlation matrix...", file=sys.stderr, flush=True) + genes_correlations = numpy.absolute(spearmanr(ranks, axis=1)[0]) # [0] = statistic, [1] = pvalue + assert(genes_correlations.shape == (ngenes,ngenes)) + + fname = "genes-ranks-correlations.npy" + print("Save genes correlation matrix to `", fname,"`...", file=sys.stderr, flush=True) + numpy.save(fname, genes_correlations) + + print("Done", file=sys.stderr, flush=True) diff --git a/scripts/paris/paris_preproc-seurat_hdf5.py b/scripts/paris/paris_preproc-seurat_hdf5.py new file mode 100644 index 0000000000000000000000000000000000000000..8c543d06541cb0464ced3ff3267b096139b2443d --- /dev/null +++ b/scripts/paris/paris_preproc-seurat_hdf5.py @@ -0,0 +1,65 @@ +import sys +from scipy.sparse import load_npz +import scanpy +import pandas as pd +import anndata as ad +# from sklearn.preprocessing import normalize +# from itertools import zip_longest +# import numpy +# import scipy + +def check(counts, meta, genes): + assert(counts.shape[0] > 0) + assert(counts.shape[1] > 0) + assert(counts.shape[0] == len(meta)) + assert(counts.shape[1] == len(genes)) + print("OK", file=sys.stderr, flush=True) + +if __name__ == "__main__": + + if len(sys.argv) == 4: + + fcounts = sys.argv[1] + ffeatures = sys.argv[2] + fmeta = sys.argv[3] + + + print("Load data files...", file=sys.stderr, flush=True) + # Load data. + with open(fcounts) as fd: + # This is a numpy sparse matrix: cells × genes. + counts = load_npz(fcounts) + + # Those are pandas' dataframes. + meta = pd.read_csv(fmeta) + genes = pd.read_csv(ffeatures) + + print("Loaded",len(genes["id"]),"genes and",len(meta["cell_types_v2"]),"cells", file=sys.stderr, flush=True) + check(counts, meta["cell_types_v2"], genes["id"]) + + + print("Build annotated data...", file=sys.stderr, flush=True) + # adata = ad.AnnData(counts, meta, genes, dtype=counts.dtype) # Seurat does not support sparse matrices. + adata = ad.AnnData(counts.todense(), meta, genes, dtype=counts.dtype) + check(adata.X, adata.obs_names, adata.var_names) + + + # print("Save raw annotated data...", file=sys.stderr, flush=True) + # adata.write(fcounts+".hdf5") + # print("OK", file=sys.stderr, flush=True) + + elif len(sys.argv) == 2: + print("Load annotated data...", file=sys.stderr, flush=True) + adata = ad.read(sys.argv[1]) + check(adata.X, adata.obs_names, adata.var_names) + + else: + assert(len(sys.argv) == 2 or len(sys.argv) == 4) + + print("Seurat preprocessing...", file=sys.stderr, flush=True) + scanpy.pp.recipe_seurat(adata) + check(adata.X, adata.obs_names, adata.var_names) + adata.write(fcounts+".seurat.hdf5") + adata.file.close() + + print("Done", file=sys.stderr, flush=True) diff --git a/scripts/paris/preproc-mara__npz-to-hdf5.py b/scripts/paris/preproc-mara__npz-to-hdf5.py new file mode 100644 index 0000000000000000000000000000000000000000..a35560dede4cf571bbb1b141170559a1688bb927 --- /dev/null +++ b/scripts/paris/preproc-mara__npz-to-hdf5.py @@ -0,0 +1,106 @@ +import sys +from scipy.sparse import load_npz +import scanpy as sc +import pandas as pd +import anndata as ad +import pathlib +# from sklearn.preprocessing import normalize +# from itertools import zip_longest +# import numpy +# import scipy + +def check(counts, cells, genes): + assert(counts.shape[0] > 0) + assert(counts.shape[1] > 0) + assert(counts.shape[0] == len(cells)) + assert(counts.shape[1] == len(genes)) + print(len(genes),"genes",len(cells),"cells") + print("OK", file=sys.stderr, flush=True) + +if __name__ == "__main__": + + if len(sys.argv) == 4: + + fcounts = sys.argv[1] + ffeatures = sys.argv[2] + fmeta = sys.argv[3] + + + print("Load data files...", file=sys.stderr, flush=True) + # Load data. + with open(fcounts) as fd: + # This is a numpy sparse matrix: cells × genes. + counts = load_npz(fcounts) + + # Those are pandas' dataframes. + cells = pd.read_csv(fmeta) + genes = pd.read_csv(ffeatures) + + print("Loaded",len(genes["id"]),"genes and",len(cells["cell_types_v2"]),"cells", file=sys.stderr, flush=True) + check(counts, cells["cell_types_v2"], genes["id"]) + + print("Build annotated data...", file=sys.stderr, flush=True) + adata = ad.AnnData(counts.tocsr(), cells, genes, dtype=counts.dtype) + check(adata.X, adata.obs_names, adata.var_names) + + # print("Save raw annotated data...", file=sys.stderr, flush=True) + # adata.write(fcounts+".hdf5") + # print("OK", file=sys.stderr, flush=True) + + elif len(sys.argv) == 2: + print("Load annotated data...", file=sys.stderr, flush=True) + adata = ad.read(sys.argv[1]) + check(adata.X, adata.obs_names, adata.var_names) + + else: + assert(len(sys.argv) == 2 or len(sys.argv) == 4) + + # https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html + print("Mara's preprocessing...", file=sys.stderr, flush=True) + + # sc.pl.highest_expr_genes(adata, n_top=20, ) # TODO checkpoint + + # print("Basic filtering...", file=sys.stderr, flush=True) + # sc.pp.filter_cells(adata, min_genes=200) + # sc.pp.filter_genes(adata, min_cells=3) + + # adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt' + # sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) + + # sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) # TODO checkpoint + # sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt') # TODO checkpoint + # sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts') # TODO checkpoint + + # print("Actually do the basic filtering...", file=sys.stderr, flush=True) + # adata = adata[adata.obs.n_genes_by_counts < 2500, :] + # adata = adata[adata.obs.pct_counts_mt < 5, :] + + print("Filter out non-cancer...", file=sys.stderr, flush=True) + adata = adata[adata.obs.cell_types_v2 == "cancer", :] + check(adata.X, adata.obs_names, adata.var_names) + + print("Total-count normalize...", file=sys.stderr, flush=True) + # (library-size correct) the data matrix X to 10,000 reads per cell, so that counts become comparable among cells. + sc.pp.normalize_total(adata, target_sum=1e4) + + print("Logarithmize the data...", file=sys.stderr, flush=True) + sc.pp.log1p(adata) + + print("Identify highly-variable genes...", file=sys.stderr, flush=True) + # sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) + sc.pp.highly_variable_genes(adata, n_top_genes=10000) + + # sc.pl.highly_variable_genes(adata) # TODO checkpoint + + print("Actually do the highly-variable filtering...", file=sys.stderr, flush=True) + adata = adata[:, adata.var.highly_variable] + + check(adata.X, adata.obs_names, adata.var_names) + + fname = str(pathlib.Path(fcounts).with_suffix(".mara.hdf5")) + print("Write data to", fname, file=sys.stderr, flush=True) + adata.write(fname) + + adata.file.close() # In case it was backed. + print("Done", file=sys.stderr, flush=True) + diff --git a/scripts/paris/Snakefile b/scripts/paris/preproc.Snakefile similarity index 51% rename from scripts/paris/Snakefile rename to scripts/paris/preproc.Snakefile index 3ad1eb2858a397da3915815f8af31cd4a7db44d7..23eeafb857841c4b2196ac8db54d25df8cd683e7 100644 --- a/scripts/paris/Snakefile +++ b/scripts/paris/preproc.Snakefile @@ -11,28 +11,43 @@ FRICTIONLESSER=config["executable"] rule all: input: - expand("signatures_of_{size}-genes/signature_{seed}.txt", size=config["sizes"], seed=SEEDS) + "data/inter/ranks.tsv", + "cache/trans.cache.dat", + expand("cache/size_{size}.cache.dat", size=config["sizes"]) +rule preprocessing: + input: + counts="data/input/2022_02_18_version_2_EOC_counts.npz", + features="data/input/2022_02_18_version_2_EOC_features.csv", + meta="data/input/2022_02_18_version_2_EOC_meta.csv" + output: + "data/inter/2022_02_18_version_2_EOC_counts.mara.hdf5" + shell: + "python3 preproc-mara__npz-to-hdf5.py {input.counts} {input.features} {input.meta}" -# rule rank: -# input: -# "exprs.csv" -# output: -# "ranks.tsv" -# shell: -# "{FRICTIONLESSER}" -# " --exprs={input}" -# " > {output}" +rule counts: + input: + "data/inter/2022_02_18_version_2_EOC_counts.mara.hdf5" + output: + "data/inter/counts.csv" + shell: + "python3 counts__hdf5-to-csv.py {input} > {output}" + +rule ranks: + input: + "data/inter/counts.csv" + output: + "data/inter/ranks.tsv" + shell: + "{FRICTIONLESSER} --exprs={input} > {output}" rule save_cache_transcriptome: input: - "ranks.tsv" + "data/inter/ranks.tsv" output: protected("cache/trans.cache.dat") log: "logs/save_cache_transcriptome.log" benchmark: "logs/save_cache_transcriptome.bench" - resources: - job_name="cache_trans" shell: "{FRICTIONLESSER}" " --ranks={input}" @@ -42,7 +57,7 @@ rule save_cache_transcriptome: rule save_cache_size: input: - ranks="ranks.tsv", + ranks="data/inter/ranks.tsv", transcache="cache/trans.cache.dat" output: protected("cache/size_{size}.cache.dat") @@ -61,29 +76,3 @@ rule save_cache_size: " --cache-only" " 2> {log}" -rule load_and_run: - input: - ranks="ranks.tsv", - transcache="cache/trans.cache.dat", - sizecache="cache/size_{size}.cache.dat" - output: - "signatures_of_{size}-genes/signature_{seed}.txt" - wildcard_constraints: - # Wildcards should be numeric. - seed="\d+", - size="\d+" - log: "logs/load_and_run_size-{size}_seed-{seed}.log" - benchmark: "logs/load_and_run_size-{size}_seed-{seed}.bench" - resources: - mem_mb=16000, - job_name=lambda wildcards: f"z{wildcards.size}_s{wildcards.seed}" - shell: - "{FRICTIONLESSER}" - " --ranks={input.ranks}" - " --cache-transcriptome={input.transcache}" - " --cache-size={input.sizecache}" - " --ngenes={wildcards.size}" - " --seed={wildcards.seed}" - " > {output}" - " 2> {log}" - diff --git a/scripts/paris/signatures.py b/scripts/paris/signatures.py new file mode 100644 index 0000000000000000000000000000000000000000..22f86e142d1e38097886c901fb8071e2cb8e799d --- /dev/null +++ b/scripts/paris/signatures.py @@ -0,0 +1,217 @@ +import sys +import scipy +import numpy +from sortedcollections import OrderedSet +from scipy.spatial.distance import squareform, pdist, cdist, jaccard +from fastcluster import linkage + +# geneset = set of genes. +# e.g. {"A","B"} +# signature = dictionary of all seen genes ("genome"), with value being true if the gene is in the signature. +# e.g. {"A":1, "B":1, "C": 0, "D":0} + +def load_genome(filenames, filter_size=None): + assert(len(filenames) > 0) + + genome = OrderedSet() + + for filename in filenames: + with open(filename) as fd: + lines = fd.readlines() + for line in lines: + #print(line) + fields = line.strip().split() + if len(fields) == 1: + fields = line.strip().split(",") + assert(len(fields)>1) + i = 0 + if fields[1].isdigit(): + # Johann’ format: <score> <ngenes> <genes>… + score = float(fields[0]) + n = int(fields[1]) + i = 2 + if filter_size: + if len(fields[i:]) != filter_size: + continue + genes = frozenset(fields[i:]) + genome |= genes # Merge genes into all genes. + + return genome + + +def load(filenames, filter_size=None): + if filter_size == 0: + filter_size = None + assert(len(filenames) > 0) + + genesets = OrderedSet() + # current_id = 0 + genome = OrderedSet() + #genesets_scores = {} + breaks = [] + + for filename in filenames: + kept_lines = 0 + with open(filename) as fd: + lines = fd.readlines() + file_genesets = OrderedSet() + for line in lines: + fields = line.strip().split() + # print(len(fields),fields) + if len(fields) == 1: + fields = line.strip().split(",") + assert(len(fields)>1) + i = 0 + if fields[1].isdigit(): + # Johann’ format: <score> <ngenes> <genes>… + score = float(fields[0]) + n = int(fields[1]) + i = 2 + if filter_size: + if len(fields[i:]) != filter_size: + continue + kept_lines += 1 + genes = frozenset(fields[i:]) + #genesets_scores[genes] = score + #assert(len(genes) == n) + genome |= genes # Merge genes into all genes. + # geneset = {"id":current_id, "score":score, "genes": genes} + # geneset = {"score":score, "genes": genes} + # if geneset not in geneset: + # geneset.append( geneset ) + # current_id += 1 + file_genesets |= OrderedSet([genes]) # Add a genes as a geneset if not already here. + #print(genes) + # if kept_lines > 1: + if filter_size: + print("File `",filename,"` had",len(file_genesets),"unique signatures among",kept_lines,"signatures of size",filter_size, file=sys.stderr, flush=True) + else: + print("File `",filename,"` had",len(file_genesets),"unique signatures among",kept_lines,"signatures of all size", file=sys.stderr, flush=True) + + breaks.append(len(file_genesets)) + + genesets |= file_genesets + + return genesets,genome,breaks + + +def genesets_to_signatures(genesets,genome): + signatures = numpy.zeros( (len(genesets),len(genome)), bool ) + for s,geneset in enumerate(genesets): + for g,gene in enumerate(genome): + # if gene in signature["genome"]: + if gene in geneset: + signatures[s][g] = 1 + else: + signatures[s][g] = 0 + return signatures + + +def self_similarity_matrix(signatures): + dissim = pdist(signatures, 'jaccard') # jaccard = DISsimilarity + return 1-squareform(dissim) + + +def seriation(Z,N,cur_index): + ''' + input: + - Z is a hierarchical tree (dendrogram) + - N is the number of points given to the clustering process + - cur_index is the position in the tree for the recursive traversal + output: + - order implied by the hierarchical tree Z + + seriation computes the order implied by a hierarchical tree (dendrogram) + ''' + if cur_index < N: + return [cur_index] + else: + left = int(Z[cur_index-N,0]) + right = int(Z[cur_index-N,1]) + return (seriation(Z,N,left) + seriation(Z,N,right)) + + +def compute_serial_matrix(dist_mat,method="ward"): + ''' + input: + - dist_mat is a distance matrix + - method = ["ward","single","average","complete"] + output: + - seriated_dist is the input dist_mat, + but with re-ordered rows and columns + according to the seriation, i.e. the + order implied by the hierarchical tree + - res_order is the order implied by + the hierarhical tree + - res_linkage is the hierarhical tree (dendrogram) + + compute_serial_matrix transforms a distance matrix into + a sorted distance matrix according to the order implied + by the hierarchical tree (dendrogram) + ''' + N = len(dist_mat) + flat_dist_mat = squareform(dist_mat) + res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True) + res_order = seriation(res_linkage, N, N + N-2) + seriated_dist = numpy.zeros((N,N)) + a,b = numpy.triu_indices(N,k=1) + seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]] + seriated_dist[b,a] = seriated_dist[a,b] + + return seriated_dist, res_order, res_linkage + +def colormesh(mat, ax, cmap, norm): + # cmap = cm.get_cmap("Blues") + cmap.set_bad("white") + + pcm = ax.pcolormesh(mat, cmap=cmap, norm=norm)#, edgecolors="#eeeeee", linewidth=0.01) + #plt.imshow(mat, cmap=cmap) + + # ax.set_aspect("equal") + + xticks=numpy.arange(0,mat.shape[1],max(1,int(mat.shape[1]/40))) + yticks=numpy.arange(0,mat.shape[0],max(1,int(mat.shape[0]/30))) + + # Major ticks + ax.set_xticks(xticks+0.5) + ax.set_yticks(yticks+0.5) + # Labels + ax.set_xticklabels(xticks, rotation=90) + ax.set_yticklabels(yticks, rotation=0) + # Minor ticks + # ax.set_xticks(xticks-0.5, minor=True) + # ax.set_yticks(yticks-0.5, minor=True) + # ax.tick_params(which="minor", bottom=False, left=False) + # ax.grid(True, which="minor", axis="both", color="white", linestyle="-", linewidth=1) + return pcm + + +if __name__=="__main__": + import sys + from matplotlib import gridspec + import matplotlib.pyplot as plt + from matplotlib import cm + from matplotlib.colors import Normalize + + size = int(sys.argv[1]) + if size == 0: + size = None + + sim_thresh = float(sys.argv[2]) + + filenames = sys.argv[3:] + + genesets,genome,breaks = load(filenames, filter_size=size) + assert(len(genesets) > 0) + assert(len(genome) > 0) + #print(genome) + #for s in genesets: + # print(s) + print(len(filenames), "files,", len(genesets), "unique genesets,", len(genome), "genes", flush=True) + sizes = [] + for geneset in genesets: + s = len(geneset) + sizes.append(s) + if not all(s == len(genesets[0]) for s in sizes): + print("Statistics or sizes:",scipy.stats.describe(sizes), flush=True) + diff --git a/scripts/paris/signatures_jaccard-distances_plot.py b/scripts/paris/signatures_jaccard-distances_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..4c5e3e6135d3b4ff745586c2dbfc2e1ee5ba7dd5 --- /dev/null +++ b/scripts/paris/signatures_jaccard-distances_plot.py @@ -0,0 +1,138 @@ +import scipy +import numpy +from sortedcollections import OrderedSet +from scipy.spatial.distance import squareform, pdist, cdist, jaccard +from fastcluster import linkage + +# geneset = set of genes. +# e.g. {"A","B"} +# signature = dictionary of all seen genes ("genome"), with value being true if the gene is in the signature. +# e.g. {"A":1, "B":1, "C": 0, "D":0} + +from signatures import * + +if __name__=="__main__": + import sys + from matplotlib import gridspec + import matplotlib.pyplot as plt + from matplotlib import cm + from matplotlib.colors import Normalize + + size = int(sys.argv[1]) + if size == 0: + size = None + + sim_thresh = float(sys.argv[2]) + + filenames = sys.argv[3:5] # Only two files + + genesets,genome,breaks = load(filenames, filter_size=size) + assert(len(genesets) > 0) + assert(len(genome) > 0) + #print(genome) + #for s in genesets: + # print(s) + print(len(filenames), "files,", len(genesets), "unique genesets,", len(genome), "genes", flush=True) + sizes = [] + for geneset in genesets: + s = len(geneset) + sizes.append(s) + if not all(s == len(genesets[0]) for s in sizes): + print("Statistics or sizes:",scipy.stats.describe(sizes), flush=True) + + signatures = genesets_to_signatures(genesets,genome) + # with numpy.printoptions(threshold=numpy.inf): + # print(signatures) + raw_distance_matrix = self_similarity_matrix(signatures) + + # full_distance_matrix, res_order, res_linkage = compute_serial_matrix(raw_distance_matrix, "ward") + full_distance_matrix = raw_distance_matrix + + # PLOT + fig = plt.figure(figsize=(10,10)) + fig.tight_layout() + gs = gridspec.GridSpec(4, 3, width_ratios=[40,1,1], height_ratios=[20,20,1,1]) + ax1 = fig.add_subplot(gs[0]) + ax2 = fig.add_subplot(gs[3]) + ax2r = fig.add_subplot(gs[4],sharey=ax2) + ax2rr = fig.add_subplot(gs[5],sharey=ax2) + ax2b = fig.add_subplot(gs[6],sharex=ax2) + ax2bb = fig.add_subplot(gs[9],sharex=ax2) + + cmap = cm.get_cmap("Reds") + normalizer = Normalize(0,1) + im = cm.ScalarMappable(norm=normalizer, cmap=cmap) + fig.colorbar(im, ax=[ax1,ax2,ax2r,ax2rr,ax2b,ax2bb]) + + # Full distance matrx + mask = numpy.tri(full_distance_matrix.shape[0], k=-1).transpose() # k>0 = above + mat = numpy.ma.array(full_distance_matrix, mask=mask) + pcm = colormesh(mat, ax1, cmap, normalizer) + ax1.set_title("Jaccard similarities of all {} signatures of size {} on {} genes".format(len(genesets),len(next(iter(genesets))),len(genome))) + + # Dual distance matrix + genesets_1,genome_1,breaks_1 = load([filenames[0]], filter_size=size) + assert(len(genesets_1) > 0) + assert(len(genome_1) > 0) + signatures_1 = genesets_to_signatures(genesets_1, genome) # against common genome + + genesets_2,genome_2,breaks_2 = load([filenames[1]], filter_size=size) + assert(len(genesets_2) > 0) + assert(len(genome_2) > 0) + signatures_2 = genesets_to_signatures(genesets_2, genome) # against common genome + + # print(signatures_1.shape, signatures_2.shape) + + sub_distance_matrix = 1 - cdist(signatures_1, signatures_2, "jaccard") + + + + # Sort columns on sum. + # dual_distance_matrix = sub_distance_matrix[:, (sub_distance_matrix.sum(axis=0)*-1).argsort()] # rows + # dual_distance_matrix = sub_distance_matrix[(sub_distance_matrix.sum(axis=1)).argsort(), :] # columns + sorted_distance_matrix = sub_distance_matrix[:, (sub_distance_matrix.sum(axis=0)*-1).argsort()] # rows + dual_distance_matrix = sorted_distance_matrix[(sorted_distance_matrix.sum(axis=1)).argsort(), :] # columns + + # argsort the columns, sorting by the last row first, then the second-to-last row, continuing up to the first row + # dual_distance_matrix = sub_distance_matrix[numpy.lexsort(sub_distance_matrix)] + # dual_distance_matrix = sub_distance_matrix.T[numpy.lexsort(sub_distance_matrix.T)] + + colormesh(dual_distance_matrix, ax2, cmap, normalizer) + ax2.set_ylabel(filenames[0]) + # ax2.xaxis.tick_top() + + # Sum over rows + # sum_rows = dual_distance_matrix.mean(1) + sum_rows = dual_distance_matrix.sum(1) + colormesh(sum_rows.reshape(-1,1), ax2r, cmap, normalizer) + ax2r.set_ylabel("Sum of similarities for {} signatures in `{}`".format(len(sum_rows),filenames[0])) + ax2r.yaxis.set_label_position("right") + ax2r.yaxis.tick_right() + + # Sum over columns + # sum_cols = dual_distance_matrix.mean(0) + sum_cols = dual_distance_matrix.sum(0) + colormesh(sum_cols.reshape(1,-1), ax2b, cmap, normalizer) + # ax2b.set_xlabel(filenames[1]) + ax2b.set_xlabel("Sum of similarities for {} signatures in `{}`".format(len(sum_cols),filenames[1])) + + def thresh(slice): + return any([s > sim_thresh for s in slice]) + + # Threshold count over rows + match_rows = numpy.apply_along_axis(thresh, axis=1, arr=dual_distance_matrix) + colormesh(match_rows.reshape(-1,1), ax2rr, cmap, normalizer) + sm = sum(match_rows) + n = len(match_rows) + ax2rr.set_ylabel("{:.0%} of signatures in `{}` are similar enough (>{}) to at least another one in `{}`".format(sm/n, filenames[0], sim_thresh, filenames[1])) + ax2rr.yaxis.set_label_position("right") + ax2rr.yaxis.tick_right() + + # Threshold count over columns + match_cols = numpy.apply_along_axis(thresh, axis=0, arr=dual_distance_matrix) + colormesh(match_cols.reshape(1,-1), ax2bb, cmap, normalizer) + sm = sum(match_cols) + n = len(match_cols) + ax2bb.set_xlabel("{:.0%} of signatures in `{}` are similar enough (>{}) to at least another one in `{}`".format(sm/n, filenames[1], sim_thresh, filenames[0])) + + plt.show() diff --git a/scripts/paris/validation.Snakefile b/scripts/paris/validation.Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..fa855ddbca765d6dc5f3478f1d1add9f2d971c7e --- /dev/null +++ b/scripts/paris/validation.Snakefile @@ -0,0 +1,29 @@ + +SIZE=10 + +rule all: + input: + "signatures-average-correlations.npy", + "signatures-average-correlations_full.npy" + +rule ranksgenes_correlations_npy: + input: + ranks="ranks.tsv", + A="signatures/emil.csv", + B="signatures/johann.tsv" + output: + "genes-ranks-correlations.npy" + shell: + "python3 genes-correlations__sign-to-npy.py {input.ranks} {SIZE} {input.A} {input.B}" + +rule npycorrsign_signavcorr_npy: + input: + ranks="ranks.tsv", + corrs="genes-ranks-correlations.npy", + A="signatures/emil.csv", + B="signatures/johann.tsv" + output: + "signatures-average-correlations.npy", + "signatures-average-correlations_full.npy" + shell: + "python3 average-correlations__rkcorr-to-npy.py {input.ranks} {input.corrs} {SIZE} {input.A} {input.B}" diff --git a/src/parser.cpp b/src/parser.cpp index f533c8c4874f26b4900e358a1047c4eac9be5c26..4dec3058f21b42617fc4901d749d5874f6f50e82 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -125,6 +125,11 @@ size_t TranscriptomeParser::load_header(const std::string& line) } ASSERT(loaded.size() == i); ASSERT(_rt._samples.size() == i); + if( _rt.samples_nb() / _rt.cells_nb() > 0.0005) { + CLUTCHLOG(warning, "Suspiciously many samples (" << _rt.samples_nb() + << ") regarding the number of cells (" << _rt.cells_nb() << ")"); + } + return loaded.size(); }