From 5ff10a8620a697d3c36d17a8ff4970d59b07f6db Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Wed, 9 May 2018 12:43:55 +0200 Subject: [PATCH] More checks before DESeq2, mean TPM. --- PRO-seq/PRO-seq.snakefile | 119 ++++++++++---------- RNA_Seq_Cecere/RNA-seq.snakefile | 150 +++++++++++++++----------- small_RNA-seq/small_RNA-seq.snakefile | 125 +++++++++++---------- 3 files changed, 212 insertions(+), 182 deletions(-) diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 580b396..ff7321b 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -1067,66 +1067,73 @@ rule differential_expression: ################# formula = Formula("~ lib") (cond, ref) = CONTRAST2PAIR[wildcards.contrast] - contrast = StrVector(["lib", cond, ref]) - try: - res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) - except RRuntimeError as e: + if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS): warnings.warn( - "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( - str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) + "Reference data is all zero.\nSkipping %s_%s_%s" % ( + wildcards.contrast, wildcards.orientation, wildcards.biotype)) for outfile in output: shell(f"echo 'NA' > {outfile}") else: - # Determining fold-change category - ################################### - set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - res = res.assign(status=res.apply(set_de_status, axis=1)) - # Converting gene IDs - ###################### - with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: - res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) - # Just to see if column_converter works also with named column, and not just index: - # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: - # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) - ########################################## - # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") - res.to_csv(output.deseq_results, sep="\t", na_rep="NA") - # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status - counts_and_res = counts_data - for normalizer in SIZE_FACTORS: - if normalizer == "median_ratio_to_pseudo_ref": - ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but - ## add pseudo-count to compute the geometric mean, then remove it - #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 - #def median_ratio_to_pseudo_ref(col): - # return (col / pseudo_ref).median() - #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) - size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) - else: - #raise NotImplementedError(f"{normalizer} normalization not implemented") - size_factors = summaries.loc[normalizer] - by_norm = counts_data / size_factors - by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) - counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - counts_and_res = pd.concat((counts_and_res, res), axis=1) - counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") - # Saving lists of genes gaining or loosing siRNAs - up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) - down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) - with open(output.up_genes, "w") as up_file: - if up_genes: - up_file.write("%s\n" % "\n".join(up_genes)) - else: - up_file.truncate(0) - with open(output.down_genes, "w") as down_file: - if down_genes: - down_file.write("%s\n" % "\n".join(down_genes)) - else: - down_file.truncate(0) + contrast = StrVector(["lib", cond, ref]) + try: + res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) + except RRuntimeError as e: + warnings.warn( + "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( + str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) + for outfile in output: + shell(f"echo 'NA' > {outfile}") + else: + # Determining fold-change category + ################################### + set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + res = res.assign(status=res.apply(set_de_status, axis=1)) + # Converting gene IDs + ###################### + with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: + res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) + with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + # Just to see if column_converter works also with named column, and not just index: + # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) + ########################################## + # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") + res.to_csv(output.deseq_results, sep="\t", na_rep="NA") + # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status + counts_and_res = counts_data + for normalizer in SIZE_FACTORS: + if normalizer == "median_ratio_to_pseudo_ref": + ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but + ## add pseudo-count to compute the geometric mean, then remove it + #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 + #def median_ratio_to_pseudo_ref(col): + # return (col / pseudo_ref).median() + #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) + size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) + else: + #raise NotImplementedError(f"{normalizer} normalization not implemented") + size_factors = summaries.loc[normalizer] + by_norm = counts_data / size_factors + by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) + counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + counts_and_res = pd.concat((counts_and_res, res), axis=1) + counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") + # Saving lists of genes gaining or loosing siRNAs + up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) + down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) + with open(output.up_genes, "w") as up_file: + if up_genes: + up_file.write("%s\n" % "\n".join(up_genes)) + else: + up_file.truncate(0) + with open(output.down_genes, "w") as down_file: + if down_genes: + down_file.write("%s\n" % "\n".join(down_genes)) + else: + down_file.truncate(0) rule make_lfc_distrib_plot: diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 638f770..76760fc 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -270,6 +270,10 @@ def specific_input(aligner): return files counts_files = [ + expand( + OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + "{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"), + lib=LIBS, counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS), expand( OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"), @@ -964,6 +968,23 @@ rule compute_TPM: tpm.to_csv(output.tpm_file, sep="\t") +# Useful to compute translation efficiency in the Ribo-seq pipeline +rule compute_mean_TPM: + input: + all_tmp_file = rules.compute_TPM.output.tpm_file + output: + tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + "{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"), + run: + # TODO + tpm = pd.read_table( + input.all_tmp_file, index_col="gene", + usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]]) + tpm_mean = tpm.mean(axis=1) + tpm_mean.columns = [wildcards.lib] + tpm_mean.to_csv(output.tpm_file, sep="\t") + + rule compute_RPM: input: counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", @@ -1525,71 +1546,78 @@ rule differential_expression: ################# formula = Formula("~ lib") (cond, ref) = CONTRAST2PAIR[wildcards.contrast] - contrast = StrVector(["lib", cond, ref]) - try: - res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) - except RRuntimeError as e: + if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS): warnings.warn( - "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( - str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) + "Reference data is all zero.\nSkipping %s_%s_%s" % ( + wildcards.contrast, wildcards.orientation, wildcards.biotype)) for outfile in output: shell(f"echo 'NA' > {outfile}") else: - # Determining fold-change category - ################################### - set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - res = res.assign(status=res.apply(set_de_status, axis=1)) - # Converting gene IDs - ###################### - with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: - res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) - # Just to see if column_converter works also with named column, and not just index: - # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: - # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) - ########################################## - # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") - res.to_csv(output.deseq_results, sep="\t", na_rep="NA") - # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status - counts_and_res = counts_data - for normalizer in SIZE_FACTORS: - if normalizer == "median_ratio_to_pseudo_ref": - ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but - ## add pseudo-count to compute the geometric mean, then remove it - #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 - #def median_ratio_to_pseudo_ref(col): - # return (col / pseudo_ref).median() - #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) - size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) - else: - #raise NotImplementedError(f"{normalizer} normalization not implemented") - size_factors = summaries.loc[normalizer] - by_norm = counts_data / size_factors - by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) - counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - counts_and_res = pd.concat((counts_and_res, res), axis=1) - counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") - # Saving lists of genes gaining or loosing siRNAs - # diff_expressed = res.query("padj < 0.05") - # up_genes = list(diff_expressed.query("log2FoldChange >= 1").index) - # down_genes = list(diff_expressed.query("log2FoldChange <= -1").index) - up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) - down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) - #up_genes = list(counts_and_res.query("status == 'up'").index) - #down_genes = list(counts_and_res.query("status == 'down'").index) - with open(output.up_genes, "w") as up_file: - if up_genes: - up_file.write("%s\n" % "\n".join(up_genes)) - else: - up_file.truncate(0) - with open(output.down_genes, "w") as down_file: - if down_genes: - down_file.write("%s\n" % "\n".join(down_genes)) - else: - down_file.truncate(0) + contrast = StrVector(["lib", cond, ref]) + try: + res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) + except RRuntimeError as e: + warnings.warn( + "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( + str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) + for outfile in output: + shell(f"echo 'NA' > {outfile}") + else: + # Determining fold-change category + ################################### + set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + res = res.assign(status=res.apply(set_de_status, axis=1)) + # Converting gene IDs + ###################### + with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: + res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) + with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + # Just to see if column_converter works also with named column, and not just index: + # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) + ########################################## + # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") + res.to_csv(output.deseq_results, sep="\t", na_rep="NA") + # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status + counts_and_res = counts_data + for normalizer in SIZE_FACTORS: + if normalizer == "median_ratio_to_pseudo_ref": + ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but + ## add pseudo-count to compute the geometric mean, then remove it + #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 + #def median_ratio_to_pseudo_ref(col): + # return (col / pseudo_ref).median() + #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) + size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) + else: + #raise NotImplementedError(f"{normalizer} normalization not implemented") + size_factors = summaries.loc[normalizer] + by_norm = counts_data / size_factors + by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) + counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + counts_and_res = pd.concat((counts_and_res, res), axis=1) + counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") + # Saving lists of genes gaining or loosing siRNAs + # diff_expressed = res.query("padj < 0.05") + # up_genes = list(diff_expressed.query("log2FoldChange >= 1").index) + # down_genes = list(diff_expressed.query("log2FoldChange <= -1").index) + up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) + down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) + #up_genes = list(counts_and_res.query("status == 'up'").index) + #down_genes = list(counts_and_res.query("status == 'down'").index) + with open(output.up_genes, "w") as up_file: + if up_genes: + up_file.write("%s\n" % "\n".join(up_genes)) + else: + up_file.truncate(0) + with open(output.down_genes, "w") as down_file: + if down_genes: + down_file.write("%s\n" % "\n".join(down_genes)) + else: + down_file.truncate(0) rule make_lfc_distrib_plot: diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 6bf16cf..768c487 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -3078,74 +3078,69 @@ rule small_RNA_differential_expression: ################# formula = Formula("~ lib") (cond, ref) = CONTRAST2PAIR[wildcards.contrast] - contrast = StrVector(["lib", cond, ref]) - #if wildcards.contrast == "%s_vs_%s" % (MUT, REF): - # contrast = StrVector(["lib", MUT, REF]) - #elif wildcards.contrast in ["%s_vs_%s" % cond_pair for cond_pair in combinations(reversed(TREATS), 2)]: - # (cond, ref) = CONTRAST2PAIR[wildcards.contrast] - # contrast = StrVector(["treat", cond, ref]) - #elif wildcards.contrast in ["%s_%s_vs_%s_%s" % (lib2, treat2, lib1, treat1) for ( - # (lib1, treat1), (lib2, treat2)) in combinations(product(LIBS, TREATS), 2)]: - # formula = Formula("~ lib_treat") - # (cond, ref) = CONTRAST2PAIR[wildcards.contrast] - # contrast = StrVector(["lib_treat", cond, ref]) - #else: - # raise NotImplementedError("Unknown contrast: %s" % wildcards.contrast) - try: - res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) - except RRuntimeError as e: - warn( - "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( - str(e), wildcards.contrast, wildcards.orientation, wildcards.small_type)) + if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS): + warnings.warn( + "Reference data is all zero.\nSkipping %s_%s_%s" % ( + wildcards.contrast, wildcards.orientation, wildcards.small_type)) for outfile in output: shell(f"echo 'NA' > {outfile}") else: - # Determining fold-change category - ################################### - set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") - res = res.assign(status=res.apply(set_de_status, axis=1)) - # Converting gene IDs - ###################### - with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: - res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) - ########################################## - # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") - res.to_csv(output.deseq_results, sep="\t", na_rep="NA") - # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status - counts_and_res = counts_data - for normalizer in DE_SIZE_FACTORS: - if normalizer == "median_ratio_to_pseudo_ref": - ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but - ## add pseudo-count to compute the geometric mean, then remove it - #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 - #def median_ratio_to_pseudo_ref(col): - # return (col / pseudo_ref).median() - #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) - size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) - else: - size_factors = summaries.loc[normalizer] - by_norm = counts_data / size_factors - by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) - counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) - counts_and_res = add_tags_column( - pd.concat((counts_and_res, res), axis=1), - input.tags_table, "small_type") - counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") - # Saving lists of genes gaining or loosing siRNAs - up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) - down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) - with open(output.up_genes, "w") as up_file: - if up_genes: - up_file.write("%s\n" % "\n".join(up_genes)) - else: - up_file.truncate(0) - with open(output.down_genes, "w") as down_file: - if down_genes: - down_file.write("%s\n" % "\n".join(down_genes)) - else: - down_file.truncate(0) + contrast = StrVector(["lib", cond, ref]) + try: + res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) + except RRuntimeError as e: + warn( + "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( + str(e), wildcards.contrast, wildcards.orientation, wildcards.small_type)) + for outfile in output: + shell(f"echo 'NA' > {outfile}") + else: + # Determining fold-change category + ################################### + set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") + res = res.assign(status=res.apply(set_de_status, axis=1)) + # Converting gene IDs + ###################### + with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: + res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) + with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + ########################################## + # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") + res.to_csv(output.deseq_results, sep="\t", na_rep="NA") + # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status + counts_and_res = counts_data + for normalizer in DE_SIZE_FACTORS: + if normalizer == "median_ratio_to_pseudo_ref": + ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but + ## add pseudo-count to compute the geometric mean, then remove it + #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 + #def median_ratio_to_pseudo_ref(col): + # return (col / pseudo_ref).median() + #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) + size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) + else: + size_factors = summaries.loc[normalizer] + by_norm = counts_data / size_factors + by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) + counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) + counts_and_res = add_tags_column( + pd.concat((counts_and_res, res), axis=1), + input.tags_table, "small_type") + counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") + # Saving lists of genes gaining or loosing siRNAs + up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) + down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) + with open(output.up_genes, "w") as up_file: + if up_genes: + up_file.write("%s\n" % "\n".join(up_genes)) + else: + up_file.truncate(0) + with open(output.down_genes, "w") as down_file: + if down_genes: + down_file.write("%s\n" % "\n".join(down_genes)) + else: + down_file.truncate(0) rule gather_DE_folds: -- GitLab