From 5ff10a8620a697d3c36d17a8ff4970d59b07f6db Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Wed, 9 May 2018 12:43:55 +0200
Subject: [PATCH] More checks before DESeq2, mean TPM.

---
 PRO-seq/PRO-seq.snakefile             | 119 ++++++++++----------
 RNA_Seq_Cecere/RNA-seq.snakefile      | 150 +++++++++++++++-----------
 small_RNA-seq/small_RNA-seq.snakefile | 125 +++++++++++----------
 3 files changed, 212 insertions(+), 182 deletions(-)

diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile
index 580b396..ff7321b 100644
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -1067,66 +1067,73 @@ rule differential_expression:
         #################
         formula = Formula("~ lib")
         (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
-        contrast = StrVector(["lib", cond, ref])
-        try:
-            res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
-        except RRuntimeError as e:
+        if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
             warnings.warn(
-                "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
-                    str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
+                "Reference data is all zero.\nSkipping %s_%s_%s" % (
+                    wildcards.contrast, wildcards.orientation, wildcards.biotype))
             for outfile in output:
                 shell(f"echo 'NA' > {outfile}")
         else:
-            # Determining fold-change category
-            ###################################
-            set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
-            #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
-            res = res.assign(status=res.apply(set_de_status, axis=1))
-            # Converting gene IDs
-            ######################
-            with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
-                res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
-            with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
-            # Just to see if column_converter works also with named column, and not just index:
-            # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
-            #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
-            ##########################################
-            # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
-            res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
-            # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
-            counts_and_res = counts_data
-            for normalizer in SIZE_FACTORS:
-                if normalizer == "median_ratio_to_pseudo_ref":
-                    ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
-                    ## add pseudo-count to compute the geometric mean, then remove it
-                    #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
-                    #def median_ratio_to_pseudo_ref(col):
-                    #    return (col / pseudo_ref).median()
-                    #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
-                    size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
-                else:
-                    #raise NotImplementedError(f"{normalizer} normalization not implemented")
-                    size_factors = summaries.loc[normalizer]
-                by_norm = counts_data / size_factors
-                by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
-                counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
-            #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
-            counts_and_res = pd.concat((counts_and_res, res), axis=1)
-            counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
-            # Saving lists of genes gaining or loosing siRNAs
-            up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
-            down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
-            with open(output.up_genes, "w") as up_file:
-                if up_genes:
-                    up_file.write("%s\n" % "\n".join(up_genes))
-                else:
-                    up_file.truncate(0)
-            with open(output.down_genes, "w") as down_file:
-                if down_genes:
-                    down_file.write("%s\n" % "\n".join(down_genes))
-                else:
-                    down_file.truncate(0)
+            contrast = StrVector(["lib", cond, ref])
+            try:
+                res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
+            except RRuntimeError as e:
+                warnings.warn(
+                    "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
+                        str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
+                for outfile in output:
+                    shell(f"echo 'NA' > {outfile}")
+            else:
+                # Determining fold-change category
+                ###################################
+                set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
+                #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
+                res = res.assign(status=res.apply(set_de_status, axis=1))
+                # Converting gene IDs
+                ######################
+                with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
+                    res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
+                with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                    res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                # Just to see if column_converter works also with named column, and not just index:
+                # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
+                #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
+                ##########################################
+                # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
+                res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
+                # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
+                counts_and_res = counts_data
+                for normalizer in SIZE_FACTORS:
+                    if normalizer == "median_ratio_to_pseudo_ref":
+                        ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
+                        ## add pseudo-count to compute the geometric mean, then remove it
+                        #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
+                        #def median_ratio_to_pseudo_ref(col):
+                        #    return (col / pseudo_ref).median()
+                        #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
+                        size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
+                    else:
+                        #raise NotImplementedError(f"{normalizer} normalization not implemented")
+                        size_factors = summaries.loc[normalizer]
+                    by_norm = counts_data / size_factors
+                    by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
+                    counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
+                #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
+                counts_and_res = pd.concat((counts_and_res, res), axis=1)
+                counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
+                # Saving lists of genes gaining or loosing siRNAs
+                up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
+                down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
+                with open(output.up_genes, "w") as up_file:
+                    if up_genes:
+                        up_file.write("%s\n" % "\n".join(up_genes))
+                    else:
+                        up_file.truncate(0)
+                with open(output.down_genes, "w") as down_file:
+                    if down_genes:
+                        down_file.write("%s\n" % "\n".join(down_genes))
+                    else:
+                        down_file.truncate(0)
 
 
 rule make_lfc_distrib_plot:
diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile
index 638f770..76760fc 100644
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -270,6 +270,10 @@ def specific_input(aligner):
     return files
 
 counts_files = [
+    expand(
+        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
+            "{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
+        lib=LIBS, counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS),
     expand(
         OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
             f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
@@ -964,6 +968,23 @@ rule compute_TPM:
         tpm.to_csv(output.tpm_file, sep="\t")
 
 
+# Useful to compute translation efficiency in the Ribo-seq pipeline
+rule compute_mean_TPM:
+    input:
+        all_tmp_file = rules.compute_TPM.output.tpm_file
+    output:
+        tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
+            "{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
+    run:
+        # TODO
+        tpm = pd.read_table(
+            input.all_tmp_file, index_col="gene",
+            usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]])
+        tpm_mean = tpm.mean(axis=1)
+        tpm_mean.columns = [wildcards.lib]
+        tpm_mean.to_csv(output.tpm_file, sep="\t")
+
+
 rule compute_RPM:
     input:
         counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
@@ -1525,71 +1546,78 @@ rule differential_expression:
         #################
         formula = Formula("~ lib")
         (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
-        contrast = StrVector(["lib", cond, ref])
-        try:
-            res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
-        except RRuntimeError as e:
+        if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
             warnings.warn(
-                "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
-                    str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
+                "Reference data is all zero.\nSkipping %s_%s_%s" % (
+                    wildcards.contrast, wildcards.orientation, wildcards.biotype))
             for outfile in output:
                 shell(f"echo 'NA' > {outfile}")
         else:
-            # Determining fold-change category
-            ###################################
-            set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
-            #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
-            res = res.assign(status=res.apply(set_de_status, axis=1))
-            # Converting gene IDs
-            ######################
-            with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
-                res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
-            with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
-            # Just to see if column_converter works also with named column, and not just index:
-            # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
-            #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
-            ##########################################
-            # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
-            res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
-            # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
-            counts_and_res = counts_data
-            for normalizer in SIZE_FACTORS:
-                if normalizer == "median_ratio_to_pseudo_ref":
-                    ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
-                    ## add pseudo-count to compute the geometric mean, then remove it
-                    #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
-                    #def median_ratio_to_pseudo_ref(col):
-                    #    return (col / pseudo_ref).median()
-                    #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
-                    size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
-                else:
-                    #raise NotImplementedError(f"{normalizer} normalization not implemented")
-                    size_factors = summaries.loc[normalizer]
-                by_norm = counts_data / size_factors
-                by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
-                counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
-            #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
-            counts_and_res = pd.concat((counts_and_res, res), axis=1)
-            counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
-            # Saving lists of genes gaining or loosing siRNAs
-            # diff_expressed = res.query("padj < 0.05")
-            # up_genes = list(diff_expressed.query("log2FoldChange >= 1").index)
-            # down_genes = list(diff_expressed.query("log2FoldChange <= -1").index)
-            up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
-            down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
-            #up_genes = list(counts_and_res.query("status == 'up'").index)
-            #down_genes = list(counts_and_res.query("status == 'down'").index)
-            with open(output.up_genes, "w") as up_file:
-                if up_genes:
-                    up_file.write("%s\n" % "\n".join(up_genes))
-                else:
-                    up_file.truncate(0)
-            with open(output.down_genes, "w") as down_file:
-                if down_genes:
-                    down_file.write("%s\n" % "\n".join(down_genes))
-                else:
-                    down_file.truncate(0)
+            contrast = StrVector(["lib", cond, ref])
+            try:
+                res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
+            except RRuntimeError as e:
+                warnings.warn(
+                    "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
+                        str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
+                for outfile in output:
+                    shell(f"echo 'NA' > {outfile}")
+            else:
+                # Determining fold-change category
+                ###################################
+                set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
+                #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
+                res = res.assign(status=res.apply(set_de_status, axis=1))
+                # Converting gene IDs
+                ######################
+                with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
+                    res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
+                with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                    res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                # Just to see if column_converter works also with named column, and not just index:
+                # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
+                #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
+                ##########################################
+                # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
+                res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
+                # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
+                counts_and_res = counts_data
+                for normalizer in SIZE_FACTORS:
+                    if normalizer == "median_ratio_to_pseudo_ref":
+                        ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
+                        ## add pseudo-count to compute the geometric mean, then remove it
+                        #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
+                        #def median_ratio_to_pseudo_ref(col):
+                        #    return (col / pseudo_ref).median()
+                        #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
+                        size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
+                    else:
+                        #raise NotImplementedError(f"{normalizer} normalization not implemented")
+                        size_factors = summaries.loc[normalizer]
+                    by_norm = counts_data / size_factors
+                    by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
+                    counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
+                #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
+                counts_and_res = pd.concat((counts_and_res, res), axis=1)
+                counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
+                # Saving lists of genes gaining or loosing siRNAs
+                # diff_expressed = res.query("padj < 0.05")
+                # up_genes = list(diff_expressed.query("log2FoldChange >= 1").index)
+                # down_genes = list(diff_expressed.query("log2FoldChange <= -1").index)
+                up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
+                down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
+                #up_genes = list(counts_and_res.query("status == 'up'").index)
+                #down_genes = list(counts_and_res.query("status == 'down'").index)
+                with open(output.up_genes, "w") as up_file:
+                    if up_genes:
+                        up_file.write("%s\n" % "\n".join(up_genes))
+                    else:
+                        up_file.truncate(0)
+                with open(output.down_genes, "w") as down_file:
+                    if down_genes:
+                        down_file.write("%s\n" % "\n".join(down_genes))
+                    else:
+                        down_file.truncate(0)
 
 
 rule make_lfc_distrib_plot:
diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile
index 6bf16cf..768c487 100644
--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -3078,74 +3078,69 @@ rule small_RNA_differential_expression:
             #################
             formula = Formula("~ lib")
             (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
-            contrast = StrVector(["lib", cond, ref])
-            #if wildcards.contrast == "%s_vs_%s" % (MUT, REF):
-            #    contrast = StrVector(["lib", MUT, REF])
-            #elif wildcards.contrast in ["%s_vs_%s" % cond_pair for cond_pair in combinations(reversed(TREATS), 2)]:
-            #    (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
-            #    contrast = StrVector(["treat", cond, ref])
-            #elif wildcards.contrast in ["%s_%s_vs_%s_%s" % (lib2, treat2, lib1, treat1) for (
-            #        (lib1, treat1), (lib2, treat2)) in combinations(product(LIBS, TREATS), 2)]:
-            #    formula = Formula("~ lib_treat")
-            #    (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
-            #    contrast = StrVector(["lib_treat", cond, ref])
-            #else:
-            #    raise NotImplementedError("Unknown contrast: %s" % wildcards.contrast)
-            try:
-                res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
-            except RRuntimeError as e:
-                warn(
-                    "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
-                        str(e), wildcards.contrast, wildcards.orientation, wildcards.small_type))
+            if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
+                warnings.warn(
+                    "Reference data is all zero.\nSkipping %s_%s_%s" % (
+                        wildcards.contrast, wildcards.orientation, wildcards.small_type))
                 for outfile in output:
                     shell(f"echo 'NA' > {outfile}")
             else:
-                # Determining fold-change category
-                ###################################
-                set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
-                res = res.assign(status=res.apply(set_de_status, axis=1))
-                # Converting gene IDs
-                ######################
-                with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
-                    res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
-                with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                    res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
-                ##########################################
-                # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
-                res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
-                # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
-                counts_and_res = counts_data
-                for normalizer in DE_SIZE_FACTORS:
-                    if normalizer == "median_ratio_to_pseudo_ref":
-                        ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
-                        ## add pseudo-count to compute the geometric mean, then remove it
-                        #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
-                        #def median_ratio_to_pseudo_ref(col):
-                        #    return (col / pseudo_ref).median()
-                        #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
-                        size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
-                    else:
-                        size_factors = summaries.loc[normalizer]
-                    by_norm = counts_data / size_factors
-                    by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
-                    counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
-                counts_and_res = add_tags_column(
-                    pd.concat((counts_and_res, res), axis=1),
-                    input.tags_table, "small_type")
-                counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
-                # Saving lists of genes gaining or loosing siRNAs
-                up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
-                down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
-                with open(output.up_genes, "w") as up_file:
-                    if up_genes:
-                        up_file.write("%s\n" % "\n".join(up_genes))
-                    else:
-                        up_file.truncate(0)
-                with open(output.down_genes, "w") as down_file:
-                    if down_genes:
-                        down_file.write("%s\n" % "\n".join(down_genes))
-                    else:
-                        down_file.truncate(0)
+                contrast = StrVector(["lib", cond, ref])
+                try:
+                    res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
+                except RRuntimeError as e:
+                    warn(
+                        "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
+                            str(e), wildcards.contrast, wildcards.orientation, wildcards.small_type))
+                    for outfile in output:
+                        shell(f"echo 'NA' > {outfile}")
+                else:
+                    # Determining fold-change category
+                    ###################################
+                    set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
+                    res = res.assign(status=res.apply(set_de_status, axis=1))
+                    # Converting gene IDs
+                    ######################
+                    with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
+                        res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
+                    with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                        res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                    ##########################################
+                    # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
+                    res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
+                    # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
+                    counts_and_res = counts_data
+                    for normalizer in DE_SIZE_FACTORS:
+                        if normalizer == "median_ratio_to_pseudo_ref":
+                            ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
+                            ## add pseudo-count to compute the geometric mean, then remove it
+                            #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
+                            #def median_ratio_to_pseudo_ref(col):
+                            #    return (col / pseudo_ref).median()
+                            #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
+                            size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
+                        else:
+                            size_factors = summaries.loc[normalizer]
+                        by_norm = counts_data / size_factors
+                        by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
+                        counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
+                    counts_and_res = add_tags_column(
+                        pd.concat((counts_and_res, res), axis=1),
+                        input.tags_table, "small_type")
+                    counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
+                    # Saving lists of genes gaining or loosing siRNAs
+                    up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
+                    down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
+                    with open(output.up_genes, "w") as up_file:
+                        if up_genes:
+                            up_file.write("%s\n" % "\n".join(up_genes))
+                        else:
+                            up_file.truncate(0)
+                    with open(output.down_genes, "w") as down_file:
+                        if down_genes:
+                            down_file.write("%s\n" % "\n".join(down_genes))
+                        else:
+                            down_file.truncate(0)
 
 
 rule gather_DE_folds:
-- 
GitLab