Add new sRNA category "pimi22GtRF".

40d0e5e1 · Blaise Li · d817a7ef · 40d0e5e1 · 40d0e5e1
Commit 40d0e5e1 authored Feb 14, 2022 by Blaise Li
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ libdeseq @ git+https://gitlab.pasteur.fr/bli/libdeseq.git@196ee3d15a125fad0d8212
 libhts @ git+https://gitlab.pasteur.fr/bli/libhts.git@5ac2b62e23a1b28f54d1ce2a483c8df082dfbebc
 libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git@91db379cd379f8f12fccdd3840d4369b7f09d444
 libriboseq @ git+https://gitlab.pasteur.fr/bli/libriboseq.git@95a0837ea703054a98f2f3b098818000828c6f8f
-libsmallrna @ git+https://gitlab.pasteur.fr/bli/libsmallrna.git@6438ba5a2489db60401160099c2ea358718c9b80
+libsmallrna @ git+https://gitlab.pasteur.fr/bli/libsmallrna.git@f09cb493959dd8c944b5bea3ec14ccb9baf7bd4c
 libworkflows @ git+https://gitlab.pasteur.fr/bli/libworkflows.git@b29b854ff1db6c87386007808286207b8af11b9d
 mappy==2.17
 matplotlib==3.3.2


--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -251,7 +251,7 @@ assert set(DE_TYPES) <= set(SMALL_TYPES + JOINED_SMALL_TYPES), "%s\n%s" % (", ".
 # TODO: update cross_HTS with pimi22G
 # TODO: what kind of pimi22G ? -> pi, mi and si_22G, but not siu_22G
 # TODO: add tRFs either in pimi22G or alone? If alone, check that label_order in plot_fold_heatmap is correct
-IP_TYPES = [f"pimi{SI_MIN}G", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
+IP_TYPES = [f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
            f"siu_{SI_MAX}G", f"prot_si_{SI_MAX}G", f"ri_si_{SI_MAX}G"]
 assert set(IP_TYPES) <= set(
    SMALL_TYPES + [f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G"] + JOINED_SMALL_TYPES), ", ".join(IP_TYPES)
@@ -842,6 +842,7 @@ else:
 if IP_CONTRASTS:
    all_contrasts_folds = [
        OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}G_mean_log2_RPM_fold.txt"),
+        OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}GtRF_mean_log2_RPM_fold.txt"),
        # To have RPM (folds) for transgenes (which are not in prot_si category)
        OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"all_si_{SI_MIN}G_mean_log2_RPM_fold.txt")]
    ip_fold_boxplots_by_contrast = expand(
@@ -923,17 +924,18 @@ rule all:
        #     feature_counts_dir,
        #     "all_{small_type}_{mapping_type}_{biotype}_{orientation}_transcript_counts.txt"),
        #     small_type=READ_TYPES_FOR_MAPPING, mapping_type=[f"on_{genome}"], biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
+        # Note (14/02/2022): no pi here: Why ? -> Let's add pi while we're adding missing tRF
        expand(
            OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
-            small_type=["mi", *SI_TYPES, *SIU_TYPES,  f"pimi{SI_MIN}G"]),
+            small_type=["pi", "mi", "tRF", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
        # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting
        # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot
        expand(
            OPJ("figures", "{small_type}_norm_correlations.pdf"),
-            small_type=["mi", *SI_TYPES, *SIU_TYPES,  f"pimi{SI_MIN}G"]),
+            small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
        expand(
            OPJ("figures", "{small_type}_norm_counts_distrib.pdf"),
-            small_type=["mi", *SI_TYPES, *SIU_TYPES,  f"pimi{SI_MIN}G"]),
+            small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),

 #absolute = "/pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/includes/link_raw_data.rules"
 #relative_include_path = "../snakemake_wrappers/includes/link_raw_data.snakefile"
@@ -1932,8 +1934,8 @@ rule join_all_sisiu_counts:


 #TODO: add tRF, then change category name
-rule join_pimi22G_counts:
-    f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G"""
+rule join_pimi22GtRF_counts:
+    f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G (a.k.a pisimi) and then with tRF into pimi{SI_MIN}GtRF (a.k.a pisimitRF)"""
    input:
        pi_counts_table = OPJ(
            annot_counts_dir,
@@ -1944,10 +1946,17 @@ rule join_pimi22G_counts:
        mi_counts_table = OPJ(
            annot_counts_dir,
            f"all_{size_selected}_on_{genome}", "mi_counts.txt"),
+        tRF_counts_table = OPJ(
+            annot_counts_dir,
+            f"all_{size_selected}_on_{genome}", "tRF_counts.txt"),
    output:
        counts_table = OPJ(
            annot_counts_dir,
            f"all_{size_selected}_on_{genome}", "pisimi_counts.txt"),
+        # Also generate pisimitRF_counts.txt
+        counts_table_plus_tRF = OPJ(
+            annot_counts_dir,
+            f"all_{size_selected}_on_{genome}", "pisimitRF_counts.txt"),
    run:
        pi_counts_data = pd.read_table(input.pi_counts_table, index_col="gene")
        si_22G_counts_data = pd.read_table(input.si_22G_counts_table, index_col="gene")
@@ -1955,6 +1964,10 @@ rule join_pimi22G_counts:
        counts_data = pd.concat([pi_counts_data, si_22G_counts_data, mi_counts_data])
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, sep="\t")
+        tRF_counts_data = pd.read_table(input.tRF_counts_table, index_col="gene")
+        pisimitRF_counts_data = pd.concat([counts_data, tRF_counts_data])
+        pisimitRF_counts_data.index.names = ["gene"]
+        pisimitRF_counts_data.to_csv(output.counts_table_plus_tRF, sep="\t")


 @wc_applied
@@ -1974,7 +1987,10 @@ def source_small_RNA_counts(wildcards):
    #     return rules.join_pisimi_counts.output.counts_table
    if wildcards.small_type == f"pimi{SI_MIN}G":
        # si_22G and also pi and mi
-        return rules.join_pimi22G_counts.output.counts_table
+        return rules.join_pimi22GtRF_counts.output.counts_table
+    elif wildcards.small_type == f"pimi{SI_MIN}GtRF":
+        # si_22G and also pi and mi and tRF
+        return rules.join_pimi22GtRF_counts.output.counts_table_plus_tRF
    elif wildcards.small_type in {f"sisiu_{suffix}" for suffix in SI_SUFFIXES}:
        # si and siu
        return rules.join_sisiu_counts.output.counts_table
@@ -2361,21 +2377,19 @@ rule compute_RPM_folds:
            assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
            logfile.write(f"Adding small read type info from {input.tags_table}\n")
            #pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
-            tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile)
-            logfile.write(f"Columns in tags_table are: {tags_table.columns}\n")
-            logfile.write(f"Index in tags_table is: {tags_table.index.name}\n")
+            lfc_with_tags = add_tags_column(lfc, input.tags_table, "small_type", logfile)
+            logfile.write(f"Columns in lfc_with_tags are: {lfc_with_tags.columns}\n")
+            logfile.write(f"Index in lfc_with_tags is: {lfc_with_tags.index.name}\n")
            lfc_idx = lfc.index
            RPM_idx = RPM.index
-            tags_table_idx = tags_table.index
+            lfc_with_tags_idx = lfc_with_tags.index
            lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
-            lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
-            RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
-            logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
-            # with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
-            with_tags = pd.concat((RPM, tags_table), axis=1)
+            lfc_xor_lfc_with_tags = lfc_idx.symmetric_difference(lfc_with_tags_idx)
+            RPM_xor_lfc_with_tags = RPM_idx.symmetric_difference(lfc_with_tags_idx)
+            logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_lfc_with_tags: {lfc_xor_lfc_with_tags}\nRPM_xor_lfc_with_tags: {RPM_xor_lfc_with_tags}\n")
+            with_tags = pd.concat((RPM, lfc_with_tags), axis=1)
            logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
            logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
-            # pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
            logfile.write(f"Then writing to {output.fold_results}\n")
            with_tags.to_csv(output.fold_results, sep="\t")