From 54f1e09c16b01310e0691c452296fab200da5725 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 21 Apr 2023 11:10:30 +0200
Subject: [PATCH] Gene id conversion fix in all pipelines.

---
 CLIP/iCLIP.snakefile                  | 17 ++++++++-
 Degradome-seq/Degradome-seq.snakefile | 32 ++++++++++++----
 PRO-seq/PRO-seq.snakefile             | 53 ++++++++++++++-------------
 RNA_Seq_Cecere/RNA-seq.snakefile      | 26 ++++++++++---
 Ribo-seq/Ribo-seq.snakefile           | 37 ++++++++++++++-----
 small_RNA-seq/small_RNA-seq.snakefile |  2 +-
 6 files changed, 114 insertions(+), 53 deletions(-)

diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile
index 621a47d..40257c0 100644
--- a/CLIP/iCLIP.snakefile
+++ b/CLIP/iCLIP.snakefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Blaise Li
+# Copyright (C) 2020-2023 Blaise Li
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -76,8 +76,21 @@ genome_db = genome_dict["db"][aligner]
 # bed file binning the genome in 10nt bins
 genome_binned = genome_dict["binned"]
 annot_dir = genome_dict["annot_dir"]
-# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
+# What are the difference between
+# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
+# /!\ gene_ids_data_dir contains more conversion dicts,
+# but is not influenced by genome preparation customization,
+# like splitting of miRNAs into 3p and 5p.
+# Currently not used
 convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
+# For wormid2name, load in priority the one
+# that might contain custom gene names, like for splitted miRNAs
+with open(
+        genome_dict.get(
+            "converter",
+            OPJ(convert_dir, "wormid2name.pickle")),
+        "rb") as dict_file:
+    wormid2name = load(dict_file)
 gene_lists_dir = genome_dict["gene_lists_dir"]
 avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
 
diff --git a/Degradome-seq/Degradome-seq.snakefile b/Degradome-seq/Degradome-seq.snakefile
index 36a4316..58889de 100644
--- a/Degradome-seq/Degradome-seq.snakefile
+++ b/Degradome-seq/Degradome-seq.snakefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Blaise Li
+# Copyright (C) 2020-2023 Blaise Li
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -107,8 +107,20 @@ genome_db = genome_dict["db"][aligner]
 # bed file binning the genome in 10nt bins
 genome_binned = genome_dict["binned"]
 annot_dir = genome_dict["annot_dir"]
-# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
+# What are the difference between
+# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
+# /!\ gene_ids_data_dir contains more conversion dicts,
+# but is not influenced by genome preparation customization,
+# like splitting of miRNAs into 3p and 5p.
 convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
+# For wormid2name, load in priority the one
+# that might contain custom gene names, like for splitted miRNAs
+with open(
+        genome_dict.get(
+            "converter",
+            OPJ(convert_dir, "wormid2name.pickle")),
+        "rb") as dict_file:
+    wormid2name = load(dict_file)
 gene_lists_dir = genome_dict["gene_lists_dir"]
 avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
 
@@ -509,9 +521,11 @@ rule compute_efficiency:
         with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
             tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply(
                 column_converter(load(dict_file)), axis=1))
-        with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-            tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
-                column_converter(load(dict_file)), axis=1))
+        #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+        #    tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
+        #        column_converter(load(dict_file)), axis=1))
+        tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
+            column_converter(wormid2name), axis=1))
         tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype")
         tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA")
 
@@ -538,9 +552,11 @@ rule gather_efficiency:
         with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
             eff_data = eff_data.assign(cosmid=eff_data.apply(
                 column_converter(load(dict_file)), axis=1))
-        with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-            eff_data = eff_data.assign(name=eff_data.apply(
-                column_converter(load(dict_file)), axis=1))
+        #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+        #    eff_data = eff_data.assign(name=eff_data.apply(
+        #        column_converter(load(dict_file)), axis=1))
+        eff_data = eff_data.assign(name=eff_data.apply(
+            column_converter(wormid2name), axis=1))
         #eff_data = add_tags_column(eff_data, input.tags_table, "biotype")
         eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA")
 
diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile
index 0afa4e9..92bcc69 100644
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Blaise Li
+# Copyright (C) 2020-2023 Blaise Li
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -205,29 +205,29 @@ if isinstance(config["genome_dict"], (str, bytes)):
         genome_dict = yload(fh)
 else:
     genome_dict = config["genome_dict"]
-# genome_dict = config.get("genome_dict", None)
-# if genome_dict is not None:
-if True:
-    genome = genome_dict["name"]
-    chrom_sizes = get_chrom_sizes(genome_dict["size"])
-    chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {})))
-    genomelen = sum(chrom_sizes.values())
-    genome_db = genome_dict["db"][aligner]
-    # bed file binning the genome in 10nt bins
-    genome_binned = genome_dict["binned"]
-    annot_dir = genome_dict["annot_dir"]
-    # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
-    convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
-    gene_lists_dir = genome_dict["gene_lists_dir"]
-else:
-    genome = "C_elegans"
-    chrom_sizes = get_chrom_sizes(config["genome_size"])
-    genomelen = sum(chrom_sizes.values())
-    genome_db = config["index"]
-    genome_binned = f"/pasteur/entites/Mhe/Genomes/{genome}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed"
-    annot_dir = config["annot_dir"]
-    convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
-    gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
+genome = genome_dict["name"]
+chrom_sizes = get_chrom_sizes(genome_dict["size"])
+chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {})))
+genomelen = sum(chrom_sizes.values())
+genome_db = genome_dict["db"][aligner]
+# bed file binning the genome in 10nt bins
+genome_binned = genome_dict["binned"]
+annot_dir = genome_dict["annot_dir"]
+convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
+# What are the difference between
+# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
+# /!\ gene_ids_data_dir contains more conversion dicts,
+# but is not influenced by genome preparation customization,
+# like splitting of miRNAs into 3p and 5p.
+# For wormid2name, load in priority the one
+# that might contain custom gene names, like for splitted miRNAs
+with open(
+        genome_dict.get(
+            "converter",
+            OPJ(convert_dir, "wormid2name.pickle")),
+        "rb") as dict_file:
+    wormid2name = load(dict_file)
+gene_lists_dir = genome_dict["gene_lists_dir"]
 avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
 #gene_lists_dir = config["gene_lists_dir"]
 #local_annot_dir = config["local_annot_dir"]
@@ -1260,8 +1260,9 @@ rule differential_expression:
                     ######################
                     with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
                         res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
-                    with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                        res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                    #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                    #    res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                    res = res.assign(name=res.apply(column_converter(wormid2name), axis=1))
                     # Just to see if column_converter works also with named column, and not just index:
                     # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
                     #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile
index 0ee820e..9723776 100644
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Blaise Li
+# Copyright (C) 2020-2023 Blaise Li
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -286,8 +286,20 @@ genome_db = genome_dict["db"][aligner]
 # bed file binning the genome in 10nt bins
 genome_binned = genome_dict["binned"]
 annot_dir = genome_dict["annot_dir"]
-# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
+# What are the difference between
+# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
+# /!\ gene_ids_data_dir contains more conversion dicts,
+# but is not influenced by genome preparation customization,
+# like splitting of miRNAs into 3p and 5p.
 convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
+# For wormid2name, load in priority the one
+# that might contain custom gene names, like for splitted miRNAs
+with open(
+        genome_dict.get(
+            "converter",
+            OPJ(convert_dir, "wormid2name.pickle")),
+        "rb") as dict_file:
+    wormid2name = load(dict_file)
 gene_lists_dir = genome_dict["gene_lists_dir"]
 avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
 #COUNTERS = config["counters"]
@@ -1809,8 +1821,9 @@ rule compute_RPM_folds:
         ######################
         with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
             lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
-        with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-            lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
+        #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+            #lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
+        lfc = lfc.assign(name=lfc.apply(column_converter(wormid2name), axis=1))
         pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t")
 
 
@@ -1871,8 +1884,9 @@ rule differential_expression:
                     ######################
                     with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
                         res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
-                    with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                        res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                    #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                        #res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                    res = res.assign(name=res.apply(column_converter(wormid2name), axis=1))
                     # Just to see if column_converter works also with named column, and not just index:
                     # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
                     #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile
index 0a26b8e..b1c0b2d 100644
--- a/Ribo-seq/Ribo-seq.snakefile
+++ b/Ribo-seq/Ribo-seq.snakefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Blaise Li
+# Copyright (C) 2020-2023 Blaise Li
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -345,8 +345,20 @@ genome_db = genome_dict["db"][aligner]
 genome_binned = genome_dict["binned"]
 annot_dir = genome_dict["annot_dir"]
 exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
-# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
+# What are the difference between
+# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
+# /!\ gene_ids_data_dir contains more conversion dicts,
+# but is not influenced by genome preparation customization,
+# like splitting of miRNAs into 3p and 5p.
 convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
+# For wormid2name, load in priority the one
+# that might contain custom gene names, like for splitted miRNAs
+with open(
+        genome_dict.get(
+            "converter",
+            OPJ(convert_dir, "wormid2name.pickle")),
+        "rb") as dict_file:
+    wormid2name = load(dict_file)
 gene_lists_dir = genome_dict["gene_lists_dir"]
 avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
 index = genome_db
@@ -1305,8 +1317,9 @@ rule differential_expression:
                 ######################
                 with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
                     res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
-                with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                    res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                #    res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                res = res.assign(name=res.apply(column_converter(wormid2name), axis=1))
                 # Just to see if column_converter works also with named column, and not just index:
                 # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
                 #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
@@ -1536,9 +1549,11 @@ rule compute_efficiency:
         with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
             tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply(
                 column_converter(load(dict_file)), axis=1))
-        with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-            tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
-                column_converter(load(dict_file)), axis=1))
+        #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+        #    tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
+        #        column_converter(load(dict_file)), axis=1))
+        tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply(
+            column_converter(wormid2name), axis=1))
         tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype")
         tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA")
 
@@ -1565,9 +1580,11 @@ rule gather_efficiency:
         with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
             eff_data = eff_data.assign(cosmid=eff_data.apply(
                 column_converter(load(dict_file)), axis=1))
-        with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-            eff_data = eff_data.assign(name=eff_data.apply(
-                column_converter(load(dict_file)), axis=1))
+        #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+        #    eff_data = eff_data.assign(name=eff_data.apply(
+        #        column_converter(load(dict_file)), axis=1))
+        eff_data = eff_data.assign(name=eff_data.apply(
+            column_converter(wormid2name), axis=1))
         #eff_data = add_tags_column(eff_data, input.tags_table, "biotype")
         eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA")
 
diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile
index 2371186..8d65830 100644
--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2022 Blaise Li
+# Copyright (C) 2020-2023 Blaise Li
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
-- 
GitLab