From 54f1e09c16b01310e0691c452296fab200da5725 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Fri, 21 Apr 2023 11:10:30 +0200 Subject: [PATCH] Gene id conversion fix in all pipelines. --- CLIP/iCLIP.snakefile | 17 ++++++++- Degradome-seq/Degradome-seq.snakefile | 32 ++++++++++++---- PRO-seq/PRO-seq.snakefile | 53 ++++++++++++++------------- RNA_Seq_Cecere/RNA-seq.snakefile | 26 ++++++++++--- Ribo-seq/Ribo-seq.snakefile | 37 ++++++++++++++----- small_RNA-seq/small_RNA-seq.snakefile | 2 +- 6 files changed, 114 insertions(+), 53 deletions(-) diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile index 621a47d..40257c0 100644 --- a/CLIP/iCLIP.snakefile +++ b/CLIP/iCLIP.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -76,8 +76,21 @@ genome_db = genome_dict["db"][aligner] # bed file binning the genome in 10nt bins genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. +# Currently not used convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) diff --git a/Degradome-seq/Degradome-seq.snakefile b/Degradome-seq/Degradome-seq.snakefile index 36a4316..58889de 100644 --- a/Degradome-seq/Degradome-seq.snakefile +++ b/Degradome-seq/Degradome-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -107,8 +107,20 @@ genome_db = genome_dict["db"][aligner] # bed file binning the genome in 10nt bins genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) @@ -509,9 +521,11 @@ rule compute_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + # column_converter(load(dict_file)), axis=1)) + tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + column_converter(wormid2name), axis=1)) tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype") tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA") @@ -538,9 +552,11 @@ rule gather_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: eff_data = eff_data.assign(cosmid=eff_data.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - eff_data = eff_data.assign(name=eff_data.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # eff_data = eff_data.assign(name=eff_data.apply( + # column_converter(load(dict_file)), axis=1)) + eff_data = eff_data.assign(name=eff_data.apply( + column_converter(wormid2name), axis=1)) #eff_data = add_tags_column(eff_data, input.tags_table, "biotype") eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA") diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 0afa4e9..92bcc69 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -205,29 +205,29 @@ if isinstance(config["genome_dict"], (str, bytes)): genome_dict = yload(fh) else: genome_dict = config["genome_dict"] -# genome_dict = config.get("genome_dict", None) -# if genome_dict is not None: -if True: - genome = genome_dict["name"] - chrom_sizes = get_chrom_sizes(genome_dict["size"]) - chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {}))) - genomelen = sum(chrom_sizes.values()) - genome_db = genome_dict["db"][aligner] - # bed file binning the genome in 10nt bins - genome_binned = genome_dict["binned"] - annot_dir = genome_dict["annot_dir"] - # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] - convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) - gene_lists_dir = genome_dict["gene_lists_dir"] -else: - genome = "C_elegans" - chrom_sizes = get_chrom_sizes(config["genome_size"]) - genomelen = sum(chrom_sizes.values()) - genome_db = config["index"] - genome_binned = f"/pasteur/entites/Mhe/Genomes/{genome}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed" - annot_dir = config["annot_dir"] - convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) - gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists" +genome = genome_dict["name"] +chrom_sizes = get_chrom_sizes(genome_dict["size"]) +chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {}))) +genomelen = sum(chrom_sizes.values()) +genome_db = genome_dict["db"][aligner] +# bed file binning the genome in 10nt bins +genome_binned = genome_dict["binned"] +annot_dir = genome_dict["annot_dir"] +convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) +gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) #gene_lists_dir = config["gene_lists_dir"] #local_annot_dir = config["local_annot_dir"] @@ -1260,8 +1260,9 @@ rule differential_expression: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + res = res.assign(name=res.apply(column_converter(wormid2name), axis=1)) # Just to see if column_converter works also with named column, and not just index: # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 0ee820e..9723776 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -286,8 +286,20 @@ genome_db = genome_dict["db"][aligner] # bed file binning the genome in 10nt bins genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) #COUNTERS = config["counters"] @@ -1809,8 +1821,9 @@ rule compute_RPM_folds: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + #lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) + lfc = lfc.assign(name=lfc.apply(column_converter(wormid2name), axis=1)) pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t") @@ -1871,8 +1884,9 @@ rule differential_expression: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + #res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + res = res.assign(name=res.apply(column_converter(wormid2name), axis=1)) # Just to see if column_converter works also with named column, and not just index: # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile index 0a26b8e..b1c0b2d 100644 --- a/Ribo-seq/Ribo-seq.snakefile +++ b/Ribo-seq/Ribo-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -345,8 +345,20 @@ genome_db = genome_dict["db"][aligner] genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"), -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) index = genome_db @@ -1305,8 +1317,9 @@ rule differential_expression: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + res = res.assign(name=res.apply(column_converter(wormid2name), axis=1)) # Just to see if column_converter works also with named column, and not just index: # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) @@ -1536,9 +1549,11 @@ rule compute_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + # column_converter(load(dict_file)), axis=1)) + tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + column_converter(wormid2name), axis=1)) tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype") tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA") @@ -1565,9 +1580,11 @@ rule gather_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: eff_data = eff_data.assign(cosmid=eff_data.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - eff_data = eff_data.assign(name=eff_data.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # eff_data = eff_data.assign(name=eff_data.apply( + # column_converter(load(dict_file)), axis=1)) + eff_data = eff_data.assign(name=eff_data.apply( + column_converter(wormid2name), axis=1)) #eff_data = add_tags_column(eff_data, input.tags_table, "biotype") eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA") diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 2371186..8d65830 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by -- GitLab