diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile index 621a47def797d0f5613cbf26cafacc245b2ece13..40257c080969a945e308bb2a9a72104ae1187ab8 100644 --- a/CLIP/iCLIP.snakefile +++ b/CLIP/iCLIP.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -76,8 +76,21 @@ genome_db = genome_dict["db"][aligner] # bed file binning the genome in 10nt bins genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. +# Currently not used convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) diff --git a/Degradome-seq/Degradome-seq.snakefile b/Degradome-seq/Degradome-seq.snakefile index 36a43167047acb8b9d4c52d09848aec39e7feffa..58889de398258b24ec158659abe226010595ed63 100644 --- a/Degradome-seq/Degradome-seq.snakefile +++ b/Degradome-seq/Degradome-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -107,8 +107,20 @@ genome_db = genome_dict["db"][aligner] # bed file binning the genome in 10nt bins genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) @@ -509,9 +521,11 @@ rule compute_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + # column_converter(load(dict_file)), axis=1)) + tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + column_converter(wormid2name), axis=1)) tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype") tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA") @@ -538,9 +552,11 @@ rule gather_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: eff_data = eff_data.assign(cosmid=eff_data.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - eff_data = eff_data.assign(name=eff_data.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # eff_data = eff_data.assign(name=eff_data.apply( + # column_converter(load(dict_file)), axis=1)) + eff_data = eff_data.assign(name=eff_data.apply( + column_converter(wormid2name), axis=1)) #eff_data = add_tags_column(eff_data, input.tags_table, "biotype") eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA") diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 0afa4e96fc2f51824deaedba196ba007970fe634..92bcc696ccbf507a941e7a04fe85bc13437e21f3 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -205,29 +205,29 @@ if isinstance(config["genome_dict"], (str, bytes)): genome_dict = yload(fh) else: genome_dict = config["genome_dict"] -# genome_dict = config.get("genome_dict", None) -# if genome_dict is not None: -if True: - genome = genome_dict["name"] - chrom_sizes = get_chrom_sizes(genome_dict["size"]) - chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {}))) - genomelen = sum(chrom_sizes.values()) - genome_db = genome_dict["db"][aligner] - # bed file binning the genome in 10nt bins - genome_binned = genome_dict["binned"] - annot_dir = genome_dict["annot_dir"] - # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] - convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) - gene_lists_dir = genome_dict["gene_lists_dir"] -else: - genome = "C_elegans" - chrom_sizes = get_chrom_sizes(config["genome_size"]) - genomelen = sum(chrom_sizes.values()) - genome_db = config["index"] - genome_binned = f"/pasteur/entites/Mhe/Genomes/{genome}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed" - annot_dir = config["annot_dir"] - convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) - gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists" +genome = genome_dict["name"] +chrom_sizes = get_chrom_sizes(genome_dict["size"]) +chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {}))) +genomelen = sum(chrom_sizes.values()) +genome_db = genome_dict["db"][aligner] +# bed file binning the genome in 10nt bins +genome_binned = genome_dict["binned"] +annot_dir = genome_dict["annot_dir"] +convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) +gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) #gene_lists_dir = config["gene_lists_dir"] #local_annot_dir = config["local_annot_dir"] @@ -1260,8 +1260,9 @@ rule differential_expression: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + res = res.assign(name=res.apply(column_converter(wormid2name), axis=1)) # Just to see if column_converter works also with named column, and not just index: # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 0ee820e0265319ce50b2971ead25243e39b7149b..9723776207cebe1ee2c5aedec68f03f7386362b2 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -286,8 +286,20 @@ genome_db = genome_dict["db"][aligner] # bed file binning the genome in 10nt bins genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) #COUNTERS = config["counters"] @@ -1809,8 +1821,9 @@ rule compute_RPM_folds: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + #lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) + lfc = lfc.assign(name=lfc.apply(column_converter(wormid2name), axis=1)) pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t") @@ -1871,8 +1884,9 @@ rule differential_expression: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + #res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + res = res.assign(name=res.apply(column_converter(wormid2name), axis=1)) # Just to see if column_converter works also with named column, and not just index: # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile index 0a26b8e3e02d319bb271940cc238420ec0fb2d42..b1c0b2d11d6a9286da97c9f29a5051f042d18a4b 100644 --- a/Ribo-seq/Ribo-seq.snakefile +++ b/Ribo-seq/Ribo-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -345,8 +345,20 @@ genome_db = genome_dict["db"][aligner] genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"), -# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] +# What are the difference between +# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]? +# /!\ gene_ids_data_dir contains more conversion dicts, +# but is not influenced by genome preparation customization, +# like splitting of miRNAs into 3p and 5p. convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir) +# For wormid2name, load in priority the one +# that might contain custom gene names, like for splitted miRNAs +with open( + genome_dict.get( + "converter", + OPJ(convert_dir, "wormid2name.pickle")), + "rb") as dict_file: + wormid2name = load(dict_file) gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) index = genome_db @@ -1305,8 +1317,9 @@ rule differential_expression: ###################### with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + res = res.assign(name=res.apply(column_converter(wormid2name), axis=1)) # Just to see if column_converter works also with named column, and not just index: # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) @@ -1536,9 +1549,11 @@ rule compute_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: tpm_and_eff = tpm_and_eff.assign(cosmid=tpm_and_eff.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + # column_converter(load(dict_file)), axis=1)) + tpm_and_eff = tpm_and_eff.assign(name=tpm_and_eff.apply( + column_converter(wormid2name), axis=1)) tpm_and_eff = add_tags_column(tpm_and_eff, input.tags_table, "biotype") tpm_and_eff.to_csv(output.eff_file, sep="\t", na_rep="NA") @@ -1565,9 +1580,11 @@ rule gather_efficiency: with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: eff_data = eff_data.assign(cosmid=eff_data.apply( column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - eff_data = eff_data.assign(name=eff_data.apply( - column_converter(load(dict_file)), axis=1)) + #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + # eff_data = eff_data.assign(name=eff_data.apply( + # column_converter(load(dict_file)), axis=1)) + eff_data = eff_data.assign(name=eff_data.apply( + column_converter(wormid2name), axis=1)) #eff_data = add_tags_column(eff_data, input.tags_table, "biotype") eff_data.to_csv(output.eff_file, sep="\t", na_rep="NA") diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 237118666af56162b1126a4a10781c6711f7d351..8d65830c1b571c87ef0f09e881d2d2f095a2a2f3 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 Blaise Li +# Copyright (C) 2020-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by