diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 20a77e3cee0d20f49db5a5cfaff68079b06d52eb..572550550c6cf04590d268fb204e59c0d465437a 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -1705,57 +1705,66 @@ rule gather_small_RNA_counts: counts_table = OPJ( annot_counts_dir, "all_{mapped_type}_on_%s" % genome, "{small_type}_counts.txt"), + log: + log = OPJ(log_dir, "gather_small_RNA_counts", "{mapped_type}_{small_type}.log"), run: - # Gathering the counts data - ############################ - counts_files = (OPJ( - annot_counts_dir, - f"{cond_name}_{wildcards.mapped_type}_on_{genome}", - f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES) - if wildcards.small_type == "pi": - drop = ["piRNA"] - elif wildcards.small_type == "mi": - drop = ["miRNA"] - elif wildcards.small_type in { - *{f"prot_si_{suffix}" for suffix in SI_SUFFIXES}, - *{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}: - drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"] - elif wildcards.small_type in { - *{f"te_si_{suffix}" for suffix in SI_SUFFIXES}, - *{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}: - drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"] - elif wildcards.small_type in { - *{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES}, - *{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}: - drop = ["pseudogene"] - elif wildcards.small_type in { - *{f"satel_si_{suffix}" for suffix in SI_SUFFIXES}, - *{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}: - drop = ["satellites_rmsk"] - elif wildcards.small_type in { - *{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES}, - *{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}: - drop = ["simple_repeats_rmsk"] - elif wildcards.small_type in {"all_si", "all_siu"}: - drop = ["all_siRNA"] - else: - raise NotImplementedError("Unknown read type: %s" % wildcards.small_type) - counts_data = pd.concat( - (pd.read_table( - counts_file, - index_col=0, - na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files), - axis=1).fillna(0).astype(int) - counts_data.columns = COND_NAMES - # Simple_repeat|Simple_repeat|(TTTTTTG)n:1 - # Simple_repeat|Simple_repeat|(TTTTTTG)n:2 - # Simple_repeat|Simple_repeat|(TTTTTTG)n:3 - # Simple_repeat|Simple_repeat|(TTTTTTG)n:4 - # -> Simple_repeat|Simple_repeat|(TTTTTTG)n - if wildcards.small_type in RMSK_SISIU_TYPES: - counts_data = sum_by_family(counts_data) - counts_data.index.names = ["gene"] - counts_data.to_csv(output.counts_table, sep="\t") + with open(log.log, "w") as logfile: + # Gathering the counts data + ############################ + counts_files = (OPJ( + annot_counts_dir, + f"{cond_name}_{wildcards.mapped_type}_on_{genome}", + f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES) + if wildcards.small_type == "pi": + drop = ["piRNA"] + elif wildcards.small_type == "mi": + drop = ["miRNA"] + elif wildcards.small_type in { + *{f"prot_si_{suffix}" for suffix in SI_SUFFIXES}, + *{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}: + drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"] + elif wildcards.small_type in { + *{f"te_si_{suffix}" for suffix in SI_SUFFIXES}, + *{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}: + drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"] + elif wildcards.small_type in { + *{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES}, + *{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}: + drop = ["pseudogene"] + elif wildcards.small_type in { + *{f"satel_si_{suffix}" for suffix in SI_SUFFIXES}, + *{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}: + drop = ["satellites_rmsk"] + elif wildcards.small_type in { + *{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES}, + *{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}: + drop = ["simple_repeats_rmsk"] + elif wildcards.small_type in { + *{f"all_si_{suffix}" for suffix in SI_SUFFIXES}, + *{f"all_siu_{suffix}" for suffix in SI_SUFFIXES}}: + drop = [f"all_si_{suffix}RNA" for suffix in SI_SUFFIXES] + elif wildcards.small_type in {"all_si", "all_siu"}: + drop = ["all_siRNA"] + else: + # Unknown read type: all_si_22G + raise NotImplementedError("Unknown read type: %s" % wildcards.small_type) + logfile.write(f"Will drop {drop} for {wildcards.small_type}.\n") + counts_data = pd.concat( + (pd.read_table( + counts_file, + index_col=0, + na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files), + axis=1).fillna(0).astype(int) + counts_data.columns = COND_NAMES + # Simple_repeat|Simple_repeat|(TTTTTTG)n:1 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:2 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:3 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:4 + # -> Simple_repeat|Simple_repeat|(TTTTTTG)n + if wildcards.small_type in RMSK_SISIU_TYPES: + counts_data = sum_by_family(counts_data) + counts_data.index.names = ["gene"] + counts_data.to_csv(output.counts_table, sep="\t") def counts_to_join(wildcards): @@ -1960,11 +1969,15 @@ rule associate_small_type: tags_table.to_csv(output.tags_table, sep="\t") -def add_tags_column(data, tags_table, tag_name): +# TODO: check what this does with index column name (can it loose "gene"?) +def add_tags_column(data, tags_table, tag_name, logfile=None): """Adds a column *tag_name* to *data* based on the DataFrame *tag_table* associating tags to row names.""" # Why "inner" ? df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1) + if logfile is not None: + logfile.write(f"Index in data is: {data.index.name}\n") + logfile.write(f"Index in df is: {df.index.name}\n") df.columns = (*data.columns, tag_name) return df @@ -2180,6 +2193,9 @@ rule compute_RPM_folds: input.counts_table, usecols=["gene", *column_list], index_col="gene") + assert counts_data.index.name == "gene", f"Wrong index: {counts_data.index.name}" + # logfile.write(f"Columns in counts_data are: {counts_data.columns}") + # logfile.write(f"Index in counts_data is: {counts_data.index.name}") logfile.write(f"Found {len(counts_data.index)} genes\n") #logfile.write(f"Reading exon lengths from {exon_lengths_file}\n") #exon_lengths = pd.read_table( @@ -2196,6 +2212,9 @@ rule compute_RPM_folds: logfile.write(str(norm)) logfile.write("Computing counts per million non-structural mappers\n") RPM = 1000000 * counts_data / norm + # logfile.write(f"Columns in RPM are: {RPM.columns}\n") + # logfile.write(f"Index in RPM is: {RPM.index.name}\n") + assert RPM.index.name == "gene", f"Wrong index: {RPM.index.name}" logfile.write("Computing RPM log2 fold changes\n") # Maybe not a very sensible way to compute the mean folds: it pairs replicates, but the pairing is arbitrary # TODO: Should we compute the fold of the means instead? @@ -2212,9 +2231,28 @@ rule compute_RPM_folds: lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1)) with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1)) - logfile.write("Adding small read type info\n") + # logfile.write(f"Columns in lfc are: {lfc.columns}\n") + # logfile.write(f"Index in lfc is: {lfc.index.name}\n") + assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}" + logfile.write(f"Adding small read type info from {input.tags_table}\n") #pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") - pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") + tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile) + logfile.write(f"Columns in tags_table are: {tags_table.columns}\n") + logfile.write(f"Index in tags_table is: {tags_table.index.name}\n") + lfc_idx = lfc.index + RPM_idx = RPM.index + tags_table_idx = tags_table.index + lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx) + lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx) + RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx) + logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n") + # with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1) + with_tags = pd.concat((RPM, tags_table), axis=1) + logfile.write(f"Columns in with_tags are: {with_tags.columns}\n") + logfile.write(f"Index in with_tags is: {with_tags.index.name}\n") + # pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t") + logfile.write(f"Then writing to {output.fold_results}\n") + with_tags.to_csv(output.fold_results, sep="\t") rule gather_RPM_folds: @@ -2233,12 +2271,17 @@ rule gather_RPM_folds: benchmark: OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"), run: - pd.DataFrame({contrast : pd.read_table( - OPJ(mapping_dir, f"RPM_folds_{size_selected}", - f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), - index_col=["gene", "cosmid", "name", "small_type"], - usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( - output.all_folds, sep="\t") + with open(log.log, "w") as logfile: + actual_input = [ + OPJ(mapping_dir, f"RPM_folds_{size_selected}", + f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt") for contrast in IP_CONTRASTS] + logfile.write(f"Gathering RPM folds from:\n{actual_input}\nShould be from:\n{input.fold_results}\n") + pd.DataFrame({contrast : pd.read_table( + OPJ(mapping_dir, f"RPM_folds_{size_selected}", + f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), + index_col=["gene", "cosmid", "name", "small_type"], + usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( + output.all_folds, sep="\t") # TODO: update consumers with mapping_type