Commit 40d0e5e1 authored by Blaise Li's avatar Blaise Li
Browse files

Add new sRNA category "pimi22GtRF".

parent d817a7ef
......@@ -251,7 +251,7 @@ assert set(DE_TYPES) <= set(SMALL_TYPES + JOINED_SMALL_TYPES), "%s\n%s" % (", ".
# TODO: update cross_HTS with pimi22G
# TODO: what kind of pimi22G ? -> pi, mi and si_22G, but not siu_22G
# TODO: add tRFs either in pimi22G or alone? If alone, check that label_order in plot_fold_heatmap is correct
IP_TYPES = [f"pimi{SI_MIN}G", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
IP_TYPES = [f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
f"siu_{SI_MAX}G", f"prot_si_{SI_MAX}G", f"ri_si_{SI_MAX}G"]
assert set(IP_TYPES) <= set(
SMALL_TYPES + [f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G"] + JOINED_SMALL_TYPES), ", ".join(IP_TYPES)
......@@ -842,6 +842,7 @@ else:
if IP_CONTRASTS:
all_contrasts_folds = [
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}G_mean_log2_RPM_fold.txt"),
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}GtRF_mean_log2_RPM_fold.txt"),
# To have RPM (folds) for transgenes (which are not in prot_si category)
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"all_si_{SI_MIN}G_mean_log2_RPM_fold.txt")]
ip_fold_boxplots_by_contrast = expand(
......@@ -923,17 +924,18 @@ rule all:
# feature_counts_dir,
# "all_{small_type}_{mapping_type}_{biotype}_{orientation}_transcript_counts.txt"),
# small_type=READ_TYPES_FOR_MAPPING, mapping_type=[f"on_{genome}"], biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
# Note (14/02/2022): no pi here: Why ? -> Let's add pi while we're adding missing tRF
expand(
OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["pi", "mi", "tRF", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
# piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting
# simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot
expand(
OPJ("figures", "{small_type}_norm_correlations.pdf"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
expand(
OPJ("figures", "{small_type}_norm_counts_distrib.pdf"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
#absolute = "/pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/includes/link_raw_data.rules"
#relative_include_path = "../snakemake_wrappers/includes/link_raw_data.snakefile"
......@@ -1932,8 +1934,8 @@ rule join_all_sisiu_counts:
#TODO: add tRF, then change category name
rule join_pimi22G_counts:
f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G"""
rule join_pimi22GtRF_counts:
f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G (a.k.a pisimi) and then with tRF into pimi{SI_MIN}GtRF (a.k.a pisimitRF)"""
input:
pi_counts_table = OPJ(
annot_counts_dir,
......@@ -1944,10 +1946,17 @@ rule join_pimi22G_counts:
mi_counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "mi_counts.txt"),
tRF_counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "tRF_counts.txt"),
output:
counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "pisimi_counts.txt"),
# Also generate pisimitRF_counts.txt
counts_table_plus_tRF = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "pisimitRF_counts.txt"),
run:
pi_counts_data = pd.read_table(input.pi_counts_table, index_col="gene")
si_22G_counts_data = pd.read_table(input.si_22G_counts_table, index_col="gene")
......@@ -1955,6 +1964,10 @@ rule join_pimi22G_counts:
counts_data = pd.concat([pi_counts_data, si_22G_counts_data, mi_counts_data])
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
tRF_counts_data = pd.read_table(input.tRF_counts_table, index_col="gene")
pisimitRF_counts_data = pd.concat([counts_data, tRF_counts_data])
pisimitRF_counts_data.index.names = ["gene"]
pisimitRF_counts_data.to_csv(output.counts_table_plus_tRF, sep="\t")
@wc_applied
......@@ -1974,7 +1987,10 @@ def source_small_RNA_counts(wildcards):
# return rules.join_pisimi_counts.output.counts_table
if wildcards.small_type == f"pimi{SI_MIN}G":
# si_22G and also pi and mi
return rules.join_pimi22G_counts.output.counts_table
return rules.join_pimi22GtRF_counts.output.counts_table
elif wildcards.small_type == f"pimi{SI_MIN}GtRF":
# si_22G and also pi and mi and tRF
return rules.join_pimi22GtRF_counts.output.counts_table_plus_tRF
elif wildcards.small_type in {f"sisiu_{suffix}" for suffix in SI_SUFFIXES}:
# si and siu
return rules.join_sisiu_counts.output.counts_table
......@@ -2361,21 +2377,19 @@ rule compute_RPM_folds:
assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
logfile.write(f"Adding small read type info from {input.tags_table}\n")
#pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in tags_table are: {tags_table.columns}\n")
logfile.write(f"Index in tags_table is: {tags_table.index.name}\n")
lfc_with_tags = add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in lfc_with_tags are: {lfc_with_tags.columns}\n")
logfile.write(f"Index in lfc_with_tags is: {lfc_with_tags.index.name}\n")
lfc_idx = lfc.index
RPM_idx = RPM.index
tags_table_idx = tags_table.index
lfc_with_tags_idx = lfc_with_tags.index
lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
# with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
with_tags = pd.concat((RPM, tags_table), axis=1)
lfc_xor_lfc_with_tags = lfc_idx.symmetric_difference(lfc_with_tags_idx)
RPM_xor_lfc_with_tags = RPM_idx.symmetric_difference(lfc_with_tags_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_lfc_with_tags: {lfc_xor_lfc_with_tags}\nRPM_xor_lfc_with_tags: {RPM_xor_lfc_with_tags}\n")
with_tags = pd.concat((RPM, lfc_with_tags), axis=1)
logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
# pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
logfile.write(f"Then writing to {output.fold_results}\n")
with_tags.to_csv(output.fold_results, sep="\t")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment