Skip to content
Snippets Groups Projects
Commit 40d0e5e1 authored by Blaise Li's avatar Blaise Li
Browse files

Add new sRNA category "pimi22GtRF".

parent d817a7ef
Branches
No related tags found
No related merge requests found
......@@ -251,7 +251,7 @@ assert set(DE_TYPES) <= set(SMALL_TYPES + JOINED_SMALL_TYPES), "%s\n%s" % (", ".
# TODO: update cross_HTS with pimi22G
# TODO: what kind of pimi22G ? -> pi, mi and si_22G, but not siu_22G
# TODO: add tRFs either in pimi22G or alone? If alone, check that label_order in plot_fold_heatmap is correct
IP_TYPES = [f"pimi{SI_MIN}G", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
IP_TYPES = [f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
f"siu_{SI_MAX}G", f"prot_si_{SI_MAX}G", f"ri_si_{SI_MAX}G"]
assert set(IP_TYPES) <= set(
SMALL_TYPES + [f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G"] + JOINED_SMALL_TYPES), ", ".join(IP_TYPES)
......@@ -842,6 +842,7 @@ else:
if IP_CONTRASTS:
all_contrasts_folds = [
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}G_mean_log2_RPM_fold.txt"),
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}GtRF_mean_log2_RPM_fold.txt"),
# To have RPM (folds) for transgenes (which are not in prot_si category)
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"all_si_{SI_MIN}G_mean_log2_RPM_fold.txt")]
ip_fold_boxplots_by_contrast = expand(
......@@ -923,17 +924,18 @@ rule all:
# feature_counts_dir,
# "all_{small_type}_{mapping_type}_{biotype}_{orientation}_transcript_counts.txt"),
# small_type=READ_TYPES_FOR_MAPPING, mapping_type=[f"on_{genome}"], biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
# Note (14/02/2022): no pi here: Why ? -> Let's add pi while we're adding missing tRF
expand(
OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["pi", "mi", "tRF", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
# piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting
# simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot
expand(
OPJ("figures", "{small_type}_norm_correlations.pdf"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
expand(
OPJ("figures", "{small_type}_norm_counts_distrib.pdf"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G", f"pimi{SI_MIN}GtRF"]),
#absolute = "/pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/includes/link_raw_data.rules"
#relative_include_path = "../snakemake_wrappers/includes/link_raw_data.snakefile"
......@@ -1932,8 +1934,8 @@ rule join_all_sisiu_counts:
#TODO: add tRF, then change category name
rule join_pimi22G_counts:
f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G"""
rule join_pimi22GtRF_counts:
f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G (a.k.a pisimi) and then with tRF into pimi{SI_MIN}GtRF (a.k.a pisimitRF)"""
input:
pi_counts_table = OPJ(
annot_counts_dir,
......@@ -1944,10 +1946,17 @@ rule join_pimi22G_counts:
mi_counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "mi_counts.txt"),
tRF_counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "tRF_counts.txt"),
output:
counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "pisimi_counts.txt"),
# Also generate pisimitRF_counts.txt
counts_table_plus_tRF = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "pisimitRF_counts.txt"),
run:
pi_counts_data = pd.read_table(input.pi_counts_table, index_col="gene")
si_22G_counts_data = pd.read_table(input.si_22G_counts_table, index_col="gene")
......@@ -1955,6 +1964,10 @@ rule join_pimi22G_counts:
counts_data = pd.concat([pi_counts_data, si_22G_counts_data, mi_counts_data])
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
tRF_counts_data = pd.read_table(input.tRF_counts_table, index_col="gene")
pisimitRF_counts_data = pd.concat([counts_data, tRF_counts_data])
pisimitRF_counts_data.index.names = ["gene"]
pisimitRF_counts_data.to_csv(output.counts_table_plus_tRF, sep="\t")
@wc_applied
......@@ -1974,7 +1987,10 @@ def source_small_RNA_counts(wildcards):
# return rules.join_pisimi_counts.output.counts_table
if wildcards.small_type == f"pimi{SI_MIN}G":
# si_22G and also pi and mi
return rules.join_pimi22G_counts.output.counts_table
return rules.join_pimi22GtRF_counts.output.counts_table
elif wildcards.small_type == f"pimi{SI_MIN}GtRF":
# si_22G and also pi and mi and tRF
return rules.join_pimi22GtRF_counts.output.counts_table_plus_tRF
elif wildcards.small_type in {f"sisiu_{suffix}" for suffix in SI_SUFFIXES}:
# si and siu
return rules.join_sisiu_counts.output.counts_table
......@@ -2361,21 +2377,19 @@ rule compute_RPM_folds:
assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
logfile.write(f"Adding small read type info from {input.tags_table}\n")
#pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in tags_table are: {tags_table.columns}\n")
logfile.write(f"Index in tags_table is: {tags_table.index.name}\n")
lfc_with_tags = add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in lfc_with_tags are: {lfc_with_tags.columns}\n")
logfile.write(f"Index in lfc_with_tags is: {lfc_with_tags.index.name}\n")
lfc_idx = lfc.index
RPM_idx = RPM.index
tags_table_idx = tags_table.index
lfc_with_tags_idx = lfc_with_tags.index
lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
# with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
with_tags = pd.concat((RPM, tags_table), axis=1)
lfc_xor_lfc_with_tags = lfc_idx.symmetric_difference(lfc_with_tags_idx)
RPM_xor_lfc_with_tags = RPM_idx.symmetric_difference(lfc_with_tags_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_lfc_with_tags: {lfc_xor_lfc_with_tags}\nRPM_xor_lfc_with_tags: {RPM_xor_lfc_with_tags}\n")
with_tags = pd.concat((RPM, lfc_with_tags), axis=1)
logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
# pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
logfile.write(f"Then writing to {output.fold_results}\n")
with_tags.to_csv(output.fold_results, sep="\t")
......
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment