Commit ba59d920 authored by Blaise Li's avatar Blaise Li
Browse files

Try to add tRF sRNA category.

parent 392f4239
libsmallrna @ 43628e46
Subproject commit 2daa5621b96e3e4d77978be4eb2f676b53dfe14a
Subproject commit 43628e460050c82f200ba0b5157c84cc488714f9
......@@ -44,7 +44,7 @@ if major < 3 or (major == 3 and minor < 6):
# TODO (27/10/2021): isolate tRFs: sense to tRNA, then look at abundance distribution across sites on their remapping: Sites with narrow size distribution and high enough count are potentially interesting. Is it recurrent across libraries?
# TODO: add tRFs in pimi22G -> pimitRF22G
# TODO (27/10/2021): add ri_si[u]_{22G|26G}: antisense to rRNA genes
# Done (27/10/2021): add ri_si[u]_{22G|26G}: antisense to rRNA genes
# TODO: Try to find what the proportion of small reads map at unique position within repetitive elements.
# What is the distribution of unique reads among repetitive elements of a given family: is it evenly spread, or biased?
......@@ -250,7 +250,7 @@ assert set(DE_TYPES) <= set(SMALL_TYPES + JOINED_SMALL_TYPES), "%s\n%s" % (", ".
#IP_TYPES = ["pisimi", "siu", "prot_si"]
# TODO: update cross_HTS with pimi22G
# TODO: what kind of pimi22G ? -> pi, mi and si_22G, but not siu_22G
# TODO: add tRFs?
# TODO: add tRFs either in pimi22G or alone? If alone, check that label_order in plot_fold_heatmap is correct
IP_TYPES = [f"pimi{SI_MIN}G", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
f"siu_{SI_MAX}G", f"prot_si_{SI_MAX}G", f"ri_si_{SI_MAX}G"]
assert set(IP_TYPES) <= set(
......@@ -348,7 +348,8 @@ read_type_max_len = {
size_selected: int(MAX_LEN),
"pi": min(PI_MAX, int(MAX_LEN)),
"si": min(SI_MAX, int(MAX_LEN)),
"mi": min(MI_MAX, int(MAX_LEN))}
"mi": min(MI_MAX, int(MAX_LEN)),
"tRF": int(MAX_LEN)}
READ_TYPES_FOR_COMPOSITION = [
size_selected, "nomap_siRNA", "all_siRNA",
f"all_si_{SI_MIN}GRNA", f"all_si_{SI_MAX}GRNA",
......@@ -452,9 +453,9 @@ mapping_dir = OPJ(aligner, f"mapped_{genome}")
reads_dir = OPJ(mapping_dir, "reads")
annot_counts_dir = OPJ(mapping_dir, "annotation")
# TODO: add tRFs?
# The order after pi and mi must match: this is used in make_read_counts_summary
# The order after pi, mi and tRF must match: this is used in make_read_counts_summary
ANNOT_COUNTS_TYPES = [
"pi", "mi", "all_si",
"pi", "mi", "tRF", "all_si",
f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G",
*SI_TYPES]
ANNOT_COUNTS_TYPES_U = [
......@@ -518,7 +519,7 @@ READ_TYPES_FOR_METAPROFILES = [
f"all_si_{SI_MIN}GRNA", f"all_si_{SI_MAX}GRNA",
f"si_{SI_MIN}GRNA", f"si_{SI_MAX}GRNA",
f"siu_{SI_MIN}GRNA", f"siu_{SI_MAX}GRNA",
"miRNA", "piRNA", "all_siRNA"]
"miRNA", "piRNA", "tRFRNA", "all_siRNA"]
# split = str.split
#
......@@ -1052,6 +1053,7 @@ def source_fastq(wildcards):
return rules.map_on_genome.output.nomap_fastq
elif read_type == "nomap_siRNA":
return rules.extract_nomap_siRNAs.output.nomap_si
# [:-3]: remove "RNA" from "{small_type}RNA"
elif read_type[:-3] in annotate_read_output:
return annotate_read_output[read_type[:-3]]
elif read_type == f"si_{SI_MIN}GRNA":
......@@ -1808,6 +1810,8 @@ rule gather_small_RNA_counts:
drop = ["piRNA"]
elif wildcards.small_type == "mi":
drop = ["miRNA"]
elif wildcards.small_type == "tRF":
drop = ["tRF"]
elif wildcards.small_type in {
*{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
*{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
......@@ -1927,6 +1931,7 @@ rule join_all_sisiu_counts:
counts_data.to_csv(output.counts_table, sep="\t")
#TODO: add tRF, then change category name
rule join_pimi22G_counts:
f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G"""
input:
......@@ -2027,6 +2032,8 @@ rule associate_small_type:
annot_counts_dir, f"all_{size_selected}_on_{genome}", "pi_counts.txt"),
mi_counts_table = OPJ(
annot_counts_dir, f"all_{size_selected}_on_{genome}", "mi_counts.txt"),
tRF_counts_table = OPJ(
annot_counts_dir, f"all_{size_selected}_on_{genome}", "tRF_counts.txt"),
all_sisiu_counts_table = rules.join_all_sisiu_counts.output.counts_table,
output:
tags_table = OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "id2tags.txt"),
......@@ -2048,10 +2055,14 @@ rule associate_small_type:
(pd.read_table(input.pi_counts_table, index_col="gene"),), "pi")
mi_tags_table = make_tag_association(
(pd.read_table(input.mi_counts_table, index_col="gene"),), "mi")
tRF_tags_table = make_tag_association(
(pd.read_table(input.tRF_counts_table, index_col="gene"),), "tRF")
# TODO: add check for intersection of pi, mi, and tRF?
assert len(pi_tags_table.index.intersection(mi_tags_table.index)) == 0, "Some genes have both pi and mi."
assert len(pi_tags_table.index.intersection(sisiu_tags_table.index)) == 0, "Some genes have both pi and sisiu."
assert len(mi_tags_table.index.intersection(sisiu_tags_table.index)) == 0, "Some genes have both mi and sisiu."
tags_table = pd.concat((sisiu_tags_table, pi_tags_table, mi_tags_table))
assert len(tRF_tags_table.index.intersection(sisiu_tags_table.index)) == 0, "Some genes have both tRF and sisiu."
tags_table = pd.concat((sisiu_tags_table, pi_tags_table, mi_tags_table, tRF_tags_table))
# Add the "all_sisiu" tag to those not already tagged
all_sisiu_tags_table = make_tag_association(
(pd.read_table(input.all_sisiu_counts_table, index_col="gene"),), "all_sisiu")
......@@ -2083,8 +2094,8 @@ def add_tags_column(data, tags_table, tag_name, logfile=None):
# - size-selected
# - mapped
# - ambiguous
# - si, pi, mi
# - all_si ((22G-26G)(T*) that are not mi or pi)
# - si, pi, mi, tRF
# - all_si ((22G-26G)(T*) that are not mi or tRF or pi)
rule make_read_counts_summary:
input:
nb_raw = rules.trim_and_dedup.output.nb_raw,
......@@ -2140,7 +2151,7 @@ rule make_read_counts_summary:
"raw", "trimmed", "deduped", "%s" % size_selected, "nomap_siRNA",
"mapped", "non_structural", "ambig_type",
*type2RNA(SISIU_TYPES),
"all_sisiuRNA", "piRNA", "miRNA"]))
"all_sisiuRNA", "piRNA", "miRNA", "tRFRNA"]))
summary_file.write("%d\t" % read_int_from_file(input.nb_raw))
summary_file.write("%d\t" % read_int_from_file(input.nb_trimmed))
summary_file.write("%d\t" % read_int_from_file(input.nb_deduped))
......@@ -2191,8 +2202,9 @@ rule make_read_counts_summary:
summary_file.write(str(sum_counts(annot_counts_files["pi"])))
summary_file.write("\t")
summary_file.write(str(sum_counts(annot_counts_files["mi"])))
summary_file.write("\t")
summary_file.write(str(sum_counts(annot_counts_files["tRF"])))
summary_file.write("\n")
# TODO: add tRF?
rule compute_median_ratio_to_pseudo_ref_size_factors:
......@@ -2537,6 +2549,7 @@ def plot_fold_heatmap(data, gene_colours=None, labels_dict=None):
# How to automaticaly partition list(chain(*small_type_series.values()))
# in groups of max(map(len, small_type_series.values())), padding with ""s ?
# TODO: Update this with new small types
# TODO (09/11/2021): Use third placeholder to add tRF?
label_order = ["pi", "mi", "", "prot_sisiu", "pseu_sisiu", "", "te_sisiu", "satel_sisiu", "simrep_sisiu"]
handles, display_labels = g.ax_col_dendrogram.get_legend_handles_labels()
label2handle = dict(zip([dislab2label[dislab] for dislab in display_labels], handles))
......@@ -4220,7 +4233,7 @@ def get_type_max_len(wildcards):
except KeyError:
# ex: nomap_siRNA
general_type = read_type.split("_")[1]
return read_type_max_len.get(general_type[:2], 0)
return read_type_max_len.get(general_type[:2], int(MAX_LEN))
rule compute_base_composition_along:
......@@ -4551,7 +4564,7 @@ rule plot_summary_barchart:
run:
name = f"{wildcards.lib}_{wildcards.rep}"
# summary = pd.read_table(input[0]).T.astype(int).loc[["nomap_siRNA", "ambig_type", "siRNA", "piRNA", "miRNA"]]
summary = pd.read_table(input[0]).T.astype(int).loc[["nomap_siRNA", "ambig_type", "prot_sisiu_22GRNA", "piRNA", "miRNA"]]
summary = pd.read_table(input[0]).T.astype(int).loc[["nomap_siRNA", "ambig_type", "prot_sisiu_22GRNA", "piRNA", "miRNA", "tRFRNA"]]
summary.columns = [name]
save_plot(output.barchart, plot_barchart, summary, title="%s small RNAs" % name)
......@@ -4590,6 +4603,8 @@ def make_lib_colour_setter(colour_series_dict):
small_type_series = {
"pi" : ["pi"],
"mi" : ["mi"],
# TODO: We don't know whether this works or not:
"tRF": ["tRF"],
"gene" : ["prot_sisiu", "pseu_sisiu", "ri_sisiu"],
"repeat" : ["te_sisiu", "satel_sisiu", "simrep_sisiu"]}
# small_type_series = {
......
......@@ -165,6 +165,14 @@ def make_annotation_processor(getter_type):
# TODO: add match_tRF:
# * start and end should fall inside a tRNA annotation
# * the annotation should be same strand as the read
def match_tRF(annot):
"""Returns True if the annotation tuple *annot* matches a tRNA and
is compatible with the AlignedSegment mapping coordinates."""
return (annot[0] == "tRNA"
and annot[1] == strand
and annot[2] <= start
and annot[3] >= end)
# sense_only = cfilter(same_strand)
# fastq = FQ_TEMPLATE % (name, seq, qual)
# Simplify and collapse in a set, group by biotype
......@@ -215,6 +223,22 @@ def make_annotation_processor(getter_type):
signature = "NA"
annotations = set(filter(
match_mi, annot_set))
elif ("tRNA" in biotypes
and any(map(match_tRF, annot_set))):
# If piRNA annotations are actually all antisense to the read,
# or if the read is not a 21U, consider it an actual tRF
if (("piRNA" in biotypes)
and has_pi_signature(seq, read_len)
and any(map(match_pi, annot_set))):
annot_name = "piRNA"
signature = f"{PI_MIN}-{PI_MAX}U"
annotations = set(filter(
match_pi, annot_set))
else:
annot_name = "tRNA"
signature = "NA"
annotations = set(filter(
match_tRF, annot_set))
elif has_pi_signature(seq, read_len):
if ("piRNA" in biotypes
and any(map(match_pi, annot_set))):
......@@ -492,12 +516,8 @@ def main():
stream_processor = FuncApplier([
count_annots, count_small,
count_first_bases])
# TODO (21/10/2021): Add tRNA-derived sRNAs (tRF):
# small_types = [
# "pi", "mi", "tRF", "all_si"
# f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]
small_types = [
"pi", "mi", "all_si",
"pi", "mi", "tRF", "all_si",
f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]
# The same classification logic is applied to reads
# that had a polyU tail, remapped after polyU removal,
......@@ -508,11 +528,8 @@ def main():
# One reason is that elements in small_types_u
# should be in the same order as small_types
# in order to be able to build the u2nonu dict.
# small_types_u = [
# "piu", "miu", "tRFu", "all_siu",
# f"all_siu_{SI_MIN}G", f"all_siu_{SI_MAX}G", *SIU_TYPES]
small_types_u = [
"piu", "miu", "all_siu",
"piu", "miu", "tRFu", "all_siu",
f"all_siu_{SI_MIN}G", f"all_siu_{SI_MAX}G", *SIU_TYPES]
# !! relies on the fact that dicts have stable order
# via the construction of SIU_TYPES and SI_TYPES
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment