Commit a304a2af authored by Blaise Li's avatar Blaise Li
Browse files

Further preparatory comments.

parent 27e84e63
......@@ -187,6 +187,9 @@ def make_annotation_processor(getter_type):
# For miRNA, we also want the coordinates to match exactly.
# For piRNA, we also want the coordinates to match exactly
# and the read to be 21U (actually, we want to accept longer reads).
# TODO (21/10/2021): Add a tRNA category
# with no constraints on read extremities coordinates.
# Where in the priority order? After pi and mi, before si?
#
# If not a miRNA or piRNA, then it might be an RdRP-derived endosiRNA.
# We want to keep the annotations for further analyses.
......@@ -259,7 +262,6 @@ def make_annotation_processor(getter_type):
# It is faster using at:
# compositions[annot].at[read_len, seq[0]] += 1
return (annot_name, signature), (annotations, (name, seq, qual, read_len, strand))
# return annot_name, (annotations, get_read_info(ali))
return process_annotations
# First: Count bona fide piRNA and miRNA
......@@ -477,7 +479,7 @@ def main():
# The functions in the list passed to the FuncApplier object
# process chunks of annotations.
# The FuncApplier object is a callable that replicates
# the chunks for these functions to process, and put the
# the chunks for these functions to process, and puts the
# corresponding results in a tuple.
# stream_processor = FuncApplier([
# count_annots, count_pimis, count_sis, count_all_sis,
......@@ -485,10 +487,22 @@ def main():
stream_processor = FuncApplier([
count_annots, count_small,
count_first_bases])
# TODO (21/10/2021): Add tRNA-derived sRNAs:
# small_types = [
# "pi", "mi", "ti", "all_si"
# f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]
small_types = [
"pi", "mi", "all_si",
f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]
# The same classification logic is applied to reads
# that had a polyU tail, remapped after polyU removal,
# but read and output file names are different.
if args.remapped_polyU:
# We don't really care about piu and miu,
# but they are here for code simplicity.
# One reason is that elements in small_types_u
# should be in the same order as small_types
# in order to be able to build the u2nonu dict.
small_types_u = [
"piu", "miu", "all_siu",
f"all_siu_{SI_MIN}G", f"all_siu_{SI_MAX}G", *SIU_TYPES]
......@@ -496,6 +510,9 @@ def main():
# via the construction of SIU_TYPES and SI_TYPES
# from SMALL_TYPES_TREE
u2nonu = dict(zip(small_types_u, small_types)).get
# File paths dictionaries use u2nonu to have the base
# small type names as keys, so that further code can
# ignore the U vs. non-U distinction (see fq_writer).
fq_paths = {u2nonu(small_type): OPJ(
args.reads_dir, f"{small_type}RNA.fastq.gz") for small_type in small_types_u}
ambig_type_path = OPJ(args.out_dir, "ambig_typeU.txt")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment