Commit 16b41b3f authored by Blaise Li's avatar Blaise Li
Browse files

Increase siRNA size range, reduce memory issues.

22G should actually be 21-23 nt long.
parent a7b7156a
......@@ -8,7 +8,7 @@ TODO: New criteria for small RNA identification:
- piRNA
- miRNA
- all_si (no signature criteria):
- 22G antisense (including on such things as rRNA):
- 22G antisense (actually 21-23, including on such things as rRNA):
- te_22G
- prot_22G
- pseu_22G
......@@ -216,7 +216,7 @@ def has_pi_signature(str seq, int read_len):
cdef bint chas_endosi_signature(str seq, size_t read_len):
return read_len == SI_MIN and seq[0] == "G"
return (SI_MIN - 1) <= read_len <= (SI_MIN + 1) and seq[0] == "G"
def has_endosi_signature(str seq, int read_len):
......@@ -251,7 +251,7 @@ def has_csr1_endosi_signature(str seq, int read_len):
cdef bint chas_allsi_signature(str seq):
cdef unsigned int read_len = len(seq)
return (seq[0] == "G" and (SI_MIN <= read_len <= SI_MAX)) or chas_csr1_endosi_signature(seq, read_len)
return (seq[0] == "G" and ((SI_MIN - 1) <= read_len <= SI_MAX)) or chas_csr1_endosi_signature(seq, read_len)
def count_annots(annot_infos, queue):
......
This diff is collapsed.
......@@ -3,10 +3,6 @@
"""This script parses a sorted bam file and uses annotations from a gtf file to
identify small RNA categories."""
# TODO:
# * fix 21 leak in siRNA (strangely: not in all_si, but in prot_si)
# * split all_si and other downstream in 26 and others
# Doesn't work
# import pyximport
# pyximport.install(pyimport = True)
......@@ -99,111 +95,6 @@ ALI2POS_INFO = attrgetter(
"reference_end")
# Doesn't seem to work like that
# class MyParser(pysam.ctabix.asGTF):
# def parse(self, buffer, length):
# return(GTF2ATTRS(super().parse(self, buffer, length)))
# def get_read_info(ali):
# """Extracts information from AlignedSegment *ali*."""
# if ali.is_reverse:
# name, seq, qual, read_len = ALI2READ_INFO(ali)
# return (name, reverse_complement(seq), qual[::-1], read_len)
# else:
# return ALI2READ_INFO(ali)
# Slightly faster ?
# def get_read_info(ali):
# """Extracts information from AlignedSegment *ali*."""
# if ali.is_reverse:
# return (
# ali.query_name,
# reverse_complement(ali.query_sequence),
# # ali.query_qualities[::-1],
# ali.qual[::-1],
# ali.query_length,
# ali.reference_start,
# ali.reference_end,
# "-")
# else:
# return (
# ali.query_name,
# ali.query_sequence,
# # ali.query_qualities,
# ali.qual,
# ali.query_length,
# ali.reference_start,
# ali.reference_end,
# "+")
# def make_read_composition_table(min_len=21, max_len=26):
# """Initializes a table to count read first (or other) base composition
# structured by read length.
#
# To initialize:
#
# >>> table = make_read_composition_table(21, 26)
# >>> table
# A C G T
# 21 0 0 0 0
# 22 0 0 0 0
# 23 0 0 0 0
# 24 0 0 0 0
# 25 0 0 0 0
# 26 0 0 0 0
#
# To increment:
#
# >>> table.loc[23]["G"] += 1
# >>> table
# A C G T
# 21 0 0 0 0
# 22 0 0 0 0
# 23 0 0 1 0
# 24 0 0 0 0FST = itemgetter(0)
# # extract info from AlignedSegment
# ALI2READ_INFO = attrgetter(
# "query_name",
# "query_sequence
# 25 0 0 0 0
# 26 0 0 0 0
# """
# return pd.DataFrame(
# np.zeros(
# 1 + max_len - min_len,
# dtype={
# "names" : ["A", "C", "G", "T"],
# "formats" : ["uint", "uint", "uint", "uint"]}),
# index=np.arange(min_len, max_len + 1))
# def has_pi_signature(seq, read_len):
# """Returns True if this is a 21U."""
# return read_len == 21 and seq[0] == "T"
# def has_endosi_signature(seq, read_len):
# """Returns True if this is a 22G."""
# return read_len == 22 and seq[0] == "G"
# def has_26G_signature(seq, read_len):
# """Returns True if this is a 26G."""
# return read_len == 26 and seq[0] == "G"
# These should have problems mapping because the T-tail is not
# in the genome sequence
# def has_csr1_endosi_signature(seq, read_len):
# """Returns True if this is a 22G with extra Ts."""
# # if read_len <= 22, t_tail will be the empty string
# t_tail = (read_len - 22) * "T"
# if t_tail:
# return (seq[0] == "G") and (seq[22:] == t_tail)
# else:
# return False
def make_gtf_info_getter(getter_type):
"""Generates a function that extracts gtf information."""
if getter_type == "tabix":
......@@ -403,7 +294,7 @@ def make_annotation_processor(getter_type):
# annotation names that seem to be frequent and that are not protein-coding
# COMMON_NO_TARGET = {"piRNA", "miRNA", "no_signature", "22G"}
# COMMON_NO_TARGET = {"piRNA", "miRNA", "no_signature", f"{SI_MIN}G"}
def fq_writer(queue,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment