diff --git a/Degradome-seq/degradome_metaprofile.py b/Degradome-seq/degradome_metaprofile.py index ec5a3e3bb3c47ed234caf0bf77d4e1989bd40053..20a11541bdf6d448c9c44c7639ac0a5995de649f 100755 --- a/Degradome-seq/degradome_metaprofile.py +++ b/Degradome-seq/degradome_metaprofile.py @@ -217,7 +217,10 @@ def dists_to_ali(ali, neighbours): msg = f"{chrom1}, {pos1}, {strand1}\n" + ", ".join([str(ali2) for ali2 in neighbours]) assert all(compatible(ali2) for ali2 in neighbours), msg - return Counter(pos2 - pos1 for (_, pos2, _) in neighbours) + if strand1 == "+": + return Counter(pos2 - pos1 for (_, pos2, _) in neighbours) + else: + return Counter(pos1 - pos2 for (_, pos2, _) in neighbours) # TODO: implement filtering from a bed or gtf file (for instance, to indicate the coordinates of genes or gens portions of interest) diff --git a/Degradome-seq/test_data/Simulate_degradome_and_small_reads.py b/Degradome-seq/test_data/Simulate_degradome_and_small_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..c1f9fb87fa64a38a7ab727a39ca56db129c55fe4 --- /dev/null +++ b/Degradome-seq/test_data/Simulate_degradome_and_small_reads.py @@ -0,0 +1,148 @@ + +# coding: utf-8 + +# In[1]: + +from Bio import SeqIO +from gzip import open as gzopen +genome="/pasteur/homes/bli/Documents/Cecere_team/Genome_image/genomes/Yersinia_pestis.fna.gz" +chrom="NC_003143.1" +with gzopen(genome, "rt") as handle: + records = dict((rec.id, rec) for rec in SeqIO.parse(handle, "fasta")) + + +# In[2]: + +get_ipython().run_line_magic('matplotlib', 'inline') +import numpy as np +from seaborn import distplot +import matplotlib.pyplot as plt +fig, axes = plt.subplots(nrows=2, ncols=3) +points = 0.5 * np.random.gamma(4, 1.6, 1000) +distplot(points, ax=axes[0, 0]) + +points = (10 * np.random.poisson(0.3, 1000)) - 4 +distplot(points, ax=axes[0, 1]) + +points = np.random.beta(0.3, 8, 1000) +distplot(points, ax=axes[0, 2]) + +points = np.random.laplace(8, 5, 1000) +distplot(points, ax=axes[1, 0]) + +points = np.random.laplace(8, 2, 1000) +distplot(points, ax=axes[1, 1]) + +points = np.floor(np.random.standard_cauchy(20)) +distplot(points, ax=axes[1, 2]) + +fig.set_figwidth(17) +sorted(points) + + +# In[3]: + +offsets = np.concatenate([5 * np.random.poisson(0.3, 20) - 5, np.floor(np.random.standard_cauchy(20))]) +distplot(offsets) +sorted(offsets) + + +# In[4]: + +# forward small read +sf_start = 100 +sf_stop = 121 +sf = "\t".join((chrom, f"{sf_start}", f"{sf_stop}", "sf", ".", "+")) +# degradome reverse reads +dr_start_offsets = offsets + +drs = list(map( + "\t".join, + [(chrom, f"{int(sf_start + dr_start_offset + 1 - dr_len)}", f"{int(sf_start + dr_start_offset + 1)}", f"dr{i}", ".", "-") for ( + i, + (dr_start_offset, dr_len)) in enumerate(zip(dr_start_offsets, np.random.randint(24, 36, len(dr_start_offsets))))])) +#print(sf, *drs, sep="\n") + + +# In[5]: + +# reverse small read +sr_start = 117 +sr_stop = 139 +sr = "\t".join((chrom, f"{sr_start}", f"{sr_stop}", "sr", ".", "-")) +# degradome forward reads +df_start_offsets = [0 - offset for offset in offsets] + +dfs = list(map( + "\t".join, + [(chrom, f"{int(sr_start + df_start_offset)}", f"{int(sr_start + df_start_offset + df_len)}", f"df{i}", ".", "+") for ( + i, + (df_start_offset, df_len)) in enumerate(zip(df_start_offsets, np.random.randint(24, 36, len(df_start_offsets))))])) +#print(sr, *dfs, sep="\n") + + +# In[6]: + +from pybedtools import BedTool +import tempfile +with tempfile.NamedTemporaryFile(mode="w") as handle: + SeqIO.write(records.values(), handle, "fasta") + drs_fasta = BedTool("\n".join(drs), from_string=True).sequence(fi=handle.name) + degradome_rev_revcompl = [] + with open(drs_fasta.seqfn) as in_fasta_handle: + for record in SeqIO.parse(in_fasta_handle, "fasta"): + record.seq = record.seq.reverse_complement() + degradome_rev_revcompl.append(record) + with open("degradome_rev.fa", "w") as out_fasta_handle: + SeqIO.write(degradome_rev_revcompl, out_fasta_handle, "fasta") + dfs_fasta = BedTool("\n".join(dfs), from_string=True).sequence(fi=handle.name) + dfs_fasta.save_seqs("degradome_fwd.fa") + #with open("degradome_fwd.fa", "w") as out_fasta_handle: + # SeqIO.write(degradome_rev_revcompl, out_fasta_handle, "fasta") + #TODO: reverse-complement the antisense reads, write to file, map on genome, get bam file to test + sf_fasta = BedTool(sf, from_string=True).sequence(fi=handle.name) + sf_fasta.save_seqs("small_fwd.fa") + sr_fasta = BedTool(sr, from_string=True).sequence(fi=handle.name) + with open(sr_fasta.seqfn) as in_fasta_handle: + record = SeqIO.read(in_fasta_handle, "fasta") + record.seq = record.seq.reverse_complement() + with open("small_rev.fa", "w") as out_fasta_handle: + SeqIO.write(record, out_fasta_handle, "fasta") + + +# In[7]: + +get_ipython().system('bowtie2-build --seed 123 /pasteur/homes/bli/Documents/Cecere_team/Genome_image/genomes/Yersinia_pestis.fna.gz Yersinia_pestis') + + +# In[8]: + +get_ipython().system('cat small_fwd.fa small_rev.fa > to_map_small.fa') +get_ipython().system('cat degradome_rev.fa degradome_fwd.fa > to_map_degradome.fa') + + +# In[9]: + +get_ipython().system('bowtie2 --seed 123 -N 0 -i S,1,0.8 -f -x Yersinia_pestis -U to_map_small.fa -S mapped_small.sam') + + +# In[10]: + +get_ipython().system('bowtie2 --seed 123 -N 0 -i S,1,0.8 -f -x Yersinia_pestis -U to_map_degradome.fa -S mapped_degradome.sam') + + +# In[11]: + +get_ipython().system('sam2indexedbam.sh mapped_small.sam') + + +# In[12]: + +get_ipython().system('sam2indexedbam.sh mapped_degradome.sam') + + +# In[13]: + +from collections import Counter +Counter(offsets) +