Skip to content
Snippets Groups Projects
Commit 00f001de authored by Blaise Li's avatar Blaise Li
Browse files

Remapping unmapped reads for iCLIP.

The remapping is done with one mismatch allowed in the seed. It happened
that very little mapped reads had mismatches, although read-through
reads can have some.
parent 0e3dab89
No related branches found
No related tags found
No related merge requests found
......@@ -38,6 +38,12 @@ from smincludes import rules as irules
alignment_settings = {
"bowtie2": "--local -L 6 -i S,1,0.8 -N 0",
"crac": "-k 20 --stranded --use-x-in-cigar"}
# Lower stringency settings, to remap the unmapped
realignment_settings = {
# Allow more mismatches in the seed
"bowtie2": "--local -L 6 -i S,1,0.8 -N 1",
# TODO: Find how to be less stringent with crac
"crac": "-k 20 --stranded --use-x-in-cigar"}
# Define functions to be used in shell portions
shell.prefix(SHELL_FUNCTIONS)
......@@ -120,7 +126,10 @@ mapping = [
## Will be pulled in as dependencies of other needed results:
# expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED),
##
expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED),
expand(
OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome),
trimmer=TRIMMERS, lib=LIBS, rep=REPS,
read_type=POST_TRIMMING + SIZE_SELECTED + [f"{to_map}_unmapped" for to_map in POST_TRIMMING + SIZE_SELECTED]),
]
counting = [
......@@ -133,9 +142,14 @@ counting = [
]
#TODO:
# - Plot histogram of read type counts at successive processing steps
# - Remap unmapped with less stringency to check if we are too stringent
# (- remove deduplication step ?)
# - map and featureCount rev/fwd: fwd -> mRNA, rev -> smallRNA
# - map with CRAC, detect chimera and crosslink-induced sequencing erros
# - map with CRAC, detect chimera and crosslink-induced sequencing errors
# - find cross-link sites on genes: should be 5' of antisense reads
# (otherwise, we expect mismatches at the cross-link sites: distribution of mismatch positions in the reads)
# see also https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1130-x
rule all:
"""This top rule is used to drive the whole workflow by taking as input its final products."""
input:
......@@ -242,6 +256,8 @@ def source_fastq(wildcards):
return rules.trim_and_dedup.output.noadapt
elif read_type == "adapt_nodedup":
return rules.trim_and_dedup.output.adapt_nodedup
elif read_type.endswith("unmapped"):
return rules.map_on_genome.output.nomap_fastq
else:
raise NotImplementedError("Unknown read type: %s" % read_type)
......@@ -353,10 +369,16 @@ rule plot_size_distribution:
title = f"read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}"
plot_histo(output[0], data, title)
def set_alignment_settings(wildcards):
if wildcards.read_type.endswith("unmapped"):
return realignment_settings[aligner]
else:
return alignment_settings[aligner]
###########
# Mapping #
###########
# TODO: replace settings by function of read_type
rule map_on_genome:
input:
# fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"),
......@@ -365,10 +387,13 @@ rule map_on_genome:
# sam files take a lot of space
sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome)),
nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome),
wildcard_constraints:
read_type = "|".join(POST_TRIMMING + SIZE_SELECTED)
params:
aligner = aligner,
index = genome_db,
settings = alignment_settings[aligner],
# settings = set_alignment_settings,
message:
"Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome."
log:
......@@ -379,6 +404,34 @@ rule map_on_genome:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
rule remap_on_genome:
input:
# fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"),
#fastq = rules.map_on_genome.output.nomap_fastq,
fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome),
output:
# sam files take a lot of space
sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.sam" % genome)),
nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_unmapped_on_%s.fastq.gz" % genome),
wildcard_constraints:
read_type = "|".join(POST_TRIMMING + SIZE_SELECTED)
#wildcard_constraints:
# read_type = "|".join([f"{to_map}_unmapped" for to_map in POST_TRIMMING + SIZE_SELECTED])
params:
aligner = aligner,
index = genome_db,
settings = realignment_settings[aligner],
# settings = set_alignment_settings,
message:
"Re-mapping unmapped {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome."
log:
log = OPJ(log_dir, "{trimmer}", aligner, "remap_{read_type}_unmapped_on_genome", "{lib}_{rep}.log"),
err = OPJ(log_dir, "{trimmer}", aligner, "remap_{read_type}_unmapped_on_genome", "{lib}_{rep}.err"),
threads: 12
wrapper:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
rule sam2indexedbam:
input:
sam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment