Skip to content
Snippets Groups Projects
Commit 38c641d1 authored by Blaise Li's avatar Blaise Li
Browse files

Merging size classes in iCLIP data.

These reads are then merged with "noadapt".
Also added mapping parameters for "noadapt" reads.
parent 9dd243ee
Branches
Tags
No related merge requests found
......@@ -118,6 +118,7 @@ ORIENTATIONS = ["fwd", "rev", "all"]
WITH_ADAPT = ["adapt_deduped", "adapt_nodedup"]
POST_TRIMMING = ["noadapt_deduped"] + WITH_ADAPT
SIZE_RANGES = ["12-18", "21-24", "26-40", "48-52"]
# Note: size is only knowable where the adaptor was found
SIZE_SELECTED = [f"{read_type}_{size_range}" for (read_type, size_range) in product(WITH_ADAPT, SIZE_RANGES)]
# TODO: Have different settings for different size ranges
......@@ -131,6 +132,8 @@ alignment_settings = {
"21-24": "--local -L 14 -i S,1,0.5 -N 1",
"26-40": "--local -L 15 -i S,1,0.5 -N 1",
"48-52": "--local -L 16 -i S,1,0.5 -N 1",
"noadapt": "--local -L 17 -i S,1,0.5 -N 1",
# What is used for sRNA-seq
"default": "-L 6 -i S,1,0.8 -N 0",
},
# Small RNA-seq parameters may not be compatible with --local
......@@ -140,6 +143,7 @@ alignment_settings = {
"21-24": "-k 20 --stranded --use-x-in-cigar",
"26-40": "-k 20 --stranded --use-x-in-cigar",
"48-52": "-k 20 --stranded --use-x-in-cigar",
"noadapt": "-k 20 --stranded --use-x-in-cigar",
"default": "-k 20 --stranded --use-x-in-cigar",
}
}
......@@ -204,9 +208,9 @@ counting = [
## Will be pulled in as dependencies of other needed results:
# expand(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
##
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, norm=NORM_TYPES, orientation=["all"]),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], norm=NORM_TYPES, orientation=["all"]),
]
#TODO:
......@@ -450,6 +454,9 @@ def set_alignment_settings(wildcards):
m = size_range_re.match(wildcards.read_type)
if m:
(size_range,) = m.groups()
else:
if wildcards.read_type.startswith("noadapt_deduped"):
size_range = "noadapt"
else:
size_range = "default"
return alignment_settings[aligner][size_range]
......@@ -521,6 +528,8 @@ rule sam2indexedbam:
output:
sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome),
index = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome),
wildcard_constraints:
read_type = "|".join(SIZE_SELECTED)
message:
"Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
log:
......@@ -543,20 +552,53 @@ rule compute_mapping_stats:
"""samtools stats {input.sorted_bam} > {output.stats}"""
rule merge_size_ranges_bams:
"""This rule fuses the sorted bam files corresponding to the mapping
of the reads belonging to different size ranges."""
input:
sorted_bams = expand(
OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "{{lib}}_{{rep}}_{{read_type}}_{size_range}_on_%s_sorted.bam" % genome),
size_range=SIZE_RANGES),
output:
sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome),
bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome),
wildcard_constraints:
read_type = "|".join(WITH_ADAPT)
message:
"Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}"
log:
log = OPJ(log_dir, "{trimmer}", "merge_size_ranges_bams", "{lib}_{rep}_{read_type}.log"),
err = OPJ(log_dir, "{trimmer}", "merge_size_ranges_bams", "{lib}_{rep}_{read_type}.err"),
shell:
"""
samtools merge -c {output.sorted_bam} {input.sorted_bams} 1> {log.log} 2> {log.err}
indexed=""
while [ ! ${{indexed}} ]
do
samtools index {output.sorted_bam} && indexed="OK"
if [ ! ${{indexed}} ]
then
rm -f {output.bai}
echo "Indexing failed. Retrying" 1>&2
fi
done 1>> {log.log} 2>> {log.err}
"""
rule fuse_bams:
"""This rule fuses the two sorted bam files corresponding to the mapping
of the reads containing the adaptor or not."""
input:
noadapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_on_%s_sorted.bam" % genome),
adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_adapt_on_%s_sorted.bam" % genome),
noadapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_deduped_on_%s_sorted.bam" % genome),
adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_adapt_deduped_on_%s_sorted.bam" % genome),
output:
sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s_sorted.bam" % genome),
bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_C_%s_sorted.bam.bai" % genome),
sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_deduped_on_%s_sorted.bam" % genome),
bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_deduped_on_%s_sorted.bam.bai" % genome),
message:
"Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}"
"Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}_deduped"
log:
log = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.log"),
err = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.err"),
log = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}_deduped.log"),
err = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}_deduped.err"),
shell:
"""
samtools merge -c {output.sorted_bam} {input.noadapt_sorted_bam} {input.adapt_sorted_bam} 1> {log.log} 2> {log.err}
......@@ -587,7 +629,7 @@ rule feature_count_reads:
bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome),
output:
counts = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
counts_converted = OPJ("{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts_gene_names.txt"),
counts_converted = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts_gene_names.txt"),
params:
stranded = feature_orientation2stranded(LIB_TYPE),
annot = biotype2annot,
......@@ -598,7 +640,6 @@ rule feature_count_reads:
err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}.err")
shell:
"""
# converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle"
tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
featureCounts -v 2> {log.log}
......@@ -728,7 +769,7 @@ rule make_normalized_bigwig:
#size_factor_file = rules.compute_coverage.output.coverage
#median_ratios_file = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
# TODO: compute this
#scale_factor_file = OPJ(aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"),
#scale_factor_file = OPJ(aligner, "mapped_%s" % genome, "annotation", "all_%s_on_%s" % (size_selected, genome), "pisimi_median_ratios_to_pseudo_ref.txt"),
output:
bigwig_norm = OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome),
#params:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment