diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile index 1e03860beff7f0e9d963ace0b4ebff630980b74a..c7955cf2d2639714c125ad35d1ebc8cfdcc9b0c8 100644 --- a/CLIP/iCLIP.snakefile +++ b/CLIP/iCLIP.snakefile @@ -32,26 +32,6 @@ from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_cont from libworkflows import feature_orientation2stranded, read_feature_counts, sum_feature_counts from smincludes import rules as irules -# TODO: Have different settings for different size ranges -# recommended k-mer length for D. melanogaster is 20 -# However, reads shorter thant the k-mer length will be ignored. -# http://crac.gforge.inria.fr/documentation/crac/#sec-2 -alignment_settings = { - #"bowtie2": "-L 6 -i S,1,0.8 -N 0", - "bowtie2": "-L 6 -i S,1,0.8 -N 1", - # Small RNA-seq parameters may not be compatible with --local - #"bowtie2": "--local -L 6 -i S,1,0.8 -N 0", - "crac": "-k 20 --stranded --use-x-in-cigar"} -# Lower stringency settings, to remap the unmapped -realignment_settings = { - # Try with almost-default settings - "bowtie2": "-N 1", - # Allow more mismatches in the seed - # Reduce minimal mismatch and gap open penalties - #"bowtie2": "--local -L 6 -i S,1,0.8 -N 1 --mp 6,1 --rdg 4,3", - # TODO: Find how to be less stringent with crac - "crac": "-k 20 --stranded --use-x-in-cigar"} - # Define functions to be used in shell portions shell.prefix(SHELL_FUNCTIONS) @@ -103,6 +83,37 @@ WITH_ADAPT = ["adapt_deduped", "adapt_nodedup"] POST_TRIMMING = ["noadapt_deduped"] + WITH_ADAPT SIZE_RANGES = ["12-18", "21-24", "26-40", "48-52"] SIZE_SELECTED = [f"{read_type}_{size_range}" for (read_type, size_range) in product(WITH_ADAPT, SIZE_RANGES)] + +# TODO: Have different settings for different size ranges +# recommended k-mer length for D. melanogaster is 20 +# However, reads shorter thant the k-mer length will be ignored. +# http://crac.gforge.inria.fr/documentation/crac/#sec-2 +alignment_settings = { + #"bowtie2": "-L 6 -i S,1,0.8 -N 0", + "bowtie2": "-L 6 -i S,1,0.8 -N 1", + # Small RNA-seq parameters may not be compatible with --local + #"bowtie2": "--local -L 6 -i S,1,0.8 -N 0", + "crac": "-k 20 --stranded --use-x-in-cigar"} +# Lower stringency settings, to remap the unmapped +realignment_settings = { + # Try with almost-default settings + "bowtie2": "-N 1", + # Allow more mismatches in the seed + # Reduce minimal mismatch and gap open penalties + #"bowtie2": "--local -L 6 -i S,1,0.8 -N 1 --mp 6,1 --rdg 4,3", + # TODO: Find how to be less stringent with crac + "crac": "-k 20 --stranded --use-x-in-cigar"} +#test_alignment_settings = { +# "bowtie2": { +# "": "-L 6 -i S,1,0.8 -N 1", +# "": "-L 6 -i S,1,0.8 -N 1", +# "": "-L 6 -i S,1,0.8 -N 1", +# "": "-L 6 -i S,1,0.8 -N 1", +# "": "-L 6 -i S,1,0.8 -N 1", +# "": "-L 6 -i S,1,0.8 -N 1"} +# } + + # For compatibility with trim_and_dedup as used in PRO-seq pipeline lib2adapt = defaultdict(lambda: config["adapter"]) MAX_ADAPT_ERROR_RATE = config["max_adapt_error_rate"] diff --git a/libhts/libhts/__init__.py b/libhts/libhts/__init__.py index 5a45ee3d20d74e0d790407fe56946f55080bea73..b1aab224f06b5b3d591665a04587c727ed0e5ca8 100644 --- a/libhts/libhts/__init__.py +++ b/libhts/libhts/__init__.py @@ -1,6 +1,7 @@ from .libhts import ( do_deseq2, gtf_2_genes_exon_lengths, make_empty_bigwig, + make_seeding_function, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_histo, plot_lfc_distribution, plot_MA, diff --git a/libhts/libhts/libhts.py b/libhts/libhts/libhts.py index b3775766fea5c565be3b82cc8a08cbb27e24f24f..b087970d62aad6d064fda60139f873864dc2ff51 100644 --- a/libhts/libhts/libhts.py +++ b/libhts/libhts/libhts.py @@ -1,4 +1,4 @@ -from math import floor, ceil +from math import floor, ceil, sqrt, log from functools import reduce from re import sub import warnings @@ -177,13 +177,67 @@ def spikein_gtf_2_lengths(spikein_gtf): def make_empty_bigwig(filename, chrom_sizes): + """Writes *filename* so that it is an empty bigwig file. + *chrom_sizes is a dictionary giving chromosome sizes* given + chomosome names. + """ bw_out = pyBigWig.open(filename, "w") bw_out.addHeader(list(chrom_sizes.items())) for (chrom, chrom_len) in bw_out.chroms().items(): - bw_out.addEntries(chrom, 0, values=np.nan_to_num(np.zeros(chrom_len)[0::10]), span=10, step=10) + bw_out.addEntries( + chrom, 0, + values=np.nan_to_num(np.zeros(chrom_len)[0::10]), + span=10, step=10) bw_out.close() +def zero(value): + return 0 + + +def identity(value): + return value + + +bowtie2_function_selector = { + "C": zero, + "L": identity, + "S": sqrt, + "G": log} + + +def make_seeding_function(seeding_string): + """Generates a function that computes the seeding pattern given a + string representing bowtie2 seeding settings (-L and -i options). + + >>> make_seeding_function("-L 6 -i S,1,0.8")(18) + [[0, 6], [4, 10], [8, 14], [12, 18]] + """ + [opt1, val1, opt2, val2] = seeding_string.split() + if opt1 == "-L": + assert opt2 == "-i" + seed_len = int(val1) + interval_string = val2 + else: + assert opt2 == "-L" + seed_len = int(val2) + interval_string = val1 + [func_type, constant, coeff] = interval_string.split(",") + constant = float(constant) + coeff = float(coeff) + func_type = bowtie2_function_selector[func_type] + def seeding_function(read_len): + interval = floor(constant + (coeff * func_type(read_len))) + seeds = [] + seed = [0, seed_len] + while seed[1] <= read_len: + seeds.append(seed) + next_seed_start = seed[0] + interval + seed = [next_seed_start, next_seed_start + seed_len] + return seeds + return seeding_function + + def do_deseq2(cond_names, conditions, counts_data, formula=None, contrast=None, deseq2_args=None): """Runs a DESeq2 differential expression analysis."""