diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile index 44505bda0eae26aea9d10717fdb56e9031de68ff..385431bc3c9d5488608dd16a95d671dd72cfc0e6 100644 --- a/CLIP/iCLIP.snakefile +++ b/CLIP/iCLIP.snakefile @@ -10,8 +10,10 @@ if major < 3 or (major == 3 and minor < 6): import os OPJ = os.path.join from glob import glob +from subprocess import CalledProcessError from collections import defaultdict +from itertools import product import matplotlib as mpl # To be able to run the script without a defined $DISPLAY @@ -24,8 +26,10 @@ mpl.rcParams["font.family"] = "sans-serif" import pandas as pd import matplotlib.pyplot as plt +from libhts import median_ratio_to_pseudo_ref_size_factors from libworkflows import get_chrom_sizes -from libworkflows import ensure_relative, SHELL_FUNCTIONS, warn_context +from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context +from libworkflows import feature_orientation2stranded, read_feature_counts, sum_feature_counts from smincludes import rules as irules # recommended k-mer length for D. melanogaster is 20 @@ -67,32 +71,69 @@ for (barcode, lib_info) in barcode_dict.items(): lib2raw[lib_info["lib"]][lib_info["rep"]] = OPJ(demux_dir, f"{barcode}.fastq.gz") LIBS = list(lib2raw.keys()) REPS = sorted(REPS) - -TRIMMERS = ["fastx_clipper", "cutadapt"] -READ_TYPES = ["noadapt_deduped", "adapt_deduped", "adapt_nodedup"] +CONDITIONS = [{ + "lib" : lib, + "rep" : rep} for rep in REPS for lib in LIBS] +# We use this for various things in order to have always the same library order: +COND_NAMES = ["_".join(( + cond["lib"], + cond["rep"])) for cond in CONDITIONS] +COND_COLUMNS = pd.DataFrame(CONDITIONS).assign( + cond_name=pd.Series(COND_NAMES).values).set_index("cond_name") + +LIB_TYPE = config["lib_type"] #TRIMMERS = ["fastx_clipper"] +TRIMMERS = ["cutadapt"] +COUNTERS = ["feature_count"] +ORIENTATIONS = ["fwd", "rev", "all"] +WITH_ADAPT = ["adapt_deduped", "adapt_nodedup"] +POST_TRIMMING = ["noadapt_deduped"] + WITH_ADAPT +SIZE_RANGES = ["12-18", "21-24", "26-40", "48-52"] +SIZE_SELECTED = [f"{read_type}_{size_range}" for (read_type, size_range) in product(WITH_ADAPT, SIZE_RANGES)] # For compatibility with trim_and_dedup as used in PRO-seq pipeline lib2adapt = defaultdict(lambda: config["adapter"]) MAX_ADAPT_ERROR_RATE = config["max_adapt_error_rate"] +COUNT_BIOTYPES = ["protein_coding", "DNA_transposons_rmsk_families", "RNA_transposons_rmsk_families"] +SIZE_FACTORS = ["protein_coding", "median_ratio_to_pseudo_ref"] +assert set(SIZE_FACTORS).issubset(set(COUNT_BIOTYPES) | {"median_ratio_to_pseudo_ref"}) +NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"] +assert set(NORM_TYPES).issubset(set(SIZE_FACTORS)) + wildcard_constraints: lib="|".join(LIBS), - rep="\d+" + rep="\d+", + orientation="|".join(ORIENTATIONS), + #size_range="\d+-\d+" preprocessing = [ - expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES), - expand(OPJ(data_dir, "{lib}_{rep}.fastq.gz"), lib=LIBS, rep=REPS), - expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), - expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), - expand(OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.pdf"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), + ## Will be pulled in as dependencies of other needed results: + # expand(OPJ(demux_dir, "{barcode}.fastq.gz"), barcode=BARCODES), + # expand(OPJ(data_dir, "{lib}_{rep}.fastq.gz"), lib=LIBS, rep=REPS), + # expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED), + ## + expand(OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED), + expand(OPJ(data_dir, "trimmed_{trimmer}", "read_stats", "{lib}_{rep}", "{read_type}_size_distribution.pdf"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING), ] mapping = [ - expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), - expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=READ_TYPES), + ## Will be pulled in as dependencies of other needed results: + # expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED), + ## + expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED), +] + +counting = [ + ## Will be pulled in as dependencies of other needed results: + # expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), + ## + expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, orientation=ORIENTATIONS), + expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), + expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_by_{norm_type}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, norm_type=NORM_TYPES, orientation=["all"]), ] #TODO: +# - map and featureCount rev/fwd: fwd -> mRNA, rev -> smallRNA # - map with CRAC, detect chimera and crosslink-induced sequencing erros # - find cross-link sites on genes: should be 5' of antisense reads rule all: @@ -100,6 +141,7 @@ rule all: input: preprocessing, mapping, + counting, ################# @@ -113,7 +155,7 @@ rule demultiplex: params: demux_dir = demux_dir, bc_start = config["bc_start"], - barcodes = " ".join(BARCODES), + barcodes = " -b ".join(BARCODES), max_diff = MAX_DIFF log: err = OPJ(log_dir, "demultiplex.err") @@ -121,7 +163,8 @@ rule demultiplex: OPJ(log_dir, "demultiplex_benchmark.txt") shell: """ - qaf-demux -i {input.fq_in} -o {params.demux_dir} \\ + /pasteur/homes/bli/src/bioinfo_utils/fastq-demultiplexing/Nim/qaf_demux \\ + -i {input.fq_in} -g -o {params.demux_dir} \\ -s {params.bc_start} -b {params.barcodes} -m {params.max_diff} \\ 2> {log.err} || error_exit "qaf_demux failed" """ @@ -172,11 +215,27 @@ MAX_ERROR_RATE="{params.max_adapt_error_rate}" THREADS="{threads}" {params.proce shell(shell_commands) +def source_trimmed_fastq(wildcards): + """Determine the fastq file corresponding to a given read type.""" + # remove size range + read_type = "_".join(wildcards.read_type.split("_")[:-1]) + if read_type == "adapt_deduped": + return rules.trim_and_dedup.output.adapt + elif read_type == "noadapt_deduped": + return rules.trim_and_dedup.output.noadapt + elif read_type == "adapt_nodedup": + return rules.trim_and_dedup.output.adapt_nodedup + else: + raise NotImplementedError("Unknown read type: %s" % read_type) + + def source_fastq(wildcards): """Determine the fastq file corresponding to a given read type.""" read_type = wildcards.read_type if read_type == "raw": return rules.link_raw_data.output + elif read_type in SIZE_SELECTED: + return rules.select_size_range.output.selected elif read_type == "adapt_deduped": return rules.trim_and_dedup.output.adapt elif read_type == "noadapt_deduped": @@ -187,15 +246,86 @@ def source_fastq(wildcards): raise NotImplementedError("Unknown read type: %s" % read_type) +def awk_size_filter(wildcards): + """Returns the bioawk filter to select reads of type *wildcards.read_type*.""" + size_range = wildcards.read_type.split("_")[-1] + (min_len, max_len) = size_range.split("-") + return f"{min_len} <= length($seq) && length($seq) <= {max_len}" + + +rule select_size_range: + """Select (and count) reads in the correct size range.""" + input: + source_trimmed_fastq, + output: + selected = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), + nb_selected = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_{read_type}.txt"), + wildcard_constraints: + read_type = "|".join(SIZE_SELECTED) + params: + awk_filter = awk_size_filter, + message: + "Selecting {wildcards.read_type} for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." + shell: + """ + bioawk -c fastx '{params.awk_filter} {{print "@"$name" "$4"\\n"$seq"\\n+\\n"$qual}}' {input} \\ + | tee >(count_fastq_reads {output.nb_selected}) \\ + | gzip > {output.selected} + """ + + +# for with_size_selection in [True, False]: +# if with_size_selection: +# rule do_fastqc_on_size_selected: +# input: +# fastq = source_fastq +# output: +# fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_{size_range}_fastqc.html") +# shell: +# """ +# fastqc {input.fastq} +# """ +# else: +# rule do_fastqc: +# input: +# fastq = source_fastq +# output: +# fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html") +# shell: +# """ +# fastqc {input.fastq} +# """ + rule do_fastqc: input: - fastq = source_fastq + fastq = source_fastq, output: fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html") shell: """ fastqc {input.fastq} """ +# rule do_fastqc_on_trimmed: +# input: +# fastq = source_fastq, +# output: +# fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_fastqc.html") +# shell: +# """ +# fastqc {input.fastq} +# """ +# +# +# rule do_fastqc_on_size_selected: +# input: +# fastq = source_fastq, +# #fastq = rules.select_size_range.output.selected, +# output: +# fastqc_out = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}_{size_range}_fastqc.html") +# shell: +# """ +# fastqc {input.fastq} +# """ rule compute_size_distribution: @@ -302,7 +432,7 @@ rule compute_mapping_stats: rule fuse_bams: - """This rules fuses the two sorted bam files corresponding to the mapping + """This rule fuses the two sorted bam files corresponding to the mapping of the reads containing the adaptor or not.""" input: noadapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_on_%s_sorted.bam" % genome), @@ -331,6 +461,193 @@ rule fuse_bams: """ +def biotype2annot(wildcards): + if wildcards.biotype.endswith("_rmsk_families"): + biotype = wildcards.biotype[:-9] + else: + biotype = wildcards.biotype + return OPJ(annot_dir, f"{biotype}.gtf") + + +rule feature_count_reads: + input: + sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), + bai = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome), + output: + counts = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), + counts_converted = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts_gene_names.txt"), + params: + stranded = feature_orientation2stranded(LIB_TYPE), + annot = biotype2annot, + message: + "Counting {wildcards.orientation} {wildcards.biotype} {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep} with featureCounts." + log: + log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}.log"), + err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}.err") + shell: + """ + converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle" + tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX") + cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}" + featureCounts -v 2> {log.log} + echo ${{cmd}} 1>> {log.log} + eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed" + rm -rf ${{tmpdir}} + cat {output.counts} | id2name.py ${{converter}} > {output.counts_converted} + """ + + +rule summarize_feature_counts: + """For a given library, compute the total counts for each biotype and write this in a summary table.""" + input: + biotype_counts_files = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{{lib}}_{{rep}}_{{read_type}}_on_%s" % genome, "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES), + output: + summary = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome), + run: + sum_counter = sum_feature_counts + with open(output.summary, "w") as summary_file: + header = "\t".join(COUNT_BIOTYPES) + summary_file.write("%s\n" % header) + sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files)) + summary_file.write("%s\n" % sums) + + +rule gather_read_counts_summaries: + input: + summary_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS), + output: + summary_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), + run: + summary_files = (OPJ( + output_dir, + wildcards.trimmer, + aligner, + "mapped_%s" % genome, + "feature_count", + "summaries", + f"{cond_name}_{wildcards.read_type}_on_%s_{wildcards.orientation}_counts.txt" % genome) for cond_name in COND_NAMES) + summaries = pd.concat( + (pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), + axis=1) + summaries.columns = COND_NAMES + summaries.to_csv(output.summary_table, sep="\t") + + +rule gather_counts: + """For a given biotype, gather counts from all libraries in one table.""" + input: + counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS), + output: + counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), + # wildcard_constraints: + # # Avoid ambiguity with join_all_counts + # biotype = "|".join(COUNT_BIOTYPES) + run: + # Gathering the counts data + ############################ + counts_files = (OPJ( + output_dir, + wildcards.trimmer, + aligner, + "mapped_%s" % genome, + "feature_count", + f"{cond_name}_{wildcards.read_type}_on_%s" % genome, + f"{wildcards.biotype}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES) + # if wildcards.counter == "htseq_count": + # counts_data = pd.concat( + # map(read_htseq_counts, counts_files), + # axis=1).fillna(0).astype(int) + # elif wildcards.counter == "intersect_count": + # counts_data = pd.concat( + # map(read_intersect_counts, counts_files), + # axis=1).fillna(0).astype(int) + # elif wildcards.counter == "feature_count": + # counts_data = pd.concat( + # map(read_feature_counts, counts_files), + # axis=1).fillna(0).astype(int) + # else: + # raise NotImplementedError(f"{wilcards.counter} not handled (yet?)") + counts_data = pd.concat( + map(read_feature_counts, counts_files), + axis=1).fillna(0).astype(int) + counts_data.columns = COND_NAMES + # Simple_repeat|Simple_repeat|(TTTTTTG)n:1 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:2 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:3 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:4 + # -> Simple_repeat|Simple_repeat|(TTTTTTG)n + if wildcards.biotype.endswith("_rmsk_families"): + repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index] + # Sum the counts for a given repeat family + counts_data = counts_data.assign(family=repeat_families).groupby("family").sum() + counts_data.index.names = ["gene"] + counts_data.to_csv(output.counts_table, sep="\t") + + +rule compute_median_ratio_to_pseudo_ref_size_factors: + input: + counts_table = rules.gather_counts.output.counts_table, + output: + median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), + run: + counts_data = pd.read_table( + input.counts_table, + index_col=0, + na_filter=False) + # http://stackoverflow.com/a/21320592/1878788 + #median_ratios = pd.DataFrame(median_ratio_to_pseudo_ref_size_factors(counts_data)).T + #median_ratios.index.names = ["median_ratio_to_pseudo_ref"] + # Easier to grep when not transposed, actually: + median_ratios = median_ratio_to_pseudo_ref_size_factors(counts_data) + print(median_ratios) + median_ratios.to_csv(output.median_ratios_file, sep="\t") + + +rule make_normalized_bigwig: + input: + bam = rules.sam2indexedbam.output.sorted_bam, + #bam = rules.fuse_bams.output.sorted_bam, + # TODO: use sourcing function based on norm_type + #size_factor_file = rules.compute_coverage.output.coverage + median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), + # TODO: compute this + #scale_factor_file = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"), + output: + bigwig_norm = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_by_{norm_type}_{orientation}.bw" % genome), + #params: + # orient_filter = bamcoverage_filter, + threads: 12 # to limit memory usage, actually + benchmark: + OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm_type}_{orientation}_benchmark.txt") + params: + genome_binned = genome_binned, + log: + log = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm_type}_{orientation}.log"), + err = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm_type}_{orientation}.err"), + run: + # TODO: make this a function of deeptools version + no_reads = """Error: The generated bedGraphFile was empty. Please adjust +your deepTools settings and check your input files. +""" + zero_bytes = """needLargeMem: trying to allocate 0 bytes (limit: 100000000000) +bam2bigwig.sh: bedGraphToBigWig failed +""" + try: + shell(""" + bam2bigwig.sh {input.bam} {params.genome_binned} \\ + {wildcards.lib}_{wildcards.rep} {wildcards.orientation} %s \\ + {input.median_ratios_file} {output.bigwig_norm} \\ + > {log.log} 2> {log.err} \\ + || error_exit "bam2bigwig.sh failed" + """ % LIB_TYPE[-1]) + except CalledProcessError as e: + if last_lines(log.err, 2) in {no_reads, zero_bytes}: + with open(output.bigwig_norm, "w") as bwfile: + bwfile.write("") + else: + raise + + onsuccess: print("iCLIP data pre-processing finished.") diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index a3aef8791812295c447b7cfe57314799a1af44bc..9a18c5aacad928724d4c991e9c273bcd46ace6a9 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -51,11 +51,12 @@ from libworkflows import ensure_relative from libworkflows import get_chrom_sizes, column_converter from libworkflows import strip_split, last_lines, save_plot, test_na_file from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context +from libworkflows import feature_orientation2stranded from libworkflows import read_htseq_counts, sum_htseq_counts from libworkflows import read_feature_counts, sum_feature_counts from smincludes import rules as irules -alignment_settings = {"bowtie2": ""}, +alignment_settings = {"bowtie2": ""} #TRIMMERS = ["cutadapt", "fastx_clipper"] TRIMMERS = ["cutadapt"] @@ -357,7 +358,7 @@ rule sam2indexedbam: rule fuse_bams: - """This rules fuses the two sorted bam files corresponding to the mapping + """This rule fuses the two sorted bam files corresponding to the mapping of the reads containing the adaptor or not.""" input: noadapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_noadapt_on_C_elegans_sorted.bam"), @@ -508,31 +509,6 @@ def parse_htseq_counts(counts_filename): yield (gene, int(count)) -# NOTE: It is not clear that this really does what I think it does: -# https://www.biostars.org/p/161534/ -# https://www.biostars.org/p/170406/ -def feature_orientation2stranded(wildcards): - orientation = wildcards.orientation - if orientation == "fwd": - if LIB_TYPE[-2:] == "SF": - return 1 - elif LIB_TYPE[-2:] == "SR": - return 2 - else: - raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.") - elif orientation == "rev": - if LIB_TYPE[-2:] == "SF": - return 2 - elif LIB_TYPE[-2:] == "SR": - return 1 - else: - raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.") - elif orientation == "all": - return 0 - else: - exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".") - - rule feature_count_reads: input: sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"), @@ -544,7 +520,7 @@ rule feature_count_reads: counts = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"), counts_converted = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"), params: - stranded = feature_orientation2stranded, + stranded = feature_orientation2stranded(LIB_TYPE), annot = biotype2annot, message: "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts." @@ -805,12 +781,12 @@ rule make_normalized_bigwig: # orient_filter = bamcoverage_filter, #threads: 12 # to limit memory usage, actually benchmark: - OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{orientation}_benchmark.txt") + OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}_benchmark.txt") params: genome_binned = genome_binned, log: - log = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{orientation}.log"), - err = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{orientation}.err"), + log = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.log"), + err = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.err"), shell: """ bam2bigwig.sh {input.bam} {params.genome_binned} \\ diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 0f6ff18bff87d9ecf1dbc3a6ecb235158bcdda7e..4ed5131307576f9f7fa059fbe9c55d02bb67eeec 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -8,6 +8,18 @@ major, minor = sys.version_info[:2] if major < 3 or (major == 3 and minor < 6): sys.exit("Need at least python 3.6\n") + +# TODO: scatterplots log(IP_RPM) vs log(input_RPM) (or mutant vs WT), colouring with gene list +# TODO: heatmaps with rows (genes) ranked by expression in WT (mean reference level (wild types or inputs)), columns representing fold in mut vs WT +# Possibly remove the low expressed ones +# TODO: MA plot: fold vs "mean" count (to be defined)? + +# TODO: extract most abundant piRNA reads (WT_48hph) +#bioawk -c fastx '{print $seq}' results_prg1_HS/bowtie2/mapped_C_elegans/reads/WT_RT_1_18-26_on_C_elegans/piRNA.fastq.gz | sort | uniq -c | sort -n | mawk '$1 >= 25 {print}' | wc -l +# -> 6857 +# map them on the genome (without the piRNA loci), or on pseudo-genomes with specific features (histone genes...), tolerating a certain amount of mismatches (using bowtie1) +# Or: scan the genome (or "pseudo-genome") to find where there are matches with 0 to 3 mismatches at specific positions (13-21) (and perfect matches in the 5' zone) + # TODO: meta-profiles with IP and input on the same meta profile for Ortiz_oogenic and Ortiz_spermatogenic # Make external script to generate meta-profiles on a custom list of libraries, and a given list of genes. @@ -129,7 +141,7 @@ from libworkflows import filter_combinator, sum_feature_counts, sum_htseq_counts from smincludes import rules as irules strip = str.strip -alignment_settings = {"bowtie2": "-L 6 -i S,1,0.8 -N 0"}, +alignment_settings = {"bowtie2": "-L 6 -i S,1,0.8 -N 0"} # Positions in small RNA sequences for which to analyse nucleotide distribution #POSITIONS = ["first", "last"]