Ribo-seq.snakefile 164 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
"""
Snakefile to analyse small_RNA-seq data.

TODO: Some figures and summaries may be overridden when changing the mapper. The mapper name should be added to their path.
"""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")


Blaise Li's avatar
Blaise Li committed
12
13
14
15
16
# Bias removal: only consider after 15-th codon and before -5-th codon of CDS
# TODO: compute TPM of RPF and "normalize" by TPM of transcripts from RNA-seq data (translation efficiency (TE))
# graph: x: RNA-seq TPM, y: log2(Ribo_TPM/RNA_TPM)
# or y: log2folds of deseq2 normalized values
# Other option: use scikit-ribo? (more detailed analysis)
17
# Done: define RPF as size-selected (likely 28-30), that map in sense orientation on protein-coding genes
18

19
# Done: extract RPF reads
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# bedtools intersect \
#     -a results_Ribo-seq_test_2/bowtie2/mapped_C_elegans/WT_1/28-30_on_C_elegans_sorted.bam \
#     -b /pasteur/homes/bli/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/protein_coding_merged.bed \
#     -wa -u -f 1.0 \
#     | samtools view | bioawk -c sam '{print "@"$qname"\n"$seq"\n+\n"$qual}' > /tmp/bioawk.fq
# 
# mkfifo /tmp/test.bam
# bedtools intersect \
#     -a results_Ribo-seq_test_2/bowtie2/mapped_C_elegans/WT_1/28-30_on_C_elegans_sorted.bam \
#     -b /pasteur/homes/bli/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/protein_coding_merged.bed \
#     -wa -u -f 1.0  > /tmp/test.bam &
# bedtools bamtofastq -i /tmp/test.bam -fq /tmp/bamtofastq.fq
# rm /tmp/test.bam

# Total number of "non-structural" (mapped - (fwd-(t,sn,sno,r-RNA))) to compute RPKM
# Quick-and-dirty: use gene span (TES - TSS) -> done for repeats
# Better: use length of union of exons for each gene -> done for genes
# Ideal: use transcriptome-based per-isoform computation
# Actually, we don't want a normalization by length. We deal with small RNAs, not transcripts


# Possibly filter out on RPKM
# Then, compute folds of RP(K)M IP/input (for a given experiment, i.-e. REP)
# and give list of genes sorted by such folds -> see /Gene_lists/csr1_prot_si_supertargets*

# Exploratory:
# Heatmap (fold)
# genes | experiment

# Then either:
# - take the genes (above log-fold threshold) common across replicates
# - look at irreproducible discovery rate (http://dx.doi.org/doi:10.1214%2F11-AOAS466)
# -> define CSR-1-loaded

# For metagene: see --metagene option of computeMatrix
# Retrieve gtf info after filtering out interfering genes based on merged bed


import os
OPJ = os.path.join
from glob import glob
from re import sub
from pickle import load
from fileinput import input as finput
from sys import stderr
from subprocess import Popen, PIPE, CalledProcessError

# Useful for functional style
from itertools import chain, combinations, product, repeat, starmap
from functools import reduce
from operator import or_ as union
from cytoolz import concat, merge_with, take_nth

def concat_lists(lists):
    return list(concat(lists))

# Useful data structures
from collections import OrderedDict as od
from collections import defaultdict, Counter


import warnings


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning


# from gatb import Bank
from mappy import fastx_read
# To parse SAM format
import pysam
import pyBigWig

# To compute correlation coefficient
from scipy.stats.stats import pearsonr
# To catch errors when plotting KDE
from scipy.linalg import LinAlgError
# For data processing and displaying
from sklearn import preprocessing
from sklearn.decomposition import PCA
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
# https://github.com/mwaskom/seaborn/issues/1262
#mpl.use("agg")
mpl.use("PDF")
#mpl.rcParams["figure.figsize"] = 2, 4
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"
#mpl.rcParams["figure.figsize"] = [16, 30]

from matplotlib import cm
from matplotlib.colors import Normalize

from matplotlib import numpy as np
from math import ceil, floor
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# import seaborn.apionly as sns
# import husl
# predefined seaborn graphical parameters
sns.set_context("talk")

from libsmallrna import PI_MIN, PI_MAX, SI_MIN, SI_MAX
# Do this outside the workflow
#from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths
133
134
from libdeseq import do_deseq2
from libhts import status_setter
135
136
137
138
139
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution, plot_boxplots, plot_histo
from libworkflows import texscape, ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import read_int_from_file, strip_split, file_len, last_lines, save_plot, SHELL_FUNCTIONS
140
from libworkflows import read_feature_counts, sum_feature_counts, sum_htseq_counts, warn_context
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from smincludes import rules as irules
strip = str.strip

alignment_settings = {"bowtie2": "-L 6 -i S,1,0.8 -N 0"}

# Positions in small RNA sequences for which to analyse nucleotide distribution
#POSITIONS = ["first", "last"]
POSITIONS = config["positions"]
# Orientations of small RNAs with respect to an annotated feature orientation.
# "fwd" and "rev" restrict feature quantification to sense or antisense reads.
ORIENTATIONS = config["orientations"]
# small RNA types on which to run DESeq2
DE_TYPES = config["de_types"]
# small RNA types on which to compute IP/input RPM folds
IP_TYPES = ["pisimi", "siu", "prot_si"]
#IP_TYPES = config["ip_types"]
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
#STANDARDS = ["zscore", "robust", "minmax", "unit"]
# hexbin jointplot for principal components crashes on MemoryError for PCA without standardization
#STANDARDS = ["robust", "identity"]
STANDARDS = ["robust"]

COMPL = {"A" : "T", "C" : "G", "G" : "C", "T" : "A", "N" : "N"}

# Possible feature ID conversions
ID_TYPES = ["name", "cosmid"]

#########################
# Reading configuration #
#########################
# key: library name
# value: 3' adapter sequence
lib2adapt = config["lib2adapt"]
trim5 = config["trim5"]
trim3 = config["trim3"]
# key: library name
# value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
183
184
# Libraries for which we have matching RNA-seq data
# so that translation efficiency can be computed
185
186
187
EFF_LIBS = list(config["transcriptome_TPM"].keys())
# What type of "efficency" are we computing? Translation efficiency.
[this_TPM, ref_TPM, eff_name] = ["Ribo_TPM", "RNA_TPM", "TE"]
188
189
190
191
192
193
194
195
196
#REF=config["WT"]
#MUT=config["mutant"]
# Used to associate colours to libraries
# Or to make separate plots per series
# key: series name
# value: list of libraries
series_dict = config["series"]
SERIES_TYPES = list(series_dict.keys())
genotype_series = series_dict["genotype_series"]
197
dt_series = series_dict["dt_series"]
198
199
200
201
202
203
204
205
206
207
merged_series = merge_with(concat_lists, *series_dict.values())
ALL_SERIES = list(merged_series.keys())
#all_libs_in series = concat_lists(merge_with(concat_lists, *series_dict.values()).values())
REPS = config["replicates"]
DE_COND_PAIRS = config["de_cond_pairs"]
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
208
DT_COND_PAIRS = config["dt_cond_pairs"]
209
210
211
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
212
213
214
    ", ".join([f"({cond}, {ref})" for (cond, ref) in DT_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DT_COND_PAIRS]), ""
COND_PAIRS = DE_COND_PAIRS + DT_COND_PAIRS
215
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
216
217
218
DT_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DT_COND_PAIRS]
contrasts_dict = {"de" : DE_CONTRASTS, "dt" : DT_CONTRASTS}
CONTRASTS = DE_CONTRASTS + DT_CONTRASTS
219
220
221
222
223
224
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
MIN_LEN = config["min_len"]
MAX_LEN = config["max_len"]
size_selected = "%s-%s" % (MIN_LEN, MAX_LEN)
read_type_max_len = {
    size_selected: int(MAX_LEN),
Blaise Li's avatar
Blaise Li committed
225
    "RPF": int(MAX_LEN)}
226
227
228
229
230
231
READ_TYPES_FOR_COMPOSITION=[size_selected, "RPF"]
READ_TYPES_FOR_MAPPING=[size_selected, "RPF"]
# Types of annotation features, as defined in the "gene_biotype"
# GTF attribute sections of the annotation files.
COUNT_BIOTYPES = config["count_biotypes"]
ANNOT_BIOTYPES = config["annot_biotypes"]
232
233
234
235
236
237
238
239
240
DE_BIOTYPES = [
    "protein_coding",
    "protein_coding_CDS",
    "protein_coding_UTR",
    "protein_coding_5UTR",
    "protein_coding_3UTR",
    "pseudogene",
    "DNA_transposons_rmsk",
    "RNA_transposons_rmsk"]
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# cf Gu et al. (2009), supplement:
# -----
# To compare 22G-RNAs derived from a gene, transposon, or pseudogene between two
# samples, each sample was normalized using the total number of reads less structural RNAs, i.e.
# sense small RNA reads likely derived from degraded ncRNAs, tRNAs, snoRNAs, rRNAs,
# snRNAs, and scRNAs. Degradation products of structural RNAs map to the sense strand, with a
# poorly defined size profile and 1 st nucleotide distribution. At least 25 22G-RNA reads per million,
# nonstructural reads in one of the two samples was arbitrarily chosen as a cutoff for comparison
# analyses. A change of 2-fold or more between samples was chosen as an enrichment threshold.
# Because some 21U-RNAs or miRNAs overlap with protein coding genes, reads derived from
# miRNA loci within a window of ± 4 nt and all the known 21U-RNAs were filtered out prior to
# comparison analysis.
# -----
# And Germano Cecere, about scRNA:
# -----
# Its an old nomenclature and in anycase there is only one of this annotated scRNAs
# (small cytoplasmic RNA genes).
# https://www.ncbi.nlm.nih.gov/books/NBK19701/table/genestructure_table2/?report=objectonly
# Don't even pay attention to this
# -----
STRUCTURAL_BIOTYPES = ["tRNA", "snRNA", "snoRNA", "rRNA", "ncRNA"]
262
BIOTYPES = COUNT_BIOTYPES
263
GENE_LISTS = config["gene_lists"]
264
265
266
267
268
269
270
BOXPLOT_GENE_LISTS = config["boxplot_gene_lists"]
#BOXPLOT_GENE_LISTS = [
#    "all_genes",
#    "replication_dependent_octamer_histone",
#    "piRNA_dependent_prot_si_down4",
#    "csr1_prot_si_supertargets_48hph",
#    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
Blaise Li's avatar
Blaise Li committed
271
272
273
274
275
276
277
278
279
280
281
282
aligner = config["aligner"]
########################
# Genome configuration #
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
283
exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
Blaise Li's avatar
Blaise Li committed
284
285
286
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
287
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
Blaise Li's avatar
Blaise Li committed
288
289
index = genome_db

290
291
292
293
output_dir = config["output_dir"]
local_annot_dir = config.get("local_annot_dir", OPJ(output_dir, "annotations"))
log_dir = config.get("log_dir", OPJ(output_dir, "logs"))
data_dir = config.get("data_dir", OPJ(output_dir, "data"))
Blaise Li's avatar
Blaise Li committed
294

295
296
counter = "feature_count"
counts_dir = OPJ(output_dir, aligner, f"mapped_{genome}", counter)
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# Used to skip some genotype x treatment x replicate number combinations
# when some of them were not sequenced
forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]}
CONDITIONS = [{
    "lib" : lib,
    "rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
    cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
#SIZE_FACTORS = ["raw", "deduped", size_selected, "mapped", "siRNA", "miRNA"]
#SIZE_FACTORS = [size_selected, "mapped", "miRNA"]
#TESTED_SIZE_FACTORS = ["mapped", "non_structural", "siRNA", "miRNA", "median_ratio_to_pseudo_ref"]
312
TESTED_SIZE_FACTORS = ["mapped", "non_structural", "siRNA", "miRNA", "RPF"]
313
314
315
316
317
318
319
320
#SIZE_FACTORS = ["mapped", "miRNA", "median_ratio_to_pseudo_ref"]
# "median_ratio_to_pseudo_ref" is a size factor adapted from
# the method described in the DESeq paper, but with addition
# and then substraction of a pseudocount, in order to deal with zero counts.
# This seems to perform well (see "test_size_factor" results).
#DE_SIZE_FACTORS = ["non_structural", "median_ratio_to_pseudo_ref"]
DE_SIZE_FACTORS = ["non_structural"]
#SIZE_FACTORS = ["non_structural", "median_ratio_to_pseudo_ref"]
321
SIZE_FACTORS = ["non_structural", "RPF"]
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
NORMALIZER = "median_ratio_to_pseudo_ref"

# For metagene analyses
#META_MARGIN = 300
META_MARGIN = 0
META_SCALE = 500
#UNSCALED_INSIDE = 500
UNSCALED_INSIDE = 0
#META_MIN_LEN = 1000
META_MIN_LEN = 2 * UNSCALED_INSIDE
MIN_DIST = 2 * META_MARGIN


def add_dataframes(df1, df2):
    return df1.add(df2, fill_value=0)


def sum_dataframes(dfs):
    return reduce(add_dataframes, dfs)


def sum_counts(fname):
    p = Popen(
        ['awk', '$1 ~ /^piRNA$|^miRNA$|^pseudogene$|^satellites_rmsk$|^simple_repeats_rmsk$|^protein_coding_|^.NA_transposons_rmsk$/ {sum += $2} END {print sum}', fname],
        stdout=PIPE,
        stderr=PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    try:
        return int(result.strip().split()[0])
    except IndexError:
        warnings.warn(f"No counts in {fname}\n")
        return 0

def sum_te_counts(fname):
    p = Popen(
        ['awk', '$1 !~ /WBGene/ {sum += $2} END {print sum}', fname],
        stdout=PIPE,
        stderr=PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0])


# Limit risks of ambiguity by imposing replicates to be numbers
# and restricting possible forms of some other wildcards
wildcard_constraints:
    lib="|".join(LIBS),
    #treat="|".join(TREATS),
    rep="\d+",
    min_dist="\d+",
    min_len="\d+",
    #max_len="\d+",
377
    biotype="|".join(set(["alltypes"] + COUNT_BIOTYPES + ANNOT_BIOTYPES)),
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
    id_list="|".join(GENE_LISTS),
    type_set="|".join(["all", "protein_coding", "protein_coding_TE"]),
    read_type="|".join([
        "raw", "trimmed", "deduped", f"{size_selected}",
        "mapped", "RPF"]),
    standard="zscore|robust|minmax|unit|identity",
    orientation="all|fwd|rev",
    contrast="|".join(CONTRASTS),
    norm="|".join(TESTED_SIZE_FACTORS),
    series="|".join(ALL_SERIES),
    series_type="|".join(SERIES_TYPES),
    fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]),

# Define functions to be used in shell portions
shell.prefix(SHELL_FUNCTIONS)


###################################
# Preparing the input of rule all #
###################################

bigwig_files = [
    # individual libraries
    expand(
402
403
404
        OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}",
            "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome),
        lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]),
405
406
    # means of replicates
    expand(
407
408
409
        OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean",
            "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome),
        lib=LIBS, read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]),
410
411
    ]

412
413
counts_files = [
    expand(
414
        OPJ(counts_dir, "summaries",
415
416
            "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome),
        lib=LIBS, rep=REPS, read_type=["RPF"], orientation=ORIENTATIONS),
417
418
419
    # TPM is computed on all biotypes simultaneously
    expand(
        OPJ(counts_dir,
Blaise Li's avatar
Blaise Li committed
420
            "{lib}_mean_{read_type}_on_%s" % genome, "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
421
422
423
424
425
        lib=LIBS, read_type=["RPF"], biotype=["alltypes"], orientation=ORIENTATIONS),
    # expand(
    #     OPJ(counts_dir,
    #         "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
    #     read_type=["RPF"], biotype=["alltypes"], orientation=ORIENTATIONS),
426
    expand(
427
        OPJ(counts_dir,
428
429
430
            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPK.txt"),
        read_type=["RPF"], biotype=BIOTYPES, orientation=ORIENTATIONS),
    expand(
431
        OPJ(counts_dir,
432
433
434
            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
        read_type=["RPF"], biotype=BIOTYPES, orientation=ORIENTATIONS),
    # expand(
435
    #     OPJ(counts_dir,
436
437
438
    #         "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPK.txt"),
    #     read_type=["RPF"], biotype=BIOTYPES + ["alltypes"], orientation=ORIENTATIONS),
    # expand(
439
    #     OPJ(counts_dir,
440
441
442
443
    #         "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
    #     read_type=["RPF"], biotype=BIOTYPES + ["alltypes"], orientation=ORIENTATIONS),
    ]

444
445
446
meta_profiles = [
        #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split()),
        #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]),
447
        # TODO: check how to adapt to dt_series instead of ip_series
448
449
        expand(
            OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
450
                "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.pdf"),
451
452
453
            meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
            norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
            biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
454
            series_type=["dt_series"], series=list(dt_series.keys())),
455
456
        expand(
            OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
457
                "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.pdf"),
458
459
460
            meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"],
            norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding"], min_dist=[str(MIN_DIST)],
            biotype=["protein_coding"], min_len=[str(META_MIN_LEN)],
461
            series_type=["dt_series"], series=list(dt_series.keys())),
462
463
        expand(
            OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
464
                "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.pdf"),
465
466
            meta_scale=[str(META_SCALE)], read_type=[size_selected, "RPF"],
            norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=["0"], id_list=GENE_LISTS,
467
            series_type=["dt_series"], series=list(dt_series.keys())),
468
469
470
        ## TODO: Resolve issue with bedtools
        # expand(
        #     OPJ(output_dir, "figures", "{lib}_{rep}",
471
        #         "{read_type}_by_{norm}_{orientation}_pi_targets_in_{biotype}_profile.pdf"),
472
        #     lib=LIBS, rep=REPS, read_type=[size_selected, "RPF"],
473
        #     norm=SIZE_FACTORS, orientation=["all"], biotype=["protein_coding"]),
474
475
476
477
    ]

read_graphs = [
    expand(
478
479
        OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.pdf"),
        lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]),
480
    expand(
481
482
        OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.pdf"),
        lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]),
483
    expand(
484
485
        OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.pdf"),
        lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS),
486
    expand(
487
488
        OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.pdf"),
        lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS),
489
    expand(
490
491
        OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_size_distribution.pdf"),
        lib=LIBS, rep=REPS, read_type=["trimmed"]),
492
493
    # Not relevant in Ribo-seq?
    #expand(
494
495
    #    OPJ(output_dir, "figures", "{lib}_{rep}", f"{size_selected}_smallRNA_barchart.pdf"),
    #    lib=LIBS, rep=REPS),
496
    expand(
497
498
        OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"),
        lib=LIBS, rep=REPS),
499
500
    ]

501
502
503
# TODO: check what can be done with "dt" (differential translation, a.k.a. translation efficiency difference)
# What should be done with small_type, fold_type, etc.
if contrasts_dict["dt"]:
504
    fold_heatmaps = expand(
505
506
        OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"),
        small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold", "log2FoldChange"])
507
508
    ip_fold_boxplots = expand(
        OPJ(output_dir, "figures", "all_{contrast_type}",
509
            "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"),
510
        contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
511
        gene_list=BOXPLOT_GENE_LISTS)
512
513
else:
    fold_heatmaps = expand(
514
515
        OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"),
        small_type=["pisimi", "prot_si"], fold_type=["log2FoldChange"])
516
517
518
519
520
    ip_fold_boxplots = []

exploratory_graphs = [
    fold_heatmaps,
    # Large figures, not very readable
521
    #expand(
522
523
    #    OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.pdf"),
    #    contrast=DE_CONTRASTS, small_type=DE_TYPES),
524
525
    ## TODO: debug PCA
    #expand(
526
527
    #    OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.pdf"),
    #    small_type=["pisimi"], standard=STANDARDS),
528
    #expand(
529
530
    #    OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"),
    #    small_type=["pisimi"], standard=STANDARDS),
531
532
    ##
    #expand(
533
534
    #    OPJ(output_dir, "figures", "{small_type}_clustermap.pdf"),
    #    small_type=SMALL_TYPES),
535
    #expand(
536
537
    #    OPJ(output_dir, "figures", "{small_type}_zscore_clustermap.pdf"),
    #    small_type=SMALL_TYPES),
538
    #expand(
539
540
    #    OPJ(output_dir, "figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"),
    #    contrast=CONTRASTS, small_type=DE_TYPES),
541
542
543
544
    ]

de_fold_boxplots = expand(
    OPJ(output_dir, "figures", "{contrast}",
545
        "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"),
546
    contrast=DE_CONTRASTS, small_type=DE_TYPES, fold_type=["log2FoldChange", "mean_log2_RPM_fold"],
547
    gene_list=["all_gene_lists"])
548
549
ip_fold_boxplots_by_contrast = expand(
    OPJ(output_dir, "figures", "{contrast}",
550
        "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"),
551
    contrast=DT_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
552
    gene_list=["all_gene_lists"])
553
554
555
556
557
558

fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplots]

rule all:
    input:
        expand(
559
            OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome),
560
            lib=LIBS, rep=REPS, read_type=[size_selected]),
561
        expand(
562
            OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome),
563
            lib=LIBS, rep=REPS, read_type=[size_selected]),
564
565
        # In read_graphs
        #expand(
566
567
        #    OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"),
        #    lib=LIBS, rep=REPS),
568
        read_graphs,
569
        counts_files,
570
        bigwig_files,
571
572
        # translation_efficiency
        expand(
573
574
            OPJ(counts_dir, "{lib}_mean_{read_type}_on_%s" % genome, "{lib}_{biotype}_{orientation}_%s.txt" % eff_name),
            lib=EFF_LIBS, read_type=["RPF"], biotype=["alltypes"], orientation=["fwd"]),
575
576
577
        # DESeq2 results
        expand(
            OPJ(
578
                counts_dir,
579
                "deseq2_{read_type}",
Blaise Li's avatar
Blaise Li committed
580
                "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
581
582
583
584
            read_type=[size_selected], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
        # We are not supposed to have data outside protein-coding genes for RPF
        expand(
            OPJ(
585
                counts_dir,
586
                "deseq2_{read_type}",
Blaise Li's avatar
Blaise Li committed
587
                "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
588
            read_type=["RPF"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=["protein_coding"]),
589
590
591
        expand(
            OPJ(
                counts_dir,
592
593
                "diff_%s_{read_type}" % eff_name,
                "{contrast}", "{orientation}_{biotype}", "{contrast}_diff_%s.txt" % eff_name),
594
            read_type=["RPF"], contrast=DT_CONTRASTS, orientation=["fwd"], biotype=["alltypes"]),
595
596
597
598

# rule future_all:
#         meta_profiles,
#         read_graphs,
599
600
#         OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"),
#         expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
601
#         #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=DT_CONTRASTS, small_type=DE_TYPES),
602
603
#         exploratory_graphs,
#         fold_boxplots,
604
#         #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.pdf"), small_type=SMALL_TYPES),
605
606
607
#         # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting
#         # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot
#         expand(
608
#             OPJ(counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
609
610
#             small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu",  "pisimi"]),
#         expand(
611
612
#             OPJ(output_dir, "figures", "{small_type}_norm_correlations.pdf"),
#             small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu",  "pisimi"]),
613
#         expand(
614
615
#             OPJ(output_dir, "figures", "{small_type}_norm_counts_distrib.pdf"),
#             small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]),
616
617
618
619

include: ensure_relative(irules["link_raw_data"], workflow.basedir)


620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
# rule trim_and_dedup:
#     input:
#         rules.link_raw_data.output,
#         #OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
#     params:
#         adapter = lambda wildcards : lib2adapt[wildcards.lib],
#         trim5 = trim5,
#         trim3 = trim3,
#     output:
#         trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_trimmed.fastq.gz"),
#         nb_raw =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_raw.txt"),
#         nb_trimmed =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"),
#         nb_deduped =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_deduped.txt"),
#     #threads: 2
#     message:
#         "Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
#     benchmark:
#         OPJ(log_dir, "trim_and_dedup", "{lib}_{rep}_benchmark.txt")
#     log:
#         cutadapt = OPJ(log_dir, "cutadapt", "{lib}_{rep}.log"),
#         trim_and_dedup = OPJ(log_dir, "trim_and_dedup", "{lib}_{rep}.log"),
#     shell:
#         """
#         zcat {input} \\
#             | tee >(count_fastq_reads {output.nb_raw}) \\
#             | cutadapt -a {params.adapter} --discard-untrimmed - 2> {log.cutadapt} \\
#             | tee >(count_fastq_reads {output.nb_trimmed}) \\
#             | dedup \\
#             | tee >(count_fastq_reads {output.nb_deduped}) \\
#             | trim_random_nt {params.trim5} {params.trim3}  2>> {log.cutadapt} \\
#             | gzip > {output.trimmed} \\
#             2> {log.trim_and_dedup}
#         """
653
654


655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
rule trim:
    input:
        rules.link_raw_data.output,
        #OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
    params:
        adapter = lambda wildcards : lib2adapt[wildcards.lib],
        trim5 = trim5,
        trim3 = trim3,
    output:
        trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_trimmed.fastq.gz"),
        nb_raw =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_raw.txt"),
        nb_trimmed =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"),
    #threads: 2
    message:
        "Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
    benchmark:
        OPJ(log_dir, "trim", "{lib}_{rep}_benchmark.txt")
    log:
        cutadapt = OPJ(log_dir, "cutadapt", "{lib}_{rep}.log"),
        trim = OPJ(log_dir, "trim", "{lib}_{rep}.log"),
    shell:
        """
        zcat {input} \\
            | tee >(count_fastq_reads {output.nb_raw}) \\
            | cutadapt -a {params.adapter} --discard-untrimmed - 2> {log.cutadapt} \\
            | tee >(count_fastq_reads {output.nb_trimmed}) \\
            | trim_random_nt {params.trim5} {params.trim3}  2>> {log.cutadapt} \\
            | gzip > {output.trimmed} \\
683
            2> {log.trim}
684
685
686
        """


687
688
689
690
691
692
693
694
def awk_size_filter(wildcards):
    """Returns the bioawk filter to select reads of size from MIN_LEN to MAX_LEN."""
    return "%s <= length($seq) && length($seq) <= %s" % (MIN_LEN, MAX_LEN)


rule select_size_range:
    """Select (and count) reads in the correct size range."""
    input:
695
696
        # rules.trim_and_dedup.output.trimmed
        rules.trim.output.trimmed
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
    output:
        selected = OPJ(data_dir, "trimmed", "{lib}_{rep}_%s.fastq.gz" % size_selected),
        nb_selected = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_%s.txt" % size_selected),
    params:
        awk_filter = awk_size_filter,
    message:
        "Selecting reads size %s for {wildcards.lib}_{wildcards.rep}." % size_selected
    shell:
        """
        bioawk -c fastx '{params.awk_filter} {{print "@"$name" "$4"\\n"$seq"\\n+\\n"$qual}}' {input} \\
            | tee >(count_fastq_reads {output.nb_selected}) \\
            | gzip > {output.selected}
        """


def source_fastq(wildcards):
    """Determine the fastq file corresponding to a given read type."""
    read_type = wildcards.read_type
    if read_type == "raw":
        return rules.link_raw_data.output
    elif read_type == "trimmed":
718
719
        # return rules.trim_and_dedup.output.trimmed
        return rules.trim.output.trimmed
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
    elif read_type == size_selected:
        return rules.select_size_range.output.selected
    elif read_type == "nomap":
        return rules.map_on_genome.output.nomap_fastq
    elif read_type == "RPF":
        return rules.extract_RPF.output.rpf
    else:
        raise NotImplementedError("Unknown read type: %s" % read_type)


rule map_on_genome:
    input:
        # fastq = rules.select_size_range.output.selected,
        fastq = source_fastq,
    output:
735
736
        sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s.sam" % genome)),
        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome),
737
738
739
740
741
    params:
        aligner = aligner,
        index = index,
        settings = alignment_settings[aligner],
    message:
742
        "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome."
743
    benchmark:
744
        OPJ(log_dir, "map_on_genome", "{lib}_{rep}_{read_type}_benchmark.txt")
745
    log:
746
747
        log = OPJ(log_dir, "map_on_genome", "{lib}_{rep}_{read_type}.log"),
        err = OPJ(log_dir, "map_on_genome", "{lib}_{rep}_{read_type}.err")
748
749
750
751
752
753
754
755
756
    threads:
        4
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"


def source_sam(wildcards):
    if hasattr(wildcards, "read_type"):
        return OPJ(
757
            output_dir, aligner, f"mapped_{genome}",
758
            f"{wildcards.lib}_{wildcards.rep}",
759
            f"{wildcards.read_type}_on_{genome}.sam")
760
761
    else:
        return OPJ(
762
            output_dir, aligner, f"mapped_{genome}",
763
            f"{wildcards.lib}_{wildcards.rep}",
764
            f"{size_selected}_on_{genome}.sam")
765
766
767
768
769
770


rule sam2indexedbam:
    input:
        sam = source_sam,
    output:
771
772
        sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam" % genome),
        index = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam.bai" % genome),
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
    message:
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}."
    benchmark:
        OPJ(log_dir, "sam2indexedbam", "{lib}_{rep}_{read_type}_benchmark.txt"),
    log:
        log = OPJ(log_dir, "sam2indexedbam", "{lib}_{rep}_{read_type}.log"),
        err = OPJ(log_dir, "sam2indexedbam", "{lib}_{rep}_{read_type}.err"),
    threads:
        4
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"


rule compute_mapping_stats:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
    output:
790
        stats = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_samtools_stats.txt" % genome),
791
792
793
794
795
796
797
798
799
    shell:
        """samtools stats {input.sorted_bam} > {output.stats}"""


rule get_nb_mapped:
    """Extracts the number of mapped reads from samtools stats results."""
    input:
        stats = rules.compute_mapping_stats.output.stats,
    output:
800
        nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome),
801
802
803
804
805
806
807
808
    shell:
        """samstats2mapped {input.stats} {output.nb_mapped}"""


rule compute_coverage:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
    output:
809
        coverage = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome),
810
811
812
813
814
815
816
817
818
819
    params:
        genomelen = genomelen,
    shell:
        """
        bases=$(samtools depth {input.sorted_bam} | awk '{{sum += $3}} END {{print sum}}') || error_exit "samtools depth failed"
        python3 -c "print(${{bases}} / {params.genomelen})" > {output.coverage}
        """


rule extract_RPF:
820
821
822
    """We only count as RPF (ribosome protected fragments) those reads that map
    on protein coding genes, in sense direction. We therefore miss translation
    on "non-coding" regions."""
823
    input:
Blaise Li's avatar
Blaise Li committed
824
        sorted_bam = OPJ(
825
826
            output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}",
            f"{size_selected}_on_{genome}_sorted.bam"),
827
828
        protein_gtf = OPJ(annot_dir, "protein_coding.gtf")
    output:
829
        rpf = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.fastq.gz")
830
    params:
831
        tmp_filtered_bam = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.bam"),
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
    shell:
        """
        mkfifo {params.tmp_filtered_bam}
        bedtools intersect -a {input.sorted_bam} -b {input.protein_gtf} -wa -u -f 1.0 -s \\
            > {params.tmp_filtered_bam} &
        samtools fastq {params.tmp_filtered_bam} > {output.rpf}
        rm -f {params.tmp_filtered_bam}
        """

def feature_orientation2stranded(wildcards):
    orientation = wildcards.orientation
    if orientation == "fwd":
        return 1
    elif orientation == "rev":
        return 2
    elif orientation == "all":
        return 0
    else:
        exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")


def source_bams(wildcards):
    """We can count from several bam files with feature_count."""
855
    read_types = wildcards.read_type.split("_and_")
856
    sorted_bams = [OPJ(
857
858
        output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}",
        f"{read_type}_on_{genome}_sorted.bam") for read_type in read_types]
859
860
861
862
863
864
865
866
    return sorted_bams


rule feature_count_reads:
    input:
        source_bams,
    output:
        counts = OPJ(
867
            counts_dir,
868
            "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
869
870
    params:
        stranded = feature_orientation2stranded,
871
        annot = lambda wildcards: OPJ(annot_dir, f"{wildcards.biotype}.gtf")
872
    message:
873
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} with featureCounts."
874
    benchmark:
875
        OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_benchmark.txt"),
876
    log:
877
878
        log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}.err"),
879
880
    shell:
        """
881
882
        tmpdir=$(TMPDIR=/var/tmp mktemp --tmpdir -d "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
        cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -s {params.stranded} --fracOverlap 1 --tmpDir ${{tmpdir}} {input}"
883
884
885
886
887
888
889
890
891
892
893
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
        """


rule summarize_feature_counts:
    """For a given library, compute the total counts for each biotype and write this in a summary table."""
    input:
        expand(
894
            OPJ(counts_dir,
895
                "{{lib}}_{{rep}}_{{read_type}}_on_%s" % genome, "{biotype}_{{orientation}}_counts.txt"),
896
897
            biotype=COUNT_BIOTYPES),
    output:
898
        summary = OPJ(counts_dir, "summaries",
899
            "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome)
900
901
902
903
    run:
        with open(output.summary, "w") as summary_file:
            header = "\t".join(COUNT_BIOTYPES)
            summary_file.write("#biotypes\t%s\n" % header)
904
            sums = "\t".join((str(sum_feature_counts(counts_file, nb_bams=len(wildcards.read_type.split("_and_")))) for counts_file in input))
905
906
907
908
909
910
911
            summary_file.write(f"%s_%s_%s\t%s\n" % (wildcards.lib, wildcards.rep, wildcards.orientation, sums))


rule count_non_structural_mappers:
    input:
        # nb_mapped is the total number of size-selected that mapped
        #nb_mapped = rules.get_nb_mapped.output.nb_mapped,
912
        nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"),
913
914
        # structural are fwd to STRUCTURAL_BIOTYPES
        summary = OPJ(
915
            counts_dir, "summaries",
916
            "{lib}_{rep}_%s_on_%s_fwd_counts.txt" % (size_selected, genome)),
917
918
    output:
        nb_non_structural = OPJ(
919
            counts_dir, "summaries",
920
921
922
923
924
925
926
927
928
929
            "{lib}_{rep}_nb_non_structural.txt"),
    run:
        nb_mapped = read_int_from_file(input.nb_mapped)
        structural = pd.read_table(input.summary, index_col=0).loc[:, STRUCTURAL_BIOTYPES].sum(axis=1)[0]
        with open(output.nb_non_structural, "w") as out_file:
            out_file.write("%d\n" % (nb_mapped - structural))


rule count_RPF:
    input:
930
        summary_table = OPJ(counts_dir, "summaries",
931
            "{lib}_{rep}_%s_on_%s_fwd_counts.txt" % (size_selected, genome))
932
    output:
933
        nb_RPF = OPJ(counts_dir, "summaries",
934
            "{lib}_{rep}_%s_on_%s_fwd_nb_RPF.txt" % (size_selected, genome))
935
936
937
938
939
940
941
    run:
        nb_RPF = pd.read_table(input.summary_table, index_col=0)["protein_coding"][f"{wildcards.lib}_{wildcards.rep}_fwd"]
        with open(output.nb_RPF, "w") as nb_RPF_file:
            nb_RPF_file.write(f"{nb_RPF}\n")
        

def source_counts(wildcards):
942
943
944
    """Determines from which rule the gathered counts should be sourced."""
    if wildcards.biotype == "alltypes":
        return rules.join_all_counts.output.counts_table
945
    else:
946
947
        # "Directly" from the counts gathered across libraries
        return rules.gather_counts.output.counts_table
948
949


950
951
rule plot_read_numbers:
    input:
952
953
954
955
956
        # nb_raw = rules.trim_and_dedup.output.nb_raw,
        nb_raw = rules.trim.output.nb_raw,
        # nb_deduped = rules.trim_and_dedup.output.nb_deduped,
        # nb_trimmed = rules.trim_and_dedup.output.nb_trimmed,
        nb_trimmed = rules.trim.output.nb_trimmed,
957
958
959
960
        nb_selected = rules.select_size_range.output.nb_selected,
        nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"),
        nb_RPF = rules.count_RPF.output.nb_RPF,
    output:
961
        plot = OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"),
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
    message:
        "Plotting read numbers for {wildcards.lib}_{wildcards.rep}."
    # Currently not used
    #log:
    #    log = OPJ(log_dir, "plot_read_numbers", "{lib}_{rep}.log"),
    #    err = OPJ(log_dir, "plot_read_numbers", "{lib}_{rep}.err")
    run:
        name = f"{wildcards.lib}_{wildcards.rep}"
        summary = pd.Series({
            "nb_raw" : read_int_from_file(input.nb_raw),
            # "nb_deduped" : read_int_from_file(input.nb_deduped),
            "nb_trimmed" : read_int_from_file(input.nb_trimmed),
            "nb_selected" : read_int_from_file(input.nb_selected),
            "nb_mapped" : read_int_from_file(input.nb_mapped),
            "nb_RPF" : read_int_from_file(input.nb_RPF)})
        # Chose column order:
        #summary = summary[["nb_raw", "nb_deduped", "nb_trimmed", "nb_selected", "nb_mapped", "nb_RPF"]]
        summary = summary[["nb_raw", "nb_trimmed", "nb_selected", "nb_mapped", "nb_RPF"]]
        save_plot(output.plot, plot_barchart, summary, title="%s reads" % name)


983
984
985
986
987
988
989
990
991
992
993
# Generate report with
# - raw
# - trimmed
# - deduplicated
# - size-selected
# - mapped
# - ambiguous
# - si, pi, mi
# - all_si ((22G-26G)(T*) that are not mi or pi)
rule make_read_counts_summary:
    input:
994
995
        # nb_raw = rules.trim_and_dedup.output.nb_raw,
        nb_raw = rules.trim.output.nb_raw,
996
        # nb_deduped = rules.trim_and_dedup.output.nb_deduped,
997
998
        # nb_trimmed = rules.trim_and_dedup.output.nb_trimmed,
        nb_trimmed = rules.trim.output.nb_trimmed,
999
1000
        nb_size_selected = rules.select_size_range.output.nb_selected,
        nb_mapped = rules.get_nb_mapped.output.nb_mapped,
For faster browsing, not all history is shown. View entire blame