RNA-seq.snakefile 96.8 KB
Newer Older
1
2
3
"""
Snakefile to analyse RNA-seq data.
"""
Blaise Li's avatar
Blaise Li committed
4
5
6
7
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")
8

9
# TODO: plot spike-in vs spike-in between libraries to detect anormal spike-ins: should be a straight line
10

11
12
13
14
15
16
# TODO: Add rules to take into account spike-ins.
# The size factor should be determined within the dynamic range of the spike-ins
# Use 1 RPKM for LDD
# See /Genomes/spike-ins/ERCC_RNA_Spike-In_Control_Mixes.pdf pp. 13-14
# Then use size factor to compute normalized RPKM and then fold changes like for small RNA IP

17
#TODO: same boxplots as small_RNA-seq
18
19
20
#TODO: for each contrast,  scatterplots with highlight of differentially expressed genes (with p-value colour code)
# and same scale for different biotypes
# boxplots of categories of genes (only protein-coding)
21
# Prepare scripts to compare with small RNA (fold vs fold)
22
23
24
25
26
27
28
29
30
31
32
#TODO: try to use bedtools intersect to do the counting (only reads strictly included in feature)
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ mkfifo /tmp/gene_id
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ mkfifo /tmp/count
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ head -6 /tmp/WT_RT_1_protein_coding_counts_F1S.txt | tee >(cut -f9 | gtf2attr gene_id > /tmp/gene_id) | cut -f7,10 > /tmp/count & paste /tmp/gene_id /tmp/count
# "WBGene00022277"	-	170
# "WBGene00022276"	+	551
# "WBGene00022276"	+	522
# "WBGene00022276"	+	550
# "WBGene00022276"	+	550
# "WBGene00022276"	+	550

33
34
35

import os
OPJ = os.path.join
36
from glob import glob
37
from pickle import load
38
from collections import defaultdict
39
from itertools import combinations
40
from functools import partial
41
from gzip import open as gzopen
42

43
44
45
46
47
48
49
50
51
52
import warnings


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning

53
54
55
56
57
58

def debug_wildcards(wildcards):
    print(wildcards)
    return []


59
60
61
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
mpl.use("PDF")
Blaise Li's avatar
Blaise Li committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
# https://stackoverflow.com/a/42768093/1878788
# from matplotlib.backends.backend_pgf import FigureCanvasPgf
# mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)
# TEX_PARAMS = {
#     "text.usetex": True,            # use LaTeX to write all text
#     "pgf.rcfonts": False,           # Ignore Matplotlibrc
#     "pgf.texsystem": "lualatex",  # hoping to avoid memory issues
#     "pgf.preamble": [
#         r'\usepackage{color}'     # xcolor for colours
#     ]
# }
# mpl.rcParams.update(TEX_PARAMS)
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"

import numpy as np
81
import pandas as pd
Blaise Li's avatar
Blaise Li committed
82
# To catch errors when plotting KDE
83
from scipy.linalg import LinAlgError
84
from scipy.stats import pearsonr, shapiro
Blaise Li's avatar
Blaise Li committed
85
from sklearn.linear_model import LinearRegression
86
import seaborn as sns
87
88
# To merge bigwig files
import pyBigWig
89
from mappy import fastx_read
90
91

from rpy2.robjects import Formula, StrVector
92
from libdeseq import do_deseq2
93
from libhts import aligner2min_mapq
94
from libhts import status_setter, plot_lfc_distribution, plot_MA
95
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
96
97
98
99
100
from libhts import (
    plot_paired_scatters,
    plot_norm_correlations,
    plot_counts_distribution,
    plot_boxplots)
101
from libworkflows import wc_applied, ensure_relative, cleanup_and_backup
102
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
103
from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
104
from libworkflows import feature_orientation2stranded
105
from libworkflows import read_htseq_counts, sum_htseq_counts
106
from libworkflows import read_intersect_counts, sum_intersect_counts
107
from libworkflows import read_feature_counts, sum_feature_counts
108
from smincludes import rules as irules
109

110
alignment_settings = {"bowtie2": "", "hisat2": "", "crac": "-k 20 --stranded --use-x-in-cigar"}
111

112
113
114
# Possible feature ID conversions
ID_TYPES = ["name", "cosmid"]

115
# TODO: make this part of the configuration
116
LFC_RANGE = {
117
    "spike_ins" : (-10, 10),
118
119
120
121
122
123
124
    "protein_coding" : (-10, 10),
    "DNA_transposons_rmsk" : (-10, 10),
    "RNA_transposons_rmsk" : (-10, 10),
    "satellites_rmsk" : (-10, 10),
    "simple_repeats_rmsk" : (-10, 10),
    "DNA_transposons_rmsk_families" : (-10, 10),
    "RNA_transposons_rmsk_families" : (-10, 10),
125
    "satellites_rmsk_families" : (-10, 10),
126
    "simple_repeats_rmsk_families" : (-10, 10),
127
    "pseudogene" : (-10, 10),
128
129
    "all_rmsk" : (-10, 10),
    "all_rmsk_families" : (-10, 10),
130
    "alltypes" : (-10, 10)}
131
132
133
134
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
135
136
137
#status2colour = make_status2colour(DOWN_STATUSES, UP_STATUSES)
#STATUSES = list(reversed(DOWN_STATUSES)) + ["down", "NS", "up"] + UP_STATUSES
#STATUS2COLOUR = dict(zip(STATUSES, sns.color_palette("coolwarm", len(STATUSES))))
138

139
140
141
142
143
# Or use --configfile instead
#configfile:
#    "RNA-seq.config.json"


144
145
146
spikein_dict = config["spikein_dict"]
spikein_microliters = spikein_dict["microliters"]
spikein_dilution_factor = spikein_dict["dilution_factor"]
147
148
149
# To get the expected number of attomoles of a given spike-in,
# multiply this by the value found in column 4 of Genomes/spike-ins/ERCC92_concentrations.txt
spikein_microliter_equivalent = spikein_microliters / spikein_dilution_factor
150
151
152
153
154
155
156
157
158
159
if spikein_microliter_equivalent:
    spikein_concentrations = pd.read_table(
        spikein_dict["concentrations"],
        index_col="ERCC ID",
        usecols=["ERCC ID", "concentration in Mix %s (attomoles/ul)" % spikein_dict["mix"]])
    from scipy.constants import Avogadro
    # attomole: 10^{-18} mole
    molecules_per_attomole = Avogadro / 10**18
    spikein_expected_counts = spikein_concentrations * spikein_microliter_equivalent * molecules_per_attomole
    spikein_expected_counts.columns = ["expected_counts"]
160

161
162
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
163
164
165
166
167
168
169
ORIENTATIONS = config["orientations"]
# key: library name
# value: dictionary
#   key: replicate number
#   value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
170
REPS = config["replicates"]
171
172
173
#COND_PAIRS = config["cond_pairs"]
#CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in COND_PAIRS]
DE_COND_PAIRS = config["de_cond_pairs"]
174
175
176
177
178
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
179
IP_COND_PAIRS = config["ip_cond_pairs"]
180
181
182
183
184
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in IP_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in IP_COND_PAIRS]), ""
185
186
187
188
189
COND_PAIRS = DE_COND_PAIRS + IP_COND_PAIRS
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
IP_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in IP_COND_PAIRS]
contrasts_dict = {"de" : DE_CONTRASTS, "ip" : IP_CONTRASTS}
CONTRASTS = DE_CONTRASTS + IP_CONTRASTS
190
191
192
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
CONDITIONS = [{
    "lib" : lib,
193
    "rep" : rep} for rep in REPS for lib in LIBS]
194
195
196
197
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
198
199
# COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
#     cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
200
201
202


BIOTYPES = config["biotypes"]
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221

RMSK_BIOTYPES = [
    "DNA_transposons_rmsk",
    "RNA_transposons_rmsk",
    "satellites_rmsk",
    "simple_repeats_rmsk"]
RMSK_FAMILIES_BIOTYPES = [
    "DNA_transposons_rmsk_families",
    "RNA_transposons_rmsk_families",
    "satellites_rmsk_families",
    "simple_repeats_rmsk_families"]

BIOTYPES_TO_JOIN = {
    "all_rmsk": [biotype for biotype in BIOTYPES if biotype in RMSK_BIOTYPES],
    "all_rmsk_families": [biotype for biotype in BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES],
    # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
    "alltypes": [biotype for biotype in BIOTYPES if not biotype.startswith("protein_coding_")]}
JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys())
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES + JOINED_BIOTYPES]
222
223
224
225
if "spike_ins" in BIOTYPES:
    PAIR_SCATTER_BIOTYPES = ["spike_ins"]
else:
    PAIR_SCATTER_BIOTYPES = []
226
227
228
229
230
ID_LISTS = config["boxplot_gene_lists"]
#ID_LISTS = [
#    "germline_specific", "histone",
#    "csr1_prot_si_supertargets_48hph",
#    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
231
aligner = config["aligner"]
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
########################
# Genome configuration #
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
247
248
249
250
251
#COUNTERS = config["counters"]
# htseq_count is very slow
# bedtools intersect eats all the RAM when there are a lot of rRNAs:
# https://github.com/arq5x/bedtools2/issues/565
COUNTERS = ["feature_count"]
252
output_dir = config["output_dir"]
253
254
transcriptomes_dir = OPJ(output_dir, "merged_transcriptomes")
log_dir = OPJ(output_dir, f"logs_{genome}")
Blaise Li's avatar
Blaise Li committed
255
data_dir = OPJ(output_dir, "data")
256

257
#NORM_TYPES = config["norm_types"]
258
259
260
261
262
263
264
265
if spikein_microliter_equivalent:
    SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref", "spike_ins"]
    NORM_TYPES = ["spike_ins", "protein_coding", "median_ratio_to_pseudo_ref"]
else:
    SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref"]
    NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"]
    assert "spike_ins" not in BIOTYPES
assert set(SIZE_FACTORS).issubset(set(BIOTYPES) | {"median_ratio_to_pseudo_ref"})
266
267
assert set(NORM_TYPES).issubset(set(SIZE_FACTORS))

268
269
# PolyU tail min and max length
minU, maxU = 1, 5
270

271
272
# Seems to interfere with temp files deletion
wildcard_constraints:
273
    rep="\d+",
274
    orientation="|".join(ORIENTATIONS),
275
    biotype="|".join(BIOTYPES + JOINED_BIOTYPES),
276
    fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]),
277

278
#ruleorder: join_all_counts > gather_counts
279
280

# Define functions to be used in shell portions
281
shell.prefix(SHELL_FUNCTIONS)
282

283
284
285

def specific_input(aligner):
    if aligner == "hisat2":
286
287
288
289
290
291
292
293
        files = []
        # files = [
        #     expand(
        #         OPJ(output_dir, aligner, f"mapped_{genome}", "ballgown",
        #             "{lib}_{rep}_on_%s.gtf" % genome),
        #         lib=LIBS, rep=REPS),
        #     OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare.stats"),
        #     OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),]
294
295
    elif aligner == "bowtie2":
        files = []
296
297
    elif aligner == "crac":
        files = []
298
299
300
301
302
    else:
        raise NotImplementedError("%s is not yet handeled." % aligner)
    return files

counts_files = [
303
304
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
305
306
307
            "{lib}_mean_{mapped_type}", "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
        counter=COUNTERS, lib=LIBS, mapped_type=[f"on_{genome}"],
        biotype=["alltypes"], orientation=ORIENTATIONS),
308
309
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
310
311
312
            "all_{mapped_type}", "{biotype}_{orientation}_TPM.txt"),
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
        biotype=["alltypes"], orientation=ORIENTATIONS),
313
314
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
315
316
            "all_{mapped_type}", "{biotype}_{orientation}_RPK.txt"),
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
317
        biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS),
318
319
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
320
321
            "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"),
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
322
        biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS),
323
324
325
326
327
328
329
330
331
332
333
334
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         "summaries", f"{{lib}}_{{rep}}_on_{genome}_{{orientation}}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, orientation=ORIENTATIONS),
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS),
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS),]
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
    # Just a test:
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], orientation=ORIENTATIONS),
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),]
351
352
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
353
354
            "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt"),
        counter=COUNTERS, lib=LIBS, rep=REPS,
355
        mapped_type=[f"on_{genome}", f"unique_on_{genome}"], orientation=ORIENTATIONS),
356
357
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
Blaise Li's avatar
Blaise Li committed
358
            "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
359
        counter=COUNTERS, lib=LIBS, rep=REPS,
360
        mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),
361
362
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
Blaise Li's avatar
Blaise Li committed
363
            "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
364
        counter=COUNTERS, lib=LIBS, rep=REPS,
365
        mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),]
366

367
368
369
370
371
# Actually not necessarily IP data
# TODO: generalize contrast types
if contrasts_dict["ip"]:
    ip_fold_boxplots = [
        expand(
372
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}",
373
                "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"),
374
375
376
            #counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            counter=COUNTERS, mapped_type=[f"on_{genome}"],
            contrast_type=["ip"],
377
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
378
379
380
381
            fold_type=["mean_log2_RPM_fold"], id_list=ID_LISTS),]
else:
    ip_fold_boxplots = []

382
383
384
if contrasts_dict["de"]:
    de_fold_boxplots = [
        expand(
385
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}",
386
                "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"),
387
388
            counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            contrast_type=["de"],
389
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
390
391
392
393
            fold_type=["log2FoldChange"], id_list=ID_LISTS),]
else:
    de_fold_boxplots = []

394
395
396
de_files = [
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
397
            "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
398
399
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
        contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
400
401
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
402
            "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_{fold_type}_distribution.pdf"),
403
404
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
        contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
405
406
407
        fold_type=["log2FoldChange"]),
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
408
            "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_MA_with_{id_list}.pdf"),
409
410
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
        contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
411
412
        id_list=ID_LISTS + ["lfc_statuses"]),
    de_fold_boxplots]
413
414

bigwig_files = [
415
416
417
418
419
420
421
422
423
424
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}",
    #         "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"),
    #     lib=LIBS, rep=REPS, orientation=ORIENTATIONS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES),
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean",
    #         "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"),
    #     lib=LIBS, orientation=ORIENTATIONS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES),]
425
426
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}",
427
428
            "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"),
        lib=LIBS, rep=REPS, orientation=ORIENTATIONS,
429
        #mapped_type=[f"on_{genome}", f"unique_on_{genome}"], norm_type=NORM_TYPES),
430
        mapped_type=[f"on_{genome}"], norm_type=NORM_TYPES),
431
432
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean",
433
434
            "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"),
        lib=LIBS, orientation=ORIENTATIONS,
435
        #mapped_type=[f"on_{genome}", f"unique_on_{genome}"], norm_type=NORM_TYPES),]
436
        mapped_type=[f"on_{genome}"], norm_type=NORM_TYPES),]
437

438
439
440
441
442
443
444
if spikein_microliter_equivalent:
    spikein_files = [
        expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{lib}_{rep}_spike_ins_{orientation}_TPM_response.pdf"),
            counter=COUNTERS, orientation=ORIENTATIONS, lib=LIBS, rep=REPS),]
else:
    spikein_files = []
445

446

447
448
449
rule all:
    input:
        specific_input(aligner),
450
451
452
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
            lib=LIBS, rep=REPS),
453
454
455
456
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"),
            lib=LIBS, rep=REPS),
        bigwig_files,
457
        spikein_files,
458
459
460
461
462
463
464
        # Uses too much memory when there are many libraries
        # expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
        #     f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
        #     counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS, quantif_type=["TPM"]),
        # expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
        #     f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
        #     counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS, quantif_type=["counts", "RPK"]),
465
466
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
467
468
                "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_correlations.pdf"),
            counter=COUNTERS, mapped_type=[f"on_{genome}"], orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]),
469
470
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
471
472
                "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_counts_distrib.pdf"),
            counter=COUNTERS, mapped_type=[f"on_{genome}"], orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]),
473
        de_files,
474
        ip_fold_boxplots,
475
        counts_files,
476
477
478
479
480
481
482
483
        ## TODO: debug:
        # Something to do with partial expand on quantification type?
        # Missing input files for rule compare_replicates:
        # results_44hph/hisat2/mapped_C_elegans/{counter}/all_{mapped_type}/{biotype}_{orientation}_RPM.txt
        # MissingInputException in line 1646 of /home/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile:
        # Missing input files for rule differential_expression:
        # results_44hph_vs_38hph/hisat2/mapped_C_elegans/{counter}/all_{mapped_type}/{biotype}_{orientation}_counts.txt
        # see https://bitbucket.org/snakemake/snakemake/issues/956/placeholders-not-properly-matched-with
484
485
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
486
                "all_{mapped_type}", "replicates_comparison",
487
                "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"),
488
489
            counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            lib=LIBS, biotype=["alltypes"],
490
491
492
            orientation=ORIENTATIONS, quantif_type=["TPM"]),
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
493
                "all_{mapped_type}", "replicates_comparison",
494
                "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"),
495
496
            counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            lib=LIBS, biotype=BIOTYPES + ["alltypes"],
497
            orientation=ORIENTATIONS, quantif_type=["counts", "RPM"]),
498
        ##
499
500


501
502
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
#include: "../snakemake_wrappers/includes/link_raw_data.rules"
503
504


505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# def mapping_command(aligner):
#     """This function returns the shell commands to run given the *aligner*."""
#     if aligner == "hisat2":
#         shell_commands = """
# cmd="hisat2 -p {threads} --dta --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
# echo ${{cmd}} 1> {log.log}
# eval ${{cmd}} 1>> {log.log} 2> {log.err}
# """ % genome_db
#     elif aligner == "bowtie2":
#         shell_commands = """
# cmd="bowtie2 -p {threads} --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
# echo ${{cmd}} 1> {log.log}
# eval ${{cmd}} 1>> {log.log} 2> {log.err}
# """ % genome_db
#     else:
#         raise NotImplementedError("%s is not yet handled." % aligner)
#     return shell_commands
522

523
524
525
526
527
528
529
# Could this be useful?
# https://stackoverflow.com/a/5328139/1878788
#def debug_output():
#    exc_type, exc_value, tb = sys.exc_info()
#    if tb is not None:
#        print(tb.tb_frame.f_locals)
#    return ""
530
531
532
533
534
535

rule map_on_genome:
    input:
        fastq = OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
    output:
        # temp because it uses a lot of space
536
537
538
        sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)),
        #sam = temp(OPJ(output_dir, aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s.sam" % genome)),
        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
539
540
541
542
    params:
        aligner = aligner,
        index = genome_db,
        settings = alignment_settings[aligner],
543
    message:
544
        "Mapping {wildcards.lib}_{wildcards.rep} on %s." % genome
545
    log:
546
547
        log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
        err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
548
549
550
551
    resources:
        io=5
    threads:
        4
552
553
554
555
    #shell:
    #    mapping_command(aligner)
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
556
557


558
559
560
561
562
rule extract_nomap_polyU:
    f"""
    Extract from the non-mappers those that end with a polyU tail
    of length from {minU} to {maxU}."""
    input:
563
        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
564
    output:
565
566
        nomap_polyU = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU.fastq.gz" % genome),
        nomap_polyU_stats = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU_stats.txt" % genome),
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
    run:
        stats = defaultdict(int)
        with gzopen(output.nomap_polyU, "wb") as fq_out:
            for (name, seq, qual) in fastx_read(input.nomap_fastq):
                trimmed = seq.rstrip("T")
                nb_T = len(seq) - len(trimmed)
                stats[nb_T] += 1
                if minU <= nb_T <= maxU:
                    # Find number of ending Ts
                    fq_out.write(f"@{name}\n{trimmed}\n+\n{qual:.{len(trimmed)}}\n".encode("utf-8"))
        with open(output.nomap_polyU_stats, "w") as stats_file:
            tot = 0
            for (nb_T, stat) in sorted(stats.items()):
                stats_file.write(f"{nb_T}\t{stat}\n")
                tot += stat
            stats_file.write(f"tot\t{tot}\n")


rule remap_on_genome:
    input:
        fastq = rules.extract_nomap_polyU.output.nomap_polyU,
    output:
        # temp because it might use a lot of space
590
591
        sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_polyU_on_%s.sam" % genome)),
        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_polyU_unmapped_on_%s.fastq.gz" % genome),
592
593
594
595
596
    params:
        aligner = aligner,
        index = genome_db,
        settings = alignment_settings[aligner],
    message:
597
        "Remapping {wildcards.lib}_{wildcards.rep} polyU-trimmed unmapped reads on %s." % genome
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
    log:
        log = OPJ(log_dir, "remap_on_genome_{lib}_{rep}.log"),
        err = OPJ(log_dir, "remap_on_genome_{lib}_{rep}.err")
    resources:
        io=5
    threads:
        4
    #shell:
    #    mapping_command(aligner)
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"


def source_sam(wildcards):
    mapped_type = wildcards.mapped_type
    if mapped_type == f"on_{genome}":
614
615
616
617
618
        ## debug
        # Why rules.map_on_genome.output.sam doesn't get properly expanded?
        #return rules.map_on_genome.output.sam
        return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}_on_%s.sam" % genome)
        ##
619
620
    elif mapped_type == f"polyU_on_{genome}":
        return rules.remap_on_genome.output.sam
621
622
    elif mapped_type == f"unique_on_{genome}":
        raise NotImplementedError(f"{mapped_type} only handled from count level on.")
623
624
625
626
    else:
        raise NotImplementedError(f"{mapped_type} not handled (yet?)")


627
628
rule sam2indexedbam:
    input:
629
        #debug_wildcards,
630
631
        # sam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam"),
        sam = source_sam,
632
    output:
633
634
635
636
        # sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam")),
        # index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai")),
        sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam")),
        index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam.bai")),
637
    message:
638
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}."
639
    log:
640
641
        log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}_{mapped_type}.log"),
        err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}_{mapped_type}.err"),
642
643
644
    resources:
        io=45
    threads:
645
        8
646
647
648
649
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"


650
651
652
653
654
rule compute_mapping_stats:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
    output:
655
        stats = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_samtools_stats.txt"),
656
657
658
659
660
661
662
663
664
665
    shell:
        """samtools stats {input.sorted_bam} > {output.stats}"""


rule get_nb_mapped:
    """Extracts the number of mapped reads from samtools stats results."""
    input:
        stats = rules.compute_mapping_stats.output.stats,
        #stats = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
    output:
666
        nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_nb_mapped.txt"),
667
668
669
670
    shell:
        """samstats2mapped {input.stats} {output.nb_mapped}"""


671
672
rule compute_coverage:
    input:
673
674
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
675
    output:
676
        coverage = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_coverage.txt"),
677
678
    params:
        genomelen = genomelen,
679
680
    shell:
        """
681
        bases=$(samtools depth {input.sorted_bam} | awk '{{sum += $3}} END {{print sum}}') || error_exit "samtools depth failed"
682
        python3 -c "print(${{bases}} / {params.genomelen})" > {output.coverage}
683
684
685
686
687
        """


rule assemble_transcripts:
    input:
688
        sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
689
    output:
690
        gtf = OPJ(output_dir, aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"),
691
692
    params:
        annot = OPJ(annot_dir, "genes.gtf"),
693
694
695
    message:
        "Assembling transcripts for {wildcards.lib}_{wildcards.rep} with stringtie."
    log:
696
697
        log = OPJ(log_dir, "assemble_transcripts_{lib}_{rep}.log"),
        err = OPJ(log_dir, "assemble_transcripts_{lib}_{rep}.err")
698
699
700
701
702
703
    resources:
        io=5
    threads:
        4
    shell:
        """
704
705
706
        cmd="stringtie -p {threads} -G {params.annot} -o {output} -l {wildcards.lib}_{wildcards.rep} {input}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
707
708
        """

709

710
711
rule merge_transcripts:
    input:
712
        expand(OPJ(output_dir, aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"), lib=LIBS, rep=REPS),
713
    output:
714
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
715
716
    params:
        annot = OPJ(annot_dir, "genes.gtf"),
717
718
719
    message:
        "Merging transcripts with stringtie."
    log:
720
721
        log = OPJ(log_dir, "merge_transcripts.log"),
        err = OPJ(log_dir, "merge_transcripts.err")
722
723
724
725
    resources:
        io=5
    shell:
        """
726
727
728
        cmd="stringtie --merge -G {params.annot} -o {output} {input}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
729
730
731
732
733
        """


rule gffcompare:
    input:
734
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
735
    output:
736
        stats = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare.stats"),
737
    params:
738
        compare = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare"),
739
        annot = OPJ(annot_dir, "genes.gtf"),
740
741
742
    message:
        "Merging transcripts with stringtie."
    log:
743
744
        log = OPJ(log_dir, "gffcompare.log"),
        err = OPJ(log_dir, "gffcompare.err")
745
746
747
748
    resources:
        io=5
    shell:
        """
749
750
751
        cmd="gffcompare -r {params.annot} -o {params.compare} {input.merged}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
752
753
        """

754

755
756
rule quantify_transcripts:
    input:
757
758
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
        sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
759
    output:
760
        gtf = OPJ(output_dir, aligner, f"mapped_{genome}", "ballgown", f"{{lib}}_{{rep}}_on_{genome}.gtf"),
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
    message:
        "Quantifying transcripts for {wildcards.lib}_{wildcards.rep} with stringtie."
    log:
        OPJ(log_dir, "quantify_transcripts_{lib}_{rep}.log")
    threads:
        4
    resources:
        io=5
    shell:
        """
        cmd="stringtie -e -B -p {threads} -G {input.merged} -o {output} {input.sorted_bam}"
        echo ${{cmd}} 1> {log}
        eval ${{cmd}} 1>> {log}
        """


777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
# def htseq_orientation2stranded(wildcards):
#     orientation = wildcards.orientation
#     if orientation == "fwd":
#         if LIB_TYPE[-2:] == "SF":
#             return "yes"
#         elif LIB_TYPE[-2:] == "SR":
#             return "reverse"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "rev":
#         if LIB_TYPE[-2:] == "SF":
#             return "reverse"
#         elif LIB_TYPE[-2:] == "SR":
#             return "yes"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "all":
#         return "no"
#     else:
#         exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
797
798
799


def biotype2annot(wildcards):
800
801
802
803
804
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}.gtf")
805
806


807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
# rule htseq_count_reads:
#     input:
#         sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
#         bai = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"),
#     output:
#         counts = OPJ(output_dir, aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
#         counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
#     params:
#         stranded = htseq_orientation2stranded,
#         mode = "union",
#         annot = biotype2annot,
#     message:
#         "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with htseq-count."
#     benchmark:
#         OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
#     log:
#         log = OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
#         err = OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
#     wrapper:
#         "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/htseq_count_reads"
827
828


829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
def source_sorted_bam(wildcards):
    mapped_type = wildcards.mapped_type
    if mapped_type.startswith("unique_"):
        real_mapped_type = mapped_type[7:]
    else:
        real_mapped_type = mapped_type
    return OPJ(
        output_dir, aligner, f"mapped_{genome}",
        f"{wildcards.lib}_{wildcards.rep}_{real_mapped_type}_sorted.bam")


def source_index(wildcards):
    mapped_type = wildcards.mapped_type
    if mapped_type.startswith("unique_"):
        real_mapped_type = mapped_type[7:]
    else:
        real_mapped_type = mapped_type
    return OPJ(
        output_dir, aligner, f"mapped_{genome}",
        f"{wildcards.lib}_{wildcards.rep}_{real_mapped_type}_sorted.bam.bai")


851
852
rule feature_count_reads:
    input:
853
854
855
856
        sorted_bam = source_sorted_bam,
        index = source_index,
        #sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        #index = rules.sam2indexedbam.output.index,
857
    output:
Blaise Li's avatar
Blaise Li committed
858
859
        counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
860
861
    #wildcard_constraints:
    #    mapped_type = "|".join([f"on_{genome}", f"polyU_on_{genome}"])
862
    params:
863
        min_mapq = partial(aligner2min_mapq, aligner),
864
        stranded = feature_orientation2stranded(LIB_TYPE),
865
        annot = biotype2annot,
866
867
        # pickled dictionary that associates gene ids to gene names
        converter = genome_dict["converter"]
868
    message:
869
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} {wildcards.mapped_type} with featureCounts."
870
    benchmark:
871
        OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}_benchmark.txt")
872
    log:
873
874
        log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.err"),
875
876
    shell:
        """
877
        tmpdir=$(TMPDIR=/var/tmp mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
878
        cmd="featureCounts {params.min_mapq} -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
879
880
881
882
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
883
        cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
884
885
886
        """


887
888
889
890
891
892
893
894
895
896
897
898
# rule feature_count_unique_reads:
#     input:
#         sorted_bam = source_sorted_bam,
#         index = source_index,
#         #sorted_bam = rules.sam2indexedbam.output.sorted_bam,
#         #index = rules.sam2indexedbam.output.index,
#     output:
#         counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
#         counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
#     #wildcard_constraints:
#     #    mapped_type = f"unique_on_{genome}"
#     params:
899
#         min_mapq = partial(aligner2min_mapq, aligner),
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
#         stranded = feature_orientation2stranded(LIB_TYPE),
#         annot = biotype2annot,
#         # pickled dictionary that associates gene ids to gene names
#         converter = genome_dict["converter"]
#     message:
#         "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} {wildcards.mapped_type} with featureCounts."
#     benchmark:
#         OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}_benchmark.txt")
#     log:
#         log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.log"),
#         err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.err"),
#     shell:
#         """
#         tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
#         cmd="featureCounts {params.min_mapq} -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
#         featureCounts -v 2> {log.log}
#         echo ${{cmd}} 1>> {log.log}
#         eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
#         rm -rf ${{tmpdir}}
#         cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
#         """


# def biotype2merged(wildcards):
#     if wildcards.biotype.endswith("_rmsk_families"):
#         biotype = wildcards.biotype[:-9]
#     else:
#         biotype = wildcards.biotype
#     return OPJ(annot_dir, f"{biotype}_merged.bed")
#
#
# def intersect_orientation2stranded(wildcards):
#     orientation = wildcards.orientation
#     if orientation == "fwd":
#         if LIB_TYPE[-2:] == "SF":
#             return "-s"
#         elif LIB_TYPE[-2:] == "SR":
#             return "-S"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "rev":
#         if LIB_TYPE[-2:] == "SF":
#             return "-S"
#         elif LIB_TYPE[-2:] == "SR":
#             return "-s"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "all":
#         return ""
#     else:
#         exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
#
#
# rule intersect_count_reads:
#     input:
#         sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
#         bai = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"),
#     output:
#         counts = OPJ(output_dir, aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
#         counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
#     params:
#         stranded = intersect_orientation2stranded,
#         annot = biotype2merged,
#     message:
#         "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with bedtools intersect."
#     benchmark:
#         OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
#     log:
#         log = OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
#         err = OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
#     threads: 4  # to limit memory usage, actually
#     wrapper:
#         "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/intersect_count_reads"
973
974


975
rule summarize_counts:
976
977
    """For a given library, write a summary of the read counts for the various biotypes."""
    input:
Blaise Li's avatar
Blaise Li committed
978
        biotype_counts_files = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "{{lib}}_{{rep}}_{{mapped_type}}", "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES),
979
    output:
980
        summary = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt")
981
    run:
982
        if wildcards.counter == "htseq_count":
983
            sum_counter = sum_htseq_counts
984
985
986
        elif wildcards.counter == "intersect_count":
            sum_counter = sum_intersect_counts
        elif wildcards.counter == "feature_count":
987
988
            sum_counter = sum_feature_counts
        else:
989
            raise NotImplementedError(f"{wildcards.counter} not handled (yet?)")
990
991
        with open(output.summary, "w") as summary_file:
            header = "\t".join(BIOTYPES)
992
            summary_file.write("%s\n" % header)
993
            sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
994
995
996
997
            summary_file.write("%s\n" % sums)


rule gather_read_counts_summaries:
998
    """Gather read count summaries across libraries: lib_rep -> all."""
999
    input:
1000
        summary_tables = expand(OPJ(
For faster browsing, not all history is shown. View entire blame