RNA-seq.snakefile 97.5 KB
Newer Older
1
2
3
"""
Snakefile to analyse RNA-seq data.
"""
Blaise Li's avatar
Blaise Li committed
4
5
6
7
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")
8

9
# TODO: plot spike-in vs spike-in between libraries to detect anormal spike-ins: should be a straight line
10

11
12
13
14
15
16
# TODO: Add rules to take into account spike-ins.
# The size factor should be determined within the dynamic range of the spike-ins
# Use 1 RPKM for LDD
# See /Genomes/spike-ins/ERCC_RNA_Spike-In_Control_Mixes.pdf pp. 13-14
# Then use size factor to compute normalized RPKM and then fold changes like for small RNA IP

17
#TODO: same boxplots as small_RNA-seq
18
19
20
#TODO: for each contrast,  scatterplots with highlight of differentially expressed genes (with p-value colour code)
# and same scale for different biotypes
# boxplots of categories of genes (only protein-coding)
21
# Prepare scripts to compare with small RNA (fold vs fold)
22
23
24
25
26
27
28
29
30
31
32
#TODO: try to use bedtools intersect to do the counting (only reads strictly included in feature)
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ mkfifo /tmp/gene_id
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ mkfifo /tmp/count
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ head -6 /tmp/WT_RT_1_protein_coding_counts_F1S.txt | tee >(cut -f9 | gtf2attr gene_id > /tmp/gene_id) | cut -f7,10 > /tmp/count & paste /tmp/gene_id /tmp/count
# "WBGene00022277"	-	170
# "WBGene00022276"	+	551
# "WBGene00022276"	+	522
# "WBGene00022276"	+	550
# "WBGene00022276"	+	550
# "WBGene00022276"	+	550

33
34
35

import os
OPJ = os.path.join
36
from glob import glob
37
from pickle import load
38
from collections import defaultdict
39
from itertools import combinations
40
from functools import partial
41
from gzip import open as gzopen
42

43
44
45
46
47
48
49
50
51
52
import warnings


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning

53
54
55
56
57
58

def debug_wildcards(wildcards):
    print(wildcards)
    return []


59
60
61
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
mpl.use("PDF")
Blaise Li's avatar
Blaise Li committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
# https://stackoverflow.com/a/42768093/1878788
# from matplotlib.backends.backend_pgf import FigureCanvasPgf
# mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)
# TEX_PARAMS = {
#     "text.usetex": True,            # use LaTeX to write all text
#     "pgf.rcfonts": False,           # Ignore Matplotlibrc
#     "pgf.texsystem": "lualatex",  # hoping to avoid memory issues
#     "pgf.preamble": [
#         r'\usepackage{color}'     # xcolor for colours
#     ]
# }
# mpl.rcParams.update(TEX_PARAMS)
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"

import numpy as np
81
import pandas as pd
Blaise Li's avatar
Blaise Li committed
82
# To catch errors when plotting KDE
83
from scipy.linalg import LinAlgError
84
from scipy.stats import pearsonr, shapiro
Blaise Li's avatar
Blaise Li committed
85
from sklearn.linear_model import LinearRegression
86
import seaborn as sns
87
88
# To merge bigwig files
import pyBigWig
89
from mappy import fastx_read
90
91

from rpy2.robjects import Formula, StrVector
92
from libdeseq import do_deseq2
93
from libhts import aligner2min_mapq
94
from libhts import status_setter, plot_lfc_distribution, plot_MA
95
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
96
97
98
99
100
from libhts import (
    plot_paired_scatters,
    plot_norm_correlations,
    plot_counts_distribution,
    plot_boxplots)
101
from libworkflows import wc_applied, ensure_relative, cleanup_and_backup
102
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
103
from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
104
from libworkflows import feature_orientation2stranded
105
from libworkflows import sum_by_family
106
from libworkflows import read_htseq_counts, sum_htseq_counts
107
from libworkflows import read_intersect_counts, sum_intersect_counts
108
from libworkflows import read_feature_counts, sum_feature_counts
109
from smincludes import rules as irules
Blaise Li's avatar
Blaise Li committed
110
from smwrappers import wrappers_dir
111

112
alignment_settings = {"bowtie2": "", "hisat2": "", "crac": "-k 20 --stranded --use-x-in-cigar"}
113

114
115
116
# Possible feature ID conversions
ID_TYPES = ["name", "cosmid"]

117
# TODO: make this part of the configuration
118
LFC_RANGE = {
119
    "spike_ins" : (-10, 10),
120
121
122
123
124
125
126
    "protein_coding" : (-10, 10),
    "DNA_transposons_rmsk" : (-10, 10),
    "RNA_transposons_rmsk" : (-10, 10),
    "satellites_rmsk" : (-10, 10),
    "simple_repeats_rmsk" : (-10, 10),
    "DNA_transposons_rmsk_families" : (-10, 10),
    "RNA_transposons_rmsk_families" : (-10, 10),
127
    "satellites_rmsk_families" : (-10, 10),
128
    "simple_repeats_rmsk_families" : (-10, 10),
129
    "pseudogene" : (-10, 10),
130
131
    "all_rmsk" : (-10, 10),
    "all_rmsk_families" : (-10, 10),
132
    "alltypes" : (-10, 10)}
133
134
135
136
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
137
138
139
#status2colour = make_status2colour(DOWN_STATUSES, UP_STATUSES)
#STATUSES = list(reversed(DOWN_STATUSES)) + ["down", "NS", "up"] + UP_STATUSES
#STATUS2COLOUR = dict(zip(STATUSES, sns.color_palette("coolwarm", len(STATUSES))))
140

141
142
143
144
145
# Or use --configfile instead
#configfile:
#    "RNA-seq.config.json"


146
147
148
spikein_dict = config["spikein_dict"]
spikein_microliters = spikein_dict["microliters"]
spikein_dilution_factor = spikein_dict["dilution_factor"]
149
150
151
# To get the expected number of attomoles of a given spike-in,
# multiply this by the value found in column 4 of Genomes/spike-ins/ERCC92_concentrations.txt
spikein_microliter_equivalent = spikein_microliters / spikein_dilution_factor
152
153
154
155
156
157
158
159
160
161
if spikein_microliter_equivalent:
    spikein_concentrations = pd.read_table(
        spikein_dict["concentrations"],
        index_col="ERCC ID",
        usecols=["ERCC ID", "concentration in Mix %s (attomoles/ul)" % spikein_dict["mix"]])
    from scipy.constants import Avogadro
    # attomole: 10^{-18} mole
    molecules_per_attomole = Avogadro / 10**18
    spikein_expected_counts = spikein_concentrations * spikein_microliter_equivalent * molecules_per_attomole
    spikein_expected_counts.columns = ["expected_counts"]
162

163
164
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
165
166
167
168
169
170
171
ORIENTATIONS = config["orientations"]
# key: library name
# value: dictionary
#   key: replicate number
#   value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
172
REPS = config["replicates"]
173
174
175
#COND_PAIRS = config["cond_pairs"]
#CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in COND_PAIRS]
DE_COND_PAIRS = config["de_cond_pairs"]
176
177
178
179
180
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
181
IP_COND_PAIRS = config["ip_cond_pairs"]
182
183
184
185
186
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in IP_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in IP_COND_PAIRS]), ""
187
188
189
190
191
COND_PAIRS = DE_COND_PAIRS + IP_COND_PAIRS
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
IP_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in IP_COND_PAIRS]
contrasts_dict = {"de" : DE_CONTRASTS, "ip" : IP_CONTRASTS}
CONTRASTS = DE_CONTRASTS + IP_CONTRASTS
192
193
194
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
CONDITIONS = [{
    "lib" : lib,
195
    "rep" : rep} for rep in REPS for lib in LIBS]
196
197
198
199
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
200
201
# COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
#     cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
202
203
204


BIOTYPES = config["biotypes"]
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

RMSK_BIOTYPES = [
    "DNA_transposons_rmsk",
    "RNA_transposons_rmsk",
    "satellites_rmsk",
    "simple_repeats_rmsk"]
RMSK_FAMILIES_BIOTYPES = [
    "DNA_transposons_rmsk_families",
    "RNA_transposons_rmsk_families",
    "satellites_rmsk_families",
    "simple_repeats_rmsk_families"]

BIOTYPES_TO_JOIN = {
    "all_rmsk": [biotype for biotype in BIOTYPES if biotype in RMSK_BIOTYPES],
    "all_rmsk_families": [biotype for biotype in BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES],
    # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
    "alltypes": [biotype for biotype in BIOTYPES if not biotype.startswith("protein_coding_")]}
JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys())
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES + JOINED_BIOTYPES]
224
225
226
227
if "spike_ins" in BIOTYPES:
    PAIR_SCATTER_BIOTYPES = ["spike_ins"]
else:
    PAIR_SCATTER_BIOTYPES = []
228
ID_LISTS = config.get("boxplot_gene_lists", [])
229
230
231
232
#ID_LISTS = [
#    "germline_specific", "histone",
#    "csr1_prot_si_supertargets_48hph",
#    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
233
aligner = config["aligner"]
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
########################
# Genome configuration #
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
249
250
251
252
253
#COUNTERS = config["counters"]
# htseq_count is very slow
# bedtools intersect eats all the RAM when there are a lot of rRNAs:
# https://github.com/arq5x/bedtools2/issues/565
COUNTERS = ["feature_count"]
254
255
256
257
258
259
#output_dir = config["output_dir"]
#workdir: config["output_dir"]
output_dir = os.path.abspath(".")
transcriptomes_dir = OPJ("merged_transcriptomes")
log_dir = OPJ(f"logs_{genome}")
data_dir = OPJ("data")
260

261
#NORM_TYPES = config["norm_types"]
262
263
264
265
266
267
268
269
if spikein_microliter_equivalent:
    SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref", "spike_ins"]
    NORM_TYPES = ["spike_ins", "protein_coding", "median_ratio_to_pseudo_ref"]
else:
    SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref"]
    NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"]
    assert "spike_ins" not in BIOTYPES
assert set(SIZE_FACTORS).issubset(set(BIOTYPES) | {"median_ratio_to_pseudo_ref"})
270
271
assert set(NORM_TYPES).issubset(set(SIZE_FACTORS))

272
273
# PolyU tail min and max length
minU, maxU = 1, 5
274

275
276
# Seems to interfere with temp files deletion
wildcard_constraints:
277
    rep="\d+",
278
    orientation="|".join(ORIENTATIONS),
279
    biotype="|".join(BIOTYPES + JOINED_BIOTYPES),
280
    fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]),
281

282
#ruleorder: join_all_counts > gather_counts
283
284

# Define functions to be used in shell portions
285
shell.prefix(SHELL_FUNCTIONS)
286

287
288
289

def specific_input(aligner):
    if aligner == "hisat2":
290
291
292
        files = []
        # files = [
        #     expand(
293
        #         OPJ(aligner, f"mapped_{genome}", "ballgown",
294
295
296
297
        #             "{lib}_{rep}_on_%s.gtf" % genome),
        #         lib=LIBS, rep=REPS),
        #     OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare.stats"),
        #     OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),]
298
299
    elif aligner == "bowtie2":
        files = []
300
301
    elif aligner == "crac":
        files = []
302
303
304
305
306
    else:
        raise NotImplementedError("%s is not yet handeled." % aligner)
    return files

counts_files = [
307
    expand(
308
        OPJ(aligner, f"mapped_{genome}", "{counter}",
309
310
311
            "{lib}_mean_{mapped_type}", "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
        counter=COUNTERS, lib=LIBS, mapped_type=[f"on_{genome}"],
        biotype=["alltypes"], orientation=ORIENTATIONS),
312
    expand(
313
        OPJ(aligner, f"mapped_{genome}", "{counter}",
314
315
316
            "all_{mapped_type}", "{biotype}_{orientation}_TPM.txt"),
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
        biotype=["alltypes"], orientation=ORIENTATIONS),
317
    expand(
318
        OPJ(aligner, f"mapped_{genome}", "{counter}",
319
320
            "all_{mapped_type}", "{biotype}_{orientation}_RPK.txt"),
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
321
        biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS),
322
    expand(
323
        OPJ(aligner, f"mapped_{genome}", "{counter}",
324
325
            "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"),
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
326
        biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS),
327
    # expand(
328
    #     OPJ(aligner, f"mapped_{genome}", "{counter}",
329
330
331
    #         "summaries", f"{{lib}}_{{rep}}_on_{genome}_{{orientation}}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, orientation=ORIENTATIONS),
    # expand(
332
    #     OPJ(aligner, f"mapped_{genome}", "{counter}",
333
334
335
    #         f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS),
    # expand(
336
    #     OPJ(aligner, f"mapped_{genome}", "{counter}",
337
338
    #         f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS),]
339
340
    # Just a test:
    # expand(
341
    #     OPJ(aligner, f"mapped_{genome}", "{counter}",
342
343
344
345
    #         "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], orientation=ORIENTATIONS),
    # expand(
346
    #     OPJ(aligner, f"mapped_{genome}", "{counter}",
347
348
349
350
    #         "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),
    # expand(
351
    #     OPJ(aligner, f"mapped_{genome}", "{counter}",
352
353
354
    #         "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),]
355
    expand(
356
        OPJ(aligner, f"mapped_{genome}", "{counter}",
357
358
            "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt"),
        counter=COUNTERS, lib=LIBS, rep=REPS,
359
        mapped_type=[f"on_{genome}", f"unique_on_{genome}"], orientation=ORIENTATIONS),
360
    expand(
361
        OPJ(aligner, f"mapped_{genome}", "{counter}",
Blaise Li's avatar
Blaise Li committed
362
            "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
363
        counter=COUNTERS, lib=LIBS, rep=REPS,
364
        mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),
365
    expand(
366
        OPJ(aligner, f"mapped_{genome}", "{counter}",
Blaise Li's avatar
Blaise Li committed
367
            "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
368
        counter=COUNTERS, lib=LIBS, rep=REPS,
369
        mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),]
370

371
372
373
374
375
# Actually not necessarily IP data
# TODO: generalize contrast types
if contrasts_dict["ip"]:
    ip_fold_boxplots = [
        expand(
376
            OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}",
377
                "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"),
378
379
380
            #counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            counter=COUNTERS, mapped_type=[f"on_{genome}"],
            contrast_type=["ip"],
381
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
382
383
384
385
            fold_type=["mean_log2_RPM_fold"], id_list=ID_LISTS),]
else:
    ip_fold_boxplots = []

386
387
388
if contrasts_dict["de"]:
    de_fold_boxplots = [
        expand(
389
            OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}",
390
                "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"),
391
392
            counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            contrast_type=["de"],
393
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
394
395
396
397
            fold_type=["log2FoldChange"], id_list=ID_LISTS),]
else:
    de_fold_boxplots = []

398
399
400
de_files = []
if len(REPS) > 1:
    de_files += expand(
401
        OPJ(aligner, f"mapped_{genome}", "{counter}",
402
            "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"),
403
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
404
405
        contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES)
    de_files += expand(
406
        OPJ(aligner, f"mapped_{genome}", "{counter}",
407
            "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_{fold_type}_distribution.pdf"),
408
409
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
        contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
410
411
        fold_type=["log2FoldChange"])
    de_files += expand(
412
        OPJ(aligner, f"mapped_{genome}", "{counter}",
413
            "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_MA_with_{id_list}.pdf"),
414
415
        counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
        contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
416
417
418
419
        id_list=ID_LISTS + ["lfc_statuses"])
    de_files += de_fold_boxplots
else:
    warnings.warn("DESeq2 analyses will not be run because there are not enough replicates.\n")
420
421

bigwig_files = [
422
    # expand(
423
    #     OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}",
424
425
426
427
    #         "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"),
    #     lib=LIBS, rep=REPS, orientation=ORIENTATIONS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES),
    # expand(
428
    #     OPJ(aligner, f"mapped_{genome}", "{lib}_mean",
429
430
431
    #         "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"),
    #     lib=LIBS, orientation=ORIENTATIONS,
    #     mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES),]
432
    expand(
433
        OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}",
434
435
            "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"),
        lib=LIBS, rep=REPS, orientation=ORIENTATIONS,
436
        #mapped_type=[f"on_{genome}", f"unique_on_{genome}"], norm_type=NORM_TYPES),
437
        mapped_type=[f"on_{genome}"], norm_type=NORM_TYPES),
438
    expand(
439
        OPJ(aligner, f"mapped_{genome}", "{lib}_mean",
440
441
            "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"),
        lib=LIBS, orientation=ORIENTATIONS,
442
        #mapped_type=[f"on_{genome}", f"unique_on_{genome}"], norm_type=NORM_TYPES),]
443
        mapped_type=[f"on_{genome}"], norm_type=NORM_TYPES),]
444

445
446
if spikein_microliter_equivalent:
    spikein_files = [
447
        expand(OPJ(aligner, f"mapped_{genome}", "{counter}",
448
449
450
451
            f"all_on_{genome}", "{lib}_{rep}_spike_ins_{orientation}_TPM_response.pdf"),
            counter=COUNTERS, orientation=ORIENTATIONS, lib=LIBS, rep=REPS),]
else:
    spikein_files = []
452

453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
replicates_comparisons = []
if len(REPS) > 1:
    ## TODO: debug:
    # Something to do with partial expand on quantification type?
    # Missing input files for rule compare_replicates:
    # results_44hph/hisat2/mapped_C_elegans/{counter}/all_{mapped_type}/{biotype}_{orientation}_RPM.txt
    # MissingInputException in line 1646 of /home/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile:
    # Missing input files for rule differential_expression:
    # results_44hph_vs_38hph/hisat2/mapped_C_elegans/{counter}/all_{mapped_type}/{biotype}_{orientation}_counts.txt
    # see https://bitbucket.org/snakemake/snakemake/issues/956/placeholders-not-properly-matched-with
    replicates_comparisons.extend(
        expand(
            OPJ(aligner, f"mapped_{genome}", "{counter}",
                "all_{mapped_type}", "replicates_comparison",
                "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"),
            counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            lib=LIBS, biotype=["alltypes"],
            orientation=ORIENTATIONS, quantif_type=["TPM"]))
    replicates_comparisons.extend(
        expand(
            OPJ(aligner, f"mapped_{genome}", "{counter}",
                "all_{mapped_type}", "replicates_comparison",
                "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"),
            counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"],
            lib=LIBS, biotype=BIOTYPES + ["alltypes"],
            orientation=ORIENTATIONS, quantif_type=["counts", "RPM"]))
else:
    warnings.warn("Replicates comparisons will not be run because there are not enough replicates.\n")
481

482
483
484
rule all:
    input:
        specific_input(aligner),
485
        expand(
486
            OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
487
            lib=LIBS, rep=REPS),
488
        expand(
489
            OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"),
490
491
            lib=LIBS, rep=REPS),
        bigwig_files,
492
        spikein_files,
493
        # Uses too much memory when there are many libraries
494
        # expand(OPJ(aligner, f"mapped_{genome}", "{counter}",
495
496
        #     f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
        #     counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS, quantif_type=["TPM"]),
497
        # expand(OPJ(aligner, f"mapped_{genome}", "{counter}",
498
499
        #     f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
        #     counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS, quantif_type=["counts", "RPK"]),
500
        expand(
501
            OPJ(aligner, f"mapped_{genome}", "{counter}",
502
503
                "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_correlations.pdf"),
            counter=COUNTERS, mapped_type=[f"on_{genome}"], orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]),
504
        expand(
505
            OPJ(aligner, f"mapped_{genome}", "{counter}",
506
507
                "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_counts_distrib.pdf"),
            counter=COUNTERS, mapped_type=[f"on_{genome}"], orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]),
508
        de_files,
509
        ip_fold_boxplots,
510
        counts_files,
511
        replicates_comparisons,
512
        ##
513
514


515
516
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
#include: "../snakemake_wrappers/includes/link_raw_data.rules"
517
518


519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
# def mapping_command(aligner):
#     """This function returns the shell commands to run given the *aligner*."""
#     if aligner == "hisat2":
#         shell_commands = """
# cmd="hisat2 -p {threads} --dta --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
# echo ${{cmd}} 1> {log.log}
# eval ${{cmd}} 1>> {log.log} 2> {log.err}
# """ % genome_db
#     elif aligner == "bowtie2":
#         shell_commands = """
# cmd="bowtie2 -p {threads} --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
# echo ${{cmd}} 1> {log.log}
# eval ${{cmd}} 1>> {log.log} 2> {log.err}
# """ % genome_db
#     else:
#         raise NotImplementedError("%s is not yet handled." % aligner)
#     return shell_commands
536

537
538
539
540
541
542
543
# Could this be useful?
# https://stackoverflow.com/a/5328139/1878788
#def debug_output():
#    exc_type, exc_value, tb = sys.exc_info()
#    if tb is not None:
#        print(tb.tb_frame.f_locals)
#    return ""
544
545
546
547
548
549

rule map_on_genome:
    input:
        fastq = OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
    output:
        # temp because it uses a lot of space
550
551
552
        sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)),
        #sam = temp(OPJ(aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s.sam" % genome)),
        nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
553
554
555
556
    params:
        aligner = aligner,
        index = genome_db,
        settings = alignment_settings[aligner],
557
    message:
558
        "Mapping {wildcards.lib}_{wildcards.rep} on %s." % genome
559
    log:
560
561
        log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
        err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
562
563
564
565
    resources:
        io=5
    threads:
        4
566
567
568
    #shell:
    #    mapping_command(aligner)
    wrapper:
Blaise Li's avatar
Blaise Li committed
569
        f"file://{wrappers_dir[0]}/map_on_genome"
570
571


572
573
574
575
576
rule extract_nomap_polyU:
    f"""
    Extract from the non-mappers those that end with a polyU tail
    of length from {minU} to {maxU}."""
    input:
577
        nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
578
    output:
579
580
        nomap_polyU = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU.fastq.gz" % genome),
        nomap_polyU_stats = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU_stats.txt" % genome),
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
    run:
        stats = defaultdict(int)
        with gzopen(output.nomap_polyU, "wb") as fq_out:
            for (name, seq, qual) in fastx_read(input.nomap_fastq):
                trimmed = seq.rstrip("T")
                nb_T = len(seq) - len(trimmed)
                stats[nb_T] += 1
                if minU <= nb_T <= maxU:
                    # Find number of ending Ts
                    fq_out.write(f"@{name}\n{trimmed}\n+\n{qual:.{len(trimmed)}}\n".encode("utf-8"))
        with open(output.nomap_polyU_stats, "w") as stats_file:
            tot = 0
            for (nb_T, stat) in sorted(stats.items()):
                stats_file.write(f"{nb_T}\t{stat}\n")
                tot += stat
            stats_file.write(f"tot\t{tot}\n")


rule remap_on_genome:
    input:
        fastq = rules.extract_nomap_polyU.output.nomap_polyU,
    output:
        # temp because it might use a lot of space
604
605
        sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_polyU_on_%s.sam" % genome)),
        nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_polyU_unmapped_on_%s.fastq.gz" % genome),
606
607
608
609
610
    params:
        aligner = aligner,
        index = genome_db,
        settings = alignment_settings[aligner],
    message:
611
        "Remapping {wildcards.lib}_{wildcards.rep} polyU-trimmed unmapped reads on %s." % genome
612
613
614
615
616
617
618
619
620
621
    log:
        log = OPJ(log_dir, "remap_on_genome_{lib}_{rep}.log"),
        err = OPJ(log_dir, "remap_on_genome_{lib}_{rep}.err")
    resources:
        io=5
    threads:
        4
    #shell:
    #    mapping_command(aligner)
    wrapper:
Blaise Li's avatar
Blaise Li committed
622
        f"file://{wrappers_dir[0]}/map_on_genome"
623
624
625
626
627


def source_sam(wildcards):
    mapped_type = wildcards.mapped_type
    if mapped_type == f"on_{genome}":
628
629
630
        ## debug
        # Why rules.map_on_genome.output.sam doesn't get properly expanded?
        #return rules.map_on_genome.output.sam
631
        return OPJ(aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}_on_%s.sam" % genome)
632
        ##
633
634
    elif mapped_type == f"polyU_on_{genome}":
        return rules.remap_on_genome.output.sam
635
636
    elif mapped_type == f"unique_on_{genome}":
        raise NotImplementedError(f"{mapped_type} only handled from count level on.")
637
638
639
640
    else:
        raise NotImplementedError(f"{mapped_type} not handled (yet?)")


641
642
rule sam2indexedbam:
    input:
643
        #debug_wildcards,
644
        # sam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam"),
645
        sam = source_sam,
646
    output:
647
648
649
650
        # sorted_bam = protected(OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam")),
        # index = protected(OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai")),
        sorted_bam = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam")),
        index = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam.bai")),
651
    message:
652
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}."
653
    log:
654
655
        log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}_{mapped_type}.log"),
        err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}_{mapped_type}.err"),
656
    resources:
657
        mem_mb=4100,
658
659
        io=45
    threads:
660
        8
661
    wrapper:
Blaise Li's avatar
Blaise Li committed
662
        f"file://{wrappers_dir[0]}/sam2indexedbam"
663
664


665
666
667
rule compute_mapping_stats:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
668
        #sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
669
    output:
670
        stats = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_samtools_stats.txt"),
671
672
673
674
675
676
677
678
    shell:
        """samtools stats {input.sorted_bam} > {output.stats}"""


rule get_nb_mapped:
    """Extracts the number of mapped reads from samtools stats results."""
    input:
        stats = rules.compute_mapping_stats.output.stats,
679
        #stats = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
680
    output:
681
        nb_mapped = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_nb_mapped.txt"),
682
683
684
685
    shell:
        """samstats2mapped {input.stats} {output.nb_mapped}"""


686
687
rule compute_coverage:
    input:
688
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
689
        #sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
690
    output:
691
        coverage = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_coverage.txt"),
692
693
    params:
        genomelen = genomelen,
694
695
    shell:
        """
696
        bases=$(samtools depth {input.sorted_bam} | awk '{{sum += $3}} END {{print sum}}') || error_exit "samtools depth failed"
697
        python3 -c "print(${{bases}} / {params.genomelen})" > {output.coverage}
698
699
700
701
702
        """


rule assemble_transcripts:
    input:
703
        sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
704
    output:
705
        gtf = OPJ(aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"),
706
707
    params:
        annot = OPJ(annot_dir, "genes.gtf"),
708
709
710
    message:
        "Assembling transcripts for {wildcards.lib}_{wildcards.rep} with stringtie."
    log:
711
712
        log = OPJ(log_dir, "assemble_transcripts_{lib}_{rep}.log"),
        err = OPJ(log_dir, "assemble_transcripts_{lib}_{rep}.err")
713
714
715
716
717
718
    resources:
        io=5
    threads:
        4
    shell:
        """
719
720
721
        cmd="stringtie -p {threads} -G {params.annot} -o {output} -l {wildcards.lib}_{wildcards.rep} {input}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
722
723
        """

724

725
726
rule merge_transcripts:
    input:
727
        expand(OPJ(aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"), lib=LIBS, rep=REPS),
728
    output:
729
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
730
731
    params:
        annot = OPJ(annot_dir, "genes.gtf"),
732
733
734
    message:
        "Merging transcripts with stringtie."
    log:
735
736
        log = OPJ(log_dir, "merge_transcripts.log"),
        err = OPJ(log_dir, "merge_transcripts.err")
737
738
739
740
    resources:
        io=5
    shell:
        """
741
742
743
        cmd="stringtie --merge -G {params.annot} -o {output} {input}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
744
745
746
747
748
        """


rule gffcompare:
    input:
749
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
750
    output:
751
        stats = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare.stats"),
752
    params:
753
        compare = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare"),
754
        annot = OPJ(annot_dir, "genes.gtf"),
755
756
757
    message:
        "Merging transcripts with stringtie."
    log:
758
759
        log = OPJ(log_dir, "gffcompare.log"),
        err = OPJ(log_dir, "gffcompare.err")
760
761
762
763
    resources:
        io=5
    shell:
        """
764
765
766
        cmd="gffcompare -r {params.annot} -o {params.compare} {input.merged}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
767
768
        """

769

770
771
rule quantify_transcripts:
    input:
772
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
773
        sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
774
    output:
775
        gtf = OPJ(aligner, f"mapped_{genome}", "ballgown", f"{{lib}}_{{rep}}_on_{genome}.gtf"),
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
    message:
        "Quantifying transcripts for {wildcards.lib}_{wildcards.rep} with stringtie."
    log:
        OPJ(log_dir, "quantify_transcripts_{lib}_{rep}.log")
    threads:
        4
    resources:
        io=5
    shell:
        """
        cmd="stringtie -e -B -p {threads} -G {input.merged} -o {output} {input.sorted_bam}"
        echo ${{cmd}} 1> {log}
        eval ${{cmd}} 1>> {log}
        """


792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
# def htseq_orientation2stranded(wildcards):
#     orientation = wildcards.orientation
#     if orientation == "fwd":
#         if LIB_TYPE[-2:] == "SF":
#             return "yes"
#         elif LIB_TYPE[-2:] == "SR":
#             return "reverse"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "rev":
#         if LIB_TYPE[-2:] == "SF":
#             return "reverse"
#         elif LIB_TYPE[-2:] == "SR":
#             return "yes"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "all":
#         return "no"
#     else:
#         exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
812
813
814


def biotype2annot(wildcards):
815
816
817
818
819
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}.gtf")
820
821


822
823
# rule htseq_count_reads:
#     input:
824
825
#         sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
#         bai = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"),
826
#     output:
827
828
#         counts = OPJ(aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
#         counts_converted = OPJ(aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
829
830
831
832
833
834
835
836
837
838
839
840
#     params:
#         stranded = htseq_orientation2stranded,
#         mode = "union",
#         annot = biotype2annot,
#     message:
#         "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with htseq-count."
#     benchmark:
#         OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
#     log:
#         log = OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
#         err = OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
#     wrapper:
Blaise Li's avatar
Blaise Li committed
841
#         f"file://{wrappers_dir[0]}/htseq_count_reads"
842
843


844
845
846
847
848
849
850
def source_sorted_bam(wildcards):
    mapped_type = wildcards.mapped_type
    if mapped_type.startswith("unique_"):
        real_mapped_type = mapped_type[7:]
    else:
        real_mapped_type = mapped_type
    return OPJ(
851
        aligner, f"mapped_{genome}",
852
853
854
855
856
857
858
859
860
861
        f"{wildcards.lib}_{wildcards.rep}_{real_mapped_type}_sorted.bam")


def source_index(wildcards):
    mapped_type = wildcards.mapped_type
    if mapped_type.startswith("unique_"):
        real_mapped_type = mapped_type[7:]
    else:
        real_mapped_type = mapped_type
    return OPJ(
862
        aligner, f"mapped_{genome}",
863
864
865
        f"{wildcards.lib}_{wildcards.rep}_{real_mapped_type}_sorted.bam.bai")


866
867
rule feature_count_reads:
    input:
868
869
870
871
        sorted_bam = source_sorted_bam,
        index = source_index,
        #sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        #index = rules.sam2indexedbam.output.index,
872
    output:
873
874
        counts = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
875
876
    #wildcard_constraints:
    #    mapped_type = "|".join([f"on_{genome}", f"polyU_on_{genome}"])
877
    params:
878
        min_mapq = partial(aligner2min_mapq, aligner),
879
        stranded = feature_orientation2stranded(LIB_TYPE),
880
        annot = biotype2annot,
881
882
        # pickled dictionary that associates gene ids to gene names
        converter = genome_dict["converter"]
883
    message:
884
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} {wildcards.mapped_type} with featureCounts."
885
    benchmark:
886
        OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}_benchmark.txt")
887
    log:
888
889
        log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.err"),
890
891
    shell:
        """
892
        tmpdir=$(TMPDIR=/var/tmp mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
893
        cmd="featureCounts {params.min_mapq} -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
894
895
896
897
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
898
        cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
899
900
901
        """


902
903
904
905
906
907
908
# rule feature_count_unique_reads:
#     input:
#         sorted_bam = source_sorted_bam,
#         index = source_index,
#         #sorted_bam = rules.sam2indexedbam.output.sorted_bam,
#         #index = rules.sam2indexedbam.output.index,
#     output:
909
910
#         counts = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
#         counts_converted = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
911
912
913
#     #wildcard_constraints:
#     #    mapped_type = f"unique_on_{genome}"
#     params:
914
#         min_mapq = partial(aligner2min_mapq, aligner),
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
#         stranded = feature_orientation2stranded(LIB_TYPE),
#         annot = biotype2annot,
#         # pickled dictionary that associates gene ids to gene names
#         converter = genome_dict["converter"]
#     message:
#         "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} {wildcards.mapped_type} with featureCounts."
#     benchmark:
#         OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}_benchmark.txt")
#     log:
#         log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.log"),
#         err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.err"),
#     shell:
#         """
#         tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
#         cmd="featureCounts {params.min_mapq} -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
#         featureCounts -v 2> {log.log}
#         echo ${{cmd}} 1>> {log.log}
#         eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
#         rm -rf ${{tmpdir}}
#         cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
#         """


# def biotype2merged(wildcards):
#     if wildcards.biotype.endswith("_rmsk_families"):
#         biotype = wildcards.biotype[:-9]
#     else:
#         biotype = wildcards.biotype
#     return OPJ(annot_dir, f"{biotype}_merged.bed")
#
#
# def intersect_orientation2stranded(wildcards):
#     orientation = wildcards.orientation
#     if orientation == "fwd":
#         if LIB_TYPE[-2:] == "SF":
#             return "-s"
#         elif LIB_TYPE[-2:] == "SR":
#             return "-S"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "rev":
#         if LIB_TYPE[-2:] == "SF":
#             return "-S"
#         elif LIB_TYPE[-2:] == "SR":
#             return "-s"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "all":
#         return ""
#     else:
#         exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
#
#
# rule intersect_count_reads:
#     input:
970
971
#         sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
#         bai = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"),
972
#     output:
973
974
#         counts = OPJ(aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
#         counts_converted = OPJ(aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
975
976
977
978
979
980
981
982
983
984
985
986
#     params:
#         stranded = intersect_orientation2stranded,
#         annot = biotype2merged,
#     message:
#         "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with bedtools intersect."
#     benchmark:
#         OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
#     log:
#         log = OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
#         err = OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
#     threads: 4  # to limit memory usage, actually
#     wrapper:
Blaise Li's avatar
Blaise Li committed
987
#         f"file://{wrappers_dir[0]}/intersect_count_reads"
988
989


990
rule summarize_counts:
991
992
    """For a given library, write a summary of the read counts for the various biotypes."""
    input:
993
        biotype_counts_files = expand(OPJ(aligner, f"mapped_{genome}", "{{counter}}", "{{lib}}_{{rep}}_{{mapped_type}}", "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES),
994
    output:
995
        summary = OPJ(aligner, f"mapped_{genome}", "{counter}", "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt")
996
    run:
997
        if wildcards.counter == "htseq_count":
998
            sum_counter = sum_htseq_counts
999
1000
        elif wildcards.counter == "intersect_count":
            sum_counter = sum_intersect_counts
For faster browsing, not all history is shown. View entire blame