RNA-seq.snakefile 85.7 KB
Newer Older
1
2
3
"""
Snakefile to analyse RNA-seq data.
"""
Blaise Li's avatar
Blaise Li committed
4
5
6
7
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")
8

9
10
# TODO: Extract reads with poly-T tail (up to 5) from non-mapped, trim the tail, remap those (like siu RNA)

11
# TODO: plot spike-in vs spike-in between libraries to detect anormal spike-ins: should be a straight line
12

13
14
15
16
17
18
# TODO: Add rules to take into account spike-ins.
# The size factor should be determined within the dynamic range of the spike-ins
# Use 1 RPKM for LDD
# See /Genomes/spike-ins/ERCC_RNA_Spike-In_Control_Mixes.pdf pp. 13-14
# Then use size factor to compute normalized RPKM and then fold changes like for small RNA IP

19
#TODO: same boxplots as small_RNA-seq
20
21
22
#TODO: for each contrast,  scatterplots with highlight of differentially expressed genes (with p-value colour code)
# and same scale for different biotypes
# boxplots of categories of genes (only protein-coding)
23
# Prepare scripts to compare with small RNA (fold vs fold)
24
25
26
27
28
29
30
31
32
33
34
#TODO: try to use bedtools intersect to do the counting (only reads strictly included in feature)
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ mkfifo /tmp/gene_id
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ mkfifo /tmp/count
# bli@naples:/extra2/Documents/Mhe/bli/RNA_Seq_analyses$ head -6 /tmp/WT_RT_1_protein_coding_counts_F1S.txt | tee >(cut -f9 | gtf2attr gene_id > /tmp/gene_id) | cut -f7,10 > /tmp/count & paste /tmp/gene_id /tmp/count
# "WBGene00022277"	-	170
# "WBGene00022276"	+	551
# "WBGene00022276"	+	522
# "WBGene00022276"	+	550
# "WBGene00022276"	+	550
# "WBGene00022276"	+	550

35
36
37

import os
OPJ = os.path.join
38
from glob import glob
39
from pickle import load
40
from collections import defaultdict
41
from itertools import combinations
42
from gzip import open as gzopen
43

44
45
46
47
48
49
50
51
52
53
import warnings


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning

54
55
56
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
mpl.use("PDF")
Blaise Li's avatar
Blaise Li committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
# https://stackoverflow.com/a/42768093/1878788
# from matplotlib.backends.backend_pgf import FigureCanvasPgf
# mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)
# TEX_PARAMS = {
#     "text.usetex": True,            # use LaTeX to write all text
#     "pgf.rcfonts": False,           # Ignore Matplotlibrc
#     "pgf.texsystem": "lualatex",  # hoping to avoid memory issues
#     "pgf.preamble": [
#         r'\usepackage{color}'     # xcolor for colours
#     ]
# }
# mpl.rcParams.update(TEX_PARAMS)
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"

import numpy as np
76
import pandas as pd
Blaise Li's avatar
Blaise Li committed
77
# To catch errors when plotting KDE
78
from scipy.linalg import LinAlgError
79
from scipy.stats import pearsonr, shapiro
Blaise Li's avatar
Blaise Li committed
80
from sklearn.linear_model import LinearRegression
81
import seaborn as sns
82
83
# To merge bigwig files
import pyBigWig
84
from mappy import fastx_read
85
86

from rpy2.robjects import Formula, StrVector
87
from libhts import do_deseq2, status_setter, plot_lfc_distribution, plot_MA
88
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
89
90
91
92
93
from libhts import (
    plot_paired_scatters,
    plot_norm_correlations,
    plot_counts_distribution,
    plot_boxplots)
94
from libworkflows import ensure_relative, cleanup_and_backup
95
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
96
from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
97
from libworkflows import feature_orientation2stranded
98
from libworkflows import read_htseq_counts, sum_htseq_counts
99
from libworkflows import read_intersect_counts, sum_intersect_counts
100
from libworkflows import read_feature_counts, sum_feature_counts
101
from smincludes import rules as irules
102

103
alignment_settings = {"bowtie2": "", "hisat2": "", "crac": "-k 20 --stranded --use-x-in-cigar"}
104

105
106
107
# Possible feature ID conversions
ID_TYPES = ["name", "cosmid"]

108
# TODO: make this part of the configuration
109
LFC_RANGE = {
110
    "spike_ins" : (-10, 10),
111
112
113
114
115
116
117
118
    "protein_coding" : (-10, 10),
    "DNA_transposons_rmsk" : (-10, 10),
    "RNA_transposons_rmsk" : (-10, 10),
    "satellites_rmsk" : (-10, 10),
    "simple_repeats_rmsk" : (-10, 10),
    "DNA_transposons_rmsk_families" : (-10, 10),
    "RNA_transposons_rmsk_families" : (-10, 10),
    "simple_repeats_rmsk_families" : (-10, 10),
119
    "pseudogene" : (-10, 10),
120
    "alltypes" : (-10, 10)}
121
122
123
124
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
125
126
127
#status2colour = make_status2colour(DOWN_STATUSES, UP_STATUSES)
#STATUSES = list(reversed(DOWN_STATUSES)) + ["down", "NS", "up"] + UP_STATUSES
#STATUS2COLOUR = dict(zip(STATUSES, sns.color_palette("coolwarm", len(STATUSES))))
128

129
130
131
132
133
# Or use --configfile instead
#configfile:
#    "RNA-seq.config.json"


134
135
136
spikein_dict = config["spikein_dict"]
spikein_microliters = spikein_dict["microliters"]
spikein_dilution_factor = spikein_dict["dilution_factor"]
137
138
139
# To get the expected number of attomoles of a given spike-in,
# multiply this by the value found in column 4 of Genomes/spike-ins/ERCC92_concentrations.txt
spikein_microliter_equivalent = spikein_microliters / spikein_dilution_factor
140
141
142
143
144
145
146
147
148
149
if spikein_microliter_equivalent:
    spikein_concentrations = pd.read_table(
        spikein_dict["concentrations"],
        index_col="ERCC ID",
        usecols=["ERCC ID", "concentration in Mix %s (attomoles/ul)" % spikein_dict["mix"]])
    from scipy.constants import Avogadro
    # attomole: 10^{-18} mole
    molecules_per_attomole = Avogadro / 10**18
    spikein_expected_counts = spikein_concentrations * spikein_microliter_equivalent * molecules_per_attomole
    spikein_expected_counts.columns = ["expected_counts"]
150

151
152
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
153
154
155
156
157
158
159
ORIENTATIONS = config["orientations"]
# key: library name
# value: dictionary
#   key: replicate number
#   value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
160
REPS = config["replicates"]
161
162
163
#COND_PAIRS = config["cond_pairs"]
#CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in COND_PAIRS]
DE_COND_PAIRS = config["de_cond_pairs"]
164
165
166
167
168
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
169
IP_COND_PAIRS = config["ip_cond_pairs"]
170
171
172
173
174
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in IP_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in IP_COND_PAIRS]), ""
175
176
177
178
179
COND_PAIRS = DE_COND_PAIRS + IP_COND_PAIRS
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
IP_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in IP_COND_PAIRS]
contrasts_dict = {"de" : DE_CONTRASTS, "ip" : IP_CONTRASTS}
CONTRASTS = DE_CONTRASTS + IP_CONTRASTS
180
181
182
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
CONDITIONS = [{
    "lib" : lib,
183
    "rep" : rep} for rep in REPS for lib in LIBS]
184
185
186
187
188
189
190
191
192
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
    cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")


BIOTYPES = config["biotypes"]
193
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in BIOTYPES]
194
195
196
197
if "spike_ins" in BIOTYPES:
    PAIR_SCATTER_BIOTYPES = ["spike_ins"]
else:
    PAIR_SCATTER_BIOTYPES = []
198
199
200
201
202
# TODO: make this configurable
ID_LISTS = [
    "germline_specific", "histone",
    "csr1_prot_si_supertargets_48hph",
    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
203
aligner = config["aligner"]
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
########################
# Genome configuration #
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
219
220
221
222
223
#COUNTERS = config["counters"]
# htseq_count is very slow
# bedtools intersect eats all the RAM when there are a lot of rRNAs:
# https://github.com/arq5x/bedtools2/issues/565
COUNTERS = ["feature_count"]
224
output_dir = config["output_dir"]
225
226
transcriptomes_dir = OPJ(output_dir, "merged_transcriptomes")
log_dir = OPJ(output_dir, f"logs_{genome}")
Blaise Li's avatar
Blaise Li committed
227
data_dir = OPJ(output_dir, "data")
228

229
#NORM_TYPES = config["norm_types"]
230
231
232
233
234
235
236
237
if spikein_microliter_equivalent:
    SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref", "spike_ins"]
    NORM_TYPES = ["spike_ins", "protein_coding", "median_ratio_to_pseudo_ref"]
else:
    SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref"]
    NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"]
    assert "spike_ins" not in BIOTYPES
assert set(SIZE_FACTORS).issubset(set(BIOTYPES) | {"median_ratio_to_pseudo_ref"})
238
239
assert set(NORM_TYPES).issubset(set(SIZE_FACTORS))

240
241
# PolyU tail min and max length
minU, maxU = 1, 5
242

243
244
# Seems to interfere with temp files deletion
wildcard_constraints:
245
    rep="\d+",
246
    orientation="|".join(ORIENTATIONS),
247
248
    biotype="|".join(BIOTYPES + ["alltypes"]),
    fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]),
249

250
#ruleorder: join_all_counts > gather_counts
251
252

# Define functions to be used in shell portions
253
shell.prefix(SHELL_FUNCTIONS)
254

255
256
257
258

def specific_input(aligner):
    if aligner == "hisat2":
        files = [
259
260
261
262
263
            expand(OPJ(
                output_dir, aligner, f"mapped_{genome}",
                "ballgown", "{lib}_{rep}_on_%s.gtf" % genome), lib=LIBS, rep=REPS),
            OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare.stats"),
            OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),]
264
265
    elif aligner == "bowtie2":
        files = []
266
267
    elif aligner == "crac":
        files = []
268
269
270
271
272
    else:
        raise NotImplementedError("%s is not yet handeled." % aligner)
    return files

counts_files = [
273
274
275
276
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            "{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
        lib=LIBS, counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS),
277
278
279
280
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
        counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS),
281
282
283
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_RPK.txt"),
284
        counter=COUNTERS, biotype=BIOTYPES + ["alltypes"], orientation=ORIENTATIONS),
285
286
287
288
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
        counter=COUNTERS, biotype=BIOTYPES, orientation=ORIENTATIONS),
289
290
291
292
293
294
295
296
297
298
299
300
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         "summaries", f"{{lib}}_{{rep}}_on_{genome}_{{orientation}}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, orientation=ORIENTATIONS),
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS),
    # expand(
    #     OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
    #         f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
    #     counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS),]
301
302
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
303
304
305
            "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt"),
        counter=COUNTERS, lib=LIBS, rep=REPS,
        mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], orientation=ORIENTATIONS),
306
307
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
308
309
310
            "{lib}_{rep}_{mapped_type}", "{biotype}_{orientation}_counts.txt"),
        counter=COUNTERS, lib=LIBS, rep=REPS,
        mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),
311
312
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
313
314
315
            "{lib}_{rep}_{mapped_type}", "{biotype}_{orientation}_counts_gene_names.txt"),
        counter=COUNTERS, lib=LIBS, rep=REPS,
        mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),]
316

317
318
319
320
321
322
323
324
325
326
327
328
# Actually not necessarily IP data
# TODO: generalize contrast types
if contrasts_dict["ip"]:
    ip_fold_boxplots = [
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "fold_boxplots",
                "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"),
            counter=COUNTERS, contrast_type=["ip"], orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
            fold_type=["mean_log2_RPM_fold"], id_list=ID_LISTS),]
else:
    ip_fold_boxplots = []

329
330
331
332
333
334
335
336
337
338
if contrasts_dict["de"]:
    de_fold_boxplots = [
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "fold_boxplots",
                "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"),
            counter=COUNTERS, contrast_type=["de"], orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
            fold_type=["log2FoldChange"], id_list=ID_LISTS),]
else:
    de_fold_boxplots = []

339
340
341
342
de_files = [
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"),
343
        counter=COUNTERS, contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
344
345
346
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            "deseq2", "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"),
347
        counter=COUNTERS, contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
348
349
350
351
        fold_type=["log2FoldChange"]),
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            "deseq2", "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"),
352
353
354
        counter=COUNTERS, contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES,
        id_list=ID_LISTS + ["lfc_statuses"]),
    de_fold_boxplots]
355
356
357
358

bigwig_files = [
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}",
359
360
361
            "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"),
        lib=LIBS, rep=REPS, orientation=ORIENTATIONS,
        mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES),
362
363
    expand(
        OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean",
364
365
366
            "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"),
        lib=LIBS, orientation=ORIENTATIONS,
        mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES),]
367

368
369
370
371
372
373
374
if spikein_microliter_equivalent:
    spikein_files = [
        expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{lib}_{rep}_spike_ins_{orientation}_TPM_response.pdf"),
            counter=COUNTERS, orientation=ORIENTATIONS, lib=LIBS, rep=REPS),]
else:
    spikein_files = []
375

376

377
378
379
rule all:
    input:
        specific_input(aligner),
380
381
382
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
            lib=LIBS, rep=REPS),
383
384
385
386
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"),
            lib=LIBS, rep=REPS),
        bigwig_files,
387
        spikein_files,
388
389
390
391
392
393
394
        # Uses too much memory when there are many libraries
        # expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
        #     f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
        #     counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS, quantif_type=["TPM"]),
        # expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
        #     f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"),
        #     counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS, quantif_type=["counts", "RPK"]),
395
396
397
398
399
400
401
402
403
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
                "summaries", f"all_on_{genome}_{{orientation}}_{{biotype}}_norm_correlations.pdf"),
            counter=COUNTERS, orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]),
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
                "summaries", f"all_on_{genome}_{{orientation}}_{{biotype}}_norm_counts_distrib.pdf"),
            counter=COUNTERS, orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]),
        de_files,
404
        ip_fold_boxplots,
405
        counts_files,
406
407
408
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
                f"all_on_{genome}", "replicates_comparison",
409
410
411
412
413
414
415
416
417
                "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"),
            counter=COUNTERS, lib=LIBS, biotype=["alltypes"],
            orientation=ORIENTATIONS, quantif_type=["TPM"]),
        expand(
            OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
                f"all_on_{genome}", "replicates_comparison",
                "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"),
            counter=COUNTERS, lib=LIBS, biotype=BIOTYPES + ["alltypes"],
            orientation=ORIENTATIONS, quantif_type=["counts", "RPM"]),
418
419


420
421
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
#include: "../snakemake_wrappers/includes/link_raw_data.rules"
422
423


424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# def mapping_command(aligner):
#     """This function returns the shell commands to run given the *aligner*."""
#     if aligner == "hisat2":
#         shell_commands = """
# cmd="hisat2 -p {threads} --dta --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
# echo ${{cmd}} 1> {log.log}
# eval ${{cmd}} 1>> {log.log} 2> {log.err}
# """ % genome_db
#     elif aligner == "bowtie2":
#         shell_commands = """
# cmd="bowtie2 -p {threads} --seed 123 -t --mm -x %s -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
# echo ${{cmd}} 1> {log.log}
# eval ${{cmd}} 1>> {log.log} 2> {log.err}
# """ % genome_db
#     else:
#         raise NotImplementedError("%s is not yet handled." % aligner)
#     return shell_commands
441
442
443
444
445
446
447


rule map_on_genome:
    input:
        fastq = OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
    output:
        # temp because it uses a lot of space
448
449
        sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam")),
        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", f"{{lib}}_{{rep}}_unmapped_on_{genome}.fastq.gz"),
450
451
452
453
    params:
        aligner = aligner,
        index = genome_db,
        settings = alignment_settings[aligner],
454
    message:
455
        f"Mapping {{wildcards.lib}}_{{wildcards.rep}} on {genome}."
456
    log:
457
458
        log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
        err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
459
460
461
462
    resources:
        io=5
    threads:
        4
463
464
465
466
    #shell:
    #    mapping_command(aligner)
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
467
468


469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
rule extract_nomap_polyU:
    f"""
    Extract from the non-mappers those that end with a polyU tail
    of length from {minU} to {maxU}."""
    input:
        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", f"{{lib}}_{{rep}}_unmapped_on_{genome}.fastq.gz"),
    output:
        nomap_polyU = OPJ(output_dir, aligner, f"not_mapped_{genome}", f"{{lib}}_{{rep}}_unmapped_on_{genome}_polyU.fastq.gz"),
        nomap_polyU_stats = OPJ(output_dir, aligner, f"not_mapped_{genome}", f"{{lib}}_{{rep}}_unmapped_on_{genome}_polyU_stats.txt"),
    run:
        stats = defaultdict(int)
        with gzopen(output.nomap_polyU, "wb") as fq_out:
            for (name, seq, qual) in fastx_read(input.nomap_fastq):
                trimmed = seq.rstrip("T")
                nb_T = len(seq) - len(trimmed)
                stats[nb_T] += 1
                if minU <= nb_T <= maxU:
                    # Find number of ending Ts
                    fq_out.write(f"@{name}\n{trimmed}\n+\n{qual:.{len(trimmed)}}\n".encode("utf-8"))
        with open(output.nomap_polyU_stats, "w") as stats_file:
            tot = 0
            for (nb_T, stat) in sorted(stats.items()):
                stats_file.write(f"{nb_T}\t{stat}\n")
                tot += stat
            stats_file.write(f"tot\t{tot}\n")


rule remap_on_genome:
    input:
        fastq = rules.extract_nomap_polyU.output.nomap_polyU,
    output:
        # temp because it might use a lot of space
        sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_polyU_on_{genome}.sam")),
        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", f"{{lib}}_{{rep}}_polyU_unmapped_on_{genome}.fastq.gz"),
    params:
        aligner = aligner,
        index = genome_db,
        settings = alignment_settings[aligner],
    message:
        f"Remapping {{wildcards.lib}}_{{wildcards.rep}} polyU-trimmed unmapped reads on {genome}."
    log:
        log = OPJ(log_dir, "remap_on_genome_{lib}_{rep}.log"),
        err = OPJ(log_dir, "remap_on_genome_{lib}_{rep}.err")
    resources:
        io=5
    threads:
        4
    #shell:
    #    mapping_command(aligner)
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"


def source_sam(wildcards):
    mapped_type = wildcards.mapped_type
    if mapped_type == f"on_{genome}":
        return rules.map_on_genome.output.sam
    elif mapped_type == f"polyU_on_{genome}":
        return rules.remap_on_genome.output.sam
    else:
        raise NotImplementedError(f"{mapped_type} not handled (yet?)")


532
533
rule sam2indexedbam:
    input:
534
535
        # sam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam"),
        sam = source_sam,
536
    output:
537
538
539
540
        # sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam")),
        # index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai")),
        sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam")),
        index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam.bai")),
541
542
543
544
545
546
547
548
    message:
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}."
    log:
        log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.log"),
        err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.err"),
    resources:
        io=45
    threads:
549
        8
550
551
552
553
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"


554
555
556
557
558
rule compute_mapping_stats:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
    output:
559
        stats = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_samtools_stats.txt"),
560
561
562
563
564
565
566
567
568
569
    shell:
        """samtools stats {input.sorted_bam} > {output.stats}"""


rule get_nb_mapped:
    """Extracts the number of mapped reads from samtools stats results."""
    input:
        stats = rules.compute_mapping_stats.output.stats,
        #stats = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"),
    output:
570
        nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_nb_mapped.txt"),
571
572
573
574
    shell:
        """samstats2mapped {input.stats} {output.nb_mapped}"""


575
576
rule compute_coverage:
    input:
577
578
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
579
    output:
580
        coverage = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_coverage.txt"),
581
582
    params:
        genomelen = genomelen,
583
584
    shell:
        """
585
        bases=$(samtools depth {input.sorted_bam} | awk '{{sum += $3}} END {{print sum}}') || error_exit "samtools depth failed"
586
        python3 -c "print(${{bases}} / {params.genomelen})" > {output.coverage}
587
588
589
590
591
        """


rule assemble_transcripts:
    input:
592
        sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
593
    output:
594
        gtf = OPJ(output_dir, aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"),
595
596
    params:
        annot = OPJ(annot_dir, "genes.gtf"),
597
598
599
    message:
        "Assembling transcripts for {wildcards.lib}_{wildcards.rep} with stringtie."
    log:
600
601
        log = OPJ(log_dir, "assemble_transcripts_{lib}_{rep}.log"),
        err = OPJ(log_dir, "assemble_transcripts_{lib}_{rep}.err")
602
603
604
605
606
607
    resources:
        io=5
    threads:
        4
    shell:
        """
608
609
610
        cmd="stringtie -p {threads} -G {params.annot} -o {output} -l {wildcards.lib}_{wildcards.rep} {input}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
611
612
        """

613

614
615
rule merge_transcripts:
    input:
616
        expand(OPJ(output_dir, aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"), lib=LIBS, rep=REPS),
617
    output:
618
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
619
620
    params:
        annot = OPJ(annot_dir, "genes.gtf"),
621
622
623
    message:
        "Merging transcripts with stringtie."
    log:
624
625
        log = OPJ(log_dir, "merge_transcripts.log"),
        err = OPJ(log_dir, "merge_transcripts.err")
626
627
628
629
    resources:
        io=5
    shell:
        """
630
631
632
        cmd="stringtie --merge -G {params.annot} -o {output} {input}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
633
634
635
636
637
        """


rule gffcompare:
    input:
638
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
639
    output:
640
        stats = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare.stats"),
641
    params:
642
        compare = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare"),
643
        annot = OPJ(annot_dir, "genes.gtf"),
644
645
646
    message:
        "Merging transcripts with stringtie."
    log:
647
648
        log = OPJ(log_dir, "gffcompare.log"),
        err = OPJ(log_dir, "gffcompare.err")
649
650
651
652
    resources:
        io=5
    shell:
        """
653
654
655
        cmd="gffcompare -r {params.annot} -o {params.compare} {input.merged}"
        echo ${{cmd}} 1> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err}
656
657
        """

658

659
660
rule quantify_transcripts:
    input:
661
662
        merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"),
        sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
663
    output:
664
        gtf = OPJ(output_dir, aligner, f"mapped_{genome}", "ballgown", f"{{lib}}_{{rep}}_on_{genome}.gtf"),
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
    message:
        "Quantifying transcripts for {wildcards.lib}_{wildcards.rep} with stringtie."
    log:
        OPJ(log_dir, "quantify_transcripts_{lib}_{rep}.log")
    threads:
        4
    resources:
        io=5
    shell:
        """
        cmd="stringtie -e -B -p {threads} -G {input.merged} -o {output} {input.sorted_bam}"
        echo ${{cmd}} 1> {log}
        eval ${{cmd}} 1>> {log}
        """


681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
# def htseq_orientation2stranded(wildcards):
#     orientation = wildcards.orientation
#     if orientation == "fwd":
#         if LIB_TYPE[-2:] == "SF":
#             return "yes"
#         elif LIB_TYPE[-2:] == "SR":
#             return "reverse"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "rev":
#         if LIB_TYPE[-2:] == "SF":
#             return "reverse"
#         elif LIB_TYPE[-2:] == "SR":
#             return "yes"
#         else:
#             raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
#     elif orientation == "all":
#         return "no"
#     else:
#         exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
701
702
703


def biotype2annot(wildcards):
704
705
706
707
708
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}.gtf")
709
710


711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
# rule htseq_count_reads:
#     input:
#         sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
#         bai = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"),
#     output:
#         counts = OPJ(output_dir, aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
#         counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
#     params:
#         stranded = htseq_orientation2stranded,
#         mode = "union",
#         annot = biotype2annot,
#     message:
#         "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with htseq-count."
#     benchmark:
#         OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
#     log:
#         log = OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
#         err = OPJ(log_dir, "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
#     wrapper:
#         "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/htseq_count_reads"
731
732


733
734
735
736
737
rule feature_count_reads:
    input:
        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
        index = rules.sam2indexedbam.output.index,
    output:
738
739
        counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{biotype}_{orientation}_counts_gene_names.txt"),
740
    params:
741
        stranded = feature_orientation2stranded(LIB_TYPE),
742
        annot = biotype2annot,
743
744
        # pickled dictionary that associates gene ids to gene names
        converter = genome_dict["converter"]
745
    message:
746
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} {wildcards.mapped_type} with featureCounts."
747
    benchmark:
748
        OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}_benchmark.txt")
749
    log:
750
751
        log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{mapped_type}_{biotype}_{orientation}.err"),
752
753
    shell:
        """
754
        tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
755
        cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
756
757
758
759
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
760
        cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
761
762
763
        """


764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
def biotype2merged(wildcards):
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}_merged.bed")


def intersect_orientation2stranded(wildcards):
    orientation = wildcards.orientation
    if orientation == "fwd":
        if LIB_TYPE[-2:] == "SF":
            return "-s"
        elif LIB_TYPE[-2:] == "SR":
            return "-S"
        else:
            raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
    elif orientation == "rev":
        if LIB_TYPE[-2:] == "SF":
            return "-S"
        elif LIB_TYPE[-2:] == "SR":
            return "-s"
        else:
            raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
    elif orientation == "all":
        return ""
    else:
        exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")


rule intersect_count_reads:
    input:
796
797
        sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"),
        bai = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"),
798
    output:
799
800
        counts = OPJ(output_dir, aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
    params:
        stranded = intersect_orientation2stranded,
        annot = biotype2merged,
    message:
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with bedtools intersect."
    benchmark:
        OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
    log:
        log = OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "intersect_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
    threads: 4  # to limit memory usage, actually
    wrapper:
        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/intersect_count_reads"


816
rule summarize_counts:
817
818
    """For a given library, write a summary of the read counts for the various biotypes."""
    input:
819
        biotype_counts_files = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "{{lib}}_{{rep}}_{{mapped_type}}", "{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES),
820
    output:
821
        summary = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt")
822
    run:
823
        if wildcards.counter == "htseq_count":
824
            sum_counter = sum_htseq_counts
825
826
827
        elif wildcards.counter == "intersect_count":
            sum_counter = sum_intersect_counts
        elif wildcards.counter == "feature_count":
828
829
            sum_counter = sum_feature_counts
        else:
830
            raise NotImplementedError(f"{wildcards.counter} not handled (yet?)")
831
832
        with open(output.summary, "w") as summary_file:
            header = "\t".join(BIOTYPES)
833
            summary_file.write("%s\n" % header)
834
            sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
835
836
837
838
            summary_file.write("%s\n" % sums)


rule gather_read_counts_summaries:
839
    """Gather read count summaries across libraries: lib_rep -> all."""
840
    input:
841
        #summary_tables = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "summaries", f"{{lib}}_{{rep}}_on_{genome}_{{{orientation}}}_counts.txt"), lib=LIBS, rep=REPS),
842
843
844
        summary_tables = expand(OPJ(
            output_dir, aligner, f"mapped_{genome}", "{{counter}}", "summaries",
            "{name}_on_%s_{{orientation}}_counts.txt" % genome), name=COND_NAMES),
845
    output:
846
847
848
        summary_table = OPJ(
            output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries",
            f"all_on_{genome}_{{orientation}}_counts.txt"),
849
    run:
850
851
        #summary_files = (OPJ(
        summary_files = [OPJ(
852
            output_dir, aligner, f"mapped_{genome}", wildcards.counter, "summaries",
853
            f"{{name}}_on_{genome}_{{orientation}}_counts.txt".format(
854
                name=cond_name,
855
856
857
858
859
860
861
                orientation=wildcards.orientation)) for cond_name in COND_NAMES]
                #orientation=wildcards.orientation)) for cond_name in COND_NAMES)
        try:
            summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
        except ValueError:
            warnings.warn("Some data may be missing. Using floats instead of ints.\n")
            summaries = pd.concat((pd.read_table(summary_file).T.astype(float) for summary_file in summary_files), axis=1)
862
        summaries.columns = COND_NAMES
863
        summaries.to_csv(output.summary_table, na_rep="NA", sep="\t")
864
865
866
867
868


rule gather_counts:
    """For a given biotype, gather counts from all libraries in one table."""
    input:
869
        counts_tables = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", f"{{lib}}_{{rep}}_on_{genome}", "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
870
    output:
871
        counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
872
873
874
    wildcard_constraints:
        # Avoid ambiguity with join_all_counts
        biotype = "|".join(BIOTYPES)
875
876
877
    run:
        # Gathering the counts data
        ############################
878
879
        #counts_files = (OPJ(
        counts_files = [OPJ(
880
            output_dir,
881
            aligner,
882
            f"mapped_{genome}",
883
            wildcards.counter,
884
            f"{cond_name}_on_{genome}",
885
886
887
            "{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES]
            #"{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
        if wildcards.counter == "htseq_count":
888
            counts_data = pd.concat(
889
890
                map(read_htseq_counts, counts_files),
                axis=1).fillna(0).astype(int)
891
892
893
894
895
        elif wildcards.counter == "intersect_count":
            counts_data = pd.concat(
                map(read_intersect_counts, counts_files),
                axis=1).fillna(0).astype(int)
        elif wildcards.counter == "feature_count":
896
897
            counts_data = pd.concat(
                map(read_feature_counts, counts_files),
898
899
                axis=1).fillna(0).astype(int)
        else:
900
            raise NotImplementedError(f"{wildcards.counter} not handled (yet?)")
901
902
903
904
905
906
        counts_data.columns = COND_NAMES
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
        # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
907
        if wildcards.biotype.endswith("_rmsk_families"):
908
909
910
911
            repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
            # Sum the counts for a given repeat family
            counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
        counts_data.index.names = ["gene"]
912
        counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
913
914


915
916
917
918
919
920
# TODO: plot log(pseudocounts) vs log(concentration) as in figs sup8 and sup9 of Risso et al. 2014
# Colour-code the spike-in length
# Also do the plot with RPK instead of pseudocounts
rule compute_RPK:
    """For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
    input:
921
922
        # TODO: Why wildcards seems to be None?
        #counts_data = rules.gather_counts.output.counts_table,
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
        counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
    output:
        rpk_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_RPK.txt"),
    params:
        feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
    run:
        counts_data = pd.read_table(input.counts_data, index_col="gene")
        feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
        common = counts_data.index.intersection(feature_lengths.index)
        rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
        rpk.to_csv(output.rpk_file, sep="\t")


938
939
940
941
rule compute_sum_million_RPK:
    input:
        rpk_file = rules.compute_RPK.output.rpk_file,
    output:
942
943
        sum_rpk_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_sum_million_RPK.txt"),
944
945
946
947
948
949
950
951
952
    run:
        sum_rpk = pd.read_table(
            input.rpk_file,
            index_col=0).sum()
        (sum_rpk / 1000000).to_csv(output.sum_rpk_file, sep="\t")


# Compute TPM using total number of mappers divided by genome length
# (not sum of RPK across biotypes: some mappers may not be counted)
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
# No, doesn't work: mappers / genome length not really comparable
# Needs to be done on all_types
rule compute_TPM:
    """For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
    input:
        rpk_file = rules.compute_RPK.output.rpk_file
    output:
        tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"),
    # The sum must be done over all counted features
    wildcard_constraints:
        biotype = "alltypes"
    run:
        rpk = pd.read_table(input.rpk_file, index_col="gene")
        tpm = 1000000 * rpk / rpk.sum()
        tpm.to_csv(output.tpm_file, sep="\t")


971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
# Useful to compute translation efficiency in the Ribo-seq pipeline
rule compute_mean_TPM:
    input:
        all_tmp_file = rules.compute_TPM.output.tpm_file
    output:
        tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            "{lib}_mean_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
    run:
        # TODO
        tpm = pd.read_table(
            input.all_tmp_file, index_col="gene",
            usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]])
        tpm_mean = tpm.mean(axis=1)
        tpm_mean.columns = [wildcards.lib]
        tpm_mean.to_csv(output.tpm_file, sep="\t")


988
989
990
991
992
993
994
995
996
997
998
999
1000
rule compute_RPM:
    input:
        counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
        #summary_table = rules.gather_read_counts_summaries.output.summary_table,
        summary_table = OPJ(
            output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries",
            f"all_on_{genome}_fwd_counts.txt"),
    output:
        rpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}",
            f"all_on_{genome}", "{biotype}_{orientation}_RPM.txt"),
    run:
        # Reading column counts from {input.counts_table}
For faster browsing, not all history is shown. View entire blame