PRO-seq.snakefile 80 KB
Newer Older
1
2
3
4
5
"""
Snakefile to analyse PRO-seq data.
"""
import sys
major, minor = sys.version_info[:2]
6
if major < 3 or (major == 3 and minor < 6):
7
8
    sys.exit("Need at least python 3.6\n")

9
10
# TODO look at 5' and 3' ends of genes

11
12
13
14
15
16
17
18
#TODO: add local metaprofiles (around TSS and around TTS), no min length
# TSS or TTS should not be within 200 of a TSS or TTS

# counts using featureCounts, differential expression for transcripts, CDS and UTR (see genes.gtf), and introns (transcript - exons) (so we need counting on exons)
# ratios CDS / introns

import os
OPJ = os.path.join
19
from glob import glob
20
from pickle import load
21
22
from fileinput import input as finput

Blaise Li's avatar
Blaise Li committed
23
24
25
26
27
28
29
30
31
32
33
import warnings


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning


34
35
36
37
38
# Useful for functional style
from itertools import product, starmap
from operator import eq

# Useful data structures
Blaise Li's avatar
Blaise Li committed
39
from collections import OrderedDict
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from collections import defaultdict, Counter

# To parse SAM format
import pysam
import pyBigWig

# For data processing and displaying
from sklearn.decomposition import PCA
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
mpl.use("PDF")
#mpl.rcParams["figure.figsize"] = 2, 4
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"

from matplotlib import numpy as np
57
import pandas as pd
58
59
60
61
import matplotlib.pyplot as plt
import seaborn as sns
import husl

62
from libdeseq import do_deseq2
63
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
64
from libworkflows import wc_applied, ensure_relative, cleanup_and_backup, texscape
65
from libworkflows import get_chrom_sizes, column_converter
66
from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file
67
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
68
from libworkflows import feature_orientation2stranded
69
from libworkflows import sum_by_family
70
71
from libworkflows import read_htseq_counts, sum_htseq_counts
from libworkflows import read_feature_counts, sum_feature_counts
72
from smincludes import rules as irules
Blaise Li's avatar
Blaise Li committed
73
from smwrappers import wrappers_dir
74

75
alignment_settings = {"bowtie2": ""}
Blaise Li's avatar
Blaise Li committed
76

77
78
#TRIMMERS = ["cutadapt", "fastx_clipper"]
TRIMMERS = ["cutadapt"]
79
COUNTERS = ["feature_count"]
80
81
82
83
ORIENTATIONS = ["fwd", "rev", "all"]

COMPL = {"A" : "T", "C" : "G", "G" : "C", "T" : "A", "N" : "N"}

84
85
86
87
88
89
90
91
LFC_RANGE = {
    "protein_coding" : (-10, 10),
    "DNA_transposons_rmsk" : (-10, 10),
    "RNA_transposons_rmsk" : (-10, 10),
    "satellites_rmsk" : (-10, 10),
    "simple_repeats_rmsk" : (-10, 10),
    "DNA_transposons_rmsk_families" : (-10, 10),
    "RNA_transposons_rmsk_families" : (-10, 10),
92
    "satellites_rmsk_families" : (-10, 10),
93
    "simple_repeats_rmsk_families" : (-10, 10),
94
    "pseudogene" : (-10, 10),
95
96
    "all_rmsk" : (-10, 10),
    "all_rmsk_families" : (-10, 10),
97
    "alltypes" : (-10, 10)}
98
99
100
101
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
102
103
104
#status2colour = make_status2colour(DOWN_STATUSES, UP_STATUSES)
#STATUSES = list(reversed(DOWN_STATUSES)) + ["down", "NS", "up"] + UP_STATUSES
#STATUS2COLOUR = dict(zip(STATUSES, sns.color_palette("coolwarm", len(STATUSES))))
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Or use --configfile instead
#configfile:
#    "PRO-seq.config.json"

# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
# key: library name
# value: 3' adapter sequence
lib2adapt = config["lib2adapt"]
# key: library name
# value: [length of 5' UMI, length of 3' UMI]
lib2UMI = config["lib2UMI"]
# key: library name
# value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
REPS = config["replicates"]
123
COND_PAIRS = config["cond_pairs"]
124
125
126
127
128
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in COND_PAIRS]), msg
129
130
131
132
133
134
135
136
137
138
139
140
CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in COND_PAIRS]
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
CONDITIONS = [{
    "lib" : lib,
    "rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
    cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")

141
COUNT_BIOTYPES = config["count_biotypes"]
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

RMSK_BIOTYPES = [
    "DNA_transposons_rmsk",
    "RNA_transposons_rmsk",
    "satellites_rmsk",
    "simple_repeats_rmsk"]
RMSK_FAMILIES_BIOTYPES = [
    "DNA_transposons_rmsk_families",
    "RNA_transposons_rmsk_families",
    "satellites_rmsk_families",
    "simple_repeats_rmsk_families"]

BIOTYPES_TO_JOIN = {
    "all_rmsk": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_BIOTYPES],
    "all_rmsk_families": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES],
157
    # We only count "protein_coding", not "protein_coding_{5UTR,CDS,3UTR}"
158
159
160
161
162
163
164
165
166
167
    "alltypes": [biotype for biotype in COUNT_BIOTYPES if not biotype.startswith("protein_coding_")]}
JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys())
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in COUNT_BIOTYPES + JOINED_BIOTYPES]


#ANNOT_BIOTYPES = config["annot_biotypes"]
#METAGENE_BIOTYPES = ["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
#METAGENE_BIOTYPES = ["protein_coding"]
#METAGENE_BIOTYPES = ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
METAGENE_BIOTYPES = [biotype for biotype in ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"] if biotype in COUNT_BIOTYPES]
168
# default id lists for MA plots
169
170
171
172
173
ID_LISTS = [
    "lfc_statuses",
    "germline_specific",
    "histone",
    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014",
174
    "piRNA_dependent_prot_si_22G_down4_top200", "piRNA_dependent_prot_si_22G_down4",
175
    "csr1_prot_si_supertargets_common"]
176
ID_LISTS = config.get("maplot_gene_lists", ID_LISTS)
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
aligner = config["aligner"]
########################
# Genome configuration #
########################
genome_dict = config.get("genome_dict", None)
if genome_dict is not None:
    genome = genome_dict["name"]
    chrom_sizes = get_chrom_sizes(genome_dict["size"])
    genomelen = sum(chrom_sizes.values())
    genome_db = genome_dict["db"][aligner]
    # bed file binning the genome in 10nt bins
    genome_binned = genome_dict["binned"]
    annot_dir = genome_dict["annot_dir"]
    # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
    convert_dir = genome_dict["convert_dir"]
    gene_lists_dir = genome_dict["gene_lists_dir"]
else:
    genome = "C_elegans"
    chrom_sizes = get_chrom_sizes(config["genome_size"])
    genomelen = sum(chrom_sizes.values())
    genome_db = config["index"]
    genome_binned = f"/pasteur/entites/Mhe/Genomes/{genome}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed"
    annot_dir = config["annot_dir"]
    convert_dir = config["convert_dir"]
    gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
202
203
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
#gene_lists_dir = config["gene_lists_dir"]
Blaise Li's avatar
Blaise Li committed
204
#local_annot_dir = config["local_annot_dir"]
205
206
207
208
209
210
#output_dir = config["output_dir"]
#workdir: config["output_dir"]
output_dir = os.path.abspath(".")
log_dir = OPJ("logs")
data_dir = OPJ("data")
local_annot_dir = OPJ("annotations")
211
212
213
214
# Used to skip some genotype x treatment x replicate number combinations
# when some of them were not sequenced
forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]}

215
SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref"]
216
217
218
219
assert set(SIZE_FACTORS).issubset(set(COUNT_BIOTYPES) | {"median_ratio_to_pseudo_ref"})
#NORM_TYPES = config["norm_types"]
NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"]
assert set(NORM_TYPES).issubset(set(SIZE_FACTORS))
220

221
# For metagene analyses
222
223
#META_MARGIN = 300
META_MARGIN = 0
224
META_SCALE = 2000
225
226
#UNSCALED_INSIDE = 500
UNSCALED_INSIDE = 0
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#META_MIN_LEN = 1000
META_MIN_LEN = 2 * UNSCALED_INSIDE
MIN_DIST = 2 * META_MARGIN


######
# Colors from https://personal.sron.nl/~pault/
# (via https://personal.sron.nl/~pault/python/distinct_colours.py)
hexcols = ['#332288', '#88CCEE', '#44AA99', '#117733', '#999933', '#DDCC77', 
           '#CC6677', '#882255', '#AA4499', '#661100', '#6699CC', '#AA4466',
           '#4477AA']

greysafecols = ['#809BC8', '#FF6666', '#FFCC66', '#64C204']

xarr = [[12],
        [12, 6],
        [12, 6, 5],
        [12, 6, 5, 3],
        [0, 1, 3, 5, 6],
        [0, 1, 3, 5, 6, 8],
        [0, 1, 2, 3, 5, 6, 8],
        [0, 1, 2, 3, 4, 5, 6, 8],
        [0, 1, 2, 3, 4, 5, 6, 7, 8],
        [0, 1, 2, 3, 4, 5, 9, 6, 7, 8],
        [0, 10, 1, 2, 3, 4, 5, 9, 6, 7, 8],
        [0, 10, 1, 2, 3, 4, 5, 9, 6, 11, 7, 8]]

# get specified nr of distinct colours in HTML hex format.
# in: nr - number of colours [1..12]
# returns: list of distinct colours in HTML hex
def get_distinct(nr):

    #
    # check if nr is in correct range
    #
    
    assert not (nr < 1 or nr > 12), "wrong nr of distinct colours!"

    #
    # get list of indices
    #
    
    lst = xarr[nr-1]
    
    #
    # generate colour list by stepping through indices and looking them up
    # in the colour table
    #

    i_col = 0
    col = [0] * nr
    for idx in lst:
        col[i_col] = hexcols[idx]
        i_col += 1
    return col

######

# def filter_combinator(combinator, blacklist):
#     """This function builds a wildcards combination generator
#     based on the generator *combinator* and a set of combinations
#     to exclude *blacklist*."""
#     def filtered_combinator(*args, **kwargs):
#         """This function generates wildcards combinations.
#         It is to be used as second argument of *expand*."""
#         #print(*args)
#         for wc_comb in combinator(*args, **kwargs):
#             # Use frozenset instead of tuple
#             # in order to accomodate
#             # unpredictable wildcard order
#             if frozenset(wc_comb) not in blacklist:
#                 yield wc_comb
#     return filtered_combinator


filtered_product = filter_combinator(product, forbidden)

wildcard_constraints:
    lib="|".join(LIBS),
    rep="\d+",
307
    orientation="|".join(ORIENTATIONS),
308
    biotype="|".join(COUNT_BIOTYPES + JOINED_BIOTYPES)
309
310
311
312
313


# Define functions to be used in shell portions
shell.prefix(SHELL_FUNCTIONS)

Blaise Li's avatar
Blaise Li committed
314
315
316
317
if len(CONDITIONS) < 2:
    pca_figs = []
else:
    pca_figs = expand(OPJ(
318
        "{trimmer}", "figures", aligner, "{counter}",
Blaise Li's avatar
Blaise Li committed
319
320
321
        "{biotype}_{orientation}_PCA.pdf"),
        trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES,
        orientation=ORIENTATIONS),
322
323
324
325

rule all:
    """This top rule is used to drive the whole workflow by taking as input its final products."""
    input:
Blaise Li's avatar
Blaise Li committed
326
        expand(OPJ(
327
            "{trimmer}", "figures", aligner, "{counter}",
328
329
330
            "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"),
            trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS,
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES, id_list=ID_LISTS),
Blaise Li's avatar
Blaise Li committed
331
        expand(OPJ(
332
            "{trimmer}", "figures", aligner, "{counter}",
333
334
335
            "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"),
            trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS,
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES, fold_type=["log2FoldChange"]),
Blaise Li's avatar
Blaise Li committed
336
        expand(OPJ(
337
            "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
338
339
340
            "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"),
            trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS,
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
Blaise Li's avatar
Blaise Li committed
341
342
        pca_figs,
        #expand(OPJ(
343
        #    "{trimmer}", "figures", aligner, "{counter}",
Blaise Li's avatar
Blaise Li committed
344
345
346
        #    "{biotype}_{orientation}_PCA.pdf"),
        #    trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES,
        #    orientation=ORIENTATIONS),
347
348
        #expand(OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), trimmer=TRIMMERS, counter=COUNTERS, orientation=["all"]),
        #expand(expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=["all"]),
Blaise Li's avatar
Blaise Li committed
349
        expand(expand(OPJ(
350
            "{{trimmer}}", "figures", aligner, "{lib}_{rep}",
351
352
            "adapt_on_C_elegans_last_bases.pdf"), filtered_product, lib=LIBS, rep=REPS),
            trimmer=TRIMMERS),
Blaise Li's avatar
Blaise Li committed
353
        expand(expand(OPJ(
354
            "{{trimmer}}", aligner, "mapped_C_elegans",
355
356
            "{lib}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), filtered_product, lib=LIBS, rep=REPS),
            trimmer=TRIMMERS, norm_type=NORM_TYPES, orientation=["all"]),
357
        expand(OPJ(
358
            "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
359
360
            "all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"),
            trimmer=TRIMMERS, counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS),
361
362
        #expand(OPJ("{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"]),
        #expand(OPJ("{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES),
363
        # TODO: Add metagene profiles similar to small RNA-seq
Blaise Li's avatar
Blaise Li committed
364
        expand(OPJ(
365
            "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean",
366
            "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)),
367
            trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all", "fwd", "rev"],
368
            biotype=METAGENE_BIOTYPES),
369
370


371
372
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
#include: "../snakemake_wrappers/includes/link_raw_data.rules"
373
374
375
376
377
378
379
380
381
382


rule trim_and_dedup:
    """The adaptor is trimmed, then reads are treated in two groups depending
    on whether the adapter was found or not. For each group the reads are
    sorted, deduplicated, and the random k-mers (k=4) that helped identify
    PCR duplicates are removed at both ends"""
    input:
        rules.link_raw_data.output,
    output:
383
384
385
386
387
388
389
        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"),
        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"),
        nb_raw =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
        nb_adapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
        nb_adapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"),
        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
        nb_noadapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"),
390
391
392
393
394
    params:
        adapter = lambda wildcards : lib2adapt[wildcards.lib],
        process_type = "PRO-seq",
        trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
        trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
Blaise Li's avatar
Blaise Li committed
395
    threads: 4 # Actually, to avoid too much IO
396
    message:
397
        "Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
398
    benchmark:
399
        OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
400
    log:
401
402
403
        trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
404
    run:
405
        shell_commands = """
406
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
407
408
409
410
    {params.adapter} {params.trim5} {params.trim3} \\
    {output.adapt} {output.noadapt} {log.trim} \\
    {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
    {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
411
412
413
414
"""
        shell(shell_commands)


415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
rule trim_only:
    """The adaptor is trimmed, then reads are treated in two groups depending
    on whether the adapter was found or not. For each group, the extra k-mers
    are removed at both ends."""
    input:
        rules.link_raw_data.output,
    output:
        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt.fastq.gz"),
        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt.fastq.gz"),
        nb_raw =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
        nb_adapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
    params:
        adapter = lambda wildcards : lib2adapt[wildcards.lib],
        process_type = "PRO-seq",
        trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
        trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
    threads: 4 # Actually, to avoid too much IO
    message:
        "Trimming adaptor from raw data using {wildcards.trimmer} and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
    benchmark:
        OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
    log:
        trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
    run:
        shell_commands = """
THREADS="{threads}" {params.process_type}_trim_only.sh {wildcards.trimmer} {input} \\
    {params.adapter} {params.trim5} {params.trim3} \\
    {output.adapt} {output.noadapt} {log.trim} \\
    {output.nb_raw} {output.nb_adapt} {output.nb_noadapt} \\
    1> {log.log} 2> {log.err}
"""
        shell(shell_commands)


def source_fastq(wildcards):
    """
    Determine the correct pre-processed fastq file depending on the pipeline
    configuration and the current wildcards.
    """
    if config.get("deduplicate", False):
        return OPJ(
            data_dir, f"trimmed_{wildcards.trimmer}",
            f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}_deduped.fastq.gz"),
    else:
        return OPJ(
            data_dir, f"trimmed_{wildcards.trimmer}",
            f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}.fastq.gz"),


467
# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
468
469
rule map_on_genome:
    input:
470
        fastq = source_fastq,
471
472
    output:
        # sam files take a lot of space
473
474
        sam = temp(OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")),
        nomap_fastq = OPJ("{trimmer}", aligner, "not_mapped_C_elegans", "{lib}_{rep}_{type}_unmapped_on_C_elegans.fastq.gz"),
475
    params:
Blaise Li's avatar
Blaise Li committed
476
        aligner = aligner,
477
        index = genome_db,
478
479
        settings = "",
    message:
480
        "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.type} on %s genome." % genome
481
    log:
482
483
        log = OPJ(log_dir, "{trimmer}", "map_{type}_on_genome", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "map_{type}_on_genome", "{lib}_{rep}.err"),
484
485
486
487
488
#    shell:
#        """
#        genome_dir="${{HOME}}/Genomes"
#        genome="C_elegans"
#        bowtie2_genome_db="${{genome_dir}}/${{genome}}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/genome"
Blaise Li's avatar
Blaise Li committed
489
#        cmd="bowtie2 --seed 123 -t --mm -x ${{bowtie2_genome_db}} -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
490
491
492
493
#        echo ${{cmd}} 1> {log.log} 2> {log.err}
#        eval ${{cmd}} 1>> {log.log} 2>> {log.err}
#        """
    wrapper:
Blaise Li's avatar
Blaise Li committed
494
        f"file://{wrappers_dir[0]}/map_on_genome"
495
496
497
498


rule sam2indexedbam:
    input:
499
        sam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam"),
500
    output:
501
502
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam"),
        index = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam.bai"),
503
    message:
504
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.type}."
505
    log:
506
507
        log = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{type}.log"),
        err = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{type}.err"),
508
509
    threads:
        4
510
511
    resources:
        mem_mb=4100
512
    wrapper:
Blaise Li's avatar
Blaise Li committed
513
        f"file://{wrappers_dir[0]}/sam2indexedbam"
514
515
516


rule fuse_bams:
517
    """This rule fuses the two sorted bam files corresponding to the mapping
518
519
    of the reads containing the adaptor or not."""
    input:
520
521
        noadapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_noadapt_on_C_elegans_sorted.bam"),
        adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"),
522
    output:
523
524
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"),
        bai = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"),
525
    message:
526
        "Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}"
527
    log:
528
529
        log = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.err"),
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
    shell:
        """
        samtools merge -c {output.sorted_bam} {input.noadapt_sorted_bam} {input.adapt_sorted_bam} 1> {log.log} 2> {log.err}
        indexed=""
        while [ ! ${{indexed}} ]
        do
            samtools index {output.sorted_bam} && indexed="OK"
            if [ ! ${{indexed}} ]
            then
                rm -f {output.bai}
                echo "Indexing failed. Retrying" 1>&2
            fi
        done 1>> {log.log} 2>> {log.err}
        """


rule compute_coverage:
    input:
Blaise Li's avatar
Blaise Li committed
548
        sorted_bam = rules.fuse_bams.output.sorted_bam,
549
    output:
550
        coverage = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_coverage.txt"),
551
552
    params:
        genomelen = genomelen,
553
554
    shell:
        """
Blaise Li's avatar
Blaise Li committed
555
        bases=$(samtools depth {input.sorted_bam} | awk '{{sum += $3}} END {{print sum}}') || error_exit "samtools depth failed"
556
        python3 -c "print(${{bases}} / {params.genomelen})" > {output.coverage}
557
558
559
        """


560
561
rule check_last_base:
    input:
562
563
        adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"),
        adapt_index = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam.bai"),
564
    output:
565
        OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
    message:
        "Computing last base proportions for {wildcards.lib}_{wildcards.rep} (mapped reads from which the adaptor had been removed)."
    log:
        log = OPJ(log_dir, "{trimmer}", "check_last_base", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "check_last_base", "{lib}_{rep}.err"),
    run:
        base_counts = defaultdict(Counter)
        with pysam.AlignmentFile(input.adapt_sorted_bam) as samfile:
            for ali in samfile.fetch():
                seq = ali.seq
                # To avoid errors when last base was erroneous:
                #seq = ali.get_reference_sequence()
                if ali.is_reverse:
                    base_counts[len(seq)][COMPL[seq[0].upper()]] += 1
                else:
                    base_counts[len(seq)][seq[-1].upper()] += 1
        with open(output[0], "w") as output_file:
            print("#length\tnb_reads\tA\tC\tG\tT\tN", file=output_file)
            for length, counter in sorted(base_counts.items()):
                nb_reads_this_length = sum(counter.values())
                print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file)
587
588


589
# TODO: use Python to make the plot
Blaise Li's avatar
Blaise Li committed
590
# This may remove dependency on R docopt
591
rule plot_last_base:
592
    input:
593
        OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
594
    output:
595
        OPJ("{trimmer}", "figures", aligner, "{lib}_{rep}", "adapt_on_C_elegans_last_bases.pdf")
596
    params:
597
598
599
        title = lambda wildcards : "\"last base frequencies for %s_%s_%s\"" % (wildcards.trimmer, wildcards.lib, wildcards.rep)
    message:
        "Plotting last base proportions for {wildcards.lib}_{wildcards.rep} (mapped reads from which the adaptor had been removed)."
600
    log:
601
        OPJ(log_dir, "{trimmer}", "plot_last_base", "{lib}_{rep}.log")
602
603
    shell:
        """
604
605
        plot_last_base.R -i {input} -o {output} -t {params.title}
        """
606
607


608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
def htseq_orientation2stranded(wildcards):
    orientation = wildcards.orientation
    if orientation == "fwd":
        if LIB_TYPE[-2:] == "SF":
            return "yes"
        elif LIB_TYPE[-2:] == "SR":
            return "reverse"
        else:
            raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
    elif orientation == "rev":
        if LIB_TYPE[-2:] == "SF":
            return "reverse"
        elif LIB_TYPE[-2:] == "SR":
            return "yes"
        else:
            raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
    elif orientation == "all":
        return "no"
    else:
        exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
628
629


630
631
632
633
634
635
636
637
638
639
def biotype2annot(wildcards):
    #return "/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/%s.gtf" % wildcards.biotype
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}.gtf")


rule htseq_count_reads:
640
    input:
641
642
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"),
        bai = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"),
643
    output:
644
645
        counts = OPJ("{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ("{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"),
646
647
648
649
650
651
    params:
        stranded = htseq_orientation2stranded,
        mode = "union",
        annot = biotype2annot,
    message:
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with htseq-count."
Blaise Li's avatar
Blaise Li committed
652
653
    benchmark:
        OPJ(log_dir, "{trimmer}", "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
654
    log:
Blaise Li's avatar
Blaise Li committed
655
656
        log = OPJ(log_dir, "{trimmer}", "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "{trimmer}", "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err")
657
    wrapper:
Blaise Li's avatar
Blaise Li committed
658
        f"file://{wrappers_dir[0]}/htseq_count_reads"
659
660


661
662
663
664
665
666
667
668
669
670
def parse_htseq_counts(counts_filename):
    with open(counts_filename) as counts_file:
        for line in counts_file:
            (gene, count) = line.strip().split()
            if gene.startswith("__"):
                return
            yield (gene, int(count))


rule feature_count_reads:
671
    input:
672
        sorted_bam = OPJ(
673
            "{trimmer}", aligner, "mapped_C_elegans",
674
675
            "{lib}_{rep}_on_C_elegans_sorted.bam"),
        bai = OPJ(
676
            "{trimmer}", aligner, "mapped_C_elegans",
677
            "{lib}_{rep}_on_C_elegans_sorted.bam.bai"),
678
679
680
        # TODO: Why does the following fail?
        #sorted_bam = rules.fuse_bams.output.sorted_bam,
        #index = rules.fuse_bams.output.index,
681
    output:
682
        counts = OPJ(
683
            "{trimmer}", aligner, "mapped_C_elegans", "feature_count",
684
685
            "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ(
686
            "{trimmer}", aligner, "mapped_C_elegans", "feature_count",
687
            "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"),
688
    params:
689
        stranded = feature_orientation2stranded(LIB_TYPE),
690
        annot = biotype2annot,
691
692
        # pickled dictionary that associates gene ids to gene names
        converter = genome_dict["converter"]
693
    message:
694
695
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
    log:
696
697
        log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.err")
698
699
    shell:
        """
700
701
702
703
704
705
        tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
        cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
706
        cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
707
708
709
        """


710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
def parse_feature_counts(counts_filename):
    with open(counts_filename) as counts_file:
        for line in counts_file:
            # skip comments
            if line[0] == "#":
                continue
            fields = line.strip().split()
            # skip header
            if fields[:6] == ["Geneid", "Chr", "Start", "End", "Strand", "Length"]:
                continue
            gene, count = fields[0], int(fields[6])
            yield (gene, count)


def same_gene_order(od1, od2):
    """Returns True if the keys of ordered dictionaries *od1* and *od2* are the same, in the same order."""
    if len(od1) != len(od2):
        return False
    return all(starmap(eq, zip(od1.keys(), od2.keys())))


def plot_scatterplot(outfile, data, data_groups, group2colour):
    #axis = plt.gca()
    for (i, (group, colour)) in enumerate(group2colour.items()):
        plt.scatter(
            data[data_groups==i, 0],
            data[data_groups==i, 1],
737
            color=colour, lw=2, label=texscape(group))
738
739
740
741
742
743
744
745
746
747
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.legend(loc="best", shadow=False, scatterpoints=1)
    plt.tight_layout()
    plt.savefig(outfile, format=outfile.name.split(".")[-1])
    #plt.savefig(outfile, format="pdf")
    plt.cla()


rule do_PCA:
748
    input:
749
        expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS),
750
    output:
751
752
753
        #OPJ(aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.pdf"),
        #OPJ(aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.png"),
        OPJ("{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.pdf"),
754
755
    message:
        "Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
756
    threads: 12  # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]"
757
    log:
758
        OPJ(log_dir, "{trimmer}", "{counter}", "do_PCA_{biotype}_{orientation}.log")
759
    run:
760
761
762
763
        if wildcards.counter == "htseq_count":
            counts_parser = parse_htseq_counts
        elif wildcards.counter == "feature_count":
            counts_parser = parse_feature_counts
764
        else:
765
766
767
768
769
770
771
772
773
774
            raise NotImplementedError("%s is not yet handled." % aligner)
        # We need the order to be fixed for the zip to be meaningful
        counts = OrderedDict([])
        nb_libs = len(LIBS)
        nb_reps = len(REPS)
        counts_array = np.empty((nb_libs * nb_reps,), dtype=object)
        #lib_categories = np.empty([len(LIBS) * len(REPS)], dtype=np.uint32)
        lib_categories = np.fromiter(chain(*[[i] * nb_reps for i in range(nb_libs)]), dtype=np.uint32)
        for i, (lib, rep) in enumerate(product(LIBS, REPS)):
            counts_filename = OPJ(
775
                wildcards.trimmer, aligner, "mapped_C_elegans", wildcards.counter,
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
                "%s_%s_on_C_elegans" % (lib, rep), "%s_%s_counts.txt" % (wildcards.biotype, wildcards.orientation))
            #print("Reading", counts_filename)
            counts[(lib, rep)] = OrderedDict(counts_parser(counts_filename))
            counts_array[i] = np.fromiter(counts[(lib, rep)].values(), np.uint32)
            #lib_categories[i] = i // len(REPS)
        # zipping the values with a shifted version and pairwise comparing the gene orders
        assert all(starmap(
            same_gene_order,
            zip(list(counts.values()),
                list(counts.values())[1:]))), "All counts file should have the same genes in the same order."
        #counts_array = np.array([library_array for library_array in counts_array])
        # Faster (http://stackoverflow.com/a/40402682/1878788)
        counts_array = np.concatenate(counts_array).reshape(len(counts_array), -1)
        libs_pca = PCA(n_components=2)
        libs_fitting = libs_pca.fit(counts_array)
        libs_transformed = libs_fitting.transform(counts_array)
        genes_pca = PCA(n_components=2)
        genes_fitting = genes_pca.fit(counts_array.T)
        genes_transformed = genes_fitting.transform(counts_array.T)
        # http://stackoverflow.com/questions/40425036/how-to-extract-the-extreme-two-colours-from-a-matplotlib-colormap
        #TODO: make this configurable
        colormap = "BrBG"
        cmap = plt.cm.get_cmap(colormap)
        # Extract rgb coordinates
        left_rgb = cmap(0)[:-1]
        right_rgb = cmap(cmap.N)[:-1]
        # Convert to husl and take the hue (first element)
        left_hue = husl.rgb_to_husl(*left_rgb)[0]
        right_hue = husl.rgb_to_husl(*right_rgb)[0]
        # Create a dark divergent palette
        palette = sns.diverging_palette(left_hue, right_hue, n=(2 * (nb_libs // 2)) + 1, center="dark")
        lib2colour = OrderedDict(zip(LIBS, [*palette[nb_libs // 2:(nb_libs+1) // 2], *palette[:nb_libs // 2], *palette[1 + nb_libs // 2:]]))
        with open(output[0], "wb") as outfile:
            plot_scatterplot(outfile, libs_transformed, lib_categories, lib2colour)
810

811
812
813
814
#def plot_cluster(counts_dir, biotype, orientation, counts_files):
#    libname_finder = re.compile("%s/(.+)_on_C_elegans_%s_%s_counts.txt" % (counts_dir, biotype, orientation))
#    libnames = [libname_finder.match(fname).groups()[0] for fname in counts_files]
#    d = pd.read_csv(filename, sep="\t", header=None, index_col=0)
815
816


817
818
rule summarize_counts:
    """For a given library, write a summary of the read counts for the various biotypes."""
819
    input:
820
        biotype_counts_files = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{{lib}}_{{rep}}_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES),
821
    output:
822
        summary = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "{lib}_{rep}_on_C_elegans_{orientation}_counts.txt")
823
    run:
824
825
826
827
828
829
830
831
832
833
834
835
836
        if wildcards.counter == "htseq_count":
            sum_counter = sum_htseq_counts
        elif wildcards.counter == "feature_count":
            sum_counter = sum_feature_counts
        else:
            raise NotImplementedError(f"{wildcards.counter} not handled (yet?)")
        with open(output.summary, "w") as summary_file:
            header = "\t".join(COUNT_BIOTYPES)
            #summary_file.write("#biotypes\t%s\n" % header)
            summary_file.write("%s\n" % header)
            sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
            #summary_file.write("%s_%s_%s\t%s\n" % (wildcards.lib, wildcards.rep, wildcards.orientation, sums))
            summary_file.write("%s\n" % sums)
837
838


839
rule gather_read_counts_summaries:
840
    input:
841
        summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "summaries", "{lib}_{rep}_on_C_elegans_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS),
842
    output:
843
        summary_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "all_on_C_elegans_{orientation}_counts.txt"),
844
    run:
845
846
847
848
849
850
851
852
853
854
855
856
        summary_files = (OPJ(
            wildcards.trimmer,
            aligner,
            "mapped_C_elegans",
            wildcards.counter,
            "summaries",
            "{name}_on_C_elegans_{orientation}_counts.txt".format(
                name=cond_name,
                orientation=wildcards.orientation)) for cond_name in COND_NAMES)
        summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
        summaries.columns = COND_NAMES
        summaries.to_csv(output.summary_table, sep="\t")
857
858


859
860
rule gather_counts:
    """For a given biotype, gather counts from all libraries in one table."""
861
    input:
862
        counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS),
863
    output:
864
        counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
865
866
867
    wildcard_constraints:
        # Avoid ambiguity with join_all_counts
        biotype = "|".join(COUNT_BIOTYPES)
868
    run:
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
        # Gathering the counts data
        ############################
        counts_files = (OPJ(
            wildcards.trimmer,
            aligner,
            "mapped_C_elegans",
            wildcards.counter,
            "{name}_on_C_elegans".format(name=cond_name),
            "{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
        if wildcards.counter == "htseq_count":
            counts_data = pd.concat(
                map(read_htseq_counts, counts_files),
                axis=1).fillna(0).astype(int)
        elif wildcards.counter == "intersect_count":
            counts_data = pd.concat(
                map(read_intersect_counts, counts_files),
                axis=1).fillna(0).astype(int)
        elif wildcards.counter == "feature_count":
            counts_data = pd.concat(
                map(read_feature_counts, counts_files),
                axis=1).fillna(0).astype(int)
        else:
            raise NotImplementedError(f"{wilcards.counter} not handled (yet?)")
        counts_data.columns = COND_NAMES
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
        # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
        if wildcards.biotype.endswith("_rmsk_families"):
899
            counts_data = sum_by_family(counts_data)
900
901
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, sep="\t")
902
903


904
905
906
907
908
909
@wc_applied
def source_counts_to_join(wildcards):
    """
    Determines which elementary biotype counts files should be joined to make the desired "joined" biotype.
    """
    return expand(
910
        OPJ("{{trimmer}}", aligner, "mapped_C_elegans",
911
912
913
914
915
916
917
918
919
            "{{counter}}", "all_on_C_elegans",
            "{biotype}_{{orientation}}_counts.txt"),
        biotype=BIOTYPES_TO_JOIN[wildcards.biotype])


rule join_all_counts:
    """concat counts for all biotypes into all"""
    input:
        counts_tables = source_counts_to_join,
920
        #counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
921
        # counts_tables = expand(
922
        #     OPJ("{{trimmer}}", aligner, "mapped_C_elegans",
923
924
925
926
927
        #         "{{counter}}", "all_on_C_elegans",
        #         "{biotype}_{{orientation}}_counts.txt"),
        #     # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
        #     biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
    output:
928
        counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
    wildcard_constraints:
        biotype = "|".join(JOINED_BIOTYPES)
    run:
        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)).drop_duplicates()
        assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, sep="\t")


@wc_applied
def source_counts(wildcards):
    """Determines from which rule the gathered small counts should be sourced."""
    if wildcards.biotype in JOINED_BIOTYPES:
        return rules.join_all_counts.output.counts_table
    else:
        # "Directly" from the counts gathered across libraries
        return rules.gather_counts.output.counts_table


Blaise Li's avatar
Blaise Li committed
948
949
950
rule compute_RPK:
    """For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
    input:
951
952
        counts_data = source_counts,
        #counts_data = rules.gather_counts.output.counts_table,
953
        #counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
954
955
        #    "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
    output:
956
        rpk_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
957
958
959
            "all_on_C_elegans", "{biotype}_{orientation}_RPK.txt"),
    params:
        feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
960
961
962
963
964
965
966
    # run:
    #     counts_data = pd.read_table(input.counts_data, index_col="gene")
    #     feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
    #     common = counts_data.index.intersection(feature_lengths.index)
    #     rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
    #     rpk.to_csv(output.rpk_file, sep="\t")
    wrapper:
Blaise Li's avatar
Blaise Li committed
967
        f"file://{wrappers_dir[0]}/compute_RPK"
Blaise Li's avatar
Blaise Li committed
968
969
970
971
972
973


rule compute_sum_million_RPK:
    input:
        rpk_file = rules.compute_RPK.output.rpk_file,
    output:
974
        sum_rpk_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
975
976
977
978
979
980
981
982
983
984
985
986
987
            "all_on_C_elegans", "{biotype}_{orientation}_sum_million_RPK.txt"),
    run:
        sum_rpk = pd.read_table(
            input.rpk_file,
            index_col=0).sum()
        (sum_rpk / 1000000).to_csv(output.sum_rpk_file, sep="\t")


rule compute_TPM:
    """For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
    input:
        rpk_file = rules.compute_RPK.output.rpk_file
    output:
988
        tpm_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
989
990
991
            "all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"),
    # The sum must be done over all counted features
    wildcard_constraints:
992
993
994
995
996
997
        biotype = "|".join(["alltypes"])
    # run:
    #     rpk = pd.read_table(input.rpk_file, index_col="gene")
    #     tpm = 1000000 * rpk / rpk.sum()
    #     tpm.to_csv(output.tpm_file, sep="\t")
    wrapper:
Blaise Li's avatar
Blaise Li committed
998
        f"file://{wrappers_dir[0]}/compute_TPM"
Blaise Li's avatar
Blaise Li committed
999
1000


For faster browsing, not all history is shown. View entire blame