PRO-seq.snakefile 80.1 KB
Newer Older
1
2
3
4
5
"""
Snakefile to analyse PRO-seq data.
"""
import sys
major, minor = sys.version_info[:2]
6
if major < 3 or (major == 3 and minor < 6):
7
8
    sys.exit("Need at least python 3.6\n")

9
10
# TODO look at 5' and 3' ends of genes

11
12
13
14
15
16
17
18
#TODO: add local metaprofiles (around TSS and around TTS), no min length
# TSS or TTS should not be within 200 of a TSS or TTS

# counts using featureCounts, differential expression for transcripts, CDS and UTR (see genes.gtf), and introns (transcript - exons) (so we need counting on exons)
# ratios CDS / introns

import os
OPJ = os.path.join
19
from glob import glob
20
from pickle import load
21
22
from fileinput import input as finput

Blaise Li's avatar
Blaise Li committed
23
24
25
26
27
28
29
30
31
32
33
import warnings


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning


34
35
36
37
38
# Useful for functional style
from itertools import product, starmap
from operator import eq

# Useful data structures
Blaise Li's avatar
Blaise Li committed
39
from collections import OrderedDict
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from collections import defaultdict, Counter

# To parse SAM format
import pysam
import pyBigWig

# For data processing and displaying
from sklearn.decomposition import PCA
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
mpl.use("PDF")
#mpl.rcParams["figure.figsize"] = 2, 4
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"

from matplotlib import numpy as np
57
import pandas as pd
58
59
60
61
import matplotlib.pyplot as plt
import seaborn as sns
import husl

62
from libdeseq import do_deseq2
63
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
64
from libworkflows import wc_applied, ensure_relative, cleanup_and_backup, texscape
65
from libworkflows import get_chrom_sizes, column_converter
66
from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file
67
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
68
from libworkflows import feature_orientation2stranded
69
from libworkflows import sum_by_family
70
71
from libworkflows import read_htseq_counts, sum_htseq_counts
from libworkflows import read_feature_counts, sum_feature_counts
72
from smincludes import rules as irules
Blaise Li's avatar
Blaise Li committed
73
from smwrappers import wrappers_dir
74

75
alignment_settings = {"bowtie2": ""}
Blaise Li's avatar
Blaise Li committed
76

77
78
#TRIMMERS = ["cutadapt", "fastx_clipper"]
TRIMMERS = ["cutadapt"]
79
COUNTERS = ["feature_count"]
80
81
82
83
ORIENTATIONS = ["fwd", "rev", "all"]

COMPL = {"A" : "T", "C" : "G", "G" : "C", "T" : "A", "N" : "N"}

84
85
86
87
88
89
90
91
LFC_RANGE = {
    "protein_coding" : (-10, 10),
    "DNA_transposons_rmsk" : (-10, 10),
    "RNA_transposons_rmsk" : (-10, 10),
    "satellites_rmsk" : (-10, 10),
    "simple_repeats_rmsk" : (-10, 10),
    "DNA_transposons_rmsk_families" : (-10, 10),
    "RNA_transposons_rmsk_families" : (-10, 10),
92
    "satellites_rmsk_families" : (-10, 10),
93
    "simple_repeats_rmsk_families" : (-10, 10),
94
    "pseudogene" : (-10, 10),
95
96
    "all_rmsk" : (-10, 10),
    "all_rmsk_families" : (-10, 10),
97
    "alltypes" : (-10, 10)}
98
99
100
101
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
102
103
104
#status2colour = make_status2colour(DOWN_STATUSES, UP_STATUSES)
#STATUSES = list(reversed(DOWN_STATUSES)) + ["down", "NS", "up"] + UP_STATUSES
#STATUS2COLOUR = dict(zip(STATUSES, sns.color_palette("coolwarm", len(STATUSES))))
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Or use --configfile instead
#configfile:
#    "PRO-seq.config.json"

# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
# key: library name
# value: 3' adapter sequence
lib2adapt = config["lib2adapt"]
# key: library name
# value: [length of 5' UMI, length of 3' UMI]
lib2UMI = config["lib2UMI"]
# key: library name
# value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
REPS = config["replicates"]
123
COND_PAIRS = config["cond_pairs"]
124
125
126
127
128
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in COND_PAIRS]), msg
129
130
131
132
133
134
135
136
137
138
139
140
CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in COND_PAIRS]
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
CONDITIONS = [{
    "lib" : lib,
    "rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
    cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")

141
COUNT_BIOTYPES = config["count_biotypes"]
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

RMSK_BIOTYPES = [
    "DNA_transposons_rmsk",
    "RNA_transposons_rmsk",
    "satellites_rmsk",
    "simple_repeats_rmsk"]
RMSK_FAMILIES_BIOTYPES = [
    "DNA_transposons_rmsk_families",
    "RNA_transposons_rmsk_families",
    "satellites_rmsk_families",
    "simple_repeats_rmsk_families"]

BIOTYPES_TO_JOIN = {
    "all_rmsk": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_BIOTYPES],
    "all_rmsk_families": [biotype for biotype in COUNT_BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES],
157
    # We only count "protein_coding", not "protein_coding_{5UTR,CDS,3UTR}"
158
159
160
161
162
163
164
165
166
167
    "alltypes": [biotype for biotype in COUNT_BIOTYPES if not biotype.startswith("protein_coding_")]}
JOINED_BIOTYPES = list(BIOTYPES_TO_JOIN.keys())
DE_BIOTYPES = [biotype for biotype in LFC_RANGE.keys() if biotype in COUNT_BIOTYPES + JOINED_BIOTYPES]


#ANNOT_BIOTYPES = config["annot_biotypes"]
#METAGENE_BIOTYPES = ["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
#METAGENE_BIOTYPES = ["protein_coding"]
#METAGENE_BIOTYPES = ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
METAGENE_BIOTYPES = [biotype for biotype in ["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"] if biotype in COUNT_BIOTYPES]
168
# default id lists for MA plots
169
170
171
172
173
ID_LISTS = [
    "lfc_statuses",
    "germline_specific",
    "histone",
    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014",
174
    "piRNA_dependent_prot_si_22G_down4_top200", "piRNA_dependent_prot_si_22G_down4",
175
    "csr1_prot_si_supertargets_common"]
176
ID_LISTS = config.get("maplot_gene_lists", ID_LISTS)
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
aligner = config["aligner"]
########################
# Genome configuration #
########################
genome_dict = config.get("genome_dict", None)
if genome_dict is not None:
    genome = genome_dict["name"]
    chrom_sizes = get_chrom_sizes(genome_dict["size"])
    genomelen = sum(chrom_sizes.values())
    genome_db = genome_dict["db"][aligner]
    # bed file binning the genome in 10nt bins
    genome_binned = genome_dict["binned"]
    annot_dir = genome_dict["annot_dir"]
    # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
    convert_dir = genome_dict["convert_dir"]
    gene_lists_dir = genome_dict["gene_lists_dir"]
else:
    genome = "C_elegans"
    chrom_sizes = get_chrom_sizes(config["genome_size"])
    genomelen = sum(chrom_sizes.values())
    genome_db = config["index"]
    genome_binned = f"/pasteur/entites/Mhe/Genomes/{genome}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed"
    annot_dir = config["annot_dir"]
    convert_dir = config["convert_dir"]
    gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
202
203
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
#gene_lists_dir = config["gene_lists_dir"]
Blaise Li's avatar
Blaise Li committed
204
#local_annot_dir = config["local_annot_dir"]
205
206
207
208
209
210
#output_dir = config["output_dir"]
#workdir: config["output_dir"]
output_dir = os.path.abspath(".")
log_dir = OPJ("logs")
data_dir = OPJ("data")
local_annot_dir = OPJ("annotations")
211
212
213
214
# Used to skip some genotype x treatment x replicate number combinations
# when some of them were not sequenced
forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]}

215
SIZE_FACTORS = ["protein_coding", "miRNA", "median_ratio_to_pseudo_ref"]
216
217
218
219
assert set(SIZE_FACTORS).issubset(set(COUNT_BIOTYPES) | {"median_ratio_to_pseudo_ref"})
#NORM_TYPES = config["norm_types"]
NORM_TYPES = ["protein_coding", "median_ratio_to_pseudo_ref"]
assert set(NORM_TYPES).issubset(set(SIZE_FACTORS))
220

221
# For metagene analyses
222
223
#META_MARGIN = 300
META_MARGIN = 0
224
META_SCALE = 2000
225
226
#UNSCALED_INSIDE = 500
UNSCALED_INSIDE = 0
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#META_MIN_LEN = 1000
META_MIN_LEN = 2 * UNSCALED_INSIDE
MIN_DIST = 2 * META_MARGIN


######
# Colors from https://personal.sron.nl/~pault/
# (via https://personal.sron.nl/~pault/python/distinct_colours.py)
hexcols = ['#332288', '#88CCEE', '#44AA99', '#117733', '#999933', '#DDCC77', 
           '#CC6677', '#882255', '#AA4499', '#661100', '#6699CC', '#AA4466',
           '#4477AA']

greysafecols = ['#809BC8', '#FF6666', '#FFCC66', '#64C204']

xarr = [[12],
        [12, 6],
        [12, 6, 5],
        [12, 6, 5, 3],
        [0, 1, 3, 5, 6],
        [0, 1, 3, 5, 6, 8],
        [0, 1, 2, 3, 5, 6, 8],
        [0, 1, 2, 3, 4, 5, 6, 8],
        [0, 1, 2, 3, 4, 5, 6, 7, 8],
        [0, 1, 2, 3, 4, 5, 9, 6, 7, 8],
        [0, 10, 1, 2, 3, 4, 5, 9, 6, 7, 8],
        [0, 10, 1, 2, 3, 4, 5, 9, 6, 11, 7, 8]]

# get specified nr of distinct colours in HTML hex format.
# in: nr - number of colours [1..12]
# returns: list of distinct colours in HTML hex
def get_distinct(nr):

    #
    # check if nr is in correct range
    #
    
    assert not (nr < 1 or nr > 12), "wrong nr of distinct colours!"

    #
    # get list of indices
    #
    
    lst = xarr[nr-1]
    
    #
    # generate colour list by stepping through indices and looking them up
    # in the colour table
    #

    i_col = 0
    col = [0] * nr
    for idx in lst:
        col[i_col] = hexcols[idx]
        i_col += 1
    return col

######

# def filter_combinator(combinator, blacklist):
#     """This function builds a wildcards combination generator
#     based on the generator *combinator* and a set of combinations
#     to exclude *blacklist*."""
#     def filtered_combinator(*args, **kwargs):
#         """This function generates wildcards combinations.
#         It is to be used as second argument of *expand*."""
#         #print(*args)
#         for wc_comb in combinator(*args, **kwargs):
#             # Use frozenset instead of tuple
#             # in order to accomodate
#             # unpredictable wildcard order
#             if frozenset(wc_comb) not in blacklist:
#                 yield wc_comb
#     return filtered_combinator


filtered_product = filter_combinator(product, forbidden)

wildcard_constraints:
    lib="|".join(LIBS),
    rep="\d+",
307
    orientation="|".join(ORIENTATIONS),
308
    biotype="|".join(COUNT_BIOTYPES + JOINED_BIOTYPES)
309
310
311
312
313


# Define functions to be used in shell portions
shell.prefix(SHELL_FUNCTIONS)

Blaise Li's avatar
Blaise Li committed
314
315
316
317
if len(CONDITIONS) < 2:
    pca_figs = []
else:
    pca_figs = expand(OPJ(
318
        "{trimmer}", "figures", aligner, "{counter}",
Blaise Li's avatar
Blaise Li committed
319
320
321
        "{biotype}_{orientation}_PCA.pdf"),
        trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES,
        orientation=ORIENTATIONS),
322
323
324
325

rule all:
    """This top rule is used to drive the whole workflow by taking as input its final products."""
    input:
Blaise Li's avatar
Blaise Li committed
326
        expand(OPJ(
327
            "{trimmer}", "figures", aligner, "{counter}",
328
329
330
            "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"),
            trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS,
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES, id_list=ID_LISTS),
Blaise Li's avatar
Blaise Li committed
331
        expand(OPJ(
332
            "{trimmer}", "figures", aligner, "{counter}",
333
334
335
            "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"),
            trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS,
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES, fold_type=["log2FoldChange"]),
Blaise Li's avatar
Blaise Li committed
336
        expand(OPJ(
337
            "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
338
339
340
            "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"),
            trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS,
            orientation=ORIENTATIONS, biotype=DE_BIOTYPES),
Blaise Li's avatar
Blaise Li committed
341
342
        pca_figs,
        #expand(OPJ(
343
        #    "{trimmer}", "figures", aligner, "{counter}",
Blaise Li's avatar
Blaise Li committed
344
345
346
        #    "{biotype}_{orientation}_PCA.pdf"),
        #    trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES,
        #    orientation=ORIENTATIONS),
347
348
        #expand(OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), trimmer=TRIMMERS, counter=COUNTERS, orientation=["all"]),
        #expand(expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=["all"]),
Blaise Li's avatar
Blaise Li committed
349
        expand(expand(OPJ(
350
            "{{trimmer}}", "figures", aligner, "{lib}_{rep}",
351
352
            "adapt_on_C_elegans_last_bases.pdf"), filtered_product, lib=LIBS, rep=REPS),
            trimmer=TRIMMERS),
Blaise Li's avatar
Blaise Li committed
353
        expand(expand(OPJ(
354
            "{{trimmer}}", aligner, "mapped_C_elegans",
355
356
            "{lib}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), filtered_product, lib=LIBS, rep=REPS),
            trimmer=TRIMMERS, norm_type=NORM_TYPES, orientation=["all"]),
357
        expand(OPJ(
358
            "{trimmer}", aligner, "mapped_C_elegans", "{counter}",
359
360
            "all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"),
            trimmer=TRIMMERS, counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS),
361
362
        #expand(OPJ("{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"]),
        #expand(OPJ("{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES),
363
        # TODO: Add metagene profiles similar to small RNA-seq
Blaise Li's avatar
Blaise Li committed
364
        expand(OPJ(
365
            "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean",
366
            "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)),
367
            trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all", "fwd", "rev"],
368
            biotype=METAGENE_BIOTYPES),
369
370


371
372
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
#include: "../snakemake_wrappers/includes/link_raw_data.rules"
373
374
375
376
377
378
379
380
381
382


rule trim_and_dedup:
    """The adaptor is trimmed, then reads are treated in two groups depending
    on whether the adapter was found or not. For each group the reads are
    sorted, deduplicated, and the random k-mers (k=4) that helped identify
    PCR duplicates are removed at both ends"""
    input:
        rules.link_raw_data.output,
    output:
383
384
385
386
387
388
389
        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt_deduped.fastq.gz"),
        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt_deduped.fastq.gz"),
        nb_raw =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
        nb_adapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
        nb_adapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt_deduped.txt"),
        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
        nb_noadapt_deduped =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt_deduped.txt"),
390
391
392
393
394
    params:
        adapter = lambda wildcards : lib2adapt[wildcards.lib],
        process_type = "PRO-seq",
        trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
        trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
395
    threads: 8 # Actually, to avoid too much IO
396
    message:
397
        "Trimming adaptor from raw data using {wildcards.trimmer}, deduplicating reads, and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
398
    benchmark:
399
        OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
400
    log:
401
402
403
        trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
404
    run:
405
        shell_commands = """
406
THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
407
408
409
410
    {params.adapter} {params.trim5} {params.trim3} \\
    {output.adapt} {output.noadapt} {log.trim} \\
    {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
    {output.nb_noadapt} {output.nb_noadapt_deduped} 1> {log.log} 2> {log.err}
411
412
413
414
"""
        shell(shell_commands)


415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
rule trim_only:
    """The adaptor is trimmed, then reads are treated in two groups depending
    on whether the adapter was found or not. For each group, the extra k-mers
    are removed at both ends."""
    input:
        rules.link_raw_data.output,
    output:
        noadapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_noadapt.fastq.gz"),
        adapt = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_adapt.fastq.gz"),
        nb_raw =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_raw.txt"),
        nb_adapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_adapt.txt"),
        nb_noadapt =  OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_nb_noadapt.txt"),
    params:
        adapter = lambda wildcards : lib2adapt[wildcards.lib],
        process_type = "PRO-seq",
        trim5 = lambda wildcards : lib2UMI[wildcards.lib][0],
        trim3 = lambda wildcards : lib2UMI[wildcards.lib][1],
432
    threads: 8 # Actually, to avoid too much IO
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
    message:
        "Trimming adaptor from raw data using {wildcards.trimmer} and removing 5' and 3' random n-mers for {wildcards.lib}_{wildcards.rep}."
    benchmark:
        OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim_benchmark.txt")
    log:
        trim = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_trim.log"),
        log = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
    run:
        shell_commands = """
THREADS="{threads}" {params.process_type}_trim_only.sh {wildcards.trimmer} {input} \\
    {params.adapter} {params.trim5} {params.trim3} \\
    {output.adapt} {output.noadapt} {log.trim} \\
    {output.nb_raw} {output.nb_adapt} {output.nb_noadapt} \\
    1> {log.log} 2> {log.err}
"""
        shell(shell_commands)


def source_fastq(wildcards):
    """
    Determine the correct pre-processed fastq file depending on the pipeline
    configuration and the current wildcards.
    """
    if config.get("deduplicate", False):
        return OPJ(
            data_dir, f"trimmed_{wildcards.trimmer}",
            f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}_deduped.fastq.gz"),
    else:
        return OPJ(
            data_dir, f"trimmed_{wildcards.trimmer}",
            f"{wildcards.lib}_{wildcards.rep}_{wildcards.type}.fastq.gz"),


467
# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
468
469
rule map_on_genome:
    input:
470
        fastq = source_fastq,
471
472
    output:
        # sam files take a lot of space
473
474
        sam = temp(OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")),
        nomap_fastq = OPJ("{trimmer}", aligner, "not_mapped_C_elegans", "{lib}_{rep}_{type}_unmapped_on_C_elegans.fastq.gz"),
475
    params:
Blaise Li's avatar
Blaise Li committed
476
        aligner = aligner,
477
        index = genome_db,
478
479
        settings = "",
    message:
480
        "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.type} on %s genome." % genome
481
    log:
482
483
        log = OPJ(log_dir, "{trimmer}", "map_{type}_on_genome", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "map_{type}_on_genome", "{lib}_{rep}.err"),
484
485
    threads:
        8
486
487
488
489
490
#    shell:
#        """
#        genome_dir="${{HOME}}/Genomes"
#        genome="C_elegans"
#        bowtie2_genome_db="${{genome_dir}}/${{genome}}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/genome"
Blaise Li's avatar
Blaise Li committed
491
#        cmd="bowtie2 --seed 123 -t --mm -x ${{bowtie2_genome_db}} -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
492
493
494
495
#        echo ${{cmd}} 1> {log.log} 2> {log.err}
#        eval ${{cmd}} 1>> {log.log} 2>> {log.err}
#        """
    wrapper:
496
        f"file://{wrappers_dir}/map_on_genome"
497
498
499
500


rule sam2indexedbam:
    input:
501
        sam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam"),
502
    output:
503
504
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam"),
        index = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam.bai"),
505
    message:
506
        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.type}."
507
    log:
508
509
        log = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{type}.log"),
        err = OPJ(log_dir, "{trimmer}", "sam2indexedbam", "{lib}_{rep}_{type}.err"),
510
    threads:
511
        8
512
513
    resources:
        mem_mb=4100
514
    wrapper:
515
        f"file://{wrappers_dir}/sam2indexedbam"
516
517
518


rule fuse_bams:
519
    """This rule fuses the two sorted bam files corresponding to the mapping
520
521
    of the reads containing the adaptor or not."""
    input:
522
523
        noadapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_noadapt_on_C_elegans_sorted.bam"),
        adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"),
524
    output:
525
526
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"),
        bai = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"),
527
    message:
528
        "Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}"
529
    log:
530
531
        log = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "fuse_bams", "{lib}_{rep}.err"),
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
    shell:
        """
        samtools merge -c {output.sorted_bam} {input.noadapt_sorted_bam} {input.adapt_sorted_bam} 1> {log.log} 2> {log.err}
        indexed=""
        while [ ! ${{indexed}} ]
        do
            samtools index {output.sorted_bam} && indexed="OK"
            if [ ! ${{indexed}} ]
            then
                rm -f {output.bai}
                echo "Indexing failed. Retrying" 1>&2
            fi
        done 1>> {log.log} 2>> {log.err}
        """


rule compute_coverage:
    input:
Blaise Li's avatar
Blaise Li committed
550
        sorted_bam = rules.fuse_bams.output.sorted_bam,
551
    output:
552
        coverage = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_coverage.txt"),
553
554
    params:
        genomelen = genomelen,
555
556
    shell:
        """
Blaise Li's avatar
Blaise Li committed
557
        bases=$(samtools depth {input.sorted_bam} | awk '{{sum += $3}} END {{print sum}}') || error_exit "samtools depth failed"
558
        python3 -c "print(${{bases}} / {params.genomelen})" > {output.coverage}
559
560
561
        """


562
563
rule check_last_base:
    input:
564
565
        adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"),
        adapt_index = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam.bai"),
566
    output:
567
        OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
    message:
        "Computing last base proportions for {wildcards.lib}_{wildcards.rep} (mapped reads from which the adaptor had been removed)."
    log:
        log = OPJ(log_dir, "{trimmer}", "check_last_base", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "check_last_base", "{lib}_{rep}.err"),
    run:
        base_counts = defaultdict(Counter)
        with pysam.AlignmentFile(input.adapt_sorted_bam) as samfile:
            for ali in samfile.fetch():
                seq = ali.seq
                # To avoid errors when last base was erroneous:
                #seq = ali.get_reference_sequence()
                if ali.is_reverse:
                    base_counts[len(seq)][COMPL[seq[0].upper()]] += 1
                else:
                    base_counts[len(seq)][seq[-1].upper()] += 1
        with open(output[0], "w") as output_file:
            print("#length\tnb_reads\tA\tC\tG\tT\tN", file=output_file)
            for length, counter in sorted(base_counts.items()):
                nb_reads_this_length = sum(counter.values())
                print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file)
589
590


591
# TODO: use Python to make the plot
Blaise Li's avatar
Blaise Li committed
592
# This may remove dependency on R docopt
593
rule plot_last_base:
594
    input:
595
        OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
596
    output:
597
        OPJ("{trimmer}", "figures", aligner, "{lib}_{rep}", "adapt_on_C_elegans_last_bases.pdf")
598
    params:
599
600
601
        title = lambda wildcards : "\"last base frequencies for %s_%s_%s\"" % (wildcards.trimmer, wildcards.lib, wildcards.rep)
    message:
        "Plotting last base proportions for {wildcards.lib}_{wildcards.rep} (mapped reads from which the adaptor had been removed)."
602
    log:
603
        OPJ(log_dir, "{trimmer}", "plot_last_base", "{lib}_{rep}.log")
604
605
    shell:
        """
606
607
        plot_last_base.R -i {input} -o {output} -t {params.title}
        """
608
609


610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
def htseq_orientation2stranded(wildcards):
    orientation = wildcards.orientation
    if orientation == "fwd":
        if LIB_TYPE[-2:] == "SF":
            return "yes"
        elif LIB_TYPE[-2:] == "SR":
            return "reverse"
        else:
            raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
    elif orientation == "rev":
        if LIB_TYPE[-2:] == "SF":
            return "reverse"
        elif LIB_TYPE[-2:] == "SR":
            return "yes"
        else:
            raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
    elif orientation == "all":
        return "no"
    else:
        exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
630
631


632
633
634
635
636
637
638
639
640
641
def biotype2annot(wildcards):
    #return "/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/%s.gtf" % wildcards.biotype
    if wildcards.biotype.endswith("_rmsk_families"):
        biotype = wildcards.biotype[:-9]
    else:
        biotype = wildcards.biotype
    return OPJ(annot_dir, f"{biotype}.gtf")


rule htseq_count_reads:
642
    input:
643
644
        sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"),
        bai = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"),
645
    output:
646
647
        counts = OPJ("{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ("{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"),
648
649
650
651
652
653
    params:
        stranded = htseq_orientation2stranded,
        mode = "union",
        annot = biotype2annot,
    message:
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with htseq-count."
Blaise Li's avatar
Blaise Li committed
654
655
    benchmark:
        OPJ(log_dir, "{trimmer}", "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
656
    log:
Blaise Li's avatar
Blaise Li committed
657
658
        log = OPJ(log_dir, "{trimmer}", "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
        err = OPJ(log_dir, "{trimmer}", "htseq_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err")
659
    wrapper:
660
        f"file://{wrappers_dir}/htseq_count_reads"
661
662


663
664
665
666
667
668
669
670
671
672
def parse_htseq_counts(counts_filename):
    with open(counts_filename) as counts_file:
        for line in counts_file:
            (gene, count) = line.strip().split()
            if gene.startswith("__"):
                return
            yield (gene, int(count))


rule feature_count_reads:
673
    input:
674
        sorted_bam = OPJ(
675
            "{trimmer}", aligner, "mapped_C_elegans",
676
677
            "{lib}_{rep}_on_C_elegans_sorted.bam"),
        bai = OPJ(
678
            "{trimmer}", aligner, "mapped_C_elegans",
679
            "{lib}_{rep}_on_C_elegans_sorted.bam.bai"),
680
681
682
        # TODO: Why does the following fail?
        #sorted_bam = rules.fuse_bams.output.sorted_bam,
        #index = rules.fuse_bams.output.index,
683
    output:
684
        counts = OPJ(
685
            "{trimmer}", aligner, "mapped_C_elegans", "feature_count",
686
687
            "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
        counts_converted = OPJ(
688
            "{trimmer}", aligner, "mapped_C_elegans", "feature_count",
689
            "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"),
690
    params:
691
        stranded = feature_orientation2stranded(LIB_TYPE),
692
        annot = biotype2annot,
693
694
        # pickled dictionary that associates gene ids to gene names
        converter = genome_dict["converter"]
695
    message:
696
697
        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
    log:
698
699
        log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.log"),
        err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.err")
700
701
    shell:
        """
702
703
704
705
706
707
        tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
        cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
        featureCounts -v 2> {log.log}
        echo ${{cmd}} 1>> {log.log}
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
708
709
        cat {output.counts} | wormid2name > {output.counts_converted}
        # cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
710
711
712
        """


713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
def parse_feature_counts(counts_filename):
    with open(counts_filename) as counts_file:
        for line in counts_file:
            # skip comments
            if line[0] == "#":
                continue
            fields = line.strip().split()
            # skip header
            if fields[:6] == ["Geneid", "Chr", "Start", "End", "Strand", "Length"]:
                continue
            gene, count = fields[0], int(fields[6])
            yield (gene, count)


def same_gene_order(od1, od2):
    """Returns True if the keys of ordered dictionaries *od1* and *od2* are the same, in the same order."""
    if len(od1) != len(od2):
        return False
    return all(starmap(eq, zip(od1.keys(), od2.keys())))


def plot_scatterplot(outfile, data, data_groups, group2colour):
    #axis = plt.gca()
    for (i, (group, colour)) in enumerate(group2colour.items()):
        plt.scatter(
            data[data_groups==i, 0],
            data[data_groups==i, 1],
740
            color=colour, lw=2, label=texscape(group))
741
742
743
744
745
746
747
748
749
750
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.legend(loc="best", shadow=False, scatterpoints=1)
    plt.tight_layout()
    plt.savefig(outfile, format=outfile.name.split(".")[-1])
    #plt.savefig(outfile, format="pdf")
    plt.cla()


rule do_PCA:
751
    input:
752
        expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS),
753
    output:
754
755
756
        #OPJ(aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.pdf"),
        #OPJ(aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.png"),
        OPJ("{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.pdf"),
757
758
    message:
        "Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
759
    threads: 12  # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]"
760
    log:
761
        OPJ(log_dir, "{trimmer}", "{counter}", "do_PCA_{biotype}_{orientation}.log")
762
    run:
763
764
765
766
        if wildcards.counter == "htseq_count":
            counts_parser = parse_htseq_counts
        elif wildcards.counter == "feature_count":
            counts_parser = parse_feature_counts
767
        else:
768
769
770
771
772
773
774
775
776
777
            raise NotImplementedError("%s is not yet handled." % aligner)
        # We need the order to be fixed for the zip to be meaningful
        counts = OrderedDict([])
        nb_libs = len(LIBS)
        nb_reps = len(REPS)
        counts_array = np.empty((nb_libs * nb_reps,), dtype=object)
        #lib_categories = np.empty([len(LIBS) * len(REPS)], dtype=np.uint32)
        lib_categories = np.fromiter(chain(*[[i] * nb_reps for i in range(nb_libs)]), dtype=np.uint32)
        for i, (lib, rep) in enumerate(product(LIBS, REPS)):
            counts_filename = OPJ(
778
                wildcards.trimmer, aligner, "mapped_C_elegans", wildcards.counter,
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
                "%s_%s_on_C_elegans" % (lib, rep), "%s_%s_counts.txt" % (wildcards.biotype, wildcards.orientation))
            #print("Reading", counts_filename)
            counts[(lib, rep)] = OrderedDict(counts_parser(counts_filename))
            counts_array[i] = np.fromiter(counts[(lib, rep)].values(), np.uint32)
            #lib_categories[i] = i // len(REPS)
        # zipping the values with a shifted version and pairwise comparing the gene orders
        assert all(starmap(
            same_gene_order,
            zip(list(counts.values()),
                list(counts.values())[1:]))), "All counts file should have the same genes in the same order."
        #counts_array = np.array([library_array for library_array in counts_array])
        # Faster (http://stackoverflow.com/a/40402682/1878788)
        counts_array = np.concatenate(counts_array).reshape(len(counts_array), -1)
        libs_pca = PCA(n_components=2)
        libs_fitting = libs_pca.fit(counts_array)
        libs_transformed = libs_fitting.transform(counts_array)
        genes_pca = PCA(n_components=2)
        genes_fitting = genes_pca.fit(counts_array.T)
        genes_transformed = genes_fitting.transform(counts_array.T)
        # http://stackoverflow.com/questions/40425036/how-to-extract-the-extreme-two-colours-from-a-matplotlib-colormap
        #TODO: make this configurable
        colormap = "BrBG"
        cmap = plt.cm.get_cmap(colormap)
        # Extract rgb coordinates
        left_rgb = cmap(0)[:-1]
        right_rgb = cmap(cmap.N)[:-1]
        # Convert to husl and take the hue (first element)
        left_hue = husl.rgb_to_husl(*left_rgb)[0]
        right_hue = husl.rgb_to_husl(*right_rgb)[0]
        # Create a dark divergent palette
        palette = sns.diverging_palette(left_hue, right_hue, n=(2 * (nb_libs // 2)) + 1, center="dark")
        lib2colour = OrderedDict(zip(LIBS, [*palette[nb_libs // 2:(nb_libs+1) // 2], *palette[:nb_libs // 2], *palette[1 + nb_libs // 2:]]))
        with open(output[0], "wb") as outfile:
            plot_scatterplot(outfile, libs_transformed, lib_categories, lib2colour)
813

814
815
816
817
#def plot_cluster(counts_dir, biotype, orientation, counts_files):
#    libname_finder = re.compile("%s/(.+)_on_C_elegans_%s_%s_counts.txt" % (counts_dir, biotype, orientation))
#    libnames = [libname_finder.match(fname).groups()[0] for fname in counts_files]
#    d = pd.read_csv(filename, sep="\t", header=None, index_col=0)
818
819


820
821
rule summarize_counts:
    """For a given library, write a summary of the read counts for the various biotypes."""
822
    input:
823
        biotype_counts_files = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{{lib}}_{{rep}}_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES),
824
    output:
825
        summary = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "{lib}_{rep}_on_C_elegans_{orientation}_counts.txt")
826
    run:
827
828
829
830
831
832
833
834
835
836
837
838
839
        if wildcards.counter == "htseq_count":
            sum_counter = sum_htseq_counts
        elif wildcards.counter == "feature_count":
            sum_counter = sum_feature_counts
        else:
            raise NotImplementedError(f"{wildcards.counter} not handled (yet?)")
        with open(output.summary, "w") as summary_file:
            header = "\t".join(COUNT_BIOTYPES)
            #summary_file.write("#biotypes\t%s\n" % header)
            summary_file.write("%s\n" % header)
            sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
            #summary_file.write("%s_%s_%s\t%s\n" % (wildcards.lib, wildcards.rep, wildcards.orientation, sums))
            summary_file.write("%s\n" % sums)
840
841


842
rule gather_read_counts_summaries:
843
    input:
844
        summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "summaries", "{lib}_{rep}_on_C_elegans_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS),
845
    output:
846
        summary_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "all_on_C_elegans_{orientation}_counts.txt"),
847
    run:
848
849
850
851
852
853
854
855
856
857
858
859
        summary_files = (OPJ(
            wildcards.trimmer,
            aligner,
            "mapped_C_elegans",
            wildcards.counter,
            "summaries",
            "{name}_on_C_elegans_{orientation}_counts.txt".format(
                name=cond_name,
                orientation=wildcards.orientation)) for cond_name in COND_NAMES)
        summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
        summaries.columns = COND_NAMES
        summaries.to_csv(output.summary_table, sep="\t")
860
861


862
863
rule gather_counts:
    """For a given biotype, gather counts from all libraries in one table."""
864
    input:
865
        counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS),
866
    output:
867
        counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
868
869
870
    wildcard_constraints:
        # Avoid ambiguity with join_all_counts
        biotype = "|".join(COUNT_BIOTYPES)
871
    run:
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
        # Gathering the counts data
        ############################
        counts_files = (OPJ(
            wildcards.trimmer,
            aligner,
            "mapped_C_elegans",
            wildcards.counter,
            "{name}_on_C_elegans".format(name=cond_name),
            "{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
        if wildcards.counter == "htseq_count":
            counts_data = pd.concat(
                map(read_htseq_counts, counts_files),
                axis=1).fillna(0).astype(int)
        elif wildcards.counter == "intersect_count":
            counts_data = pd.concat(
                map(read_intersect_counts, counts_files),
                axis=1).fillna(0).astype(int)
        elif wildcards.counter == "feature_count":
            counts_data = pd.concat(
                map(read_feature_counts, counts_files),
                axis=1).fillna(0).astype(int)
        else:
            raise NotImplementedError(f"{wilcards.counter} not handled (yet?)")
        counts_data.columns = COND_NAMES
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
        # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
        # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
        if wildcards.biotype.endswith("_rmsk_families"):
902
            counts_data = sum_by_family(counts_data)
903
904
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, sep="\t")
905
906


907
908
909
910
911
912
@wc_applied
def source_counts_to_join(wildcards):
    """
    Determines which elementary biotype counts files should be joined to make the desired "joined" biotype.
    """
    return expand(
913
        OPJ("{{trimmer}}", aligner, "mapped_C_elegans",
914
915
916
917
918
919
920
921
922
            "{{counter}}", "all_on_C_elegans",
            "{biotype}_{{orientation}}_counts.txt"),
        biotype=BIOTYPES_TO_JOIN[wildcards.biotype])


rule join_all_counts:
    """concat counts for all biotypes into all"""
    input:
        counts_tables = source_counts_to_join,
923
        #counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
924
        # counts_tables = expand(
925
        #     OPJ("{{trimmer}}", aligner, "mapped_C_elegans",
926
927
928
929
930
        #         "{{counter}}", "all_on_C_elegans",
        #         "{biotype}_{{orientation}}_counts.txt"),
        #     # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
        #     biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
    output:
931
        counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
932
933
934
    wildcard_constraints:
        biotype = "|".join(JOINED_BIOTYPES)
    run:
935
        counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables))
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
        assert len(counts_data.index.unique()) == len(counts_data.index), "Some genes appear several times in the counts table."
        counts_data.index.names = ["gene"]
        counts_data.to_csv(output.counts_table, sep="\t")


@wc_applied
def source_counts(wildcards):
    """Determines from which rule the gathered small counts should be sourced."""
    if wildcards.biotype in JOINED_BIOTYPES:
        return rules.join_all_counts.output.counts_table
    else:
        # "Directly" from the counts gathered across libraries
        return rules.gather_counts.output.counts_table


Blaise Li's avatar
Blaise Li committed
951
952
953
rule compute_RPK:
    """For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
    input:
954
955
        counts_data = source_counts,
        #counts_data = rules.gather_counts.output.counts_table,
956
        #counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
957
958
        #    "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"),
    output:
959
        rpk_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
960
961
962
            "all_on_C_elegans", "{biotype}_{orientation}_RPK.txt"),
    params:
        feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
963
964
965
966
967
968
969
    # run:
    #     counts_data = pd.read_table(input.counts_data, index_col="gene")
    #     feature_lengths = pd.read_table(params.feature_lengths_file, index_col="gene")
    #     common = counts_data.index.intersection(feature_lengths.index)
    #     rpk = 1000 * counts_data.loc[common].div(feature_lengths.loc[common]["union_exon_len"], axis="index")
    #     rpk.to_csv(output.rpk_file, sep="\t")
    wrapper:
970
        f"file://{wrappers_dir}/compute_RPK"
Blaise Li's avatar
Blaise Li committed
971
972
973
974
975
976


rule compute_sum_million_RPK:
    input:
        rpk_file = rules.compute_RPK.output.rpk_file,
    output:
977
        sum_rpk_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
978
979
980
981
982
983
984
985
986
987
988
989
990
            "all_on_C_elegans", "{biotype}_{orientation}_sum_million_RPK.txt"),
    run:
        sum_rpk = pd.read_table(
            input.rpk_file,
            index_col=0).sum()
        (sum_rpk / 1000000).to_csv(output.sum_rpk_file, sep="\t")


rule compute_TPM:
    """For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
    input:
        rpk_file = rules.compute_RPK.output.rpk_file
    output:
991
        tpm_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}",
Blaise Li's avatar
Blaise Li committed
992
993
994
            "all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"),
    # The sum must be done over all counted features
    wildcard_constraints:
995
996
997
998
999
1000
        biotype = "|".join(["alltypes"])
    # run:
    #     rpk = pd.read_table(input.rpk_file, index_col="gene")
    #     tpm = 1000000 * rpk / rpk.sum()
    #     tpm.to_csv(output.tpm_file, sep="\t")
    wrapper:
1001
        f"file://{wrappers_dir}/compute_TPM"
Blaise Li's avatar
Blaise Li committed
1002
1003


1004
@wc_applied
Blaise Li's avatar
Blaise Li committed
1005
1006
1007
def source_quantif(wildcards):
    """Determines from which rule the gathered counts should be sourced."""
    if wildcards.quantif_type == "counts":
1008
1009
        return source_counts(wildcards)
        #return rules.gather_counts.output.counts_table
Blaise Li's avatar
Blaise Li committed
1010
1011
1012
1013
1014
1015
1016
1017
    elif wildcards.quantif_type == "RPK":
        return rules.compute_RPK.output.rpk_file
    elif wildcards.quantif_type == "TPM":
        return rules.compute_TPM.output.tpm_file
    else:
        raise NotImplementedError("%s is not yet handeled." % wildcards.quantif_type)


1018
1019
rule compute_median_ratio_to_pseudo_ref_size_factors:
    input:
1020
1021
        counts_table = source_counts,
        #counts_table = rules.gather_counts.output.counts_table,
1022
    output:
1023
        median_ratios_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"),
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
    run:
        counts_data = pd.read_table(
            input.counts_table,
            index_col=0,
            na_filter=False)
        # http://stackoverflow.com/a/21320592/1878788
        #median_ratios = pd.DataFrame(median_ratio_to_pseudo_ref_size_factors(counts_data)).T
        #median_ratios.index.names = ["median_ratio_to_pseudo_ref"]
        # Easier to grep when not transposed, actually:
        median_ratios = median_ratio_to_pseudo_ref_size_factors(counts_data)
        median_ratios.to_csv(output.median_ratios_file, sep="\t")


# Note that bamCoverage inverses strands:
# https://github.com/fidelram/deepTools/issues/494
def bamcoverage_filter(wildcards):
    if wildcards.orientation == "fwd":
        return "--filterRNAstrand reverse"
    elif wildcards.orientation == "fwd":
        return "--filterRNAstrand forward"
1044
    else:
1045
        return ""
1046
1047


Blaise Li's avatar
Blaise Li committed
1048
1049
1050
def source_normalizer(wildcards):
    if wildcards.norm_type == "median_ratio_to_pseudo_ref":
        return OPJ(
1051
            f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans",
Blaise Li's avatar
Blaise Li committed
1052
1053
1054
            "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
    elif wildcards.norm_type in COUNT_BIOTYPES:
        return OPJ(
1055
            f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans",
Blaise Li's avatar
Blaise Li committed
1056
1057
1058
1059
1060
1061
            f"{wildcards.norm_type}_fwd_sum_million_RPK.txt"),
    else:
        raise NotImplementedError(f"{wildcards.norm_type} normalization not implemented yet.")


# Warning: The normalization is done based on a particular count using the first counter
1062
rule make_normalized_bigwig:
1063
    input:
1064
1065
        bam = rules.fuse_bams.output.sorted_bam,
        # TODO: use sourcing function based on norm_type
Blaise Li's avatar
Blaise Li committed
1066
        size_factor_file = source_normalizer,
1067
        #size_factor_file = rules.compute_coverage.output.coverage
1068
        #median_ratios_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
1069
        # TODO: compute this
1070
        #scale_factor_file = OPJ(aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"),
1071
    output:
1072
1073
        bigwig_norm = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{norm_type}_{orientation}.bw"),
        #bigwig = OPJ(aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_{orientation}.bw"),
1074
1075
    #params:
    #    orient_filter = bamcoverage_filter,
Blaise Li's avatar
Blaise Li committed
1076
    threads: 4  # to limit memory usage, actually
1077
    benchmark:
1078
        OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}_benchmark.txt")
1079
    params:
1080
        genome_binned = genome_binned,
1081
    log:
1082
1083
        log = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.log"),
        err = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.err"),
1084
1085
    shell:
        """
1086
1087
        bam2bigwig.sh {input.bam} {params.genome_binned} \\
            {wildcards.lib}_{wildcards.rep} {wildcards.orientation} %s \\
Blaise Li's avatar
Blaise Li committed
1088
            {input.size_factor_file} {output.bigwig_norm} \\
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
            > {log.log} 2> {log.err} \\
            || error_exit "bam2bigwig.sh failed"
        """ % LIB_TYPE[-1]
        #"""
        #scale=$(python -c "print 1.0 / ${{size}}")
        #bamCoverage -b {input.bam} {params.orient_filter} \\
        #    -of=bigwig -bs 10 -p=1 \\
        #    --scaleFactor ${{scale}} -o {output.bigwig_norm} \\
        #    1>> {log.make_bigwig_log} 2>> {log.make_bigwig_err} \\
        #    || error_exit "bamCoverage failed"
        #bamCoverage -b {input.bam} --skipNAs {params.orient_filter} \\
        #    -of=bigwig -bs 10 -p=1 \\
        #    -o {output.bigwig} \\
        #    1>> {log.make_bigwig_log} 2>> {log.make_bigwig_err} \\
        #    || error_exit "bamCoverage failed"
        #"""
1105
1106


1107
rule make_bigwig:
1108
    input:
1109
1110
1111
        bam = rules.fuse_bams.output.sorted_bam,
        # TODO: use sourcing function based on norm_type
        #size_factor_file = rules.compute_coverage.output.coverage
1112
        median_ratios_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
1113
        # TODO: compute this
1114
        #scale_factor_file = OPJ(aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"),
1115
    output:
1116
1117
        bigwig_norm = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{norm_type}_{orientation}_bamCoverage.bw"),
        #bigwig = OPJ(aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_{orientation}.bw"),
1118
1119
1120
1121
    params:
        orient_filter = bamcoverage_filter,
    threads: 12  # to limit memory usage, actually
    benchmark:
1122
        OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}_benchmark.txt")
1123
    log:
1124
1125
        log = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.log"),
        err = OPJ(log_dir, "{trimmer}", "make_bigwig", "{lib}_{rep}_by_{norm_type}_{orientation}.err"),
1126
    run:
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
        scale = 1 / float(pd.read_table(
            input.median_ratios_file, index_col=0, header=None).loc[
                f"{wildcards.lib}_{wildcards.rep}"])
        assert scale > 0
        # TODO: make this a function of deeptools version
        no_reads = """Error: The generated bedGraphFile was empty. Please adjust
your deepTools settings and check your input files.
"""
#        no_reads = """[bwClose] There was an error while finishing writing a bigWig file! The output is likely truncated.
#"""
        try:
            shell("""
                cmd="bamCoverage -b {input.bam} {params.orient_filter} \\
                    -of=bigwig -bs 10 -p={threads} \\
                    --scaleFactor %f -o {output.bigwig_norm} \\
                    1>> {log.log} 2>> {log.err}"
                > {log.err}
                echo ${{cmd}} > {log.log}
                eval ${{cmd}} || error_exit "bamCoverage failed"
            """ % scale)
        except CalledProcessError as e:
            if last_lines(log.err, 2) == no_reads:
                with open(output.bigwig_norm, "w") as bwfile:
                    bwfile.write("")
            else:
                raise
1153
1154


1155
def source_bigwigs_for_merge(wildcards):
1156
1157
    return [OPJ("{trimmer}".format(trimmer=wildcards.trimmer), aligner, "mapped_C_elegans", "{lib}_{{rep}}_on_C_elegans_by_{norm_type}_{orientation}.bw".format(lib=wildcards.lib, norm_type=wildcards.norm_type, orientation=wildcards.orientation).format(rep=rep)) for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden]
    #return expand(OPJ(aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_norm_{orientation}.bw"), lib=[wildcards.lib], rep=[rep for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden], orientation=[wildcards.orientation])
1158
1159
1160
1161


rule merge_bigwig_reps:
    """This rule merges bigwig files by computing a mean across replicates."""
1162
    input:
1163
        source_bigwigs_for_merge,
1164
        #expand(OPJ(aligner, "mapped_C_elegans", "{{lib}}_{rep}_on_C_elegans_norm_{{orientation}}.bw"), rep=[rep for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden]),
1165
    output:
1166
        bw = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.bw"),
1167
    log:
1168
        warnings = OPJ(log_dir, "{trimmer}", "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"),
1169
    threads: 2  # to limit memory usage, actually
1170
    run:
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
        with warn_context(log.warnings) as warn:
            try:
                bws = [pyBigWig.open(bw_filename) for bw_filename in input]
                #for bw_filename in input:
                #    bws.append(pyBigWig.open(bw_filename))
            except RuntimeError as e:
                warn(str(e))
                warn("Generating empty file.\n")
                # Make the file empty
                open(output.bw, "w").close()
            else:
                bw_out = pyBigWig.open(output.bw, "w")
                bw_out.addHeader(list(chrom_sizes.items()))
1184
                for (chrom, chrom_len) in chrom_sizes.items():
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
                    try:
                        assert all([bw.chroms()[chrom] == chrom_len for bw in bws])
                    except KeyError as e:
                        warn(str(e))
                        warn(f"Chromosome {chrom} might be missing from one of the input files.\n")
                        for filename, bw in zip(input, bws):
                            msg = " ".join([f"{filename}:", *list(bw.chroms().keys())])
                            warn(f"{msg}:\n")
                        #raise
                        warn(f"The bigwig files without {chrom} will be skipped.\n")
                    to_use = [bw for bw in bws if chrom in bw.chroms()]
                    if to_use:
1197
                        means = np.nanmean(np.vstack([bw.values(chrom, 0, chrom_len) for bw in to_use]), axis=0)
1198
1199
1200
1201
1202
1203
1204
                    else:
                        means = np.zeros(chrom_len)
                    # bin size is 10
                    bw_out.addEntries(chrom, 0, values=np.nan_to_num(means[0::10]), span=10, step=10)
                bw_out.close()
                for bw in bws:
                    bw.close()
1205
1206


1207
from rpy2.robjects import Formula, StrVector
Blaise Li's avatar
Blaise Li committed
1208
#from rpy2.rinterface import RRuntimeError
1209
1210
1211
1212
1213
rule differential_expression:
    input:
        counts_table = source_counts,
        summary_table = rules.gather_read_counts_summaries.output.summary_table,
    output:
1214
1215
1216
1217
        deseq_results = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "deseq2.txt"),
        up_genes = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "up_genes.txt"),
        down_genes = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "down_genes.txt"),
        counts_and_res = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"),
1218
1219
1220
1221
1222
1223
1224
    threads: 4  # to limit memory usage, actually
    run:
        counts_data = pd.read_table(input.counts_table, index_col="gene")
        summaries = pd.read_table(input.summary_table, index_col=0)
        # Running DESeq2
        #################
        (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
1225
        if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS):
1226
            warnings.warn(
1227
1228
                "Reference data is all zero.\nSkipping %s_%s_%s" % (
                    wildcards.contrast, wildcards.orientation, wildcards.biotype))
1229
1230
1231
            for outfile in output:
                shell(f"echo 'NA' > {outfile}")
        else:
1232
            try:
1233
1234
1235
1236
                try:
                    contrast = StrVector(["lib", cond, ref])
                    formula = Formula("~ lib")
                    res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast)
Blaise Li's avatar
Blaise Li committed
1237
1238
                #except RRuntimeError as e:
                except RuntimeError as e:
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
                    warnings.warn(
                        "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % (
                            str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype))
                    for outfile in output:
                        shell(f"echo 'NA' > {outfile}")
                else:
                    # Determining fold-change category
                    ###################################
                    set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange")
                    #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
                    res = res.assign(status=res.apply(set_de_status, axis=1))
                    # Converting gene IDs
                    ######################
                    with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
                        res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
                    with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
                        res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
                    # Just to see if column_converter works also with named column, and not just index:
                    # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file:
                    #     res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1))
                    ##########################################
                    # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
                    res.to_csv(output.deseq_results, sep="\t", na_rep="NA")
                    # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status
                    counts_and_res = counts_data
                    for normalizer in SIZE_FACTORS:
                        if normalizer == "median_ratio_to_pseudo_ref":
                            ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
                            ## add pseudo-count to compute the geometric mean, then remove it
                            #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
                            #def median_ratio_to_pseudo_ref(col):
                            #    return (col / pseudo_ref).median()
                            #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
                            size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
                        else:
                            #raise NotImplementedError(f"{normalizer} normalization not implemented")
                            size_factors = summaries.loc[normalizer]
                        by_norm = counts_data / size_factors
                        by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer))
                        counts_and_res = pd.concat((counts_and_res, by_norm), axis=1)
                    #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type")
                    counts_and_res = pd.concat((counts_and_res, res), axis=1)
                    counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA")
                    # Saving lists of genes gaining or loosing siRNAs
                    up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index)
                    down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index)
                    with open(output.up_genes, "w") as up_file:
                        if up_genes:
                            up_file.write("%s\n" % "\n".join(up_genes))
                        else:
                            up_file.truncate(0)
                    with open(output.down_genes, "w") as down_file:
                        if down_genes:
                            down_file.write("%s\n" % "\n".join(down_genes))
                        else:
                            down_file.truncate(0)
            # Does not seem to be caught...
            except KeyError as err:
                err_msg = str(err)
                warnings.warn("XXXXXXXXXXXXXXXXX Got KeyError XXXXXXXXXXXXXXXXX")
                if err_msg[:17] == "Trying to release":
                    warnings.warn(err_msg)
                    warnings.warn(f"Skipping {wildcards.contrast}_{wildcards.orientation}_{wildcards.biotype}\n")
                    for outfile in output:
                        shell(f"echo 'NA' > {outfile}")
                else:
                    raise
            except:
                warnings.warn("XXXXXXXXXXXXXXXXX Got another exception XXXXXXXXXXXXXXXXX")
                raise
1309
1310


1311
rule make_lfc_distrib_plot:
1312
    input:
1313
        deseq_results = rules.differential_expression.output.deseq_results,
1314
    output:
1315
        lfc_plot = OPJ("{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"),
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
    run:
        if test_na_file(input.deseq_results):
            warnings.warn(
                "No DESeq2 results for %s_%s_%s. Making dummy output." % (
                    wildcards.contrast, wildcards.orientation, wildcards.biotype))
            for outfile in output:
                shell(f"echo 'NA' > {outfile}")
        else:
            res = pd.read_table(input.deseq_results, index_col=0)
            save_plot(
                output.lfc_plot, plot_lfc_distribution,
                res, wildcards.contrast, wildcards.fold_type,
                title="log fold-change distribution for %s, %s_%s" % (
                    wildcards.contrast, wildcards.orientation, wildcards.biotype))
1330
1331


1332
1333
def set_lfc_range(wildcards):
    return LFC_RANGE[wildcards.biotype]
1334
1335


1336
1337
1338
1339
1340
1341
1342
# takes wildcards, gene list name or path
# returns list of wormbase ids
get_id_list = make_id_list_getter(gene_lists_dir, avail_id_lists)
rule make_MA_plot:
    input:
        deseq_results = rules.differential_expression.output.deseq_results,
    output:
1343
        MA_plot = OPJ("{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"),
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
    params:
        lfc_range = set_lfc_range,
        id_list = get_id_list,
    run:
        if test_na_file(input.deseq_results):
            warnings.warn(
                "No DESeq2 results for %s_%s_%s. Making dummy output." % (
                    wildcards.contrast, wildcards.orientation, wildcards.biotype))
            for outfile in output:
                shell(f"echo 'NA' > {outfile}")
1354
        else:
1355
1356
1357
1358
1359
1360
1361
            res = pd.read_table(input.deseq_results, index_col=0)
            if params.id_list is None:
                grouping = "status"
                group2colour = None
            else:
                grouping = params.id_list
                group2colour = (wildcards.id_list, sns.xkcd_rgb["orange"])
Blaise Li's avatar
Blaise Li committed
1362
1363
1364
            title = f"MA-plot for {wildcards.contrast}, {wildcards.orientation}_{wildcards.biotype}"
            if mpl.rcParams.get("text.usetex", False):
                title = texscape(title)
1365
1366
1367
1368
            save_plot(
                output.MA_plot, plot_MA, res,
                grouping=grouping,
                group2colour=group2colour,
Blaise Li's avatar
Blaise Li committed
1369
1370
1371
                lfc_range=params.lfc_range,
                fold_type="log2FoldChange",
                title=title)
1372
1373


1374
1375
1376
1377
##################
# Metagene plots #
##################

1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
# rule gather_annotations:
#     input:
#         expand(OPJ(annot_dir, "{biotype}.gtf"), biotype=ANNOT_BIOTYPES),
#     output:
#         merged_gtf = OPJ(local_annot_dir, "all_annotations.gtf"),
#         merged_gtf_gz = OPJ(local_annot_dir, "all_annotations.gtf.gz"),
#         index = OPJ(local_annot_dir, "all_annotations.gtf.gz.tbi"),
#         #merged_bed = OPJ(local_annot_dir, "all_annotations.bed"),
#     message:
#         "Gathering annotations for {}".format(", ".join(ANNOT_BIOTYPES))
#     shell:
#         """
#         sort -k1,1 -k4,4n -m {input} | tee {output.merged_gtf} | bgzip > {output.merged_gtf_gz}
#         tabix -p gff {output.merged_gtf_gz}
#         #ensembl_gtf2bed.py {output.merged_gtf} > {output.merged_bed}
#         """
1394

1395
1396
1397
1398
# For metagene analyses:
#- Extract transcripts
#- Take only genes with TSS identified, among isoforms of a same gene, take the 3'end closest to the TSS
#- Avoid overlap between gene1 UTR and a gene2 (all biotypes except piRNA and antisense) UTR or TSS based on the 3-primes furthest of the TSS of their gene
1399

1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
rule extract_transcripts:
    input:
        # We want to get back to the original "protein_coding" annotations,
        # not separated by UTR or CDS
        #gtf = rules.gather_annotations.output.merged_gtf,
        #gtf = OPJ(annot_dir, "genes.gtf"),
        #DNA_transposon_gtf = OPJ(annot_dir, "DNA_transposons_rmsk.gtf"),
        #RNA_transposon_gtf = OPJ(annot_dir, "RNA_transposons_rmsk.gtf"),
        OPJ(annot_dir, "genes.gtf"),
        OPJ(annot_dir, "DNA_transposons_rmsk.gtf"),
        OPJ(annot_dir, "RNA_transposons_rmsk.gtf"),
    output:
        bed = OPJ(local_annot_dir, "transcripts_all.bed"),
    run:
        with finput(files=input) as gtf_source, open(output.bed, "w") as bed_dest:
            for (chrom, _, bed_type, gtf_start, end,
                 score, strand, _, annot_field) in map(strip_split, gtf_source):
                if bed_type != "transcript":
                    continue
                annots = dict([(k, v.rstrip(";").strip('"')) for (k, v) in [
                    f.split(" ") for f in annot_field[:-1].split("; ")]])
                gene_biotype = annots["gene_biotype"]
                # piRNA should not be highly expressed
                #if gene_biotype in {"miRNA", "piRNA", "antisense"}:
                if gene_biotype not in {"protein_coding", "pseudogene"}:
                    continue
                transcript_id = annots["transcript_id"]
                gene_id = annots["gene_id"]
                print(chrom, int(gtf_start) - 1, end,
                      gene_id, gene_biotype, strand,
                      sep="\t", file=bed_dest)
    # shell:
    #     """
    #     extract_transcripts_from_gtf.py \\
    #         -g Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf \\
    #         -o {params.annot_dir} \\
    #         -i "piRNA" "antisense" \\
    #         || error_exit "extract_transcripts failed"
    #     """
Blaise Li's avatar
Blaise Li committed
1439

1440
1441
1442
#- Extract transcripts
#- Take only genes with TSS identified, among isoforms of a same gene, take the 3'end closest to the TSS
#- Avoid overlap between gene1 UTR and a gene2 (all biotypes except piRNA and antisense) UTR or TSS based on the 3-primes furthest of the TSS of their gene
1443
1444


1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
class Gene(object):
    """This is used to fuse coordinates of transcripts deriving from a same gene.
    *wide_end* corresponds to the furthest-extending 3' end. It will be used to
    determine possible "interferences" (risks of small RNA confusion) between genes."""
    __slots__ = ("gene_id", "chrom", "start", "tight_end", "wide_end", "biotype", "strand")