small_RNA-seq.snakefile 225 KB
Newer Older
1
2
"""
Snakefile to analyse small_RNA-seq data.
Blaise Li's avatar
Blaise Li committed
3
4

TODO: Some figures and summaries may be overridden when changing the mapper. The mapper name should be added to their path.
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

There are 2 types of counts. One resulting from small RNA annotations using a custom script, and one resulting from remapping and counting using featureCount.

The first one can contain "chimeric" annotations (possibly resulting from overlapping annotations?):
```
bli@naples:/extra2/Documents/Mhe/bli/small_RNA-seq_analyses$ head results_HRDE1_IP/bowtie2/mapped_C_elegans/annotation/hrde1_flag_HRDE1_IP_1_18-26_on_C_elegans/all_siu_counts.txt
gene	count
DNA?|DNA?|NDNAX1_CE:81	1
DNA?|DNA?|NDNAX2_CE:110_and_WBGene00005903	1
DNA?|DNA?|NDNAX2_CE:45	2
DNA?|DNA?|NDNAX3_CE:31	1
DNA|CMC-Chapaev|Chapaev-1_CE:45	1
DNA|CMC-Chapaev|Chapaev-1_CE:58_and_WBGene00005792	1
DNA|CMC-Chapaev|Chapaev-2_CE:40_and_WBGene00021886	1
DNA|CMC-Chapaev|Chapaev-2_CE:41_and_WBGene00021886	3
DNA|CMC-Chapaev|Chapaev-2_CE:42_and_WBGene00021886	5

Maybe it would be better to not use these counts.
```

25
"""
Blaise Li's avatar
Blaise Li committed
26
27
28
29
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")
30

31
32
33
34
# TODO: Try to find what the proportion of small reads map at unique position within repetitive elements.
# What is the distribution of unique reads among repetitive elements of a given family: is it evenly spread, or biased?
# Is it different if we use all small RNAs instead of just unique ones?
# For bowtie2, allow MAPQ >= 23 to get "not too ambiguously mapped reads"
35

36
37
# TODO: implement RPM folds from feature_counts results (and also for Ribo-seq pipeline)

38
39
40
41
42
43
44
45
46
47
48
# TODO: scatterplots log(IP_RPM) vs log(input_RPM) (or mutant vs WT), colouring with gene list
# TODO: heatmaps with rows (genes) ranked by expression in WT (mean reference level (wild types or inputs)), columns representing fold in mut vs WT
# Possibly remove the low expressed ones
# TODO: MA plot: fold vs "mean" count (to be defined)?

# TODO: extract most abundant piRNA reads (WT_48hph)
#bioawk -c fastx '{print $seq}' results_prg1_HS/bowtie2/mapped_C_elegans/reads/WT_RT_1_18-26_on_C_elegans/piRNA.fastq.gz | sort | uniq -c | sort -n | mawk '$1 >= 25 {print}' | wc -l
# -> 6857
# map them on the genome (without the piRNA loci), or on pseudo-genomes with specific features (histone genes...), tolerating a certain amount of mismatches (using bowtie1)
# Or: scan the genome (or "pseudo-genome") to find where there are matches with 0 to 3 mismatches at specific positions (13-21) (and perfect matches in the 5' zone)

Blaise Li's avatar
Blaise Li committed
49
50
51
52
53
54
# TODO: meta-profiles with IP and input on the same meta profile for Ortiz_oogenic and Ortiz_spermatogenic
# Make external script to generate meta-profiles on a custom list of libraries, and a given list of genes.

# TODO: boxplots with contrasts as series and a selection of gene lists
# Ex: 3 time points and oogenic and spermatogenic -> 6 boxes (custom program needed)
# Or simpler: one figure per gene list -> done for a full group of contrasts in make_gene_list_lfc_boxplots
55
# TODO: boxplots for all genes, for different small RNA categories (same as above but with all genes)
Blaise Li's avatar
Blaise Li committed
56

Blaise Li's avatar
Blaise Li committed
57
# Total number of "non-structural" (mapped - (fwd-(t,sn,sno,r-RNA))) to compute RPKM
58
59
# Quick-and-dirty: use gene span (TES - TSS) -> done for repeats
# Better: use length of union of exons for each gene -> done for genes
60
# Ideal: use transcriptome-based per-isoform computation
Blaise Li's avatar
Blaise Li committed
61
# Actually, we don't want a normalization by length. We deal with small RNAs, not transcripts
62
# TODO: make a bigwig of IP/input: is it possible?
Blaise Li's avatar
Blaise Li committed
63
64
# For RNA-seq?: use (oocyte, spermatogenic) intersected with "CSR-1-loaded" (=targets for different time point), csr1ADH_vs_WT_all_alltypes_up_genes_ids.txt (and intersected with non-spermatogenic) for meta-profiles

65
66
67

# Possibly filter out on RPKM
# Then, compute folds of RP(K)M IP/input (for a given experiment, i.-e. REP)
68
# and give list of genes sorted by such folds -> see /Gene_lists/csr1_prot_si_supertargets*
69
70
71
72
73
74
75
76

# Exploratory:
# Heatmap (fold)
# genes | experiment

# Then either:
# - take the genes (above log-fold threshold) common across replicates
# - look at irreproducible discovery rate (http://dx.doi.org/doi:10.1214%2F11-AOAS466)
77
# -> define CSR-1-loaded
78
79
80

# For metagene: see --metagene option of computeMatrix
# Retrieve gtf info after filtering out interfering genes based on merged bed
81

82
83
# TODO
# Locate potential targets of piRNA (using perfect complementarity), in a given annot_biotypes category for a given set of piRNA (the whole list, or for instance, up or down-regulated ones...).
84
85
# To do this: map all perfect matches, make a bed file, intersect with the annotation file for the given biotype, with antisense (note that this will automatically filter out self matches) -> done but only for all piRNAs
# Plot the count of endo-siRNA around those sites (from 100 upstream up to 100 downstream, using meta-TSS-like analysis) -> done
86
87
88
89


import os
OPJ = os.path.join
90
from glob import glob
Blaise Li's avatar
Blaise Li committed
91
from re import sub
92
93
from pickle import load
from fileinput import input as finput
94
from sys import stderr
95
from subprocess import Popen, PIPE, CalledProcessError
96
97
98
# Garbage-collect (to be used before Popen)
# see comment on https://stackoverflow.com/a/13329386/1878788
from gc import collect
99
100

# Useful for functional style
101
from itertools import chain, combinations, product, repeat, starmap
102
from functools import partial, reduce
103
from operator import or_ as union
104
from cytoolz import concat, flip, merge_with, take_nth, valmap
Blaise Li's avatar
Blaise Li committed
105

106

Blaise Li's avatar
Blaise Li committed
107
108
def concat_lists(lists):
    return list(concat(lists))
109

110

111
112
113
def dont_merge(*values):
    """This function can be passed to *merge_with* so that
    merge fails when dicts share keys."""
114
115
116
117
118
119
    try:
        ((val,),) = values
    except ValueError:
        raise ValueError("The dictionaries are not supposed to share keys.")
    return val

120

121
122
123
124
# Useful data structures
from collections import OrderedDict as od
from collections import defaultdict, Counter

125

126
127
128
129
130
131
132
133
134
135
136
import warnings


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning


137
from snakemake.io import apply_wildcards
138
139
# from gatb import Bank
from mappy import fastx_read
140
# To parse SAM format
Blaise Li's avatar
Blaise Li committed
141
import pyBigWig
142

Blaise Li's avatar
Blaise Li committed
143
144
# To compute correlation coefficient
from scipy.stats.stats import pearsonr
145
146
# To catch errors when plotting KDE
from scipy.linalg import LinAlgError
147
# For data processing and displaying
148
from sklearn import preprocessing
149
from sklearn.decomposition import PCA
150
151
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
152
153
# https://github.com/mwaskom/seaborn/issues/1262
#mpl.use("agg")
154
155
156
157
158
mpl.use("PDF")
#mpl.rcParams["figure.figsize"] = 2, 4
mpl.rcParams["font.sans-serif"] = [
    "Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams["font.family"] = "sans-serif"
Blaise Li's avatar
Blaise Li committed
159
#mpl.rcParams["figure.figsize"] = [16, 30]
160

161
162
163
from matplotlib import cm
from matplotlib.colors import Normalize

164
from matplotlib import numpy as np
165
from math import ceil, floor
Blaise Li's avatar
Blaise Li committed
166
from matplotlib.backends.backend_pdf import PdfPages
167
168
import pandas as pd
import matplotlib.pyplot as plt
169
170
import seaborn as sns
# import seaborn.apionly as sns
171
# import husl
172
# predefined seaborn graphical parameters
Blaise Li's avatar
Blaise Li committed
173
174
sns.set_context("talk")

175
176
from libsmallrna import (
    PI_MIN, PI_MAX, SI_MIN, SI_MAX,
177
178
179
    SI_SUFFIXES, SI_PREFIXES,
    SMALL_TYPES_TREE,
    type2RNA,
180
181
182
183
    types_under,
    SMALL_TYPES,
    SI_TYPES,
    SIU_TYPES,
184
185
186
    SISIU_TYPES,
    RMSK_SISIU_TYPES,
    JOINED_SMALL_TYPES)
Blaise Li's avatar
Blaise Li committed
187
188
# Do this outside the workflow
#from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths
189
from libdeseq import do_deseq2
190
from libhts import aligner2min_mapq
191
from libhts import status_setter, make_empty_bigwig
192
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
193
from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution, plot_boxplots, plot_histo
194
from libworkflows import texscape, wc_applied, ensure_relative, cleanup_and_backup
195
196
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import read_int_from_file, strip_split, file_len, last_lines, save_plot, SHELL_FUNCTIONS
197
from libworkflows import filter_combinator, read_feature_counts, sum_by_family, sum_feature_counts, sum_htseq_counts, warn_context
198
from smincludes import rules as irules
Blaise Li's avatar
Blaise Li committed
199
from smwrappers import wrappers_dir
200
201
strip = str.strip

202
alignment_settings = {"bowtie2": "-L 6 -i S,1,0.8 -N 0"}
203

204
# Positions in small RNA sequences for which to analyse nucleotide distribution
205
206
#POSITIONS = ["first", "last"]
POSITIONS = config["positions"]
207
208
# Orientations of small RNAs with respect to an annotated feature orientation.
# "fwd" and "rev" restrict feature quantification to sense or antisense reads.
209
ORIENTATIONS = config["orientations"]
210
211
212
213
214
# Codes for types of small RNAs
# Codes ending with "u" are for uridinylated endo siRNAs
# (recognized as not having mapped without first trimming a poly-T tail.
# "prot", "te" and "pseu" prefixes indicate the type of mRNA template that is inferred to have been used for endo siRNA synthesis by RdRP
# See small_RNA_seq_annotate.py and libsmallrna.pyx for more details
215
216


217
# small RNA types on which to run DESeq2
218
DE_TYPES = config["de_types"]
219
assert set(DE_TYPES) <= set(SMALL_TYPES + JOINED_SMALL_TYPES), "%s\n%s" % (", ".join(DE_TYPES), ", ".join(set(SMALL_TYPES + JOINED_SMALL_TYPES)))
Blaise Li's avatar
Blaise Li committed
220
# small RNA types on which to compute IP/input RPM folds
221
222
223
# prot_si does not include polyU siRNA, so the counts here may be lower than in pisimi
# siu includes SIU_TYPES, but not SI_TYPES, so the counts here may be lower than in pisimi
# pisimi includes all (SI_TYPES, SIU_TYPES, pi and mi (pi and mi will not be on the same genes as the siRNAs))
224
225
#IP_TYPES = ["pisimi", "siu", "prot_si"]
# TODO: update cross_HTS with pimi22G
226
# TODO: what kind of pimi22G ? -> pi, mi and si_22G, but not siu_22G
227
228
IP_TYPES = [f"pimi{SI_MIN}G", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G",
            f"siu_{SI_MAX}G", f"prot_si_{SI_MAX}G"]
Blaise Li's avatar
Blaise Li committed
229
230
assert set(IP_TYPES) <= set(
    SMALL_TYPES + [f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G"] + JOINED_SMALL_TYPES), ", ".join(IP_TYPES)
231
#IP_TYPES = config["ip_types"]
232
233
234
235
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
Blaise Li's avatar
Blaise Li committed
236
#STANDARDS = ["zscore", "robust", "minmax", "unit"]
237
#STANDARDS = ["zscore", "robust", "minmax"]
Blaise Li's avatar
Blaise Li committed
238
239
# hexbin jointplot for principal components crashes on MemoryError for PCA without standardization
#STANDARDS = ["robust", "identity"]
240
STANDARDS = ["robust"]
241
242
243

COMPL = {"A" : "T", "C" : "G", "G" : "C", "T" : "A", "N" : "N"}

244
# Possible feature ID conversions
245
ID_TYPES = ["name", "cosmid"]
246
247
248
249
250
251
252
253
254
255
256
257
258

#########################
# Reading configuration #
#########################
# key: library name
# value: 3' adapter sequence
lib2adapt = config["lib2adapt"]
trim5 = config["trim5"]
trim3 = config["trim3"]
# key: library name
# value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
259
260
261
262
263
#REF=config["WT"]
#MUT=config["mutant"]
# Used to associate colours to libraries
# key: series name
# value: list of libraries
264
colour_series_dict = config["colour_series"]
Blaise Li's avatar
Blaise Li committed
265
genotype_series = colour_series_dict.get("genotype_series", [])
266
267
268
269
270
271
# Groups of libraries to put together on a same metagene profile
lib_groups = config["lib_groups"]
GROUP_TYPES = list(lib_groups.keys())
merged_groups = merge_with(concat_lists, *lib_groups.values())
ALL_LIB_GROUPS = list(merged_groups.keys())
#all_libs_in_group = concat_lists(merge_with(concat_lists, *lib_groups.values()).values())
272
REPS = config["replicates"]
273
#TREATS = config["treatments"]
274
# Conditions to compare using DESeq2
275
276
277
278
279
# Note: we don't want contrasts of the following type:
# - ({geno}, {geno}) where various treatments are taken into account
# - ({treat}, {treat}) where various genotypes are taken into account
# We also don't want contrasts where more than one factor vary (ex: prg1_RT vs WT_HS30)
#ALL_COND_PAIRS = [(MUT, REF)] + list(combinations(reversed(TREATS), 2)) + [("%s_%s" % (lib2, treat2), "%s_%s" % (lib1, treat1)) for ((lib1, treat1), (lib2, treat2)) in combinations(product(LIBS, TREATS), 2)]
280
281
282
283
284
285
#COND_PAIRS = [(lib2, lib1) for (lib1, lib2) in [
#    ["%s_%s" % (geno, treat) for (geno, treat) in product(genos, treats)] for (genos, treats) in chain(
#        product(combinations(LIBS, 1), combinations(TREATS, 2)),
#        product(combinations(LIBS, 2), combinations(TREATS, 1)))]]
#CONTRASTS = ["%s_vs_%s" % cond_pair for cond_pair in COND_PAIRS]
#CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
286
DE_COND_PAIRS = config["de_cond_pairs"]
287
288
289
290
291
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in DE_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in DE_COND_PAIRS]), msg
292
IP_COND_PAIRS = config["ip_cond_pairs"]
293
294
295
296
297
msg = "\n".join([
    "Some contrats do not use known library names.",
    "Contrasts:"
    ", ".join([f"({cond}, {ref})" for (cond, ref) in IP_COND_PAIRS])])
assert all([cond in LIBS and ref in LIBS for (cond, ref) in IP_COND_PAIRS]), ""
298
COND_PAIRS = DE_COND_PAIRS + IP_COND_PAIRS
299
DE_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in DE_COND_PAIRS]
300
IP_CONTRASTS = [f"{cond1}_vs_{cond2}" for [cond1, cond2] in IP_COND_PAIRS]
Blaise Li's avatar
Blaise Li committed
301
contrasts_dict = {"de" : DE_CONTRASTS, "ip" : IP_CONTRASTS}
302
CONTRASTS = DE_CONTRASTS + IP_CONTRASTS
303
CONTRAST2PAIR = dict(zip(CONTRASTS, COND_PAIRS))
304
305
MIN_LEN = config["min_len"]
MAX_LEN = config["max_len"]
Blaise Li's avatar
Blaise Li committed
306
size_selected = "%s-%s" % (MIN_LEN, MAX_LEN)
307
308
309
310
311
312
313
314
315
316
317
# bli@naples:/pasteur/entites/Mhe/Genomes$ cat C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/miRNA.bed | mawk '{hist[$3-$2]++} END {for (l in hist) print l"\t"hist[l]}' | sort -n
# 17	1
# 18	2
# 19	4
# 20	18
# 21	57
# 22	172
# 23	138
# 24	45
# 25	11
# 26	3
318
# This should ideally come from genome configuration:
319
320
321
322
323
324
MI_MAX = 26
read_type_max_len = {
    size_selected: int(MAX_LEN),
    "pi": min(PI_MAX, int(MAX_LEN)),
    "si": min(SI_MAX, int(MAX_LEN)),
    "mi": min(MI_MAX, int(MAX_LEN))}
325
READ_TYPES_FOR_COMPOSITION = [
326
    size_selected, "nomap_siRNA", "all_siRNA",
Blaise Li's avatar
Blaise Li committed
327
    f"all_si_{SI_MIN}GRNA", f"all_si_{SI_MAX}GRNA",
328
    *type2RNA(SMALL_TYPES)]
329
READ_TYPES_FOR_MAPPING = [
330
331
332
333
    *READ_TYPES_FOR_COMPOSITION,
    "all_siuRNA",
    f"si_{SI_MIN}GRNA", f"si_{SI_MAX}GRNA",
    f"siu_{SI_MIN}GRNA", f"siu_{SI_MAX}GRNA"]
334
335
# Types of annotation features, as defined in the "gene_biotype"
# GTF attribute sections of the annotation files.
336
COUNT_BIOTYPES = config["count_biotypes"]
337
BOXPLOT_BIOTYPES = config.get("boxplot_biotypes", [])
338
ANNOT_BIOTYPES = config["annot_biotypes"]
Blaise Li's avatar
Blaise Li committed
339
assert "protein_coding_3UTR" not in set(ANNOT_BIOTYPES), "It seems having 3' or 5' UTR annotations makes 22G RNA disappear from those regions. Only use the more general protein_coding_UTR in annot_biotypes"
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359

RMSK_BIOTYPES = [
    "DNA_transposons_rmsk",
    "RNA_transposons_rmsk",
    "satellites_rmsk",
    "simple_repeats_rmsk"]
RMSK_FAMILIES_BIOTYPES = [
    "DNA_transposons_rmsk_families",
    "RNA_transposons_rmsk_families",
    "satellites_rmsk_families",
    "simple_repeats_rmsk_families"]

BIOTYPES_TO_JOIN = {
    "all_rmsk": [biotype for biotype in COUNT_BIOTYPES + BOXPLOT_BIOTYPES if biotype in RMSK_BIOTYPES],
    "all_rmsk_families": [biotype for biotype in COUNT_BIOTYPES + BOXPLOT_BIOTYPES if biotype in RMSK_FAMILIES_BIOTYPES],
    # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
    "alltypes": [biotype for biotype in COUNT_BIOTYPES + BOXPLOT_BIOTYPES if not biotype.startswith("protein_coding_")]}
# Filter out those with empty values
JOINED_BIOTYPES = [joined_biotype for (joined_biotype, biotypes) in BIOTYPES_TO_JOIN.items() if biotypes]

360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
# cf Gu et al. (2009), supplement:
# -----
# To compare 22G-RNAs derived from a gene, transposon, or pseudogene between two
# samples, each sample was normalized using the total number of reads less structural RNAs, i.e.
# sense small RNA reads likely derived from degraded ncRNAs, tRNAs, snoRNAs, rRNAs,
# snRNAs, and scRNAs. Degradation products of structural RNAs map to the sense strand, with a
# poorly defined size profile and 1 st nucleotide distribution. At least 25 22G-RNA reads per million,
# nonstructural reads in one of the two samples was arbitrarily chosen as a cutoff for comparison
# analyses. A change of 2-fold or more between samples was chosen as an enrichment threshold.
# Because some 21U-RNAs or miRNAs overlap with protein coding genes, reads derived from
# miRNA loci within a window of ± 4 nt and all the known 21U-RNAs were filtered out prior to
# comparison analysis.
# -----
# And Germano Cecere, about scRNA:
# -----
# Its an old nomenclature and in anycase there is only one of this annotated scRNAs
# (small cytoplasmic RNA genes).
# https://www.ncbi.nlm.nih.gov/books/NBK19701/table/genestructure_table2/?report=objectonly
# Don't even pay attention to this
# -----
STRUCTURAL_BIOTYPES = ["tRNA", "snRNA", "snoRNA", "rRNA", "ncRNA"]
Blaise Li's avatar
Blaise Li committed
381
GENE_LISTS = config["gene_lists"]
382
383
384
385
386
387
388
BOXPLOT_GENE_LISTS = config["boxplot_gene_lists"]
#BOXPLOT_GENE_LISTS = [
#    "all_genes",
#    "replication_dependent_octamer_histone",
#    "piRNA_dependent_prot_si_down4",
#    "csr1_prot_si_supertargets_48hph",
#    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
389
390
391
392
393
394
395
aligner = config["aligner"]
########################
# Genome configuration #
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
396
chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {})))
397
398
399
400
401
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
402
exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
403
404
405
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
406
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
407
index = genome_db
408

409
410
411
412
413
414
#output_dir = config["output_dir"]
#workdir: config["output_dir"]
output_dir = os.path.abspath(".")
local_annot_dir = config.get("local_annot_dir", OPJ("annotations"))
log_dir = config.get("log_dir", OPJ("logs"))
data_dir = config.get("data_dir", OPJ("data"))
415
# To put the results of small_RNA_seq_annotate
416
mapping_dir = OPJ(aligner, f"mapped_{genome}")
417
418
reads_dir = OPJ(mapping_dir, "reads")
annot_counts_dir = OPJ(mapping_dir, "annotation")
419
# The order after pi and mi must match: this is used in make_read_counts_summary
Blaise Li's avatar
Blaise Li committed
420
421
422
423
424
425
426
427
ANNOT_COUNTS_TYPES = [
    "pi", "mi", "all_si",
    f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G",
    *SI_TYPES]
ANNOT_COUNTS_TYPES_U = [
    "all_siu",
    f"all_siu_{SI_MIN}G", f"all_siu_{SI_MAX}G",
    *SIU_TYPES]
428
feature_counts_dir = OPJ(mapping_dir, "feature_count")
429
READ_TYPES_FOR_COUNTING = [f"{size_selected}_and_nomap_siRNA", f"prot_si_{SI_MIN}GRNA_and_prot_siu_{SI_MIN}GRNA", *type2RNA([f"prot_si_{SI_MIN}G", f"prot_si_{SI_MAX}G"])]
430
431
# Reads remapped and counted using featureCounts
REMAPPED_COUNTED = [
432
433
    f"{small_type}_{mapping_type}_{biotype}_{orientation}_transcript" for (
        small_type, mapping_type, biotype, orientation) in product(
434
435
436
437
438
            READ_TYPES_FOR_COUNTING,
            [f"on_{genome}", f"unique_on_{genome}"],
            #[f"on_{genome}"],
            COUNT_BIOTYPES + JOINED_BIOTYPES,
            ORIENTATIONS)]
439
440
# Used to skip some genotype x treatment x replicate number combinations
# when some of them were not sequenced
441
forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]}
Blaise Li's avatar
Blaise Li committed
442
443
CONDITIONS = [{
    "lib" : lib,
444
    "rep" : rep} for rep in REPS for lib in LIBS]
Blaise Li's avatar
Blaise Li committed
445
446
447
448
449
450
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
    cond["lib"],
    cond["rep"])) for cond in CONDITIONS]
COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
    cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
Blaise Li's avatar
Blaise Li committed
451
452
#SIZE_FACTORS = ["raw", "deduped", size_selected, "mapped", "siRNA", "miRNA"]
#SIZE_FACTORS = [size_selected, "mapped", "miRNA"]
453
454
# TESTED_SIZE_FACTORS = ["mapped", "non_structural", "siRNA", "miRNA", "median_ratio_to_pseudo_ref"]
TESTED_SIZE_FACTORS = ["mapped", "non_structural", "all_sisiuRNA", "miRNA", "median_ratio_to_pseudo_ref"]
Blaise Li's avatar
Blaise Li committed
455
#SIZE_FACTORS = ["mapped", "miRNA", "median_ratio_to_pseudo_ref"]
456
457
458
459
# "median_ratio_to_pseudo_ref" is a size factor adapted from
# the method described in the DESeq paper, but with addition
# and then substraction of a pseudocount, in order to deal with zero counts.
# This seems to perform well (see "test_size_factor" results).
Blaise Li's avatar
Blaise Li committed
460
DE_SIZE_FACTORS = ["non_structural", "median_ratio_to_pseudo_ref"]
461
462
SIZE_FACTORS = ["non_structural"]
#NORMALIZER = "median_ratio_to_pseudo_ref"
463

Blaise Li's avatar
Blaise Li committed
464
# For metagene analyses
465
466
#META_MARGIN = 300
META_MARGIN = 0
467
META_SCALE = 500
468
469
#UNSCALED_INSIDE = 500
UNSCALED_INSIDE = 0
Blaise Li's avatar
Blaise Li committed
470
471
472
#META_MIN_LEN = 1000
META_MIN_LEN = 2 * UNSCALED_INSIDE
MIN_DIST = 2 * META_MARGIN
473
474
READ_TYPES_FOR_METAPROFILES = [
    size_selected,
Blaise Li's avatar
Blaise Li committed
475
    f"all_si_{SI_MIN}GRNA", f"all_si_{SI_MAX}GRNA",
476
477
478
    f"si_{SI_MIN}GRNA", f"si_{SI_MAX}GRNA",
    f"siu_{SI_MIN}GRNA", f"siu_{SI_MAX}GRNA",
    "miRNA", "piRNA", "all_siRNA"]
479

480
481
482
483
484
485
486
487
488
489
490
491
492
# split = str.split
#
#
# def strip_split(text):
#     return split(strip(text), "\t")
#
#
# # http://stackoverflow.com/a/845069/1878788
# def file_len(fname):
#     p = Popen(
#         ['wc', '-l', fname],
#         stdout=PIPE,
#         stderr=PIPE)
493
#     (result, err) = p.communicate()
494
495
496
#     if p.returncode != 0:
#         raise IOError(err)
#     return int(result.strip().split()[0])
497
498


499
500
501
502
503
504
505
506
def add_dataframes(df1, df2):
    return df1.add(df2, fill_value=0)


def sum_dataframes(dfs):
    return reduce(add_dataframes, dfs)


507
def sum_counts(fname):
508
    collect()
509
    p = Popen(
510
511
512
        # ['awk', '$1 ~ /^piRNA$|^miRNA$|^pseudogene$|^satellites_rmsk$|^simple_repeats_rmsk$|^protein_coding_|^.NA_transposons_rmsk$/ {sum += $2} END {print sum}', fname],
        # slightly faster
        ['mawk', '$1 ~ /^piRNA$|^miRNA$|^pseudogene$|^satellites_rmsk$|^simple_repeats_rmsk$|^protein_coding_|^.NA_transposons_rmsk$/ {sum += $2} END {print sum}', fname],
513
514
        stdout=PIPE,
        stderr=PIPE)
515
    (result, err) = p.communicate()
516
517
    if p.returncode != 0:
        raise IOError(err)
518
519
520
521
522
    try:
        return int(result.strip().split()[0])
    except IndexError:
        warnings.warn(f"No counts in {fname}\n")
        return 0
523
524

def sum_te_counts(fname):
525
    collect()
526
527
528
529
    p = Popen(
        ['awk', '$1 !~ /WBGene/ {sum += $2} END {print sum}', fname],
        stdout=PIPE,
        stderr=PIPE)
530
    (result, err) = p.communicate()
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0])


import matplotlib.patheffects


class Scale(matplotlib.patheffects.RendererBase):
    def __init__(self, sx, sy=None):
        self._sx = sx
        self._sy = sy

    def draw_path(self, renderer, gc, tpath, affine, rgbFace):
        affine = affine.identity().scale(self._sx, self._sy) + affine
        renderer.draw_path(gc, tpath, affine, rgbFace)


549
550
551
552
553
554
555
556
557
558
559
560
561
562
def print_wc(wildcards):
    print(wildcards)
    return []


def check_wc(wildcards):
    #if hasattr(wildcards, "read_type"):
    #    if wildcards.read_type.endswith("_on_C"):
    #        print(wildcards)
    if any(attr.endswith("_on_C") for attr in vars(wildcards).values() if type(attr) == "str"):
        print(wildcards)
    return []


563
564
565
filtered_product = filter_combinator(product, forbidden)

# Limit risks of ambiguity by imposing replicates to be numbers
566
# and restricting possible forms of some other wildcards
567
wildcard_constraints:
568
    lib="|".join(LIBS),
569
    #treat="|".join(TREATS),
570
    rep="\d+",
Blaise Li's avatar
Blaise Li committed
571
572
    min_dist="\d+",
    min_len="\d+",
573
    #max_len="\d+",
574
    biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES + JOINED_BIOTYPES)),
575
    id_list="|".join(GENE_LISTS),
576
    type_set="|".join(["all", "protein_coding", "protein_coding_TE"]),
577
    mapping_type="|".join([f"on_{genome}", f"unique_on_{genome}"]),
Blaise Li's avatar
Blaise Li committed
578
579
580
581
    #small_type="si|siu|sisiu|all_si|all_siu|all_sisiu|%s" % "|".join(SMALL_TYPES + JOINED_SMALL_TYPES),
    small_type="si|siu|sisiu|all_si|all_siu|all_sisiu|all_si_%sG|all_si_%sG|%s" % (SI_MIN, SI_MAX, "|".join(SMALL_TYPES + JOINED_SMALL_TYPES)),
    #small_type="si|siu|sisiu|all_si|all_siu|all_sisiu|%s" % "|".join(
    #    [f"all_si_{SI_MIN}G", "all_si_{SI_MAX}G"] + SMALL_TYPES + JOINED_SMALL_TYPES),
582
583
584
585
    mapped_type="|".join([size_selected, "nomap_siRNA"]),
    read_type="|".join([*REMAPPED_COUNTED, *READ_TYPES_FOR_MAPPING, *READ_TYPES_FOR_COUNTING, "trimmed", "nomap"]),
    infix="si|siu",
    suffix="|".join(SI_SUFFIXES),
Blaise Li's avatar
Blaise Li committed
586
    standard="zscore|robust|minmax|unit|identity",
Blaise Li's avatar
Blaise Li committed
587
    orientation="all|fwd|rev",
588
    contrast="|".join(CONTRASTS),
589
    norm="|".join(TESTED_SIZE_FACTORS),
590
591
    lib_group="|".join(ALL_LIB_GROUPS),
    group_type="|".join(GROUP_TYPES),
Blaise Li's avatar
Blaise Li committed
592
    fold_type="|".join(["mean_log2_RPM_fold", "log2FoldChange", "lfcMLE"]),
593

Blaise Li's avatar
Blaise Li committed
594
#ruleorder: map_on_genome > sam2indexedbam > compute_coverage > remap_on_genome > resam2indexedbam > recompute_coverage
595
596

# Define functions to be used in shell portions
597
598
599
shell.prefix(SHELL_FUNCTIONS)


600
601
602
603
###################################
# Preparing the input of rule all #
###################################

604
bigwig_files = [
Blaise Li's avatar
Blaise Li committed
605
    # individual libraries
606
607
    expand(
        expand(
608
            OPJ(mapping_dir, "{lib}_{rep}",
609
                "{lib}_{rep}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome),
610
            filtered_product, lib=LIBS, rep=REPS),
611
        read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]),
Blaise Li's avatar
Blaise Li committed
612
    # means of replicates
613
614
    expand(
        expand(
615
            OPJ(mapping_dir, "{lib}_mean",
616
                "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome),
617
            filtered_product, lib=LIBS),
618
        read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]),
619
620
621
622
623
    ]

meta_profiles = [
        #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split()),
        #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]),
Blaise Li's avatar
Blaise Li committed
624
        [expand(
625
            OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}",
626
                "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.pdf"),
627
628
            meta_scale=[str(META_SCALE)],
            read_type=READ_TYPES_FOR_METAPROFILES,
629
630
            norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
            biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
631
            group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()],
632
633
        # Same as above ?
        # [expand(
634
        #     OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}",
635
        #         "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.pdf"),
636
637
638
        #     meta_scale=[str(META_SCALE)], read_type=[size_selected, "siRNA", "siuRNA", "miRNA", "piRNA", "all_siRNA"],
        #     norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
        #     biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
639
        #     group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()],
Blaise Li's avatar
Blaise Li committed
640
        [expand(
641
            OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}",
642
                "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.pdf"),
643
644
            meta_scale=[str(META_SCALE)],
            read_type=READ_TYPES_FOR_METAPROFILES,
645
646
            norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding"], min_dist=[str(MIN_DIST)],
            biotype=["protein_coding"], min_len=[str(META_MIN_LEN)],
647
            group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()],
Blaise Li's avatar
Blaise Li committed
648
        [expand(
649
            OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}",
650
                "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{group_type}_{lib_group}_meta_profile.pdf"),
651
652
            meta_scale=[str(META_SCALE)],
            read_type=READ_TYPES_FOR_METAPROFILES,
653
            norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=["0"], id_list=GENE_LISTS,
654
            group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()],
655
656
        ## TODO: Resolve issue with bedtools
        # expand(
657
        #     OPJ("figures", "{lib}_{rep}",
658
        #         "{read_type}_by_{norm}_{orientation}_pi_targets_in_{biotype}_profile.pdf"),
659
        #     lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_METAPROFILES,
660
        #     norm=SIZE_FACTORS, orientation=["all"], biotype=["protein_coding"]),
661
662
663
    ]

read_graphs = [
664
665
    expand(
        expand(
666
            OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_base_composition_from_{position}.pdf"),
667
            read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]),
668
669
670
        filtered_product, lib=LIBS, rep=REPS),
    expand(
        expand(
671
            OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_base_logo_from_{position}.pdf"),
672
            read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]),
673
        filtered_product, lib=LIBS, rep=REPS),
674
675
    expand(
        expand(
676
            OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_composition.pdf"),
677
            read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS),
678
679
680
        filtered_product, lib=LIBS, rep=REPS),
    expand(
        expand(
681
            OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_logo.pdf"),
682
            read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS),
683
684
685
        filtered_product, lib=LIBS, rep=REPS),
    expand(
        expand(
686
            OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_size_distribution.pdf"),
687
            read_type=["trimmed", "nomap"]),
688
689
        filtered_product, lib=LIBS, rep=REPS),
    expand(
690
        OPJ("figures", "{lib}_{rep}", f"{size_selected}_smallRNA_barchart.pdf"),
691
        filtered_product, lib=LIBS, rep=REPS),
692
    expand(
693
        OPJ("figures", "{lib}_{rep}", "nb_reads.pdf"),
694
        filtered_product, lib=LIBS, rep=REPS),
695
696
    ]

697
698
699
ip_fold_heatmaps = []
de_fold_heatmaps = []
ip_fold_boxplots = []
700
701
# Temporary, until used for boxplots:
remapped_folds = []
702
remapped_fold_boxplots = []
703
if contrasts_dict["ip"]:
704
705
    if BOXPLOT_BIOTYPES:
        remapped_fold_boxplots = expand(
706
            OPJ("figures", "{contrast}", "by_biotypes",
707
                "{contrast}_{read_type}_{mapping_type}_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.pdf"),
708
709
            contrast=IP_CONTRASTS,
            read_type=READ_TYPES_FOR_COUNTING,
710
711
            mapping_type=[f"on_{genome}", f"unique_on_{genome}"],
            #mapping_type=[f"on_{genome}"],
712
713
714
715
716
            # TODO: Read this from config file
            biotypes=["_and_".join(BOXPLOT_BIOTYPES)],
            orientation=ORIENTATIONS,
            fold_type=["mean_log2_RPM_fold"],
            gene_list=BOXPLOT_GENE_LISTS)
717
    remapped_folds = expand(
718
        OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "remapped", "{counted_type}_mean_log2_RPM_fold.txt"),
Blaise Li's avatar
Blaise Li committed
719
720
        counted_type=REMAPPED_COUNTED)
    # Should be generated as a dependency of the "all" contrast:
721
722
723
724
725
726
727
728
729
730
    ## TODO: check that this is generated
    # remapped_folds += expand(
    #     OPJ(mapping_dir, f"RPM_folds_{size_selected}", "{contrast}", "remapped",
    #         "{contrast}_{read_type}_{mapping_type}_{biotype}_{orientation}_transcript_RPM_folds.txt"),
    #     contrast=IP_CONTRASTS,
    #     read_type=READ_TYPES_FOR_COUNTING,
    #     mapping_type=[f"on_{genome}", f"unique_on_{genome}"],
    #     biotype=["alltypes"],
    #     orientation=ORIENTATIONS)
    ##
731
732
733
    # snakemake -n OK
    # expand(
    #     OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all",
734
735
    #         "{read_type}_{mapping_type}_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt"),
    #     read_type=READ_TYPES_FOR_COUNTING, mapping_type=[f"on_{genome}"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
736
737
738
    # snakemake -n OK
    # expand(
    #     OPJ(mapping_dir, f"RPM_folds_{size_selected}", "{contrast}",
739
740
    #         "{contrast}_{small_type}_{mapping_type}_{biotype}_{orientation}_transcript_RPM_folds.txt"),
    #     contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, mapping_type=[f"on_{genome}"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
741
    ip_fold_heatmaps = expand(
742
        OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"),
743
        small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"])
744
    ip_fold_boxplots = expand(
745
        OPJ("figures", "all_{contrast_type}",
746
            "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"),
747
        contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
748
        gene_list=BOXPLOT_GENE_LISTS)
749
    # Subdivided by biotype (to enable distinction between 5'UTR, CDS and 3'UTR)
750
    ## TODO: activate this?
751
    # ip_fold_boxplots = expand(
752
    #     OPJ("figures", "all_{contrast_type}",
753
    #         "{contrast_type}_{small_type}_{fold_type}_{gene_list}_{biotype}_boxplots.pdf"),
754
    #     contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
755
    #     gene_list=BOXPLOT_GENE_LISTS, biotype=["protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"])
756
    ##
757
758
if contrasts_dict["de"]:
    de_fold_heatmaps = expand(
759
        OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"),
760
        small_type=DE_TYPES, fold_type=["log2FoldChange"])
761
762

exploratory_graphs = [
763
764
    ip_fold_heatmaps,
    de_fold_heatmaps,
765
    # Large figures, not very readable
766
    # expand(
767
768
    #     OPJ(mapping_dir, f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_by_{norm}_pairplots.pdf"),
    #     contrast=DE_CONTRASTS, small_type=DE_TYPES, norm=DE_SIZE_FACTORS),
769
770
    ## TODO: debug PCA
    #expand(
771
    #    OPJ("figures", "{small_type}_{standard}_PCA.pdf"),
772
    #    small_type=["pisimi"], standard=STANDARDS),
773
    #expand(
774
    #    OPJ("figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"),
775
    #    small_type=["pisimi"], standard=STANDARDS),
776
777
    ##
    #expand(
778
    #    OPJ("figures", "{small_type}_clustermap.pdf"),
779
    #    small_type=SMALL_TYPES),
780
    #expand(
781
    #    OPJ("figures", "{small_type}_zscore_clustermap.pdf"),
782
    #    small_type=SMALL_TYPES),
783
    #expand(
784
    #    OPJ("figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"),
785
    #    contrast=CONTRASTS, small_type=DE_TYPES),
786
787
788
    ]

de_fold_boxplots = expand(
789
    OPJ("figures", "{contrast}",
790
        "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"),
Blaise Li's avatar
Blaise Li committed
791
    contrast=DE_CONTRASTS, small_type=DE_TYPES, fold_type=["log2FoldChange", "mean_log2_RPM_fold"],
792
    gene_list=["all_gene_lists"])
793
ip_fold_boxplots_by_contrast = expand(
794
    OPJ("figures", "{contrast}",
795
        "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"),
Blaise Li's avatar
Blaise Li committed
796
    contrast=IP_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"],
797
    gene_list=["all_gene_lists"])
798
799
800

fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplots]

801
802
rule all:
    input:
803
        expand(
804
            OPJ(mapping_dir, "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome),
805
806
807
            filtered_product, lib=LIBS, rep=REPS, read_type=[size_selected]),
        bigwig_files,
        meta_profiles,
808
809
810
        # snakemake -n OK
        # expand(
        #     expand(
811
        #         OPJ(feature_counts_dir, "summaries",
812
813
814
815
816
817
818
        #             "{{lib}}_{{rep}}_{read_type}_on_%s_{orientation}_transcript_counts.txt" % (genome)),
        #         read_type=READ_TYPES_FOR_MAPPING, orientation=ORIENTATIONS),
        #     filtered_product, lib=LIBS, rep=REPS),
        # TODO
        # snakemake -n OK
        # expand(
        #     expand(
819
        #         OPJ(feature_counts_dir,
820
821
822
823
824
825
        #             "{{lib}}_{{rep}}_{small_type}_counts.txt"),
        #         small_type=REMAPPED_COUNTED),
        #     filtered_product, lib=LIBS, rep=REPS),
        # snakemake -n not OK: summaries contains all biotypes
        # expand(
        #     expand(
826
        #         OPJ(feature_counts_dir, "summaries",
827
828
829
830
        #             "{{lib}}_{{rep}}_{small_type}_counts.txt"),
        #         small_type=REMAPPED_COUNTED),
        #     filtered_product, lib=LIBS, rep=REPS),
        # REMAPPED_COUNTED = [f"{small_type}_on_{genome}_{biotype}_{orientation}_transcript" for (small_type, biotype, orientation) in product(READ_TYPES_FOR_MAPPING, set(COUNT_BIOTYPES + ANNOT_BIOTYPES), ORIENTATIONS)]
831
832
        expand(
            expand(
833
                OPJ(feature_counts_dir, "summaries",
834
                    "{{lib}}_{{rep}}_%s_and_nomap_siRNA_on_%s_{orientation}_transcript_counts.txt" % (size_selected, genome)),
835
836
837
838
                orientation=ORIENTATIONS),
            filtered_product, lib=LIBS, rep=REPS),
        expand(
            expand(
839
                OPJ(feature_counts_dir, "summaries",
840
                    "{{lib}}_{{rep}}_prot_si_%sGRNA_and_prot_siu_%sGRNA_on_%s_{orientation}_transcript_counts.txt" % (SI_MIN, SI_MIN, genome)),
841
842
843
                orientation=ORIENTATIONS),
            filtered_product, lib=LIBS, rep=REPS),
        read_graphs,
844
        #expand(OPJ(feature_counts_dir, "summaries", "{lib}_{rep}_nb_non_structural.txt"), filtered_product, lib=LIBS, rep=REPS),
845
        #OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"),
846
        OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}G_mean_log2_RPM_fold.txt"),
847
848
        # Not looking ad deseq2 results any more
        #expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
849
        #expand(OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES),
850
        exploratory_graphs,
851
        remapped_folds,
852
        fold_boxplots,
853
        remapped_fold_boxplots,
854
        #expand(OPJ("figures", "{small_type}_unit_clustermap.pdf"), small_type=SMALL_TYPES),
855
        #expand(OPJ(
856
        #    feature_counts_dir,
857
        #    "all_{read_type}_{mapping_type}_{biotype}_{orientation}_transcript_counts.txt"), read_type=READ_TYPES_FOR_COUNTING, mapping_type=[f"on_{genome}"], biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS),
858
        # expand(OPJ(
859
        #     feature_counts_dir,
860
861
        #     "all_{small_type}_{mapping_type}_{biotype}_{orientation}_transcript_counts.txt"),
        #     small_type=READ_TYPES_FOR_MAPPING, mapping_type=[f"on_{genome}"], biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
862
        expand(
863
            OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
864
            small_type=["mi", *SI_TYPES, *SIU_TYPES,  f"pimi{SI_MIN}G"]),
865
866
        # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting
        # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot
867
        expand(
868
            OPJ("figures", "{small_type}_norm_correlations.pdf"),
869
            small_type=["mi", *SI_TYPES, *SIU_TYPES,  f"pimi{SI_MIN}G"]),
870
        expand(
871
            OPJ("figures", "{small_type}_norm_counts_distrib.pdf"),
872
            small_type=["mi", *SI_TYPES, *SIU_TYPES,  f"pimi{SI_MIN}G"]),
873

874
875
876
877
878
879
#absolute = "/pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/includes/link_raw_data.rules"
#relative_include_path = "../snakemake_wrappers/includes/link_raw_data.snakefile"
#absolute_include_path = os.path.join(workflow.basedir, relative_include_path)
#assert os.path.exists(absolute_include_path)
#include: relative_include_path
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
880

881

882
883
884
885
886
rule trim_and_dedup:
    input:
        rules.link_raw_data.output,
        #OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
    params:
887
        adapter = lambda wildcards: lib2adapt[wildcards.lib],
888
889
890
        trim5 = trim5,
        trim3 = trim3,
    output:
891
892
893
894
        trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_trimmed.fastq.gz"),
        nb_raw =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_raw.txt"),
        nb_trimmed =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"),
        nb_deduped =  OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_deduped.txt"),
895
    threads: 2
896
897
    resources:
        mem_mb=1049300
898
    message:
899
        "Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
900
901
    benchmark:
        OPJ(log_dir, "trim_and_dedup", "{lib}_{rep}_benchmark.txt")
902
    log:
903
904
        cutadapt = OPJ(log_dir, "cutadapt", "{lib}_{rep}.log"),
        trim_and_dedup = OPJ(log_dir, "trim_and_dedup", "{lib}_{rep}.log"),
905
906
907
908
909
910
    shell:
        """
        zcat {input} \\
            | tee >(count_fastq_reads {output.nb_raw}) \\
            | cutadapt -a {params.adapter} --discard-untrimmed - 2> {log.cutadapt} \\
            | tee >(count_fastq_reads {output.nb_trimmed}) \\
911
            | dedup \\
912
913
914
915
916
917
918
919
            | tee >(count_fastq_reads {output.nb_deduped}) \\
            | trim_random_nt {params.trim5} {params.trim3}  2>> {log.cutadapt} \\
            | gzip > {output.trimmed} \\
            2> {log.trim_and_dedup}
        """


def awk_size_filter(wildcards):
Blaise Li's avatar
Blaise Li committed
920
    """Returns the bioawk filter to select reads of size from MIN_LEN to MAX_LEN."""
921
    return "%s <= length($seq) && length($seq) <= %s" % (MIN_LEN, MAX_LEN)
922

Blaise Li's avatar
Blaise Li committed
923

924
rule select_size_range:
Blaise Li's avatar
Blaise Li committed
925
    """Select (and count) reads in the correct size range."""
926
927
928
    input:
        rules.trim_and_dedup.output.trimmed
    output:
929
930
        selected = OPJ(data_dir, "trimmed", "{lib}_{rep}_%s.fastq.gz" % size_selected),
        nb_selected = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_%s.txt" % size_selected),
931
932
933
    params:
        awk_filter = awk_size_filter,
    message:
934
        "Selecting reads size %s for {wildcards.lib}_{wildcards.rep}." % size_selected
935
936
937
    shell:
        """
        bioawk -c fastx '{params.awk_filter} {{print "@"$name" "$4"\\n"$seq"\\n+\\n"$qual}}' {input} \\
938
939
            | tee >(count_fastq_reads {output.nb_selected}) \\
            | gzip > {output.selected}
940
941
942
        """


943
# TODO: update this
944
@wc_applied
Blaise Li's avatar
Blaise Li committed
945
946
947
948
949
950
951
952
953
954
def source_fastq(wildcards):
    """Determine the fastq file corresponding to a given read type."""
    read_type = wildcards.read_type
    if read_type == "raw":
        return rules.link_raw_data.output
    elif read_type == "trimmed":
        return rules.trim_and_dedup.output.trimmed
    elif read_type == size_selected:
        return rules.select_size_range.output.selected
    elif read_type == "nomap":
Blaise Li's avatar
Blaise Li committed
955
        return rules.map_on_genome.output.nomap_fastq
Blaise Li's avatar
Blaise Li committed
956
957
    elif read_type == "nomap_siRNA":
        return rules.extract_nomap_siRNAs.output.nomap_si
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
    elif read_type[:-3] in annotate_read_output:
        return annotate_read_output[read_type[:-3]]
    elif read_type == f"si_{SI_MIN}GRNA":
        return OPJ(
            reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome),
            f"si_{SI_MIN}GRNA.fastq.gz"),
    elif read_type == f"si_{SI_MAX}GRNA":
        return OPJ(
            reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome),
            f"si_{SI_MAX}GRNA.fastq.gz"),
    elif read_type == f"siu_{SI_MIN}GRNA":
        return OPJ(
            reads_dir, "{lib}_{rep}_%s_on_%s" % ("nomap_siRNA", genome),
            f"siu_{SI_MIN}GRNA.fastq.gz"),
    elif read_type == f"siu_{SI_MAX}GRNA":
        return OPJ(
            reads_dir, "{lib}_{rep}_%s_on_%s" % ("nomap_siRNA", genome),
            f"siu_{SI_MAX}GRNA.fastq.gz"),
    elif read_type[:-3] in annotate_read_output_U:
        return annotate_read_output_U[read_type[:-3]]
    elif read_type == f"sisiu_{SI_MIN}GRNA":
        return OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), f"sisiu_{SI_MIN}GRNA.fastq.gz"),
    elif read_type == f"sisiu_{SI_MAX}GRNA":
        return OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), f"sisiu_{SI_MAX}GRNA.fastq.gz"),
    elif read_type == f"all_sisiu_{SI_MIN}GRNA":
        return OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), f"all_sisiu_{SI_MIN}GRNA.fastq.gz"),
    elif read_type == f"all_sisiu_{SI_MAX}GRNA":
        return OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), f"all_sisiu_{SI_MAX}GRNA.fastq.gz"),
Blaise Li's avatar
Blaise Li committed
986
    else:
Blaise Li's avatar
Blaise Li committed
987
        raise NotImplementedError("Unknown read type: %s" % read_type)
Blaise Li's avatar
Blaise Li committed
988
989


990
991


992
rule map_on_genome:
993
    input:
Blaise Li's avatar
Blaise Li committed
994
        #rules.select_size_range.output.selected,
995
996
        #fastq = source_fastq,
        fastq = rules.select_size_range.output.selected,
997
    output:
998
        sam = temp(OPJ(mapping_dir, "{lib}_{rep}", "%s_on_%s.sam" % (size_selected, genome))),
999
        nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_on_%s.fastq.gz" % (size_selected, genome)),
1000
    params:
For faster browsing, not all history is shown. View entire blame