create_metagene_profile.py 14.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""
This script reads bigwig files, and creates a metagene profile.

The profiles can be restricted based on a list of gene identifiers.
"""
# TODO: Set font size or graph width to avoid very large titles (also for the pipeline) ?

# TODO: What about using deeptools?
# deeptools.countReadsPerBin.CountReadsPerBin, using:
# - bedFile, with the gene spans
# - blackListFileName, with the introns
# The above should count reads in the exonic part.
# TODO: Check this is true: one count per region in bedFile
# Or use count_reads_in_region twice, and compute differences?
# Then scale this to the gene exonic length
# This might correspond to the following computeMatrix scale-regions option:
#   --blackListFileName BED file, -bl BED file
#                         A BED file containing regions that should be excluded
#                         from all analyses. Currently this works by rejecting
#                         genomic chunks that happen to overlap an entry.
#                         Consequently, for BAM files, if a read partially
#                         overlaps a blacklisted region or a fragment spans over
#                         it, then the read/fragment might still be considered.
#
# See also --metagene option:
# GTF/BED12 options:
#   --metagene            When either a BED12 or GTF file are used to provide
#                         regions, perform the computation on the merged exons,
#                         rather than using the genomic interval defined by the
#                         5-prime and 3-prime most transcript bound (i.e.,
#                         columns 2 and 3 of a BED file). If a BED3 or BED6 file
#                         is used as input, then columns 2 and 3 are used as an
#                         exon. (default: False)
#   --transcriptID TRANSCRIPTID
#                         When a GTF file is used to provide regions, only
#                         entries with this value as their feature (column 2)
#                         will be processed as transcripts. (default:
#                         transcript)
#   --exonID EXONID       When a GTF file is used to provide regions, only
#                         entries with this value as their feature (column 2)
#                         will be processed as exons. CDS would be another
#                         common value for this. (default: exon)
#   --transcript_id_designator TRANSCRIPT_ID_DESIGNATOR
#                         Each region has an ID (e.g., ACTB) assigned to it,
#                         which for BED files is either column 4 (if it exists)
#                         or the interval bounds. For GTF files this is instead
#                         stored in the last column as a key:value pair (e.g.,
#                         as 'transcript_id "ACTB"', for a key of transcript_id
#                         and a value of ACTB). In some cases it can be
#                         convenient to use a different identifier. To do so,
#                         set this to the desired key. (default: transcript_id)


import argparse
import os
import sys
import warnings
from copy import deepcopy
from pathlib import Path
from tempfile import NamedTemporaryFile
from collections import defaultdict
# from itertools import chain
from sqlite3 import OperationalError
from yaml import load as yload
# https://pythonhosted.org/gffutils/
from gffutils import FeatureDB, create_db
from deeptools import heatmapper
from deeptools.plotProfile import Profile
from cytoolz import keyfilter, valmap


def formatwarning(message, category, filename, lineno, line):
    """Format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning


def open_feature_db(db_filename, annot_filename=None,
                    force_new=False, **kwd_args):
    """
Blaise Li's avatar
Blaise Li committed
85
    Open a :class:`gffutils.FeatureDB` stored as *db_filename*.
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

    If *annot_filename* is provided, it can be used to create *db_filename* in
    case this file does not exist.
    If *force_new* is set to True, the database will be re-created even if
    *db_filename* already exists.

    *kwd_args* are extra keyword arguments to be passed to
    :func:`gffutils.create_db`.
    """
    # set default annotation file name, if necessary
    if annot_filename is None:
        annot_filename = "/pasteur/entites/Mhe/Genomes/" \
            "C_elegans/Caenorhabditis_elegans/Ensembl/" \
            "WBcel235/Annotation/Genes/genes.gtf"

    def do_create_db():
        """Create the feature database."""
        assert Path(annot_filename).exists(), \
            f"{annot_filename} is not found.\n" \
            "Cannot create a FeatureDB without a source of annotations."
        print(
            "Creating feature database. This may take time.",
            file=sys.stderr)
        create_db(annot_filename, dbfn=db_filename, force=True, **kwd_args)
    if force_new:
        do_create_db()
    try:
        fdb = FeatureDB(db_filename)
    except (ValueError, OperationalError) as open_err:
        warnings.warn(
            f"Cannot open {db_filename}:\n"
            f"{str(open_err)}\n"
            "Trying to re-create it.")
        do_create_db()
        fdb = FeatureDB(db_filename)
    return fdb


def iter_transcripts(fdb, genes=None):
    """
    Iterate over the transcripts in *fdb*.

    If *gene* is not None, only the transcripts of genes whose `gene_id`
    attribute are in *genes* are yielded.
    """
    if genes is not None:
        for transcript in fdb.all_features(featuretype="transcript"):
            [gene_id] = transcript["gene_id"]
            if gene_id in genes:
                yield transcript
    else:
        yield from fdb.all_features(featuretype="transcript")


def merged_transcripts(fdb, genes=None):
    """
    Iterate over the transcripts in *fdb*, merged per `gene_id`.
    The merge results in the loss of the `attributes` attribute,
    except `gene_id`.

    If *gene* is not None, only the transcripts of genes whose `gene_id`
    attribute are in *genes* are yielded.
    """
    transcript_dict = defaultdict(list)
    for transcript in fdb.all_features(featuretype="transcript"):
        [gene_id] = transcript["gene_id"]
        if genes is None or gene_id in genes:
            transcript_dict[gene_id].append(transcript)
    # return chain.from_iterable(valmap(fdb.merge, transcript_dict).values())
    for (gene_id, (feature,)) in valmap(fdb.merge, transcript_dict).items():
        # Restore the gene_id attribute
        feature.attributes["gene_id"] = gene_id
        yield feature


def feature2bed(feature):
    """Converts a :class:`gffutils.feature.Feature` *feature*
    into a bed-formatted line."""
    try:
        [gene_id] = feature.attributes["gene_id"]
    except KeyError:
        gene_id = "."
    return "\t".join([
        feature.chrom,
        str(feature.start - 1), str(feature.stop),
        gene_id, feature.score, feature.strand])


def bedfile_from_gene_list(fdb, genes):
    """Create a temporary bed file for the merged transcripts corresponding
Blaise Li's avatar
Blaise Li committed
176
    to gene ids listed in *genes*, and using the :class:`gffutils.FeatureDB`
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
    *fdb*
    The name of the bed file is returned.
    After using it, it should be deleted using os.unlink(bedfile_name)"""
    with NamedTemporaryFile(mode="w", delete=False) as bedfile:
        print(
            *map(feature2bed, merged_transcripts(fdb, genes)),
            sep="\n", file=bedfile)
    return bedfile.name


def fix_none(param_value):
    """Convert "None" as string into real Python None.
    This can be used to fix values in a yaml loaded dictionary."""
    if param_value == "None":
        return None
    return param_value


DEFAULT_PARAMETERS = {
    "upstream": 0,
    "downstream": 0,
    # TODO: change to 2000 ?
    "body": 500,
    "bin size": 10,
    "ref point": None,
    "verbose": False,
    "bin avg type": "mean",
    "missing data as zero": False,
    "min threshold": None,
    "max threshold": None,
    "scale": 1,
    "skip zeros": False,
    "nan after end": False,
    "proc number": 4,
    "sort regions": "keep",
    "sort using": "mean",
    "unscaled 5 prime": 0,
    "unscaled 3 prime": 0,
    "start_label": "start",
    "end_label": "end",
    "label_rotation": 90,
}
PARAMETER_INFO = "\n".join(DEFAULT_PARAMETERS.keys())


def is_prof_param(key):
    """Determine if *key* corresponds to a valid parameter for a *Profile*."""
    return key in {
        "plot_title", "y_axis_label", "y_min", "y_max", "averagetype",
        "reference_point_label", "start_label", "end_label",
        "plot_height", "plot_width", "per_group",
        "plot_type", "image_format", "color_list",
        "legend_location", "plots_per_row", "label_rotation", "dpi"}


def compute_matrix(bigwig_filenames,
                   bed_filename,
                   plot_filename=None,
                   **extra_parameters):
    """Combine information from bigwig files *bigwig_filenames* and bed file
    *bed_filename*.

    If *plot_filename* is set, write the corresponding meta profile
    in this file.
    """
    parameters = deepcopy(DEFAULT_PARAMETERS)
    parameters.update(extra_parameters)
    heatm = heatmapper.heatmapper()
    heatm.computeMatrix(bigwig_filenames, bed_filename, parameters)
    if "sample_labels" in parameters:
        heatm.matrix.set_sample_labels(parameters["sample_labels"])
    if "group_labels" in parameters:
        heatm.matrix.set_group_labels(parameters["group_labels"])
    # Fixing parameters (as in heatmapper.read_matrix_file
    # and heatmapper.save_matrix)
    nb_samples = len(heatm.matrix.sample_labels)
    hm_params = dict()
    for (key, val) in heatm.parameters.items():
        if isinstance(val, list) and not val:
            val = None
        if key in heatm.special_params and not isinstance(val, list):
            val = [val] * nb_samples
            if not val:
                val = [None] * nb_samples
        hm_params[key] = val
    heatm.parameters = hm_params

    if plot_filename is not None:
        print(f"plotting profile to {plot_filename}")
        prof_params = keyfilter(is_prof_param, parameters)
        prof_params["per_group"] = True
        prof = Profile(
            heatm, plot_filename,
            **prof_params)
        prof.plot_profile()


# def get_transcript_structure(fdb, transcript):
#     [t_id] = transcript["transcript_id"]
#     return list(chain(
#         fdb.children(t_id, featuretype="CDS"),
#         fdb.children(t_id, featuretype="UTR")))


def main():
    """Main function of the program."""
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "-d", "--feature_db",
        default="/pasteur/entites/Mhe/Genomes/WBcel235_genes.db",
        help="Path to the sqlite database to read or create.\n"
        "If you want to create it, also provide annotations "
        "with option *--annot_file*.")
    parser.add_argument(
        "-a", "--annot_file",
        help="Path to the gtf file containing annotations.")
    parser.add_argument(
        "-g", "--gene_list",
        help="Path to a file containing one gene identifier per line.\n"
        "The metagene profile will only take into account those genes.\n"
        "This should not be used at the same time as option -c")
    parser.add_argument(
        "-c", "--bed_coordinates",
        help="Path to a file containing the coordinates of the features "
        "for which to create the meta-profiles.\n"
        "This should not be used at the same time as option -g")
    parser.add_argument(
        "-b", "--bigwig_files",
        nargs="+",
        help="Paths to bigwig files that should be used "
        "to represent meta profiles.")
    parser.add_argument(
        "-l", "--library_labels",
        nargs="+",
        help="Names of the libraries. These should be in the same order "
        "as the files given at the -b option.")
    parser.add_argument(
        "-p", "--profile_plot",
        help="Path to the pdf file in wich to write the meta profiles.")
    parser.add_argument(
        "-t", "--plot_title",
        default="Metagene profile",
        help="Title to use for the metagene plot.")
    parser.add_argument(
        "--yMin",
        default=None,
        help="Minimum value for the Y-axis")
    parser.add_argument(
        "--yMax",
        default=None,
        help="Maximum value for the Y-axis")
    parser.add_argument(
        "-e", "--extra_parameters",
        help="Path to a yaml file containing parameters for the "
        "*compute_matrix* function.\n"
        "possible values are:\n" + PARAMETER_INFO)
    args = parser.parse_args()
    if args.extra_parameters:
        with open(args.extra_parameters, "r") as params_file:
            extra_parameters = valmap(fix_none, yload(params_file))
    else:
        extra_parameters = {}
    if args.plot_title:
        extra_parameters["group_labels"] = [args.plot_title]
    if args.library_labels:
        extra_parameters["sample_labels"] = args.library_labels
    if args.yMin is not None:
        extra_parameters["y_min"] = [float(args.yMin)]
    if args.yMax is not None:
        extra_parameters["y_max"] = [float(args.yMax)]
    if args.profile_plot:
        check_options_msg = "Use only one among options -c and -g."
        if args.gene_list:
            assert not args.bed_coordinates, check_options_msg
            with open(args.gene_list, "r") as gene_list_file:
                gene_ids = {
                    gene_id_line.strip() for gene_id_line in gene_list_file}
            fdb = open_feature_db(
                args.feature_db,
                args.annot_file,
                disable_infer_transcripts=True)
            bed_filename = bedfile_from_gene_list(fdb, gene_ids)
            print(f"temporary bed file: {bed_filename}")
        if args.bed_coordinates:
            assert not args.gene_list, check_options_msg
            bed_filename = args.bed_coordinates
            print(f"using bed file: {bed_filename}")
        compute_matrix(
            args.bigwig_files, bed_filename,
            plot_filename=args.profile_plot, **extra_parameters)
        if args.gene_list:
            os.unlink(bed_filename)
    return 0


if __name__ == "__main__":
    sys.exit(main())