Commit a1725563 authored by Blaise Li's avatar Blaise Li
Browse files

Moved plotting scripts to a submodule.

parent 15d0feef
......@@ -16,3 +16,6 @@
[submodule "libsmallrna"]
path = libsmallrna
url =
[submodule "plotting_scripts"]
path = plotting_scripts
url =
Subproject commit c423fc12a58f7adca64ef2c6f9afd55162bb5dab
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
This script reads bigwig files, and creates a metagene profile.
The profiles can be restricted based on a list of gene identifiers.
# TODO: Set font size or graph width to avoid very large titles (also for the pipeline) ?
# TODO: What about using deeptools?
# deeptools.countReadsPerBin.CountReadsPerBin, using:
# - bedFile, with the gene spans
# - blackListFileName, with the introns
# The above should count reads in the exonic part.
# TODO: Check this is true: one count per region in bedFile
# Or use count_reads_in_region twice, and compute differences?
# Then scale this to the gene exonic length
# This might correspond to the following computeMatrix scale-regions option:
# --blackListFileName BED file, -bl BED file
# A BED file containing regions that should be excluded
# from all analyses. Currently this works by rejecting
# genomic chunks that happen to overlap an entry.
# Consequently, for BAM files, if a read partially
# overlaps a blacklisted region or a fragment spans over
# it, then the read/fragment might still be considered.
# See also --metagene option:
# GTF/BED12 options:
# --metagene When either a BED12 or GTF file are used to provide
# regions, perform the computation on the merged exons,
# rather than using the genomic interval defined by the
# 5-prime and 3-prime most transcript bound (i.e.,
# columns 2 and 3 of a BED file). If a BED3 or BED6 file
# is used as input, then columns 2 and 3 are used as an
# exon. (default: False)
# --transcriptID TRANSCRIPTID
# When a GTF file is used to provide regions, only
# entries with this value as their feature (column 2)
# will be processed as transcripts. (default:
# transcript)
# --exonID EXONID When a GTF file is used to provide regions, only
# entries with this value as their feature (column 2)
# will be processed as exons. CDS would be another
# common value for this. (default: exon)
# --transcript_id_designator TRANSCRIPT_ID_DESIGNATOR
# Each region has an ID (e.g., ACTB) assigned to it,
# which for BED files is either column 4 (if it exists)
# or the interval bounds. For GTF files this is instead
# stored in the last column as a key:value pair (e.g.,
# as 'transcript_id "ACTB"', for a key of transcript_id
# and a value of ACTB). In some cases it can be
# convenient to use a different identifier. To do so,
# set this to the desired key. (default: transcript_id)
import argparse
import os
import sys
import warnings
from copy import deepcopy
from pathlib import Path
from tempfile import NamedTemporaryFile
from collections import defaultdict
# from itertools import chain
from sqlite3 import OperationalError
from yaml import load as yload
from gffutils import FeatureDB, create_db
from deeptools import heatmapper
from deeptools.plotProfile import Profile
from cytoolz import keyfilter, valmap
def formatwarning(message, category, filename, lineno, line):
"""Format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
def open_feature_db(db_filename, annot_filename=None,
force_new=False, **kwd_args):
Open a :class:`gffutil.FeatureDB` stored as *db_filename*.
If *annot_filename* is provided, it can be used to create *db_filename* in
case this file does not exist.
If *force_new* is set to True, the database will be re-created even if
*db_filename* already exists.
*kwd_args* are extra keyword arguments to be passed to
# set default annotation file name, if necessary
if annot_filename is None:
annot_filename = "/pasteur/entites/Mhe/Genomes/" \
"C_elegans/Caenorhabditis_elegans/Ensembl/" \
def do_create_db():
"""Create the feature database."""
assert Path(annot_filename).exists(), \
f"{annot_filename} is not found.\n" \
"Cannot create a FeatureDB without a source of annotations."
"Creating feature database. This may take time.",
create_db(annot_filename, dbfn=db_filename, force=True, **kwd_args)
if force_new:
fdb = FeatureDB(db_filename)
except (ValueError, OperationalError) as open_err:
f"Cannot open {db_filename}:\n"
"Trying to re-create it.")
fdb = FeatureDB(db_filename)
return fdb
def iter_transcripts(fdb, genes=None):
Iterate over the transcripts in *fdb*.
If *gene* is not None, only the transcripts of genes whose `gene_id`
attribute are in *genes* are yielded.
if genes is not None:
for transcript in fdb.all_features(featuretype="transcript"):
[gene_id] = transcript["gene_id"]
if gene_id in genes:
yield transcript
yield from fdb.all_features(featuretype="transcript")
def merged_transcripts(fdb, genes=None):
Iterate over the transcripts in *fdb*, merged per `gene_id`.
The merge results in the loss of the `attributes` attribute,
except `gene_id`.
If *gene* is not None, only the transcripts of genes whose `gene_id`
attribute are in *genes* are yielded.
transcript_dict = defaultdict(list)
for transcript in fdb.all_features(featuretype="transcript"):
[gene_id] = transcript["gene_id"]
if genes is None or gene_id in genes:
# return chain.from_iterable(valmap(fdb.merge, transcript_dict).values())
for (gene_id, (feature,)) in valmap(fdb.merge, transcript_dict).items():
# Restore the gene_id attribute
feature.attributes["gene_id"] = gene_id
yield feature
def feature2bed(feature):
"""Converts a :class:`gffutils.feature.Feature` *feature*
into a bed-formatted line."""
[gene_id] = feature.attributes["gene_id"]
except KeyError:
gene_id = "."
return "\t".join([
str(feature.start - 1), str(feature.stop),
gene_id, feature.score, feature.strand])
def bedfile_from_gene_list(fdb, genes):
"""Create a temporary bed file for the merged transcripts corresponding
to gene ids listed in *genes*, and using the :class:`gffutil.FeatureDB`
The name of the bed file is returned.
After using it, it should be deleted using os.unlink(bedfile_name)"""
with NamedTemporaryFile(mode="w", delete=False) as bedfile:
*map(feature2bed, merged_transcripts(fdb, genes)),
sep="\n", file=bedfile)
def fix_none(param_value):
"""Convert "None" as string into real Python None.
This can be used to fix values in a yaml loaded dictionary."""
if param_value == "None":
return None
return param_value
"upstream": 0,
"downstream": 0,
# TODO: change to 2000 ?
"body": 500,
"bin size": 10,
"ref point": None,
"verbose": False,
"bin avg type": "mean",
"missing data as zero": False,
"min threshold": None,
"max threshold": None,
"scale": 1,
"skip zeros": False,
"nan after end": False,
"proc number": 4,
"sort regions": "keep",
"sort using": "mean",
"unscaled 5 prime": 0,
"unscaled 3 prime": 0,
"start_label": "start",
"end_label": "end",
"label_rotation": 90,
def is_prof_param(key):
"""Determine if *key* corresponds to a valid parameter for a *Profile*."""
return key in {
"plot_title", "y_axis_label", "y_min", "y_max", "averagetype",
"reference_point_label", "start_label", "end_label",
"plot_height", "plot_width", "per_group",
"plot_type", "image_format", "color_list",
"legend_location", "plots_per_row", "label_rotation", "dpi"}
def compute_matrix(bigwig_filenames,
"""Combine information from bigwig files *bigwig_filenames* and bed file
If *plot_filename* is set, write the corresponding meta profile
in this file.
parameters = deepcopy(DEFAULT_PARAMETERS)
heatm = heatmapper.heatmapper()
heatm.computeMatrix(bigwig_filenames, bed_filename, parameters)
if "sample_labels" in parameters:
if "group_labels" in parameters:
# Fixing parameters (as in heatmapper.read_matrix_file
# and heatmapper.save_matrix)
nb_samples = len(heatm.matrix.sample_labels)
hm_params = dict()
for (key, val) in heatm.parameters.items():
if isinstance(val, list) and not val:
val = None
if key in heatm.special_params and not isinstance(val, list):
val = [val] * nb_samples
if not val:
val = [None] * nb_samples
hm_params[key] = val
heatm.parameters = hm_params
if plot_filename is not None:
print(f"plotting profile to {plot_filename}")
prof_params = keyfilter(is_prof_param, parameters)
prof_params["per_group"] = True
prof = Profile(
heatm, plot_filename,
# def get_transcript_structure(fdb, transcript):
# [t_id] = transcript["transcript_id"]
# return list(chain(
# fdb.children(t_id, featuretype="CDS"),
# fdb.children(t_id, featuretype="UTR")))
def main():
"""Main function of the program."""
parser = argparse.ArgumentParser(
"-d", "--feature_db",
help="Path to the sqlite database to read or create.\n"
"If you want to create it, also provide annotations "
"with option *--annot_file*.")
"-a", "--annot_file",
help="Path to the gtf file containing annotations.")
"-g", "--gene_list",
help="Path to a file containing one gene identifier per line.\n"
"The metagene profile will only take into account those genes.\n"
"This should not be used at the same time as option -c")
"-c", "--bed_coordinates",
help="Path to a file containing the coordinates of the features "
"for which to create the meta-profiles.\n"
"This should not be used at the same time as option -g")
"-b", "--bigwig_files",
help="Paths to bigwig files that should be used "
"to represent meta profiles.")
"-l", "--library_labels",
help="Names of the libraries. These should be in the same order "
"as the files given at the -b option.")
"-p", "--profile_plot",
help="Path to the pdf file in wich to write the meta profiles.")
"-t", "--plot_title",
default="Metagene profile",
help="Title to use for the metagene plot.")
help="Minimum value for the Y-axis")
help="Maximum value for the Y-axis")
"-e", "--extra_parameters",
help="Path to a yaml file containing parameters for the "
"*compute_matrix* function.\n"
"possible values are:\n" + PARAMETER_INFO)
args = parser.parse_args()
if args.extra_parameters:
with open(args.extra_parameters, "r") as params_file:
extra_parameters = valmap(fix_none, yload(params_file))
extra_parameters = {}
if args.plot_title:
extra_parameters["group_labels"] = [args.plot_title]
if args.library_labels:
extra_parameters["sample_labels"] = args.library_labels
if args.yMin is not None:
extra_parameters["y_min"] = [float(args.yMin)]
if args.yMax is not None:
extra_parameters["y_max"] = [float(args.yMax)]
if args.profile_plot:
check_options_msg = "Use only one among options -c and -g."
if args.gene_list:
assert not args.bed_coordinates, check_options_msg
with open(args.gene_list, "r") as gene_list_file:
gene_ids = {
gene_id_line.strip() for gene_id_line in gene_list_file}
fdb = open_feature_db(
bed_filename = bedfile_from_gene_list(fdb, gene_ids)
print(f"temporary bed file: {bed_filename}")
if args.bed_coordinates:
assert not args.gene_list, check_options_msg
bed_filename = args.bed_coordinates
print(f"using bed file: {bed_filename}")
args.bigwig_files, bed_filename,
plot_filename=args.profile_plot, **extra_parameters)
if args.gene_list:
return 0
if __name__ == "__main__":
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""This script reads data from "tidy" files and makes plots out of
it, at the same scale.
It also outputs a table containing the plotted data points."""
import argparse
import os
import sys
import warnings
from operator import attrgetter, contains
# from functools import partial
import matplotlib as mpl
# To be able to run the script without a defined $DISPLAY
# mpl.use("PDF")
from matplotlib.backends.backend_pgf import FigureCanvasPgf
import pandas as pd
from cytoolz import compose, concat, curry
from libhts import plot_scatter
from libworkflows import save_plot, strip_split
def formatwarning(message, category, filename, lineno, line):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
OPJ = os.path.join
OPB = os.path.basename
mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)
"text.usetex": True, # use LaTeX to write all text
"pgf.rcfonts": False, # Ignore Matplotlibrc
"pgf.texsystem": "lualatex", # hoping to avoid memory issues
"pgf.preamble": [
r'\usepackage{color}' # xcolor for colours
# mpl.rcParams["figure.figsize"] = 2, 4
mpl.rcParams["font.sans-serif"] = [
"Arial", "Liberation Sans", "Bitstream Vera Sans"]
mpl.rcParams[""] = "sans-serif"
# from matplotlib import numpy as np
# import matplotlib.pyplot as plt
# def grayify_cmap(cmap):
# """Return a grayscale version of the colormap"""
# cmap =
# colors = cmap(np.arange(cmap.N))
# # convert RGBA to perceived greyscale luminance
# # cf.
# rgb_weight = [0.299, 0.587, 0.114]
# luminance = np.sqrt([:, :3] ** 2, rgb_weight))
# colors[:, :3] = luminance[:, np.newaxis]
# return cmap.from_list( + "_grayscale", colors, cmap.N)
def get_gene_list(filename):
"""Given the path to a file containing gene identifiers,
extracts the identifiers and the base name of the file.
Returns a pair (list_name, gene_list)."""
(list_name, _) = os.path.splitext(OPB(filename))
if list_name[-4:] == "_ids":
list_name = list_name[:-4]
with open(filename, "r") as infile:
gene_list = [
strip_split(line)[0] for line in infile.readlines()
if line[0] != "#"]
return (list_name, gene_list)
class Scatterplot:
"""A 2-dimension scatterplot."""
__slots__ = ("data", "x_label", "y_label", "grouping_col")
def fuse_columns(row):
"""Joins text from columns in a pandas DataFrame row with underscores.
Intended to be used with *DataFrame.apply*."""
return "_".join(map(str, row.values))
def __init__(self,
if extra_cols is None:
x_usecols = ["gene", x_column].__contains__
y_usecols = ["gene", y_column].__contains__
x_usecols = ["gene", x_column, *extra_cols].__contains__
y_usecols = ["gene", y_column, *extra_cols].__contains__
x_data = pd.read_table(
x_input_file, index_col="gene", usecols=x_usecols).rename(
columns={x_column: "x"})
y_data = pd.read_table(
y_input_file, index_col="gene", usecols=y_usecols).rename(
columns={y_column: "y"})
# Just some experiments
# from cytoolz import merge_with
# from cytoolz.curried import merge_with as cmerge
# common = x_data.index.intersection(y_data.index)
# as_dicts = merge_with(
# cmerge(set),
# x_data.loc[common].to_dict(orient="index"),
# y_data.loc[common].to_dict(orient="index"))
# data = pd.DataFrame(as_dicts).T = pd.merge(
x_data, y_data,
left_index=True, right_index=True, validate="one_to_one")
if extra_cols is not None:
extra_cols = list(concat((
[colname] if colname in
else [f"{colname}_x", f"{colname}_y"]
for colname in extra_cols)))
# if only one element in extra_cols, it will be overridden:
# "_".join(extra_cols) == extra_cols[0]
if len(extra_cols) > 1: =**{
Scatterplot.fuse_columns, axis=1)})
self.grouping_col = "_".join(extra_cols)
self.grouping_col = None
(self.x_label, self.y_label) = labels
def apply_selector(self, selector, chose_from=None):
"""Returns a list of gene ids based on a *selector* string.
The *selector* string will be used as a query string on **.
If *chose_from* is not empty, gene ids will only been selected
if they belong to *chose_from*."""
if chose_from is None:
chose_from = {}
chose_from = set(chose_from)
if chose_from:
return [gene_id for gene_id in
if gene_id in chose_from]
return [gene_id for gene_id in]
def plot_maker(self, grouping=None, group2colour=None, **kwargs):
"""Builds a plotting function that can colour dots based on them
belonging to a group defined by *grouping*."""
def plot_lfclfc_scatter():
"""Generates the scatterplot, returns its legend so that
*save_plot* can include it in the bounding box."""
# fig, axis = plot_scatter(
# print(kwargs["x_range"])
axis = plot_scatter(,
"x", "y",
except ValueError as err:
if str(err) == "No data to plot.":
warnings.warn("No data to plot.")
return None
# Lines indicating 2-fold threshold.
# Assumes the data are in log2 fold changes
line_style = {
"linewidth": 0.5, "color": "0.5", "linestyle": "dashed"}
if "x_range" in kwargs:
(x_annot_loc, _) = kwargs["x_range"]
x_annot_loc = min(
if "y_range" in kwargs:
(y_annot_loc, _) = kwargs["y_range"]
y_annot_loc = min(
axis.axhline(y=1, **line_style)
f"y = 1", xy=(x_annot_loc, 1), xycoords="data",
axis.axhline(y=-1, **line_style)
f"y = -1", xy=(x_annot_loc, -1), xycoords="data",
axis.axvline(x=1, **line_style)
f"x = -1", xy=(-1, y_annot_loc), xycoords="data",
rotation=90, size="x-small",
axis.axvline(x=-1, **line_style)
f"x = 1", xy=(1, y_annot_loc), xycoords="data",
rotation=90, size="x-small",
# Number of genes beyond lfc thresholds, in each quadrant
# up_up = 100 * len(
# f"x > 1 & y > 1")) / len(
# up_down = 100 * len(
# f"x > 1 & y < 1")) / len(
# down_up = 100 * len(
# f"x < 1 & y > 1")) / len(
# down_down = 100 * len(
# f"x < 1 & y < 1")) / len(