From 08452c07145fff9957b7d7e930b300d7c0599f55 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Mon, 13 May 2019 11:57:19 +0200 Subject: [PATCH] Made libhts a submodule. --- .gitmodules | 3 + libhts | 1 + libhts/.gitignore | 11 - libhts/install.sh | 5 - libhts/libhts/__init__.py | 12 - libhts/libhts/libhts.py | 786 -------------------------------------- libhts/setup.py | 33 -- 7 files changed, 4 insertions(+), 847 deletions(-) create mode 160000 libhts delete mode 100644 libhts/.gitignore delete mode 100755 libhts/install.sh delete mode 100644 libhts/libhts/__init__.py delete mode 100644 libhts/libhts/libhts.py delete mode 100644 libhts/setup.py diff --git a/.gitmodules b/.gitmodules index f723ad7..6430ed0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "libworkflows"] path = libworkflows url = git@gitlab.pasteur.fr:bli/libworkflows.git +[submodule "libhts"] + path = libhts + url = git@gitlab.pasteur.fr:bli/libhts.git diff --git a/libhts b/libhts new file mode 160000 index 0000000..7e25a23 --- /dev/null +++ b/libhts @@ -0,0 +1 @@ +Subproject commit 7e25a230845896610a1a11a8ba91a51c824e9d0f diff --git a/libhts/.gitignore b/libhts/.gitignore deleted file mode 100644 index b4e1667..0000000 --- a/libhts/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -# Compiled python modules. -*.pyc - -# Setuptools distribution folder. -/dist/ - -# Python egg metadata, regenerated from source files by setuptools. -/*.egg-info - -# Backups -*~ diff --git a/libhts/install.sh b/libhts/install.sh deleted file mode 100755 index 5ddc41a..0000000 --- a/libhts/install.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -python3.6 setup.py build_ext -# .egg-link does not work with PYTHONPATH ? -python3.6 -m pip install -e . -python3.6 -m pip install --no-deps --ignore-installed . diff --git a/libhts/libhts/__init__.py b/libhts/libhts/__init__.py deleted file mode 100644 index a980938..0000000 --- a/libhts/libhts/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -from .libhts import ( - aligner2min_mapq, - gtf_2_genes_exon_lengths, - id_list_gtf2bed, - make_empty_bigwig, - make_seeding_function, - median_ratio_to_pseudo_ref_size_factors, - plot_boxplots, plot_counts_distribution, plot_histo, - plot_lfc_distribution, plot_MA, - plot_norm_correlations, plot_paired_scatters, plot_scatter, - repeat_bed_2_lengths, size_factor_correlations, - spikein_gtf_2_lengths, status_setter) diff --git a/libhts/libhts/libhts.py b/libhts/libhts/libhts.py deleted file mode 100644 index 5932deb..0000000 --- a/libhts/libhts/libhts.py +++ /dev/null @@ -1,786 +0,0 @@ -from math import floor, ceil, sqrt, log -from functools import reduce -from re import sub -import warnings -from libworkflows import texscape - - -def formatwarning(message, category, filename, lineno, line): - """Used to format warning messages.""" - return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message) - - -warnings.formatwarning = formatwarning - -import numpy as np -import pandas as pd -# To compute correlation coefficient, and compute linear regression -from scipy.stats.stats import pearsonr, linregress -# To compute geometric mean -from scipy.stats.mstats import gmean -import matplotlib as mpl -import matplotlib.pyplot as plt -# https://stackoverflow.com/a/42768093/1878788 -from matplotlib.backends.backend_pgf import FigureCanvasPgf -mpl.backend_bases.register_backend('pdf', FigureCanvasPgf) -TEX_PARAMS = { - "text.usetex": True, # use LaTeX to write all text - "pgf.rcfonts": False, # Ignore Matplotlibrc - "pgf.texsystem": "lualatex", # hoping to avoid memory issues - "pgf.preamble": [ - r'\usepackage{color}' # xcolor for colours - ] -} -mpl.rcParams.update(TEX_PARAMS) -import seaborn as sns -# from rpy2.robjects import r, pandas2ri, Formula, StrVector -# as_df = r("as.data.frame") -# from rpy2.rinterface import RRuntimeError -# from rpy2.robjects.packages import importr -# deseq2 = importr("DESeq2") -from pybedtools import BedTool -import pyBigWig -import networkx as nx - - -class Exon(object): - __slots__ = ("chrom", "start", "end") - def __init__(self, chrom, start, end): - self.chrom = chrom - self.start = start - self.end = end - - def overlap(self, other): - if self.chrom != other.chrom: - return False - return (self.start <= other.start < self.end) or (other.start <= self.start < other.end) - - def merge(self, other): - # Not necessary: can be indirectly linked - #assert overlap(self, other) - return Exon(self.chrom, min(self.start, other.start), max(self.end, other.end)) - - def __len__(self): - return self.end - self.start - -overlap = Exon.overlap -merge = Exon.merge - -class Gene(object): - """This object contains information obtained from a gtf file.""" - __slots__ = ("gene_id", "exons", "union_exon_length") - def __init__(self, gene_id): - self.gene_id = gene_id - #self.transcripts = {} - self.exons = nx.Graph() - self.union_exon_length = None - - #def add_transcript(self, feature): - # the_id = feature.attrs["transcript_id"] - # assert the_id not in self.transcripts - # self.transcripts[the_id] = feature - - def add_exon(self, feature): - #the_id = feature.attrs["exon_id"] - #assert the_id not in self.exons - #self.exons[the_id] = feature - exon = Exon(feature.chrom, feature.start, feature.end) - if exon not in self.exons: - self.exons.add_node(exon) - - # The merging cannot be done on the full BedTool because we dont want - # to merge together exons not belonging to the same gene. - def set_union_exon_length(self): - """The exons are used to make a BedTool, which enables convenient merging of - overlapping features. The sum of the lengths of the merged exons is returned.""" - if len(self.exons) == 1: - # No need to merge when there is only one exon - self.union_exon_length = len(next(iter(self.exons.nodes()))) - else: - # Too slow - #self.union_exon_length = sum(map( - # len, BedTool(self.exons.values()).merge().features())) - #self.union_exon_length = 0 - # We group nodes that overlap, and merge them - #overlapping_exons = nx.quotient_graph(self.exons, overlap) - #for node in overlapping_exons.nodes(): - # mex = reduce(merge, node) - # self.union_exon_length += len(mex) - self.union_exon_length = sum((len(reduce( - merge, node)) for node in nx.quotient_graph(self.exons, overlap).nodes())) - - -def gtf_2_genes_exon_lengths(gtf_filename): - """Returns a pandas DataFrame where union exon lengths are associated to gene IDs.""" - gtf_file = open(gtf_filename, "r") - gtf = BedTool(gtf_file) - genes = {} - for feature in gtf.features(): - feat_type = feature[2] - if feat_type != "exon": - continue - attrs = feature.attrs - gene_id = attrs["gene_id"] - if gene_id not in genes: - genes[gene_id] = Gene(gene_id) - gene = genes[gene_id] - try: - gene.add_exon(feature) - except AssertionError: - # A given exon may be registered for several transcripts, hence several gtf entries - already = gene.exons[feature.attrs["exon_id"]] - assert already.attrs["transcript_id"] != feature.attrs["transcript_id"] - assert (already.start, already.end) == (feature.start, feature.end) - for gene in genes.values(): - gene.set_union_exon_length() - return pd.DataFrame(pd.Series( - {gene.gene_id : gene.union_exon_length for gene in genes.values()}, - name=("union_exon_len")).rename_axis("gene")) - - -def repeat_bed_2_lengths(repeat_bed): - """Computes the lengths of repetitive elements in a bed file, grouped by families. - This assumes that the elements have their names composed of the family name, - then a colon, then a number. For instance: - Simple_repeat|Simple_repeat|(TTTTTTG)n:1 - Simple_repeat|Simple_repeat|(TTTTTTG)n:2 - Simple_repeat|Simple_repeat|(TTTTTTG)n:3 - Simple_repeat|Simple_repeat|(TTTTTTG)n:4 - -> Simple_repeat|Simple_repeat|(TTTTTTG)n - Returns a DataFrame associating the summed lengths to the family names. - """ - # usecols=[1, 2, 3]: start, end, id - # index_col=2: id (relative to the selected columns) - start_ends = pd.read_table(repeat_bed, usecols=[1, 2, 3], header=None, index_col=2) - # bed lengths - lens = start_ends[2] - start_ends[1] - lens.name = "union_exon_len" - repeat_families = [":".join(name.split(":")[:-1]) for name in start_ends.index] - # The reads assigned to a repeated element can come - # from the summed length of all the members of the family - # We call this "gene" for convenience and compatibility - return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum() - - -def spikein_gtf_2_lengths(spikein_gtf): - """Computes the lengths of spike-ins, grouped by families. - Returns a DataFrame associating the summed lengths to the spike-in names. - """ - spikein_ends = {} - with open(spikein_gtf) as gtf_file: - for line in gtf_file: - fields = line.strip().split("\t") - spikein_ends[fields[0]] = int(fields[4]) - return pd.DataFrame(pd.Series( - spikein_ends, - name=("union_exon_len")).rename_axis("gene")) - - -def id_list_gtf2bed(identifiers, gtf_filename, feature_type="transcript", id_kwd="gene_id"): - """ - Extract bed coordinates of an iterable of identifiers from a gtf file. - - *identifiers* is the iterable of identifiers. - *gtf_filename* is the name of the gtf file. - *feature_type* is the type of feature to be considered - in the gtf file (third columns). - *id_kwd* is the keyword under which the feature ID is expected to be found - in the feature annotations in the gtf_file. These feature IDs will be - matched against the elements in *identifiers*. - """ - with open(gtf_filename, "r") as gtf_file: - gtf = BedTool(gtf_file) - if identifiers: - ids = set(identifiers) - def feature_filter(feature): - return feature[2] == feature_type and feature[id_kwd] in ids - return gtf.filter(feature_filter) - else: - # https://stackoverflow.com/a/13243870/1878788 - def empty_bed_generator(): - return - yield - return empty_bed_generator() - - -def make_empty_bigwig(filename, chrom_sizes): - """Writes *filename* so that it is an empty bigwig file. - *chrom_sizes is a dictionary giving chromosome sizes* given - chomosome names. - """ - bw_out = pyBigWig.open(filename, "w") - bw_out.addHeader(list(chrom_sizes.items())) - for (chrom, chrom_len) in bw_out.chroms().items(): - bw_out.addEntries( - chrom, 0, - values=np.nan_to_num(np.zeros(chrom_len)[0::10]), - span=10, step=10) - bw_out.close() - - -################# -# Bowtie2 stuff # -################# -def zero(value): - return 0 - - -def identity(value): - return value - - -bowtie2_function_selector = { - "C": zero, - "L": identity, - "S": sqrt, - "G": log} - - -def make_seeding_function(seeding_string): - """Generates a function that computes the seeding pattern given a - string representing bowtie2 seeding settings (-L and -i options). - - >>> make_seeding_function("-L 6 -i S,1,0.8")(18) - [[0, 6], [4, 10], [8, 14], [12, 18]] - """ - [opt1, val1, opt2, val2] = seeding_string.split() - if opt1 == "-L": - assert opt2 == "-i" - seed_len = int(val1) - interval_string = val2 - else: - assert opt2 == "-L" - seed_len = int(val2) - interval_string = val1 - [func_type, constant, coeff] = interval_string.split(",") - constant = float(constant) - coeff = float(coeff) - func_type = bowtie2_function_selector[func_type] - def seeding_function(read_len): - interval = floor(constant + (coeff * func_type(read_len))) - seeds = [] - seed = [0, seed_len] - while seed[1] <= read_len: - seeds.append(seed) - next_seed_start = seed[0] + interval - seed = [next_seed_start, next_seed_start + seed_len] - return seeds - return seeding_function - - -def aligner2min_mapq(aligner, wildcards): - """ - Option to filter on MAPQ value in featureCounts. - - What minimal MAPQ value should a read have to be considered uniquely mapped? - See <https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/>. - """ - try: - mapping_type = wildcards.mapping_type - except AttributeError: - mapping_type = wildcards.mapped_type - if mapping_type.startswith("unique_"): - if aligner == "hisat2": - return "-Q 60" - elif aligner == "bowtie2": - return "-Q 23" - else: - raise NotImplementedError(f"{aligner} not handled (yet?)") - else: - return "" - - -## Not sure this is a good idea... -# def masked_gmean(a, axis=0, dtype=None): -# """Modified from stats.py.""" -# # Converts the data into a masked array -# ma = np.ma.masked_invalid(a) -# # Apply gmean -# if not isinstance(ma, np.ndarray): -# # if not an ndarray object attempt to convert it -# log_a = np.log(np.array(ma, dtype=dtype)) -# elif dtype: -# # Must change the default dtype allowing array type -# if isinstance(ma, np.ma.MaskedArray): -# log_a = np.log(np.ma.asarray(ma, dtype=dtype)) -# else: -# log_a = np.log(np.asarray(ma, dtype=dtype)) -# else: -# log_a = np.log(ma) -# return np.exp(log_a.mean(axis=axis)) - - - -def median_ratio_to_pseudo_ref_size_factors(counts_data): - """Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) - All libraries are used to define a pseudo-reference, which has - the geometric mean across libraries for a given gene in *counts_data*. - For a given library, the median across genes of the ratios to the - pseudo-reference is used as size factor.""" - # Add pseudo-count to compute the geometric mean, then remove it - #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 - # Ignore lines with zeroes instead (may be bad for IP: many zeroes expected): - pseudo_ref = (counts_data[counts_data.prod(axis=1) > 0]).apply(gmean, axis=1) - # Ignore lines with only zeroes - # pseudo_ref = (counts_data[counts_data.sum(axis=1) > 0]).apply(masked_gmean, axis=1) - def median_ratio_to_pseudo_ref(col): - return (col / pseudo_ref).median() - #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) - median_ratios = counts_data[counts_data.prod(axis=1) > 0].apply( - median_ratio_to_pseudo_ref, axis=0) - # Not sure fillna(0) is appropriate - if any(median_ratios.isna()): - msg = "Could not compute median ratios to pseudo reference.\n" - warnings.warn(msg) - return median_ratios.fillna(1) - else: - return median_ratios - - -def size_factor_correlations(counts_data, summaries, normalizer): - """Is there a correlation, across libraries, between normalized values and size factors? - The size factor type *normalizer* is either computed or taken from *summaries*. - The normalized data are computed by dividing *counts_data* by this size factor.""" - if normalizer == "median_ratio_to_pseudo_ref": - size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) - else: - size_factors = summaries.loc[normalizer] - #by_norm = counts_data / size_factors - def compute_pearsonr_with_size_factor(row): - return pearsonr(row, size_factors)[0] - #return by_norm.apply(compute_pearsonr_with_size_factor, axis=1) - return (counts_data / size_factors).apply(compute_pearsonr_with_size_factor, axis=1) - - -def plot_norm_correlations(correlations): - #fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True) - #correlations.plot.kde(ax=ax1) - #sns.violinplot(data=correlations, orient="h", ax=ax2) - #ax2.set_xlabel("Pearson correlation coefficient") - usetex = mpl.rcParams.get("text.usetex", False) - if usetex: - correlations.columns = [texscape(colname) for colname in correlations.columns] - axis = sns.violinplot(data=correlations, cut=0) - axis.set_ylabel("Pearson correlation coefficient") - - -def plot_counts_distribution(data, xlabel): - # TODO: try to plot with semilog x axis - #axis = data.plot.kde(legend=None) - #axis.set_xlabel(xlabel) - #axis.legend(ncol=len(REPS)) - usetex = mpl.rcParams.get("text.usetex", False) - if usetex: - xlabel = texscape(xlabel) - data.columns = [texscape(colname) for colname in data.columns] - try: - axis = data.plot.kde() - except ValueError as e: - msg = "".join([ - "There seems to be a problem with the data.\n", - "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))]) - warnings.warn(msg) - raise - axis.set_xlabel(xlabel) - - -def plot_histo(outfile, data, title=None): - fig = plt.figure(figsize=(15,7)) - ax = fig.add_subplot(111) - ax.set_xlim([data.index[0] - 0.5, data.index[-1] + 0.5]) - #ax.set_ylim([0, 100]) - bar_width = 0.8 - #letter2legend = dict(zip("ACGT", "ACGT")) - usetex = mpl.rcParams.get("text.usetex", False) - if usetex: - data.columns = [texscape(colname) for colname in data.columns] - #title = sub("_", r"\_", title) - title = texscape(title) - for (read_len, count) in data.iterrows(): - plt.bar( - read_len, - count, - align="center", - width=bar_width) - #color=letter2colour[letter], - #label=letter2legend[letter]) - ax.legend() - ax.set_xticks(data.index) - ax.set_xticklabels(data.index) - ax.set_xlabel("read length") - ax.set_ylabel("number of reads") - plt.setp(ax.get_xticklabels(), rotation=90) - if title is not None: - plt.title(title) - ## debug - try: - plt.savefig(outfile) - except RuntimeError as e: - print(data.index) - print(title) - raise - ## - - -def plot_boxplots(data, ylabel): - fig = plt.figure(figsize=(6, 12)) - axis = fig.add_subplot(111) - usetex = mpl.rcParams.get("text.usetex", False) - if usetex: - ylabel = texscape(ylabel) - data.columns = [texscape(colname) for colname in data.columns] - data.plot.box(ax=axis) - axis.set_ylabel(ylabel) - for label in axis.get_xticklabels(): - label.set_rotation(90) - plt.tight_layout() - - -############ -# DE stuff # -############ -# def do_deseq2(cond_names, conditions, counts_data, -# formula=None, contrast=None, deseq2_args=None): -# """Runs a DESeq2 differential expression analysis.""" -# if formula is None: -# formula = Formula("~ lib") -# if contrast is None: -# # FIXME: MUT and REF are not defined -# # Maybe just make (formula and) contrast mandatory -# contrast = StrVector(["lib", MUT, REF]) -# if deseq2_args is None: -# deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True} -# col_data = pd.DataFrame(conditions).assign( -# cond_name=pd.Series(cond_names).values).set_index("cond_name") -# # In case we want contrasts between factor combinations -# if ("lib" in col_data.columns) and ("treat" in col_data.columns): -# col_data = col_data.assign( -# lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip( -# col_data["lib"], col_data["treat"])]) -# # http://stackoverflow.com/a/31206596/1878788 -# pandas2ri.activate() # makes some conversions automatic -# # r_counts_data = pandas2ri.py2ri(counts_data) -# # r_col_data = pandas2ri.py2ri(col_data) -# # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib")) -# dds = deseq2.DESeqDataSetFromMatrix( -# countData=counts_data, -# colData=col_data, -# design=formula) -# # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"]) -# # Decompose into the 3 steps to have more control on the options -# try: -# dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio") -# except RRuntimeError as e: -# if sum(counts_data.prod(axis=1)) == 0: -# msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, -# "This is probably because every gene has at least one zero.\n", -# "We will try to use the \"poscounts\" method instead."]) -# warnings.warn(msg) -# try: -# dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts") -# except RRuntimeError as e: -# msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, -# "We give up."]) -# warnings.warn(msg) -# raise -# #print(counts_data.dtypes) -# #print(counts_data.columns) -# #print(len(counts_data)) -# #raise -# else: -# raise -# size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds))) -# #for cond in cond_names: -# # #s = size_factors.loc[cond][0] -# # #(*_, s) = size_factors.loc[cond] -# #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',)) -# try: -# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric") -# except RRuntimeError as e: -# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, -# "We will try with fitType=\"local\"."]) -# warnings.warn(msg) -# try: -# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local") -# except RRuntimeError as e: -# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, -# "We will try with fitType=\"mean\"."]) -# warnings.warn(msg) -# try: -# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean") -# except RRuntimeError as e: -# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, -# "We give up."]) -# warnings.warn(msg) -# raise -# dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"]) -# res = pandas2ri.ri2py(as_df(deseq2.results( -# dds, -# contrast=contrast, -# addMLE=deseq2_args["addMLE"], -# independentFiltering=deseq2_args["independentFiltering"]))) -# res.index = counts_data.index -# return res, {cond : size_factors.loc[cond][0] for cond in cond_names} - - -# Cutoffs in log fold change -LFC_CUTOFFS = [0.5, 1, 2] -UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS] -DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS] - - -def status_setter(lfc_cutoffs=None, fold_type="log2FoldChange"): - """*fold_type* can also be "lfcMLE", which is based on uncorrected values. - This may not be good for genes with low expression levels.""" - if lfc_cutoffs is None: - lfc_cutoffs = LFC_CUTOFFS - def set_status(row): - """Determines the up- or down-regulation status corresponding to a given - row of a deseq2 results table.""" - if row["padj"] < 0.05: - #if row["log2FoldChange"] > 0: - lfc = row[fold_type] - if lfc > 0: - # Start from the highest cutoff, - # and decrease until below lfc - for cutoff in sorted(lfc_cutoffs, reverse=True): - if lfc > cutoff: - return f"up{cutoff}" - return "up" - else: - for cutoff in sorted(lfc_cutoffs, reverse=True): - if lfc < -cutoff: - return f"down{cutoff}" - return "down" - else: - return "NS" - return set_status - - -# res = res.assign(is_DE=res.apply(set_de_status, axis=1)) -def set_de_status(row): - """Determines whether a gene is differentially expressed (DE) of not (NS) - based on the adjusted p-value in row of a deseq2 results table.""" - if row["padj"] < 0.05: - return "DE" - else: - return "NS" -DE2COLOUR = { - # black - "DE" : "k", - # pale grey - "NS" : "0.85"} - - -def plot_lfc_distribution(res, contrast, fold_type=None): - """*fold_type* is "log2FoldChange" by default. - It can also be "lfcMLE", which is based on uncorrected values. - This may not be good for genes with low expression levels.""" - #lfc = res.lfcMLE.dropna() - if fold_type is None: - fold_type = "log2FoldChange" - lfc = getattr(res, fold_type).dropna() - usetex = mpl.rcParams.get("text.usetex", False) - if usetex: - lfc.name = texscape(contrast) - else: - lfc.name = contrast - # lfc.columns = [texscape(colname) for colname in lfc.columns] - axis = sns.kdeplot(lfc) - axis.set_xlabel(fold_type) - axis.set_ylabel("frequency") - - -def make_status2colour(down_statuses, up_statuses): - statuses = list(reversed(down_statuses)) + ["down", "NS", "up"] + up_statuses - return dict(zip(statuses, sns.color_palette("coolwarm", len(statuses)))) - - -STATUS2COLOUR = make_status2colour(DOWN_STATUSES, UP_STATUSES) -# TODO: use other labelling than logfold or gene lists, i.e. biotype -def plot_MA(res, - grouping=None, - group2colour=None, - mean_range=None, - lfc_range=None, - fold_type=None): - """*fold_type* is "log2FoldChange" by default. - It can also be "lfcMLE", which is based on uncorrected values. - This may not be good for genes with low expression levels.""" - if not len(res): - raise ValueError("No data to plot.") - fig, ax = plt.subplots() - # Make a column indicating whether the gene is DE or NS - data = res.assign(is_DE=res.apply(set_de_status, axis=1)) - x_column = "baseMean" - data = data.assign(logx=np.log10(data[x_column])) - if fold_type is None: - y_column = "log2FoldChange" - else: - y_column = fold_type - usetex = mpl.rcParams.get("text.usetex", False) - def scatter_group(group, label, colour, size=1): - """Plots the data in *group* on the scatterplot.""" - if usetex: - label = texscape(label) - group.plot.scatter( - #x=x_column, - x="logx", - y=y_column, - s=size, - #logx=True, - c=colour, - label=label, ax=ax) - if usetex: - data.columns = [texscape(colname) for colname in data.columns] - y_column = texscape(y_column) - de_status_column = "is\_DE" - else: - de_status_column = "is_DE" - # First plot the data in grey and black - for (de_status, group) in data.groupby(de_status_column): - label = f"{de_status} ({len(group)})" - colour = DE2COLOUR[de_status] - scatter_group(group, label, colour, size=2) - if grouping is not None: - if isinstance(grouping, str): - # Overlay colours based on the "grouping" column - if group2colour is None: - group2colour = STATUS2COLOUR - for status, group in data.groupby(grouping): - label = f"{status} ({len(group)})" - colour = group2colour[status] - scatter_group(group, label, colour) - else: - (status, colour) = group2colour - row_indices = data.index.intersection(grouping) - try: - label=f"{status} ({len(row_indices)})" - scatter_group(data.ix[row_indices], label, colour) - except ValueError as e: - if str(e) != "scatter requires x column to be numeric": - print(data.ix[row_indices]) - raise - else: - warnings.warn(f"Nothing to plot for {status}\n") - ax.axhline(y=1, linewidth=0.5, color="0.5", linestyle="dashed") - ax.axhline(y=-1, linewidth=0.5, color="0.5", linestyle="dashed") - # TODO: check data basemean range - if mean_range is not None: - # ax.set_xlim(mean_range) - ax.set_xlim(np.log10(mean_range)) - if lfc_range is not None: - (lfc_min, lfc_max) = lfc_range - lfc_here_min = getattr(data, y_column).min() - lfc_here_max = getattr(data, y_column).max() - if (lfc_here_min < lfc_min) or (lfc_here_max > lfc_max): - warnings.warn(f"Cannot plot {y_column} data ([{lfc_here_min}, {lfc_here_max}]) in requested range ([{lfc_min}, {lfc_max}])\n") - else: - ax.set_ylim(lfc_range) - # https://stackoverflow.com/a/24867320/1878788 - x_ticks = np.arange( - floor(np.ma.masked_invalid(data["logx"]).min()), - ceil(np.ma.masked_invalid(data["logx"]).max()), - 1) - x_ticklabels = [r"$10^{{{}}}$".format(tick) for tick in x_ticks] - plt.xticks(x_ticks, x_ticklabels) - ax.set_xlabel(x_column) - - -def plot_scatter(data, - x_column, - y_column, - regression=False, - grouping=None, - group2colour=None, - x_range=None, - y_range=None, - axes_style=None): - if not len(data): - raise ValueError("No data to plot.") - fig, ax = plt.subplots() - # ax.set_adjustable('box') - # First plot the data in grey - data.plot.scatter( - x=x_column, y=y_column, - s=2, c="black", alpha=0.15, edgecolors='none', - ax=ax) - if regression: - linreg = linregress(data[[x_column, y_column]].dropna()) - a = linreg.slope - b = linreg.intercept - def fit(x): - return (a * x) + b - min_x = data[[x_column]].min()[0] - max_x = data[[x_column]].max()[0] - min_y = fit(min_x) - max_y = fit(max_x) - xfit, yfit = (min_x, max_x), (min_y, max_y) - ax.plot(xfit, yfit, linewidth=0.5, color="0.5", linestyle="dashed") - # Overlay colour points - if grouping is not None: - if isinstance(grouping, str): - # Determine colours based on the "grouping" column - if group2colour is None: - statuses = data[grouping].unique() - group2colour = dict(zip(statuses, sns.color_palette("colorblind", len(statuses)))) - for (status, group) in data.groupby(grouping): - group.plot.scatter( - x=x_column, y=y_column, s=1, c=group2colour[status], - label=f"{status} ({len(group)})", ax=ax) - else: - # Apply a colour to a list of genes - (status, colour) = group2colour - row_indices = data.index.intersection(grouping) - try: - data.ix[row_indices].plot.scatter( - x=x_column, y=y_column, s=1, c=colour, - label=f"{status} ({len(row_indices)})", ax=ax) - except ValueError as e: - if str(e) != "scatter requires x column to be numeric": - print(data.ix[row_indices]) - raise - else: - warnings.warn(f"Nothing to plot for {status}\n") - if axes_style is None: - axes_style = {"linewidth": 0.5, "color": "0.5", "linestyle": "dashed"} - ax.axhline(y=0, **axes_style) - ax.axvline(x=0, **axes_style) - # ax.axhline(y=0, linewidth=0.5, color="0.5", linestyle="dashed") - # ax.axvline(x=0, linewidth=0.5, color="0.5", linestyle="dashed") - # Set axis limits - if x_range is not None: - (x_min, x_max) = x_range - x_here_min = getattr(data, x_column).min() - x_here_max = getattr(data, x_column).max() - if (x_here_min < x_min) or (x_here_max > x_max): - warnings.warn(f"Cannot plot {x_column} data ([{x_here_min}, {x_here_max}]) in requested range ([{x_min}, {x_max}])\n") - else: - ax.set_xlim(x_range) - if y_range is not None: - (y_min, y_max) = y_range - y_here_min = getattr(data, y_column).min() - y_here_max = getattr(data, y_column).max() - if (y_here_min < y_min) or (y_here_max > y_max): - warnings.warn(f"Cannot plot {y_column} data ([{y_here_min}, {y_here_max}]) in requested range ([{y_min}, {y_max}])\n") - else: - ax.set_ylim(y_range) - return ax - - -def plot_paired_scatters(data, columns=None, hue=None, log_log=False): - """Alternative to pairplot, in order to avoid histograms on the diagonal.""" - if columns is None: - columns = data.columns - usetex = mpl.rcParams.get("text.usetex", False) - if usetex: - data.columns = [texscape(colname) for colname in data.columns] - columns = [texscape(colname) for colname in columns] - g = sns.PairGrid(data, vars=columns, hue=hue, size=8) - #g.map_offdiag(plt.scatter, marker=".") - g.map_lower(plt.scatter, marker=".") - if log_log: - for ax in g.axes.ravel(): - ax.set_xscale('log') - ax.set_yscale('log') - g.add_legend() diff --git a/libhts/setup.py b/libhts/setup.py deleted file mode 100644 index cbe5dac..0000000 --- a/libhts/setup.py +++ /dev/null @@ -1,33 +0,0 @@ -from setuptools import setup, find_packages -#from Cython.Build import cythonize - -name = "libhts" - -# Adapted from Biopython -__version__ = "Undefined" -for line in open("%s/__init__.py" % name): - if (line.startswith('__version__')): - exec(line.strip()) - - -setup( - name=name, - version=__version__, - description="Miscellaneous things to process high throughput sequencing data.", - author="Blaise Li", - author_email="blaise.li@normalesup.org", - license="MIT", - packages=find_packages(), - install_requires=[ - # "libworkflows", - "matplotlib", - "networkx", - "numpy", - "pandas", - "pyBigWig", - "pybedtools", - "scipy", - "seaborn"]) - #ext_modules = cythonize("libsmallrna/libsmallrna.pyx"), - #install_requires=["cytoolz"], - #zip_safe=False -- GitLab