Commit 2a4d2234 authored by Blaise Li's avatar Blaise Li
Browse files

Function to generate bedtool from gene ids.

It needs a list of identifiers and an annotation file, and generates a
pybedtools.BedTool.
parent a927a057
from .libhts import (
do_deseq2, gtf_2_genes_exon_lengths,
id_list_gtf2bed,
make_empty_bigwig,
make_seeding_function,
median_ratio_to_pseudo_ref_size_factors,
......
......@@ -176,6 +176,33 @@ def spikein_gtf_2_lengths(spikein_gtf):
name=("union_exon_len")).rename_axis("gene"))
def id_list_gtf2bed(identifiers, gtf_filename, feature_type="transcript", id_kwd="gene_id"):
"""
Extract bed coordinates of an iterable of identifiers from a gtf file.
*identifiers* is the iterable of identifiers.
*gtf_filename* is the name of the gtf file.
*feature_type* is the type of feature to be considered
in the gtf file (third columns).
*id_kwd* is the keyword under which the feature ID is expected to be found
in the feature annotations in the gtf_file. These feature IDs will be
matched against the elements in *identifiers*.
"""
gtf_file = open(gtf_filename, "r")
gtf = BedTool(gtf_file)
if identifiers:
ids = set(identifiers)
def feature_filter(feature):
return feature[2] == feature_type and feature[id_kwd] in ids
return gtf.filter(feature_filter)
else:
# https://stackoverflow.com/a/13243870/1878788
def empty_bed_generator():
return
yield
return empty_bed_generator()
def make_empty_bigwig(filename, chrom_sizes):
"""Writes *filename* so that it is an empty bigwig file.
*chrom_sizes is a dictionary giving chromosome sizes* given
......@@ -191,6 +218,9 @@ def make_empty_bigwig(filename, chrom_sizes):
bw_out.close()
#################
# Bowtie2 stuff #
#################
def zero(value):
return 0
......@@ -238,90 +268,6 @@ def make_seeding_function(seeding_string):
return seeding_function
def do_deseq2(cond_names, conditions, counts_data,
formula=None, contrast=None, deseq2_args=None):
"""Runs a DESeq2 differential expression analysis."""
if formula is None:
formula = Formula("~ lib")
if contrast is None:
# FIXME: MUT and REF are not defined
# Maybe just make (formula and) contrast mandatory
contrast = StrVector(["lib", MUT, REF])
if deseq2_args is None:
deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
col_data = pd.DataFrame(conditions).assign(
cond_name=pd.Series(cond_names).values).set_index("cond_name")
# In case we want contrasts between factor combinations
if ("lib" in col_data.columns) and ("treat" in col_data.columns):
col_data = col_data.assign(
lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
col_data["lib"], col_data["treat"])])
# http://stackoverflow.com/a/31206596/1878788
pandas2ri.activate() # makes some conversions automatic
# r_counts_data = pandas2ri.py2ri(counts_data)
# r_col_data = pandas2ri.py2ri(col_data)
# r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
dds = deseq2.DESeqDataSetFromMatrix(
countData=counts_data,
colData=col_data,
design=formula)
# dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
# Decompose into the 3 steps to have more control on the options
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
except RRuntimeError as e:
if sum(counts_data.prod(axis=1)) == 0:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"This is probably because every gene has at least one zero.\n",
"We will try to use the \"poscounts\" method instead."])
warnings.warn(msg)
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
#print(counts_data.dtypes)
#print(counts_data.columns)
#print(len(counts_data))
#raise
else:
raise
size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
#for cond in cond_names:
# #s = size_factors.loc[cond][0]
# #(*_, s) = size_factors.loc[cond]
#pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"local\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"mean\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
res = pandas2ri.ri2py(as_df(deseq2.results(
dds,
contrast=contrast,
addMLE=deseq2_args["addMLE"],
independentFiltering=deseq2_args["independentFiltering"])))
res.index = counts_data.index
return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
## Not sure this is a good idea...
# def masked_gmean(a, axis=0, dtype=None):
# """Modified from stats.py."""
......@@ -461,6 +407,93 @@ def plot_boxplots(data, ylabel):
plt.tight_layout()
############
# DE stuff #
############
def do_deseq2(cond_names, conditions, counts_data,
formula=None, contrast=None, deseq2_args=None):
"""Runs a DESeq2 differential expression analysis."""
if formula is None:
formula = Formula("~ lib")
if contrast is None:
# FIXME: MUT and REF are not defined
# Maybe just make (formula and) contrast mandatory
contrast = StrVector(["lib", MUT, REF])
if deseq2_args is None:
deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
col_data = pd.DataFrame(conditions).assign(
cond_name=pd.Series(cond_names).values).set_index("cond_name")
# In case we want contrasts between factor combinations
if ("lib" in col_data.columns) and ("treat" in col_data.columns):
col_data = col_data.assign(
lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
col_data["lib"], col_data["treat"])])
# http://stackoverflow.com/a/31206596/1878788
pandas2ri.activate() # makes some conversions automatic
# r_counts_data = pandas2ri.py2ri(counts_data)
# r_col_data = pandas2ri.py2ri(col_data)
# r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
dds = deseq2.DESeqDataSetFromMatrix(
countData=counts_data,
colData=col_data,
design=formula)
# dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
# Decompose into the 3 steps to have more control on the options
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
except RRuntimeError as e:
if sum(counts_data.prod(axis=1)) == 0:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"This is probably because every gene has at least one zero.\n",
"We will try to use the \"poscounts\" method instead."])
warnings.warn(msg)
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
#print(counts_data.dtypes)
#print(counts_data.columns)
#print(len(counts_data))
#raise
else:
raise
size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
#for cond in cond_names:
# #s = size_factors.loc[cond][0]
# #(*_, s) = size_factors.loc[cond]
#pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"local\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"mean\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
res = pandas2ri.ri2py(as_df(deseq2.results(
dds,
contrast=contrast,
addMLE=deseq2_args["addMLE"],
independentFiltering=deseq2_args["independentFiltering"])))
res.index = counts_data.index
return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
......@@ -701,6 +734,7 @@ def plot_scatter(data,
ax.set_ylim(y_range)
return ax
def plot_paired_scatters(data, columns=None, hue=None, log_log=False):
"""Alternative to pairplot, in order to avoid histograms on the diagonal."""
if columns is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment