Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Blaise LI
libhts
Commits
fd4e0679
Commit
fd4e0679
authored
Aug 10, 2018
by
Blaise Li
Browse files
Moved DESEq2 calling function in its own module.
parent
2a4d2234
Changes
3
Hide whitespace changes
Inline
Side-by-side
.gitignore
0 → 100644
View file @
fd4e0679
# Compiled python modules.
*.pyc
# Setuptools distribution folder.
/dist/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
# Backups
*~
libhts/__init__.py
View file @
fd4e0679
from
.libhts
import
(
do_deseq2
,
gtf_2_genes_exon_lengths
,
gtf_2_genes_exon_lengths
,
id_list_gtf2bed
,
make_empty_bigwig
,
make_seeding_function
,
...
...
libhts/libhts.py
View file @
fd4e0679
...
...
@@ -33,11 +33,11 @@ TEX_PARAMS = {
}
mpl
.
rcParams
.
update
(
TEX_PARAMS
)
import
seaborn
as
sns
from
rpy2.robjects
import
r
,
pandas2ri
,
Formula
,
StrVector
as_df
=
r
(
"as.data.frame"
)
from
rpy2.rinterface
import
RRuntimeError
from
rpy2.robjects.packages
import
importr
deseq2
=
importr
(
"DESeq2"
)
#
from rpy2.robjects import r, pandas2ri, Formula, StrVector
#
as_df = r("as.data.frame")
#
from rpy2.rinterface import RRuntimeError
#
from rpy2.robjects.packages import importr
#
deseq2 = importr("DESeq2")
from
pybedtools
import
BedTool
import
pyBigWig
import
networkx
as
nx
...
...
@@ -410,88 +410,88 @@ def plot_boxplots(data, ylabel):
############
# DE stuff #
############
def
do_deseq2
(
cond_names
,
conditions
,
counts_data
,
formula
=
None
,
contrast
=
None
,
deseq2_args
=
None
):
"""Runs a DESeq2 differential expression analysis."""
if
formula
is
None
:
formula
=
Formula
(
"~ lib"
)
if
contrast
is
None
:
# FIXME: MUT and REF are not defined
# Maybe just make (formula and) contrast mandatory
contrast
=
StrVector
([
"lib"
,
MUT
,
REF
])
if
deseq2_args
is
None
:
deseq2_args
=
{
"betaPrior"
:
True
,
"addMLE"
:
True
,
"independentFiltering"
:
True
}
col_data
=
pd
.
DataFrame
(
conditions
).
assign
(
cond_name
=
pd
.
Series
(
cond_names
).
values
).
set_index
(
"cond_name"
)
# In case we want contrasts between factor combinations
if
(
"lib"
in
col_data
.
columns
)
and
(
"treat"
in
col_data
.
columns
):
col_data
=
col_data
.
assign
(
lib_treat
=
[
"%s_%s"
%
(
lib
,
treat
)
for
(
lib
,
treat
)
in
zip
(
col_data
[
"lib"
],
col_data
[
"treat"
])])
# http://stackoverflow.com/a/31206596/1878788
pandas2ri
.
activate
()
# makes some conversions automatic
# r_counts_data = pandas2ri.py2ri(counts_data)
# r_col_data = pandas2ri.py2ri(col_data)
# r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
dds
=
deseq2
.
DESeqDataSetFromMatrix
(
countData
=
counts_data
,
colData
=
col_data
,
design
=
formula
)
# dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
# Decompose into the 3 steps to have more control on the options
try
:
dds
=
deseq2
.
estimateSizeFactors_DESeqDataSet
(
dds
,
type
=
"ratio"
)
except
RRuntimeError
as
e
:
if
sum
(
counts_data
.
prod
(
axis
=
1
))
==
0
:
msg
=
""
.
join
([
"Error occurred in estimateSizeFactors:
\n
%s
\n
"
%
e
,
"This is probably because every gene has at least one zero.
\n
"
,
"We will try to use the
\"
poscounts
\"
method instead."
])
warnings
.
warn
(
msg
)
try
:
dds
=
deseq2
.
estimateSizeFactors_DESeqDataSet
(
dds
,
type
=
"poscounts"
)
except
RRuntimeError
as
e
:
msg
=
""
.
join
([
"Error occurred in estimateSizeFactors:
\n
%s
\n
"
%
e
,
"We give up."
])
warnings
.
warn
(
msg
)
raise
#print(counts_data.dtypes)
#print(counts_data.columns)
#print(len(counts_data))
#raise
else
:
raise
size_factors
=
pandas2ri
.
ri2py
(
as_df
(
deseq2
.
sizeFactors_DESeqDataSet
(
dds
)))
#for cond in cond_names:
# #s = size_factors.loc[cond][0]
# #(*_, s) = size_factors.loc[cond]
#pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
try
:
dds
=
deseq2
.
estimateDispersions_DESeqDataSet
(
dds
,
fitType
=
"parametric"
)
except
RRuntimeError
as
e
:
msg
=
""
.
join
([
"Error occurred in estimateDispersions:
\n
%s
\n
"
%
e
,
"We will try with fitType=
\"
local
\"
."
])
warnings
.
warn
(
msg
)
try
:
dds
=
deseq2
.
estimateDispersions_DESeqDataSet
(
dds
,
fitType
=
"local"
)
except
RRuntimeError
as
e
:
msg
=
""
.
join
([
"Error occurred in estimateDispersions:
\n
%s
\n
"
%
e
,
"We will try with fitType=
\"
mean
\"
."
])
warnings
.
warn
(
msg
)
try
:
dds
=
deseq2
.
estimateDispersions_DESeqDataSet
(
dds
,
fitType
=
"mean"
)
except
RRuntimeError
as
e
:
msg
=
""
.
join
([
"Error occurred in estimateDispersions:
\n
%s
\n
"
%
e
,
"We give up."
])
warnings
.
warn
(
msg
)
raise
dds
=
deseq2
.
nbinomWaldTest
(
dds
,
betaPrior
=
deseq2_args
[
"betaPrior"
])
res
=
pandas2ri
.
ri2py
(
as_df
(
deseq2
.
results
(
dds
,
contrast
=
contrast
,
addMLE
=
deseq2_args
[
"addMLE"
],
independentFiltering
=
deseq2_args
[
"independentFiltering"
])))
res
.
index
=
counts_data
.
index
return
res
,
{
cond
:
size_factors
.
loc
[
cond
][
0
]
for
cond
in
cond_names
}
#
def do_deseq2(cond_names, conditions, counts_data,
#
formula=None, contrast=None, deseq2_args=None):
#
"""Runs a DESeq2 differential expression analysis."""
#
if formula is None:
#
formula = Formula("~ lib")
#
if contrast is None:
#
# FIXME: MUT and REF are not defined
#
# Maybe just make (formula and) contrast mandatory
#
contrast = StrVector(["lib", MUT, REF])
#
if deseq2_args is None:
#
deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
#
col_data = pd.DataFrame(conditions).assign(
#
cond_name=pd.Series(cond_names).values).set_index("cond_name")
#
# In case we want contrasts between factor combinations
#
if ("lib" in col_data.columns) and ("treat" in col_data.columns):
#
col_data = col_data.assign(
#
lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
#
col_data["lib"], col_data["treat"])])
#
# http://stackoverflow.com/a/31206596/1878788
#
pandas2ri.activate() # makes some conversions automatic
#
# r_counts_data = pandas2ri.py2ri(counts_data)
#
# r_col_data = pandas2ri.py2ri(col_data)
#
# r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
#
dds = deseq2.DESeqDataSetFromMatrix(
#
countData=counts_data,
#
colData=col_data,
#
design=formula)
#
# dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
#
# Decompose into the 3 steps to have more control on the options
#
try:
#
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
#
except RRuntimeError as e:
#
if sum(counts_data.prod(axis=1)) == 0:
#
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
#
"This is probably because every gene has at least one zero.\n",
#
"We will try to use the \"poscounts\" method instead."])
#
warnings.warn(msg)
#
try:
#
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
#
except RRuntimeError as e:
#
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
#
"We give up."])
#
warnings.warn(msg)
#
raise
#
#print(counts_data.dtypes)
#
#print(counts_data.columns)
#
#print(len(counts_data))
#
#raise
#
else:
#
raise
#
size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
#
#for cond in cond_names:
#
# #s = size_factors.loc[cond][0]
#
# #(*_, s) = size_factors.loc[cond]
#
#pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
#
try:
#
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
#
except RRuntimeError as e:
#
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
#
"We will try with fitType=\"local\"."])
#
warnings.warn(msg)
#
try:
#
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
#
except RRuntimeError as e:
#
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
#
"We will try with fitType=\"mean\"."])
#
warnings.warn(msg)
#
try:
#
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
#
except RRuntimeError as e:
#
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
#
"We give up."])
#
warnings.warn(msg)
#
raise
#
dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
#
res = pandas2ri.ri2py(as_df(deseq2.results(
#
dds,
#
contrast=contrast,
#
addMLE=deseq2_args["addMLE"],
#
independentFiltering=deseq2_args["independentFiltering"])))
#
res.index = counts_data.index
#
return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
# Cutoffs in log fold change
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment