diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..b4e1667d2aef604250dc2a69f5edb9c94db2a90a --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# Compiled python modules. +*.pyc + +# Setuptools distribution folder. +/dist/ + +# Python egg metadata, regenerated from source files by setuptools. +/*.egg-info + +# Backups +*~ diff --git a/libhts/__init__.py b/libhts/__init__.py index 525ba518cbebd3895d19fc0d0a83d1cde0bb664a..191868969208205d3e1b98e635ab6ef9504b91a6 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1,5 +1,5 @@ from .libhts import ( - do_deseq2, gtf_2_genes_exon_lengths, + gtf_2_genes_exon_lengths, id_list_gtf2bed, make_empty_bigwig, make_seeding_function, diff --git a/libhts/libhts.py b/libhts/libhts.py index cd3037312ae392b9454a105175a7b0a10b9a6df0..d73e5126daea07d3b273b976040b91d7d7827777 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -33,11 +33,11 @@ TEX_PARAMS = { } mpl.rcParams.update(TEX_PARAMS) import seaborn as sns -from rpy2.robjects import r, pandas2ri, Formula, StrVector -as_df = r("as.data.frame") -from rpy2.rinterface import RRuntimeError -from rpy2.robjects.packages import importr -deseq2 = importr("DESeq2") +# from rpy2.robjects import r, pandas2ri, Formula, StrVector +# as_df = r("as.data.frame") +# from rpy2.rinterface import RRuntimeError +# from rpy2.robjects.packages import importr +# deseq2 = importr("DESeq2") from pybedtools import BedTool import pyBigWig import networkx as nx @@ -410,88 +410,88 @@ def plot_boxplots(data, ylabel): ############ # DE stuff # ############ -def do_deseq2(cond_names, conditions, counts_data, - formula=None, contrast=None, deseq2_args=None): - """Runs a DESeq2 differential expression analysis.""" - if formula is None: - formula = Formula("~ lib") - if contrast is None: - # FIXME: MUT and REF are not defined - # Maybe just make (formula and) contrast mandatory - contrast = StrVector(["lib", MUT, REF]) - if deseq2_args is None: - deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True} - col_data = pd.DataFrame(conditions).assign( - cond_name=pd.Series(cond_names).values).set_index("cond_name") - # In case we want contrasts between factor combinations - if ("lib" in col_data.columns) and ("treat" in col_data.columns): - col_data = col_data.assign( - lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip( - col_data["lib"], col_data["treat"])]) - # http://stackoverflow.com/a/31206596/1878788 - pandas2ri.activate() # makes some conversions automatic - # r_counts_data = pandas2ri.py2ri(counts_data) - # r_col_data = pandas2ri.py2ri(col_data) - # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib")) - dds = deseq2.DESeqDataSetFromMatrix( - countData=counts_data, - colData=col_data, - design=formula) - # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"]) - # Decompose into the 3 steps to have more control on the options - try: - dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio") - except RRuntimeError as e: - if sum(counts_data.prod(axis=1)) == 0: - msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, - "This is probably because every gene has at least one zero.\n", - "We will try to use the \"poscounts\" method instead."]) - warnings.warn(msg) - try: - dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, - "We give up."]) - warnings.warn(msg) - raise - #print(counts_data.dtypes) - #print(counts_data.columns) - #print(len(counts_data)) - #raise - else: - raise - size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds))) - #for cond in cond_names: - # #s = size_factors.loc[cond][0] - # #(*_, s) = size_factors.loc[cond] - #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',)) - try: - dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, - "We will try with fitType=\"local\"."]) - warnings.warn(msg) - try: - dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, - "We will try with fitType=\"mean\"."]) - warnings.warn(msg) - try: - dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, - "We give up."]) - warnings.warn(msg) - raise - dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"]) - res = pandas2ri.ri2py(as_df(deseq2.results( - dds, - contrast=contrast, - addMLE=deseq2_args["addMLE"], - independentFiltering=deseq2_args["independentFiltering"]))) - res.index = counts_data.index - return res, {cond : size_factors.loc[cond][0] for cond in cond_names} +# def do_deseq2(cond_names, conditions, counts_data, +# formula=None, contrast=None, deseq2_args=None): +# """Runs a DESeq2 differential expression analysis.""" +# if formula is None: +# formula = Formula("~ lib") +# if contrast is None: +# # FIXME: MUT and REF are not defined +# # Maybe just make (formula and) contrast mandatory +# contrast = StrVector(["lib", MUT, REF]) +# if deseq2_args is None: +# deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True} +# col_data = pd.DataFrame(conditions).assign( +# cond_name=pd.Series(cond_names).values).set_index("cond_name") +# # In case we want contrasts between factor combinations +# if ("lib" in col_data.columns) and ("treat" in col_data.columns): +# col_data = col_data.assign( +# lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip( +# col_data["lib"], col_data["treat"])]) +# # http://stackoverflow.com/a/31206596/1878788 +# pandas2ri.activate() # makes some conversions automatic +# # r_counts_data = pandas2ri.py2ri(counts_data) +# # r_col_data = pandas2ri.py2ri(col_data) +# # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib")) +# dds = deseq2.DESeqDataSetFromMatrix( +# countData=counts_data, +# colData=col_data, +# design=formula) +# # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"]) +# # Decompose into the 3 steps to have more control on the options +# try: +# dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio") +# except RRuntimeError as e: +# if sum(counts_data.prod(axis=1)) == 0: +# msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, +# "This is probably because every gene has at least one zero.\n", +# "We will try to use the \"poscounts\" method instead."]) +# warnings.warn(msg) +# try: +# dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts") +# except RRuntimeError as e: +# msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, +# "We give up."]) +# warnings.warn(msg) +# raise +# #print(counts_data.dtypes) +# #print(counts_data.columns) +# #print(len(counts_data)) +# #raise +# else: +# raise +# size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds))) +# #for cond in cond_names: +# # #s = size_factors.loc[cond][0] +# # #(*_, s) = size_factors.loc[cond] +# #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',)) +# try: +# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric") +# except RRuntimeError as e: +# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, +# "We will try with fitType=\"local\"."]) +# warnings.warn(msg) +# try: +# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local") +# except RRuntimeError as e: +# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, +# "We will try with fitType=\"mean\"."]) +# warnings.warn(msg) +# try: +# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean") +# except RRuntimeError as e: +# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, +# "We give up."]) +# warnings.warn(msg) +# raise +# dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"]) +# res = pandas2ri.ri2py(as_df(deseq2.results( +# dds, +# contrast=contrast, +# addMLE=deseq2_args["addMLE"], +# independentFiltering=deseq2_args["independentFiltering"]))) +# res.index = counts_data.index +# return res, {cond : size_factors.loc[cond][0] for cond in cond_names} # Cutoffs in log fold change