Commit 063b4003 authored by Blaise Li's avatar Blaise Li
Browse files

Moved DESEq2 calling function in its own module.

parent f84faa33
......@@ -6,6 +6,8 @@ major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
# TODO look at 5' and 3' ends of genes
#TODO: add local metaprofiles (around TSS and around TTS), no min length
# TSS or TTS should not be within 200 of a TSS or TTS
......@@ -57,7 +59,8 @@ import matplotlib.pyplot as plt
import seaborn as sns
import husl
from libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libdeseq import do_deseq2,
from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, last_lines, save_plot, test_na_file
......
......@@ -84,7 +84,8 @@ import pyBigWig
from mappy import fastx_read
from rpy2.robjects import Formula, StrVector
from libhts import do_deseq2, status_setter, plot_lfc_distribution, plot_MA
from libdeseq import do_deseq2
from libhts import status_setter, plot_lfc_distribution, plot_MA
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
from libhts import (
plot_paired_scatters,
......
......@@ -130,7 +130,8 @@ sns.set_context("talk")
from libsmallrna import PI_MIN, PI_MAX, SI_MIN, SI_MAX
# Do this outside the workflow
#from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths
from libhts import do_deseq2, status_setter
from libdeseq import do_deseq2
from libhts import status_setter
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution, plot_boxplots, plot_histo
from libworkflows import texscape, ensure_relative, cleanup_and_backup
......
#!/bin/sh
python3.6 setup.py build_ext
# .egg-link does not work with PYTHONPATH ?
pip3.6 install -e .
pip3.6 install --no-deps --ignore-installed .
from .libdeseq import (
do_deseq2)
import warnings
def formatwarning(message, category, filename, lineno, line):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
import pandas as pd
from rpy2.robjects import r, pandas2ri, Formula, StrVector
as_df = r("as.data.frame")
from rpy2.rinterface import RRuntimeError
from rpy2.robjects.packages import importr
deseq2 = importr("DESeq2")
def do_deseq2(cond_names, conditions, counts_data,
formula=None, contrast=None, deseq2_args=None):
"""Runs a DESeq2 differential expression analysis."""
if formula is None:
formula = Formula("~ lib")
if contrast is None:
# FIXME: MUT and REF are not defined
# Maybe just make (formula and) contrast mandatory
contrast = StrVector(["lib", MUT, REF])
if deseq2_args is None:
deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
col_data = pd.DataFrame(conditions).assign(
cond_name=pd.Series(cond_names).values).set_index("cond_name")
# In case we want contrasts between factor combinations
if ("lib" in col_data.columns) and ("treat" in col_data.columns):
col_data = col_data.assign(
lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
col_data["lib"], col_data["treat"])])
# http://stackoverflow.com/a/31206596/1878788
pandas2ri.activate() # makes some conversions automatic
# r_counts_data = pandas2ri.py2ri(counts_data)
# r_col_data = pandas2ri.py2ri(col_data)
# r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
dds = deseq2.DESeqDataSetFromMatrix(
countData=counts_data,
colData=col_data,
design=formula)
# dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
# Decompose into the 3 steps to have more control on the options
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
except RRuntimeError as e:
if sum(counts_data.prod(axis=1)) == 0:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"This is probably because every gene has at least one zero.\n",
"We will try to use the \"poscounts\" method instead."])
warnings.warn(msg)
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
#print(counts_data.dtypes)
#print(counts_data.columns)
#print(len(counts_data))
#raise
else:
raise
size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
#for cond in cond_names:
# #s = size_factors.loc[cond][0]
# #(*_, s) = size_factors.loc[cond]
#pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"local\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"mean\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
res = pandas2ri.ri2py(as_df(deseq2.results(
dds,
contrast=contrast,
addMLE=deseq2_args["addMLE"],
independentFiltering=deseq2_args["independentFiltering"])))
res.index = counts_data.index
return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
from setuptools import setup, find_packages
#from Cython.Build import cythonize
name = "libdeseq"
# Adapted from Biopython
__version__ = "Undefined"
for line in open("%s/__init__.py" % name):
if (line.startswith('__version__')):
exec(line.strip())
setup(
name=name,
version=__version__,
description="Interfacing the call to DESEq2 with python.",
author="Blaise Li",
author_email="blaise.li@normalesup.org",
license="MIT",
packages=find_packages())
#ext_modules = cythonize("libsmallrna/libsmallrna.pyx"),
#install_requires=["cytoolz"],
#zip_safe=False
# Compiled python modules.
*.pyc
# Setuptools distribution folder.
/dist/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
# Backups
*~
from .libhts import (
do_deseq2, gtf_2_genes_exon_lengths,
gtf_2_genes_exon_lengths,
id_list_gtf2bed,
make_empty_bigwig,
make_seeding_function,
......
......@@ -33,11 +33,11 @@ TEX_PARAMS = {
}
mpl.rcParams.update(TEX_PARAMS)
import seaborn as sns
from rpy2.robjects import r, pandas2ri, Formula, StrVector
as_df = r("as.data.frame")
from rpy2.rinterface import RRuntimeError
from rpy2.robjects.packages import importr
deseq2 = importr("DESeq2")
# from rpy2.robjects import r, pandas2ri, Formula, StrVector
# as_df = r("as.data.frame")
# from rpy2.rinterface import RRuntimeError
# from rpy2.robjects.packages import importr
# deseq2 = importr("DESeq2")
from pybedtools import BedTool
import pyBigWig
import networkx as nx
......@@ -410,88 +410,88 @@ def plot_boxplots(data, ylabel):
############
# DE stuff #
############
def do_deseq2(cond_names, conditions, counts_data,
formula=None, contrast=None, deseq2_args=None):
"""Runs a DESeq2 differential expression analysis."""
if formula is None:
formula = Formula("~ lib")
if contrast is None:
# FIXME: MUT and REF are not defined
# Maybe just make (formula and) contrast mandatory
contrast = StrVector(["lib", MUT, REF])
if deseq2_args is None:
deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
col_data = pd.DataFrame(conditions).assign(
cond_name=pd.Series(cond_names).values).set_index("cond_name")
# In case we want contrasts between factor combinations
if ("lib" in col_data.columns) and ("treat" in col_data.columns):
col_data = col_data.assign(
lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
col_data["lib"], col_data["treat"])])
# http://stackoverflow.com/a/31206596/1878788
pandas2ri.activate() # makes some conversions automatic
# r_counts_data = pandas2ri.py2ri(counts_data)
# r_col_data = pandas2ri.py2ri(col_data)
# r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
dds = deseq2.DESeqDataSetFromMatrix(
countData=counts_data,
colData=col_data,
design=formula)
# dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
# Decompose into the 3 steps to have more control on the options
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
except RRuntimeError as e:
if sum(counts_data.prod(axis=1)) == 0:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"This is probably because every gene has at least one zero.\n",
"We will try to use the \"poscounts\" method instead."])
warnings.warn(msg)
try:
dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
#print(counts_data.dtypes)
#print(counts_data.columns)
#print(len(counts_data))
#raise
else:
raise
size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
#for cond in cond_names:
# #s = size_factors.loc[cond][0]
# #(*_, s) = size_factors.loc[cond]
#pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"local\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We will try with fitType=\"mean\"."])
warnings.warn(msg)
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
except RRuntimeError as e:
msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
"We give up."])
warnings.warn(msg)
raise
dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
res = pandas2ri.ri2py(as_df(deseq2.results(
dds,
contrast=contrast,
addMLE=deseq2_args["addMLE"],
independentFiltering=deseq2_args["independentFiltering"])))
res.index = counts_data.index
return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
# def do_deseq2(cond_names, conditions, counts_data,
# formula=None, contrast=None, deseq2_args=None):
# """Runs a DESeq2 differential expression analysis."""
# if formula is None:
# formula = Formula("~ lib")
# if contrast is None:
# # FIXME: MUT and REF are not defined
# # Maybe just make (formula and) contrast mandatory
# contrast = StrVector(["lib", MUT, REF])
# if deseq2_args is None:
# deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
# col_data = pd.DataFrame(conditions).assign(
# cond_name=pd.Series(cond_names).values).set_index("cond_name")
# # In case we want contrasts between factor combinations
# if ("lib" in col_data.columns) and ("treat" in col_data.columns):
# col_data = col_data.assign(
# lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
# col_data["lib"], col_data["treat"])])
# # http://stackoverflow.com/a/31206596/1878788
# pandas2ri.activate() # makes some conversions automatic
# # r_counts_data = pandas2ri.py2ri(counts_data)
# # r_col_data = pandas2ri.py2ri(col_data)
# # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
# dds = deseq2.DESeqDataSetFromMatrix(
# countData=counts_data,
# colData=col_data,
# design=formula)
# # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
# # Decompose into the 3 steps to have more control on the options
# try:
# dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
# except RRuntimeError as e:
# if sum(counts_data.prod(axis=1)) == 0:
# msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
# "This is probably because every gene has at least one zero.\n",
# "We will try to use the \"poscounts\" method instead."])
# warnings.warn(msg)
# try:
# dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
# except RRuntimeError as e:
# msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
# "We give up."])
# warnings.warn(msg)
# raise
# #print(counts_data.dtypes)
# #print(counts_data.columns)
# #print(len(counts_data))
# #raise
# else:
# raise
# size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
# #for cond in cond_names:
# # #s = size_factors.loc[cond][0]
# # #(*_, s) = size_factors.loc[cond]
# #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
# try:
# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
# except RRuntimeError as e:
# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
# "We will try with fitType=\"local\"."])
# warnings.warn(msg)
# try:
# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
# except RRuntimeError as e:
# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
# "We will try with fitType=\"mean\"."])
# warnings.warn(msg)
# try:
# dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
# except RRuntimeError as e:
# msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
# "We give up."])
# warnings.warn(msg)
# raise
# dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
# res = pandas2ri.ri2py(as_df(deseq2.results(
# dds,
# contrast=contrast,
# addMLE=deseq2_args["addMLE"],
# independentFiltering=deseq2_args["independentFiltering"])))
# res.index = counts_data.index
# return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
# Cutoffs in log fold change
......
......@@ -135,7 +135,8 @@ sns.set_context("talk")
from libsmallrna import PI_MIN, PI_MAX, SI_MIN, SI_MAX
# Do this outside the workflow
#from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths
from libhts import do_deseq2, status_setter, make_empty_bigwig
from libdeseq import do_deseq2
from libhts import status_setter, make_empty_bigwig
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution, plot_boxplots, plot_histo
from libworkflows import texscape, ensure_relative, cleanup_and_backup
......@@ -516,13 +517,14 @@ meta_profiles = [
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
group_type=[group_type], lib_group=list(lib_group.keys()), fig_format=FIG_FORMATS) for (group_type, lib_group) in lib_groups.items()],
[expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.{fig_format}"),
meta_scale=[str(META_SCALE)], read_type=[size_selected, "siRNA", "siuRNA", "miRNA", "piRNA", "all_siRNA"],
norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
group_type=[group_type], lib_group=list(lib_group.keys()), fig_format=FIG_FORMATS) for (group_type, lib_group) in lib_groups.items()],
# Same as above ?
# [expand(
# OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
# "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.{fig_format}"),
# meta_scale=[str(META_SCALE)], read_type=[size_selected, "siRNA", "siuRNA", "miRNA", "piRNA", "all_siRNA"],
# norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)],
# biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)],
# group_type=[group_type], lib_group=list(lib_group.keys()), fig_format=FIG_FORMATS) for (group_type, lib_group) in lib_groups.items()],
[expand(
OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}",
"{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.{fig_format}"),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment