Function to generate bedtool from gene ids.

It needs a list of identifiers and an annotation file, and generates a pybedtools.BedTool.

Function to generate bedtool from gene ids.
2a4d2234 · Blaise Li · a927a057 · 2a4d2234 · 2a4d2234
Commit 2a4d2234 authored 6 years ago by Blaise Li
--- a/libhts/__init__.py
+++ b/libhts/__init__.py
 from .libhts import (
    do_deseq2, gtf_2_genes_exon_lengths,
+    id_list_gtf2bed,
    make_empty_bigwig,
    make_seeding_function,
    median_ratio_to_pseudo_ref_size_factors,

--- a/libhts/libhts.py
+++ b/libhts/libhts.py
@@ -176,6 +176,33 @@ def spikein_gtf_2_lengths(spikein_gtf):
        name=("union_exon_len")).rename_axis("gene"))


+def id_list_gtf2bed(identifiers, gtf_filename, feature_type="transcript", id_kwd="gene_id"):
+    """
+    Extract bed coordinates of an iterable of identifiers from a gtf file.
+
+    *identifiers* is the iterable of identifiers.
+    *gtf_filename* is the name of the gtf file.
+    *feature_type* is the type of feature to be considered
+    in the gtf file (third columns).
+    *id_kwd* is the keyword under which the feature ID is expected to be found
+    in the feature annotations in the gtf_file. These feature IDs will be
+    matched against the elements in *identifiers*.
+    """
+    gtf_file = open(gtf_filename, "r")
+    gtf = BedTool(gtf_file)
+    if identifiers:
+        ids = set(identifiers)
+        def feature_filter(feature):
+            return feature[2] == feature_type and feature[id_kwd] in ids
+        return gtf.filter(feature_filter)
+    else:
+        # https://stackoverflow.com/a/13243870/1878788
+        def empty_bed_generator():
+            return
+            yield
+        return empty_bed_generator()
+
+
 def make_empty_bigwig(filename, chrom_sizes):
    """Writes *filename* so that it is an empty bigwig file.
    *chrom_sizes is a dictionary giving chromosome sizes* given
@@ -191,6 +218,9 @@ def make_empty_bigwig(filename, chrom_sizes):
    bw_out.close()


+#################
+# Bowtie2 stuff #
+#################
 def zero(value):
    return 0

@@ -238,90 +268,6 @@ def make_seeding_function(seeding_string):
    return seeding_function


-def do_deseq2(cond_names, conditions, counts_data,
-              formula=None, contrast=None, deseq2_args=None):
-    """Runs a DESeq2 differential expression analysis."""
-    if formula is None:
-        formula = Formula("~ lib")
-    if contrast is None:
-        # FIXME: MUT and REF are not defined
-        # Maybe just make (formula and) contrast mandatory
-        contrast = StrVector(["lib", MUT, REF])
-    if deseq2_args is None:
-        deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
-    col_data = pd.DataFrame(conditions).assign(
-        cond_name=pd.Series(cond_names).values).set_index("cond_name")
-    # In case we want contrasts between factor combinations
-    if ("lib" in col_data.columns) and ("treat" in col_data.columns):
-        col_data = col_data.assign(
-            lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
-                col_data["lib"], col_data["treat"])])
-    # http://stackoverflow.com/a/31206596/1878788
-    pandas2ri.activate()  # makes some conversions automatic
-    # r_counts_data = pandas2ri.py2ri(counts_data)
-    # r_col_data = pandas2ri.py2ri(col_data)
-    # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
-    dds = deseq2.DESeqDataSetFromMatrix(
-        countData=counts_data,
-        colData=col_data,
-        design=formula)
-    # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
-    # Decompose into the 3 steps to have more control on the options
-    try:
-        dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
-    except RRuntimeError as e:
-        if sum(counts_data.prod(axis=1)) == 0:
-            msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
-                           "This is probably because every gene has at least one zero.\n",
-                           "We will try to use the \"poscounts\" method instead."])
-            warnings.warn(msg)
-            try:
-                dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
-            except RRuntimeError as e:
-                msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
-                               "We give up."])
-                warnings.warn(msg)
-                raise
-                #print(counts_data.dtypes)
-                #print(counts_data.columns)
-                #print(len(counts_data))
-                #raise
-        else:
-            raise
-    size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
-    #for cond in cond_names:
-    #    #s = size_factors.loc[cond][0]
-    #    #(*_, s) = size_factors.loc[cond]
-    #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
-    try:
-        dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
-    except RRuntimeError as e:
-        msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
-                    "We will try with fitType=\"local\"."])
-        warnings.warn(msg)
-        try:
-            dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
-        except RRuntimeError as e:
-            msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
-                        "We will try with fitType=\"mean\"."])
-            warnings.warn(msg)
-            try:
-                dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
-            except RRuntimeError as e:
-                msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
-                            "We give up."])
-                warnings.warn(msg)
-                raise
-    dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
-    res = pandas2ri.ri2py(as_df(deseq2.results(
-        dds,
-        contrast=contrast,
-        addMLE=deseq2_args["addMLE"],
-        independentFiltering=deseq2_args["independentFiltering"])))
-    res.index = counts_data.index
-    return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
-
-
 ## Not sure this is a good idea...
 # def masked_gmean(a, axis=0, dtype=None):
 #     """Modified from stats.py."""
@@ -461,6 +407,93 @@ def plot_boxplots(data, ylabel):
    plt.tight_layout()


+############
+# DE stuff #
+############
+def do_deseq2(cond_names, conditions, counts_data,
+              formula=None, contrast=None, deseq2_args=None):
+    """Runs a DESeq2 differential expression analysis."""
+    if formula is None:
+        formula = Formula("~ lib")
+    if contrast is None:
+        # FIXME: MUT and REF are not defined
+        # Maybe just make (formula and) contrast mandatory
+        contrast = StrVector(["lib", MUT, REF])
+    if deseq2_args is None:
+        deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
+    col_data = pd.DataFrame(conditions).assign(
+        cond_name=pd.Series(cond_names).values).set_index("cond_name")
+    # In case we want contrasts between factor combinations
+    if ("lib" in col_data.columns) and ("treat" in col_data.columns):
+        col_data = col_data.assign(
+            lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
+                col_data["lib"], col_data["treat"])])
+    # http://stackoverflow.com/a/31206596/1878788
+    pandas2ri.activate()  # makes some conversions automatic
+    # r_counts_data = pandas2ri.py2ri(counts_data)
+    # r_col_data = pandas2ri.py2ri(col_data)
+    # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
+    dds = deseq2.DESeqDataSetFromMatrix(
+        countData=counts_data,
+        colData=col_data,
+        design=formula)
+    # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
+    # Decompose into the 3 steps to have more control on the options
+    try:
+        dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
+    except RRuntimeError as e:
+        if sum(counts_data.prod(axis=1)) == 0:
+            msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
+                           "This is probably because every gene has at least one zero.\n",
+                           "We will try to use the \"poscounts\" method instead."])
+            warnings.warn(msg)
+            try:
+                dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
+            except RRuntimeError as e:
+                msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
+                               "We give up."])
+                warnings.warn(msg)
+                raise
+                #print(counts_data.dtypes)
+                #print(counts_data.columns)
+                #print(len(counts_data))
+                #raise
+        else:
+            raise
+    size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
+    #for cond in cond_names:
+    #    #s = size_factors.loc[cond][0]
+    #    #(*_, s) = size_factors.loc[cond]
+    #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
+    try:
+        dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
+    except RRuntimeError as e:
+        msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
+                    "We will try with fitType=\"local\"."])
+        warnings.warn(msg)
+        try:
+            dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
+        except RRuntimeError as e:
+            msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
+                        "We will try with fitType=\"mean\"."])
+            warnings.warn(msg)
+            try:
+                dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
+            except RRuntimeError as e:
+                msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
+                            "We give up."])
+                warnings.warn(msg)
+                raise
+    dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
+    res = pandas2ri.ri2py(as_df(deseq2.results(
+        dds,
+        contrast=contrast,
+        addMLE=deseq2_args["addMLE"],
+        independentFiltering=deseq2_args["independentFiltering"])))
+    res.index = counts_data.index
+    return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
+
+
 # Cutoffs in log fold change
 LFC_CUTOFFS = [0.5, 1, 2]
 UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
@@ -701,6 +734,7 @@ def plot_scatter(data,
            ax.set_ylim(y_range)
    return ax

+
 def plot_paired_scatters(data, columns=None, hue=None, log_log=False):
    """Alternative to pairplot, in order to avoid histograms on the diagonal."""
    if columns is None: