diff --git a/libhts/__init__.py b/libhts/__init__.py index b1aab224f06b5b3d591665a04587c727ed0e5ca8..525ba518cbebd3895d19fc0d0a83d1cde0bb664a 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1,5 +1,6 @@ from .libhts import ( do_deseq2, gtf_2_genes_exon_lengths, + id_list_gtf2bed, make_empty_bigwig, make_seeding_function, median_ratio_to_pseudo_ref_size_factors, diff --git a/libhts/libhts.py b/libhts/libhts.py index b087970d62aad6d064fda60139f873864dc2ff51..cd3037312ae392b9454a105175a7b0a10b9a6df0 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -176,6 +176,33 @@ def spikein_gtf_2_lengths(spikein_gtf): name=("union_exon_len")).rename_axis("gene")) +def id_list_gtf2bed(identifiers, gtf_filename, feature_type="transcript", id_kwd="gene_id"): + """ + Extract bed coordinates of an iterable of identifiers from a gtf file. + + *identifiers* is the iterable of identifiers. + *gtf_filename* is the name of the gtf file. + *feature_type* is the type of feature to be considered + in the gtf file (third columns). + *id_kwd* is the keyword under which the feature ID is expected to be found + in the feature annotations in the gtf_file. These feature IDs will be + matched against the elements in *identifiers*. + """ + gtf_file = open(gtf_filename, "r") + gtf = BedTool(gtf_file) + if identifiers: + ids = set(identifiers) + def feature_filter(feature): + return feature[2] == feature_type and feature[id_kwd] in ids + return gtf.filter(feature_filter) + else: + # https://stackoverflow.com/a/13243870/1878788 + def empty_bed_generator(): + return + yield + return empty_bed_generator() + + def make_empty_bigwig(filename, chrom_sizes): """Writes *filename* so that it is an empty bigwig file. *chrom_sizes is a dictionary giving chromosome sizes* given @@ -191,6 +218,9 @@ def make_empty_bigwig(filename, chrom_sizes): bw_out.close() +################# +# Bowtie2 stuff # +################# def zero(value): return 0 @@ -238,90 +268,6 @@ def make_seeding_function(seeding_string): return seeding_function -def do_deseq2(cond_names, conditions, counts_data, - formula=None, contrast=None, deseq2_args=None): - """Runs a DESeq2 differential expression analysis.""" - if formula is None: - formula = Formula("~ lib") - if contrast is None: - # FIXME: MUT and REF are not defined - # Maybe just make (formula and) contrast mandatory - contrast = StrVector(["lib", MUT, REF]) - if deseq2_args is None: - deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True} - col_data = pd.DataFrame(conditions).assign( - cond_name=pd.Series(cond_names).values).set_index("cond_name") - # In case we want contrasts between factor combinations - if ("lib" in col_data.columns) and ("treat" in col_data.columns): - col_data = col_data.assign( - lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip( - col_data["lib"], col_data["treat"])]) - # http://stackoverflow.com/a/31206596/1878788 - pandas2ri.activate() # makes some conversions automatic - # r_counts_data = pandas2ri.py2ri(counts_data) - # r_col_data = pandas2ri.py2ri(col_data) - # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib")) - dds = deseq2.DESeqDataSetFromMatrix( - countData=counts_data, - colData=col_data, - design=formula) - # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"]) - # Decompose into the 3 steps to have more control on the options - try: - dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio") - except RRuntimeError as e: - if sum(counts_data.prod(axis=1)) == 0: - msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, - "This is probably because every gene has at least one zero.\n", - "We will try to use the \"poscounts\" method instead."]) - warnings.warn(msg) - try: - dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, - "We give up."]) - warnings.warn(msg) - raise - #print(counts_data.dtypes) - #print(counts_data.columns) - #print(len(counts_data)) - #raise - else: - raise - size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds))) - #for cond in cond_names: - # #s = size_factors.loc[cond][0] - # #(*_, s) = size_factors.loc[cond] - #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',)) - try: - dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, - "We will try with fitType=\"local\"."]) - warnings.warn(msg) - try: - dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, - "We will try with fitType=\"mean\"."]) - warnings.warn(msg) - try: - dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean") - except RRuntimeError as e: - msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, - "We give up."]) - warnings.warn(msg) - raise - dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"]) - res = pandas2ri.ri2py(as_df(deseq2.results( - dds, - contrast=contrast, - addMLE=deseq2_args["addMLE"], - independentFiltering=deseq2_args["independentFiltering"]))) - res.index = counts_data.index - return res, {cond : size_factors.loc[cond][0] for cond in cond_names} - - ## Not sure this is a good idea... # def masked_gmean(a, axis=0, dtype=None): # """Modified from stats.py.""" @@ -461,6 +407,93 @@ def plot_boxplots(data, ylabel): plt.tight_layout() +############ +# DE stuff # +############ +def do_deseq2(cond_names, conditions, counts_data, + formula=None, contrast=None, deseq2_args=None): + """Runs a DESeq2 differential expression analysis.""" + if formula is None: + formula = Formula("~ lib") + if contrast is None: + # FIXME: MUT and REF are not defined + # Maybe just make (formula and) contrast mandatory + contrast = StrVector(["lib", MUT, REF]) + if deseq2_args is None: + deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True} + col_data = pd.DataFrame(conditions).assign( + cond_name=pd.Series(cond_names).values).set_index("cond_name") + # In case we want contrasts between factor combinations + if ("lib" in col_data.columns) and ("treat" in col_data.columns): + col_data = col_data.assign( + lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip( + col_data["lib"], col_data["treat"])]) + # http://stackoverflow.com/a/31206596/1878788 + pandas2ri.activate() # makes some conversions automatic + # r_counts_data = pandas2ri.py2ri(counts_data) + # r_col_data = pandas2ri.py2ri(col_data) + # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib")) + dds = deseq2.DESeqDataSetFromMatrix( + countData=counts_data, + colData=col_data, + design=formula) + # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"]) + # Decompose into the 3 steps to have more control on the options + try: + dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio") + except RRuntimeError as e: + if sum(counts_data.prod(axis=1)) == 0: + msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, + "This is probably because every gene has at least one zero.\n", + "We will try to use the \"poscounts\" method instead."]) + warnings.warn(msg) + try: + dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts") + except RRuntimeError as e: + msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e, + "We give up."]) + warnings.warn(msg) + raise + #print(counts_data.dtypes) + #print(counts_data.columns) + #print(len(counts_data)) + #raise + else: + raise + size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds))) + #for cond in cond_names: + # #s = size_factors.loc[cond][0] + # #(*_, s) = size_factors.loc[cond] + #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',)) + try: + dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric") + except RRuntimeError as e: + msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, + "We will try with fitType=\"local\"."]) + warnings.warn(msg) + try: + dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local") + except RRuntimeError as e: + msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, + "We will try with fitType=\"mean\"."]) + warnings.warn(msg) + try: + dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean") + except RRuntimeError as e: + msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e, + "We give up."]) + warnings.warn(msg) + raise + dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"]) + res = pandas2ri.ri2py(as_df(deseq2.results( + dds, + contrast=contrast, + addMLE=deseq2_args["addMLE"], + independentFiltering=deseq2_args["independentFiltering"]))) + res.index = counts_data.index + return res, {cond : size_factors.loc[cond][0] for cond in cond_names} + + # Cutoffs in log fold change LFC_CUTOFFS = [0.5, 1, 2] UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS] @@ -701,6 +734,7 @@ def plot_scatter(data, ax.set_ylim(y_range) return ax + def plot_paired_scatters(data, columns=None, hue=None, log_log=False): """Alternative to pairplot, in order to avoid histograms on the diagonal.""" if columns is None: