diff --git a/ariaec/maplot.py b/ariaec/maplot.py index 820cd9c0956da5afa82ec3c941cb6ad288d15ce3..4e5ebaefe7f2dfbbfb0e82feca8fc19cb1cfbf6a 100644 --- a/ariaec/maplot.py +++ b/ariaec/maplot.py @@ -5,8 +5,6 @@ from __future__ import absolute_import, division, print_function import json import logging -import pandas as pd - from .base import get_filename from .reader import ProtFileListReader from .protmap import MapFilter @@ -110,6 +108,7 @@ class AriaEcContactMap(object): cmplist = self.allresmap[mapt]['scoremap'].sortedset( human_idx=True) elif self.allresmap[mapt].get("contactmap") is not None: + # If no score given, use all contact list cmpmap = self.allresmap[mapt]["contactmap"] cmplist = self.allresmap[mapt]['contactmap'].contact_list( @@ -119,14 +118,17 @@ class AriaEcContactMap(object): "define top list related to this map" % mapt) continue + # TODO: only one function for output files # Write contact list in txt file cmpmap.write_contacts("_".join((self.outprefix, mapt)), outdir=outdir) - # Write comparison stats - cmpmap.compare_contactmap(refmap, scoremap, cmplist, prefix, + # Write cmp stats + cmpmap.compare_contactmap(refmap, cmplist, prefix, distmap=self.refmap["distmap"], human_idx=True, outdir=outdir) + refmap.report(cmpmap, scoremap, outprefix=prefix, outdir=outdir, + plotdir=self.settings.infra.get("graphics", outdir)) # Contact map comparison plot # TODO: elementwise error with compare method refmap.compareplot(cmpmap, outprefix=prefix, @@ -137,5 +139,6 @@ class AriaEcContactMap(object): "alpha"), **plotparams) # Contingency table - logger.info(pd.crosstab(cmpmap.to_series(), refmap.to_series(), - rownames=[mapt], colnames=[self.reftype])) + # print(cmpmap.to_series()) + # logger.info(pd.crosstab(cmpmap.values, refmap.values, + # rownames=[mapt], colnames=[self.reftype])) diff --git a/ariaec/maplot.pyc b/ariaec/maplot.pyc index 50a6cc26f034e5a86f4ebd7eded40b88ad317907..a7ac1385d5476956e4d62c8e5701018b9eae50ff 100644 Binary files a/ariaec/maplot.pyc and b/ariaec/maplot.pyc differ diff --git a/ariaec/protmap.py b/ariaec/protmap.py index d70fe6986d5d48e2d294d863b09c4c2911a93241..06e0676a9c32ccc060f9bfefe8ba3f2e8324f741 100644 --- a/ariaec/protmap.py +++ b/ariaec/protmap.py @@ -16,11 +16,14 @@ import operator import pandas as pd import seaborn as sns import numpy as np +import datetime from matplotlib import pyplot as plt import aria.ConversionTable as ConversionTable import aria.legacy.AminoAcid as AminoAcid from matplotlib.lines import Line2D from .base import (tickmin, tickrot, titleprint) +import sklearn.metrics as skm + logger = logging.getLogger(__name__) @@ -40,7 +43,8 @@ class Map(pd.DataFrame): super(Map, self)._constructor_expanddim() def __init__(self, index=None, columns=None, mtype='distance', - duplicate_levels=False, data=None, dtype=None, sym=True): + duplicate_levels=False, data=None, dtype=None, sym=True, + desc=""): """ :param index: :param columns: @@ -62,6 +66,7 @@ class Map(pd.DataFrame): if mtype == "score": self.sort_list = [] self.sym = sym + self.desc = desc def sortedset(self, human_idx=False): # Remove duplicate in sort_list @@ -94,7 +99,8 @@ class Map(pd.DataFrame): index = ['-'.join(idx) for idx in self.index] return getattr(self, '__init__')(self.sequence, data=self.as_matrix(), - index=index, + index=index, desc=self.desc, + sym=self.sym, mtype=self.mtype, columns=columns, dtype=self.dtype) def remove(self, rm_list): @@ -288,12 +294,113 @@ class ProteinMap(Map): if save_fig: self.saveplot(**kwargs) - def compare_contactmap(self, cmpmap, scoremap, contactlist, outprefix, + def report(self, cmpmap, scoremap, outprefix="", outdir="", plotdir="", + plot_ext="pdf"): + with open("%s/%s.report" % (outdir, outprefix), 'w') as reportf: + y_true = list(self.values.astype(int).flat) + y_pred = list(cmpmap.values.astype(int).flat) + y_scores = list(scoremap.values.astype(float).flat) + + map1name = self.desc + map2name = cmpmap.desc + + # ROC plot + allfpr, alltpr, rocthresholds = skm.roc_curve(y_true, y_scores, + pos_label=1) + roc_auc = skm.roc_auc_score(y_true, y_scores) + plotpath = os.path.join(plotdir, "%s.roc.%s" % (outprefix, + plot_ext)) + plt.figure() + plt.plot(allfpr, alltpr, label='ROC curve (area = %0.2f)' % roc_auc) + plt.plot([0, 1], [0, 1], 'k--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic %s vs. %s' % ( + map1name, map2name)) + plt.legend(loc="lower right") + plt.savefig(plotpath) + + allprec, allrec, prthresholds = skm.precision_recall_curve( + y_true, y_scores) + aver_prec = skm.average_precision_score(y_true, y_scores) + plotpath = os.path.join(plotdir, "%s.precall.%s" % (outprefix, + plot_ext)) + # Precision recall curve + plt.clf() + plt.plot(allrec, allprec, label='Precision-Recall curve') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.ylim([0.0, 1.05]) + plt.xlim([0.0, 1.0]) + plt.title('Precision-Recall example: AUC={0:0.2f}'.format( + aver_prec)) + plt.legend(loc="lower left") + plt.savefig(plotpath) + + msg = """\ +## Report {map1name} vs. {map2name} +## +## Date: {date} +## Path: {outdir} +## +## ----------------------------------------------------------------------------- +## +## Accuracy: {accuracy} +## Precision: {precision} +## Recall (Sensibility): {recall} +## +## Matthews correlation coefficient (MCC): {mcc} +## F1 score: {f1s} +## F2 score: {f2s} +## F0.5 score: {f05s} +## +## Hamming loss: {hamm} +## Hinge loss: {hin} +## +## ----------------------------------------------------------------------------- +## +## Precision recall curve +## +## Precision values: +## {allprec} +## Recall values: +## {allrec} +## Score tresholds ({map2name}): +## {prthres} +## +## ----------------------------------------------------------------------------- +## +## ROC curve +## Area Under Curve: {roc_auc} +## True Positive Rate (Sensibility) values: +## {alltpr} +## False Positive Rate (1 - Specificity) values: +## {allfpr} +## Score tresholds ({map2name}): +## {rocthres} +""".format(map1name=map1name, map2name=map2name, + date=datetime.date.today().strftime("%A %d. %B %Y"), + outdir=outdir, accuracy=skm.accuracy_score(y_true, y_pred), + precision=skm.precision_score(y_true, y_pred), + recall=skm.recall_score(y_true, y_pred), + mcc=skm.matthews_corrcoef(y_true, y_pred), + f1s=skm.f1_score(y_true, y_pred), + f2s=skm.fbeta_score(y_true, y_pred, 2), + f05s=skm.fbeta_score(y_true, y_pred, 0.5), + hamm=skm.hamming_loss(y_true, y_pred), + hin=skm.hinge_loss(y_true, y_pred), + roc_auc=roc_auc, allprec=allprec, allrec=allrec, + prthres=prthresholds, alltpr=alltpr, allfpr=allfpr, + rocthres=rocthresholds) + logger.debug("\n" + msg) + reportf.write(msg) + + def compare_contactmap(self, cmpmap, contactlist, outprefix, outdir="", distmap=None, human_idx=True): # CSV file giving TP/FP contacts - print(*self.astype(int).values.flat) - print(cmpmap.values) - outfile = open("%s/%s.contact.csv" % (outdir, outprefix), 'w') + outfile = open("%s/%s.contactcmp.csv" % (outdir, outprefix), 'w') logger.info("Writing stat file %s" % outfile) offset = 1 if human_idx else 0 extra_header = "" if distmap is None else ",dmin" @@ -424,7 +531,8 @@ class ResAtmMap(ProteinMap): def reduce(self, groupby="min"): if self.index.nlevels == 1 or not groupby: return self - newmap = ResMap(self.sequence, dtype=self.dtype) + newmap = ResMap(self.sequence, dtype=self.dtype, desc=self.desc, + sym=self.sym) if self.dtype == bool: newmap[:] = self.copy().stack().groupby(level=0).any() elif groupby == "mean": @@ -448,8 +556,9 @@ class ResAtmMap(ProteinMap): # TODO: issue with sc_sc treshold !!!!! logger.info("Generate contact map using contact definition %s" % contactdef) - # Contact map initialize to a boolean matrix filled with False - contact_map = ResAtmMap(sequence=self.sequence, mtype="contact") + # Initialize contact map to a boolean matrix filled with False + contact_map = ResAtmMap(sequence=self.sequence, mtype="contact", + desc=self.desc, sym=self.sym) if type(contactdef) == float: contact_map[:] = self.applymap(lambda x: x < contactdef) @@ -548,7 +657,8 @@ class ResMap(ResAtmMap): return None def contact_map(self, contactdef, **kwargs): - contact_map = ResMap(self.sequence, mtype="contact") + contact_map = ResMap(self.sequence, mtype="contact", desc=self.desc, + sym=self.sym) def treshconv(x): return x <= treshold diff --git a/ariaec/protmap.pyc b/ariaec/protmap.pyc index ac2b46dee027b4cf8725acf3eb271fe162dbbad4..bede6b40316bd8b8d1e8ad022c7db6c710a145de 100644 Binary files a/ariaec/protmap.pyc and b/ariaec/protmap.pyc differ diff --git a/ariaec/reader.py b/ariaec/reader.py index cc187b669c9759c51ca912664a7f3be2407dfc3b..4d0525eed0942836acf6542573c590a3ace71b1b 100644 --- a/ariaec/reader.py +++ b/ariaec/reader.py @@ -308,15 +308,17 @@ class ContactMapFile(MapFile): contactmap = ResMap(protein.aa_sequence.sequence, mtype='contact', flaglist=kwargs['flaglist'], seqidx=protein.index, index=idxnames, - columns=colnames, sym=kwargs['sym']) + columns=colnames, sym=kwargs['sym'], + desc=self.filetype) # DataFrame containing ec scores scoremap = ResMap(protein.aa_sequence.sequence, mtype='score', seqidx=protein.index, index=idxnames, - columns=colnames, sym=kwargs['sym']) if self.sort \ - else None + columns=colnames, sym=kwargs['sym'], + desc=self.filetype) if self.sort else None distmap = ResMap(protein.aa_sequence.sequence, mtype='distance', seqidx=protein.index, index=idxnames, columns=colnames, - sym=kwargs['sym']) if self.filetype == "metapsicovhb" else None + sym=kwargs['sym'], desc=self.filetype) if \ + self.filetype == "metapsicovhb" else None for contact_id in self.lines: if self.filetype == "metapsicovhb": @@ -379,7 +381,7 @@ class PDBFile(MapFile): flaglist=None, sym=True): resmap = ResAtmMap(protein.aa_sequence.sequence, mtype='distance', flaglist=flaglist, - seqidx=protein.index) + seqidx=protein.index, desc=self.filetype) resmap[:] = self.update_map(resmap, sym=sym) logger.debug("pdb distance map:\n%s" % resmap) self.mapdict["alldistmap"] = resmap diff --git a/ariaec/reader.pyc b/ariaec/reader.pyc index 814ce5de228ce94dd8ffce5581f771f1a6eed02f..7b17d809df6a864e5219e612054e909b72d30e9d 100644 Binary files a/ariaec/reader.pyc and b/ariaec/reader.pyc differ