Commit d57cf959 authored by fabrice's avatar fabrice

Bug fixe: Missing parameter desc in pdb maps after reduce call. Report file implementation

parent 8f850252
......@@ -5,8 +5,6 @@ from __future__ import absolute_import, division, print_function
import json
import logging
import pandas as pd
from .base import get_filename
from .reader import ProtFileListReader
from .protmap import MapFilter
......@@ -110,6 +108,7 @@ class AriaEcContactMap(object):
cmplist = self.allresmap[mapt]['scoremap'].sortedset(
human_idx=True)
elif self.allresmap[mapt].get("contactmap") is not None:
# If no score given, use all contact list
cmpmap = self.allresmap[mapt]["contactmap"]
cmplist = self.allresmap[mapt]['contactmap'].contact_list(
......@@ -119,14 +118,17 @@ class AriaEcContactMap(object):
"define top list related to this map" % mapt)
continue
# TODO: only one function for output files
# Write contact list in txt file
cmpmap.write_contacts("_".join((self.outprefix, mapt)),
outdir=outdir)
# Write comparison stats
cmpmap.compare_contactmap(refmap, scoremap, cmplist, prefix,
# Write cmp stats
cmpmap.compare_contactmap(refmap, cmplist, prefix,
distmap=self.refmap["distmap"],
human_idx=True,
outdir=outdir)
refmap.report(cmpmap, scoremap, outprefix=prefix, outdir=outdir,
plotdir=self.settings.infra.get("graphics", outdir))
# Contact map comparison plot
# TODO: elementwise error with compare method
refmap.compareplot(cmpmap, outprefix=prefix,
......@@ -137,5 +139,6 @@ class AriaEcContactMap(object):
"alpha"),
**plotparams)
# Contingency table
logger.info(pd.crosstab(cmpmap.to_series(), refmap.to_series(),
rownames=[mapt], colnames=[self.reftype]))
# print(cmpmap.to_series())
# logger.info(pd.crosstab(cmpmap.values, refmap.values,
# rownames=[mapt], colnames=[self.reftype]))
No preview for this file type
......@@ -16,11 +16,14 @@ import operator
import pandas as pd
import seaborn as sns
import numpy as np
import datetime
from matplotlib import pyplot as plt
import aria.ConversionTable as ConversionTable
import aria.legacy.AminoAcid as AminoAcid
from matplotlib.lines import Line2D
from .base import (tickmin, tickrot, titleprint)
import sklearn.metrics as skm
logger = logging.getLogger(__name__)
......@@ -40,7 +43,8 @@ class Map(pd.DataFrame):
super(Map, self)._constructor_expanddim()
def __init__(self, index=None, columns=None, mtype='distance',
duplicate_levels=False, data=None, dtype=None, sym=True):
duplicate_levels=False, data=None, dtype=None, sym=True,
desc=""):
"""
:param index:
:param columns:
......@@ -62,6 +66,7 @@ class Map(pd.DataFrame):
if mtype == "score":
self.sort_list = []
self.sym = sym
self.desc = desc
def sortedset(self, human_idx=False):
# Remove duplicate in sort_list
......@@ -94,7 +99,8 @@ class Map(pd.DataFrame):
index = ['-'.join(idx) for idx in self.index]
return getattr(self, '__init__')(self.sequence,
data=self.as_matrix(),
index=index,
index=index, desc=self.desc,
sym=self.sym, mtype=self.mtype,
columns=columns, dtype=self.dtype)
def remove(self, rm_list):
......@@ -288,12 +294,113 @@ class ProteinMap(Map):
if save_fig:
self.saveplot(**kwargs)
def compare_contactmap(self, cmpmap, scoremap, contactlist, outprefix,
def report(self, cmpmap, scoremap, outprefix="", outdir="", plotdir="",
plot_ext="pdf"):
with open("%s/%s.report" % (outdir, outprefix), 'w') as reportf:
y_true = list(self.values.astype(int).flat)
y_pred = list(cmpmap.values.astype(int).flat)
y_scores = list(scoremap.values.astype(float).flat)
map1name = self.desc
map2name = cmpmap.desc
# ROC plot
allfpr, alltpr, rocthresholds = skm.roc_curve(y_true, y_scores,
pos_label=1)
roc_auc = skm.roc_auc_score(y_true, y_scores)
plotpath = os.path.join(plotdir, "%s.roc.%s" % (outprefix,
plot_ext))
plt.figure()
plt.plot(allfpr, alltpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic %s vs. %s' % (
map1name, map2name))
plt.legend(loc="lower right")
plt.savefig(plotpath)
allprec, allrec, prthresholds = skm.precision_recall_curve(
y_true, y_scores)
aver_prec = skm.average_precision_score(y_true, y_scores)
plotpath = os.path.join(plotdir, "%s.precall.%s" % (outprefix,
plot_ext))
# Precision recall curve
plt.clf()
plt.plot(allrec, allprec, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(
aver_prec))
plt.legend(loc="lower left")
plt.savefig(plotpath)
msg = """\
## Report {map1name} vs. {map2name}
##
## Date: {date}
## Path: {outdir}
##
## -----------------------------------------------------------------------------
##
## Accuracy: {accuracy}
## Precision: {precision}
## Recall (Sensibility): {recall}
##
## Matthews correlation coefficient (MCC): {mcc}
## F1 score: {f1s}
## F2 score: {f2s}
## F0.5 score: {f05s}
##
## Hamming loss: {hamm}
## Hinge loss: {hin}
##
## -----------------------------------------------------------------------------
##
## Precision recall curve
##
## Precision values:
## {allprec}
## Recall values:
## {allrec}
## Score tresholds ({map2name}):
## {prthres}
##
## -----------------------------------------------------------------------------
##
## ROC curve
## Area Under Curve: {roc_auc}
## True Positive Rate (Sensibility) values:
## {alltpr}
## False Positive Rate (1 - Specificity) values:
## {allfpr}
## Score tresholds ({map2name}):
## {rocthres}
""".format(map1name=map1name, map2name=map2name,
date=datetime.date.today().strftime("%A %d. %B %Y"),
outdir=outdir, accuracy=skm.accuracy_score(y_true, y_pred),
precision=skm.precision_score(y_true, y_pred),
recall=skm.recall_score(y_true, y_pred),
mcc=skm.matthews_corrcoef(y_true, y_pred),
f1s=skm.f1_score(y_true, y_pred),
f2s=skm.fbeta_score(y_true, y_pred, 2),
f05s=skm.fbeta_score(y_true, y_pred, 0.5),
hamm=skm.hamming_loss(y_true, y_pred),
hin=skm.hinge_loss(y_true, y_pred),
roc_auc=roc_auc, allprec=allprec, allrec=allrec,
prthres=prthresholds, alltpr=alltpr, allfpr=allfpr,
rocthres=rocthresholds)
logger.debug("\n" + msg)
reportf.write(msg)
def compare_contactmap(self, cmpmap, contactlist, outprefix,
outdir="", distmap=None, human_idx=True):
# CSV file giving TP/FP contacts
print(*self.astype(int).values.flat)
print(cmpmap.values)
outfile = open("%s/%s.contact.csv" % (outdir, outprefix), 'w')
outfile = open("%s/%s.contactcmp.csv" % (outdir, outprefix), 'w')
logger.info("Writing stat file %s" % outfile)
offset = 1 if human_idx else 0
extra_header = "" if distmap is None else ",dmin"
......@@ -424,7 +531,8 @@ class ResAtmMap(ProteinMap):
def reduce(self, groupby="min"):
if self.index.nlevels == 1 or not groupby:
return self
newmap = ResMap(self.sequence, dtype=self.dtype)
newmap = ResMap(self.sequence, dtype=self.dtype, desc=self.desc,
sym=self.sym)
if self.dtype == bool:
newmap[:] = self.copy().stack().groupby(level=0).any()
elif groupby == "mean":
......@@ -448,8 +556,9 @@ class ResAtmMap(ProteinMap):
# TODO: issue with sc_sc treshold !!!!!
logger.info("Generate contact map using contact definition %s" %
contactdef)
# Contact map initialize to a boolean matrix filled with False
contact_map = ResAtmMap(sequence=self.sequence, mtype="contact")
# Initialize contact map to a boolean matrix filled with False
contact_map = ResAtmMap(sequence=self.sequence, mtype="contact",
desc=self.desc, sym=self.sym)
if type(contactdef) == float:
contact_map[:] = self.applymap(lambda x: x < contactdef)
......@@ -548,7 +657,8 @@ class ResMap(ResAtmMap):
return None
def contact_map(self, contactdef, **kwargs):
contact_map = ResMap(self.sequence, mtype="contact")
contact_map = ResMap(self.sequence, mtype="contact", desc=self.desc,
sym=self.sym)
def treshconv(x):
return x <= treshold
......
No preview for this file type
......@@ -308,15 +308,17 @@ class ContactMapFile(MapFile):
contactmap = ResMap(protein.aa_sequence.sequence, mtype='contact',
flaglist=kwargs['flaglist'],
seqidx=protein.index, index=idxnames,
columns=colnames, sym=kwargs['sym'])
columns=colnames, sym=kwargs['sym'],
desc=self.filetype)
# DataFrame containing ec scores
scoremap = ResMap(protein.aa_sequence.sequence, mtype='score',
seqidx=protein.index, index=idxnames,
columns=colnames, sym=kwargs['sym']) if self.sort \
else None
columns=colnames, sym=kwargs['sym'],
desc=self.filetype) if self.sort else None
distmap = ResMap(protein.aa_sequence.sequence, mtype='distance',
seqidx=protein.index, index=idxnames, columns=colnames,
sym=kwargs['sym']) if self.filetype == "metapsicovhb" else None
sym=kwargs['sym'], desc=self.filetype) if \
self.filetype == "metapsicovhb" else None
for contact_id in self.lines:
if self.filetype == "metapsicovhb":
......@@ -379,7 +381,7 @@ class PDBFile(MapFile):
flaglist=None, sym=True):
resmap = ResAtmMap(protein.aa_sequence.sequence, mtype='distance',
flaglist=flaglist,
seqidx=protein.index)
seqidx=protein.index, desc=self.filetype)
resmap[:] = self.update_map(resmap, sym=sym)
logger.debug("pdb distance map:\n%s" % resmap)
self.mapdict["alldistmap"] = resmap
......
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment