protmap.py 79.6 KB
Newer Older
1
# coding=utf-8
fabrice's avatar
fabrice committed
2
3
4
5
"""
                    ARIA Evolutionary Constraints Tools
"""
from __future__ import absolute_import, division, print_function
6

Fabrice Allain's avatar
Fabrice Allain committed
7
8
from ..core import ConversionTable as ConversionTable
import collections
9
import csv
10
11
import datetime
import itertools
Fabrice Allain's avatar
Fabrice Allain committed
12
import logging
13
import matplotlib; matplotlib.use("Agg")
14
import numpy as np
Fabrice Allain's avatar
Fabrice Allain committed
15
16
import operator
import os
fabrice's avatar
fabrice committed
17
import pandas as pd
Fabrice Allain's avatar
Fabrice Allain committed
18
import re
fabrice's avatar
fabrice committed
19
import seaborn as sns
Fabrice Allain's avatar
Fabrice Allain committed
20
import six
21
import sklearn.metrics as skm
Fabrice Allain's avatar
Fabrice Allain committed
22
23
24
25
import string
import textwrap
from collections import defaultdict
from copy import deepcopy
26
from matplotlib import pyplot as plt
fabrice's avatar
fabrice committed
27
from matplotlib.lines import Line2D
28

Fabrice Allain's avatar
Fabrice Allain committed
29
from ..core.legacy import AminoAcid as AminoAcid
30
from .common import (tickmin, tickrot, titleprint, addtup)
31
from .ndconv import net_deconv
32

33

34
LOG = logging.getLogger(__name__)
fabrice's avatar
fabrice committed
35
36
37
38
# TODO: check dataframe symmetry or always use unstack
# TODO: objet MapContainer contenant les differentes maps en attributs ! (et
# non en clef de dict)

39

fabrice's avatar
fabrice committed
40
class Map(pd.DataFrame):
41
42
43
44
45
46
    """
    Distance/contact matrix abstract class accepting only one type of value according to mtype arg

    Examples
    --------
    Score/distance matrix
47

48
49
50
51
52
53
54
55
56
57
58
59
60
61
    >>> d = Map(index=[0,1,2], columns=[0,1,2])
    >>> d
        0   1   2
    0 NaN NaN NaN
    1 NaN NaN NaN
    2 NaN NaN NaN
    >>> d = Map(index=[0,1,2], columns=[0,1,2], mtype='contact')
    >>> d
           0      1      2
    0  False  False  False
    1  False  False  False
    2  False  False  False

    """
fabrice's avatar
fabrice committed
62
63

    mtype_choices = {'contact': bool, 'distance': float, "score": float}
64
    _metadata = pd.DataFrame._metadata + ['_sort_list']
fabrice's avatar
fabrice committed
65

Fabrice Allain's avatar
Fabrice Allain committed
66
    def update(self, *args, **kwargs):
67
        """
68
        
69

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
70
71
72
73
74
75
76
77
78
79
80
81
82
        Parameters
        ----------
        args :
            param kwargs:
        *args :
            
        **kwargs :
            

        Returns
        -------

        
83
        """
Fabrice Allain's avatar
Fabrice Allain committed
84
85
86
87
        super(Map, self).update(*args, **kwargs)

    # def _constructor_expanddim(self):
    #     super(Map, self)._constructor_expanddim()
fabrice's avatar
fabrice committed
88
89

    def __init__(self, index=None, columns=None, mtype='distance',
90
                 duplicate_levels=False, data=None, dtype=None, sym=True,
91
                 desc="", path="", **kwargs):
fabrice's avatar
fabrice committed
92
        """
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
93
94
95
96
97
98
99
100
101
102
103
104
105
        
        Parameters
        ----------
        index
        columns
        mtype
        duplicate_levels
            Allow duplicate levels in dataframe
        data
        dtype
        sym
        desc
        path
fabrice's avatar
fabrice committed
106
107
108
        """
        if not dtype:
            dtype = self.check_type(mtype)
109
        # TODO: should probably init to np.NaN
fabrice's avatar
fabrice committed
110
        if data is None:
Fabrice Allain's avatar
Fabrice Allain committed
111
            data = False if dtype == bool else np.NaN
112
        super(Map, self).__init__(data=data, dtype=dtype, index=index,
fabrice's avatar
fabrice committed
113
114
                                  columns=columns)
        self.duplicate_levels = duplicate_levels
Fabrice Allain's avatar
Fabrice Allain committed
115
        self.mtype = mtype
fabrice's avatar
fabrice committed
116
117
        self.dtype = dtype
        if mtype == "score":
118
            self._sort_list = []
fabrice's avatar
fabrice committed
119
        self.sym = sym
120
        self.desc = desc
121
        self.path = path
fabrice's avatar
fabrice committed
122

123
124
125
    def __str__(self):
        return super(Map, self).__str__()

126
127
128
129
    @property
    def sort_list(self):
        return self._sort_list

fabrice's avatar
fabrice committed
130
    def sortedset(self, human_idx=False):
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
131
        """
132
        
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
133
134
135
136
137
138
139
140
141
142
143

        Parameters
        ----------
        human_idx :
            (Default value = False)

        Returns
        -------

        
        """
fabrice's avatar
fabrice committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
        # Remove duplicate in sort_list
        n = 1 if human_idx else 0
        if hasattr(self, "sort_list"):
            if self.sym:
                # Use OrderedDict to keep the order
                return [(x + n, y + n)
                        for x, y in
                        collections.OrderedDict.fromkeys(frozenset(x)
                                                         for x in
                                                         self.sort_list)]
            else:
                # Asym matrix, no need to remove duplicate
                return [(x + n, y + n) for x, y in self.sort_list]
        else:
            return None

    def check_type(self, mtype):
161
        """
162
        
163

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
164
165
166
167
168
169
170
171
172
        Parameters
        ----------
        mtype :
            return:

        Returns
        -------

        
173
        """
fabrice's avatar
fabrice committed
174
175
176
177
        value = self.mtype_choices.get(mtype)
        if value:
            return value
        else:
178
179
            LOG.error("Map type should be in list %s",
                      self.mtype_choices.keys())
fabrice's avatar
fabrice committed
180
181
182
            return None

    def reduce(self):
Fabrice Allain's avatar
Fabrice Allain committed
183
184
        """Low complexcity dataframe"""
        raise NotImplementedError
185

Fabrice Allain's avatar
Fabrice Allain committed
186
    def copy(self, **kwargs):
187
188
        """
        Copy the current map
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
189
190
191
192
193
194
195
196
197
198
199

        Parameters
        ----------
        **kwargs :
            

        Returns
        -------

        
        """
Fabrice Allain's avatar
Fabrice Allain committed
200
        raise NotImplementedError
fabrice's avatar
fabrice committed
201
202

    def remove(self, rm_list):
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
203
        """
204
        
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
205
206
207
208
209
210
211
212
213
214
215

        Parameters
        ----------
        rm_list :
            

        Returns
        -------

        
        """
fabrice's avatar
fabrice committed
216
        # Reset values at positions in rm_list
Fabrice Allain's avatar
Fabrice Allain committed
217
        value = False if self.dtype == bool else np.NaN
fabrice's avatar
fabrice committed
218
219
220
221
222
223
224
225
226
227
228
229
230
        for contact in rm_list:
            idx1, idx2 = self.index[contact[0]], self.index[contact[1]]
            self.set_value(idx1, idx2, value)
            # self.iat[(contact[0], contact[1])] = value
            # self.iat[(contact[1], contact[0])] = value
            if hasattr(self, 'sort_list'):
                # ! sort_list start at 1
                if (contact[0], contact[1]) in self.sort_list:
                    self.sort_list.remove((contact[0], contact[1]))
                if (contact[1], contact[0]) in self.sort_list and self.sym:
                    self.sort_list.remove((contact[1], contact[0]))

    def to_series(self):
231
        """Return panda series related to lower triangle values"""
fabrice's avatar
fabrice committed
232
233
234
235
236
237
238
239
        df = self.copy()
        df = df.astype(float)
        # remove values from upper triangle
        df.values[np.triu_indices_from(df, k=1)] = np.nan
        # pd.series with only lower triangle values
        return df.unstack().dropna()

    def topmap(self, scoremap, nb_topcontact):
240
        """
241
        
242

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
243
244
245
246
247
248
249
250
251
252
253
        Parameters
        ----------
        scoremap :
            param nb_topcontact:
        nb_topcontact :
            

        Returns
        -------

        
254
        """
fabrice's avatar
fabrice committed
255
        if self.dtype != bool:
256
            LOG.info("Error when retrieving top contact map. The type of "
257
                     "the given map is not a contact type!")
fabrice's avatar
fabrice committed
258
259
260
261
262
263
264
265
266
            return self
        self[:] = False
        if self.shape == scoremap.shape:
            pair_list = scoremap.sortedset()[:nb_topcontact]
            for contact in pair_list:
                self.iat[(contact[1], contact[0])] = True
                self.iat[(contact[0], contact[1])] = True
            return self
        else:
267
            LOG.error("Given scoremap has not the same dimension !")
fabrice's avatar
fabrice committed
268
269
270
            return None

    def subfill(self, pairdict, level=0):
271
272
        """
        Fill map with dict giving
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
273
274
275
276
277
278
279
280
281
282
283
284

        Parameters
        ----------
        pairdict :
            param level:
        level :
            (Default value = 0)

        Returns
        -------

        
fabrice's avatar
fabrice committed
285
286
287
288
289
290
291
292
        """
        pairdict = {k.upper(): v for k, v in pairdict.items()}
        if "def" in pairdict:
            self[:] = pairdict["def"]
        for idxval in pairdict:
            idx = self.index.get_level_values(level) == idxval
            self.loc[idx, idx] = pairdict[idxval]

293
    # TODO: remove sym attribute --> information is duplicated !!!!
fabrice's avatar
fabrice committed
294
    def set_value(self, index, col, value, **kwargs):
295
        """
296
297
298
299
300
301
        Assign value at [index, col] in the related dataframe

        >>> d = Map(index=[0,1,2], columns=[0,1,2])
        >>> d.set_value(1, 2, 2)
        >>> d.at[1, 2]
        2.0
302

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
        Parameters
        ----------
        index :
            param col:
        value :
            param kwargs:
        col :
            
        **kwargs :
            

        Returns
        -------

        
318
        """
319
320
321
322
        LOG.debug("Update {index}, {col} with {value} value {default}".format(index=index, col=col, value=value,
                                                                              default=self.at[index, col]))
        # super(Map, self).set_value(index, col, value, **kwargs)
        super(Map, self).at[index, col] = value
fabrice's avatar
fabrice committed
323
        if self.sym:
324
325
            super(Map, self).at[col, index] = value
            # super(Map, self).set_value(col, index, value, **kwargs)
fabrice's avatar
fabrice committed
326
327


328
# TODO: Matrices PosAaAtmMap, AaAtmMap, AtmMap
fabrice's avatar
fabrice committed
329
class ProteinMap(Map):
330
    """
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
331
    Abstract class for protein contact map objects
332
    """
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
333
334
335
336
337
338
    # TODO: matrix should be not only for heavy atoms
    # Matrix only for heavy atoms.
    heavy_reg = re.compile(r"[CNOS][ABGDEZH][0-9]?")
    all_reg = re.compile(r"^((?!cns|dyana).*)$")
    # TODO: Autre methodes de dist
    distance_method = 'euclidean'
339
340
    _metadata = Map._metadata + ['_evflags', '_maplot', '_sequence',
                                 '_atom_groups']
341

342
    def __init__(self, sequence, flaglist=None, atom_groups="min", **kwargs):
343
344
        if kwargs.get("index") is None or kwargs.get("columns") is None:
            kwargs["index"], kwargs["columns"] = self.create_index(
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
345
                sequence, **kwargs)
fabrice's avatar
fabrice committed
346
        super(ProteinMap, self).__init__(**kwargs)
347
        self._evflags = flaglist
fabrice's avatar
fabrice committed
348
        self._maplot = None
349
350
        self._sequence = sequence
        self._atom_groups = atom_groups
fabrice's avatar
fabrice committed
351

Fabrice Allain's avatar
Fabrice Allain committed
352
353
    # def _constructor_expanddim(self):
    #    return self._constructor_expanddim()
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
354
355
356
357
358
359
360
361
362
363
364
    @property
    def sequence(self):
        """
        Specific sequence related to the map in a string
        Returns
        -------
        str

        """
        raise NotImplementedError

365
366
    @property
    def contact_flags(self):
367
        return self._evflags
368

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
    def create_heatmap(self):
        """
        Generate heatmap

        Parameters
        ----------

        Returns
        -------


        """
        raise NotImplementedError

    def contact_map(self, contactdef, scsc_min=None):
        """
        Generate contact map

        Parameters
        ----------
        contactdef :
            param scsc_min:
        scsc_min :
            (Default value = None)

        Returns
        -------


        """
        raise NotImplementedError

    def create_index(self, sequence, **kwargs):
        """

        Parameters
        ----------
        sequence
        kwargs

        Returns
        -------

        """
        # Indexation matrice (tous les atomes ou tous les residus)
        raise NotImplementedError
fabrice's avatar
fabrice committed
415

Fabrice Allain's avatar
Fabrice Allain committed
416
    def reduce(self):
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
417
        """Lower index level if multi index"""
Fabrice Allain's avatar
Fabrice Allain committed
418
419
420
421
422
423
424
425
426
427
428
        # TODO: check if multiindex
        columns = ['-'.join(idx) for idx in self.columns]
        index = ['-'.join(idx) for idx in self.index]
        return getattr(self, '__init__')(self.sequence,
                                         path=self.path,
                                         data=self.as_matrix(),
                                         index=index, desc=self.desc,
                                         sym=self.sym, mtype=self.mtype,
                                         columns=columns, dtype=self.dtype)

    def copy(self, **kwargs):
429
430
        """
        Copy dataframe and related attributes of the map
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
431
432
433
434
435
436
437
438
439
440

        Parameters
        ----------
        **kwargs :
            

        Returns
        -------

        
Fabrice Allain's avatar
Fabrice Allain committed
441
442
        """
        df = super(Map, self).copy()
443
444
445
        return ProteinMap(
            sequence=self.sequence, path=self.path, data=df, desc=self.desc,
            sym=self.sym, mtype=self.mtype, dtype=self.dtype)
Fabrice Allain's avatar
Fabrice Allain committed
446

fabrice's avatar
fabrice committed
447
    def plotflush(self):
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
448
        """Flush contact map plot"""
fabrice's avatar
fabrice committed
449
450
451
452
453
        plt.clf()
        plt.cla()
        plt.close()
        self._maplot = None

fabrice's avatar
fabrice committed
454
    @property
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
455
    def maplot(self, linewidths=0.0):
456
457
        """
        Contact map plot
458
        :return:
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
459
460
461
462
463
464
465
466
467
468

        Parameters
        ----------
        linewidths :
            (Default value = 0.0)

        Returns
        -------

        
469
        """
fabrice's avatar
fabrice committed
470
471
        # Contact map Plot
        if not self._maplot:
472
            # Flush matplot
473
            LOG.debug("Build maplot")
fabrice's avatar
fabrice committed
474
475
476
            minticks = tickmin(self, shift=1)  # Nb graduations

            self._maplot = sns.heatmap(self, square=True, cbar=False,
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
477
                                       linewidths=linewidths, vmax=1, vmin=-1,
fabrice's avatar
fabrice committed
478
479
480
                                       cmap=sns.diverging_palette(20, 220, n=7,
                                                                  as_cmap=True),
                                       xticklabels=minticks[0],
481
482
                                       yticklabels=minticks[1],
                                       linecolor="grey")
fabrice's avatar
fabrice committed
483
484
485
486
        return self._maplot

    def saveplot(self, outdir='', outprefix="protein", size_fig=10,
                 plot_ext="pdf", plot_dpi=200):
487
488
        """
        Save plot
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

        Parameters
        ----------
        outdir :
            param outprefix: (Default value = '')
        size_fig :
            param plot_ext: (Default value = 10)
        plot_dpi :
            (Default value = 200)
        outprefix :
            (Default value = "protein")
        plot_ext :
            (Default value = "pdf")

        Returns
        -------

506

507
        """
508
        plotpath = os.path.join(outdir, "%s.maplot.%s" % (
509
            outprefix, plot_ext))
510
        LOG.info("Generate contact map plot (%s)", plotpath)
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
511
        f, ax = plt.subplots()
512
        tickrot(self.maplot.axes, self.maplot.figure)
fabrice's avatar
fabrice committed
513
514
515
516
517
        self.maplot.figure.set_size_inches(size_fig, size_fig)
        map_title = "%s contacts map" % outprefix
        self.maplot.set_title(map_title)
        self.maplot.figure.tight_layout()

518
        f.tight_layout()
519
        self.maplot.figure.savefig(plotpath, dpi=plot_dpi)
520
        plt.close('all')
fabrice's avatar
fabrice committed
521

fabrice's avatar
fabrice committed
522
    def contactset(self, human_idx=False):
523
524
        """
        Remove duplicate in contact_list
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
525
526
527
528
529
530
531
532

        Parameters
        ----------
        human_idx :
            return: (Default value = False)

        Returns
        -------
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
533
        list
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
534
        
535
        """
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
536
537
538
539
540
        contacts = self.contact_list(human_idx)
        if contacts:
            contacts = sorted(set((tuple(sorted((x, y)))
                                   for x, y in contacts)))
        return contacts
fabrice's avatar
fabrice committed
541
542

    def contact_list(self, human_idx=False):
543
544
        """
        Return contact list
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
545
546
547
548
549
550
551
552

        Parameters
        ----------
        human_idx :
            return: (Default value = False)

        Returns
        -------
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
553
        list
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
554
        
555
        """
fabrice's avatar
fabrice committed
556
557
558
        contact_list = []
        n = 1 if human_idx else 0
        if self.dtype is bool:
Fabrice Allain's avatar
Fabrice Allain committed
559
560
            # for irow, row in enumerate(self):
            for irow, row in enumerate(self.index):
fabrice's avatar
fabrice committed
561
562
563
                for icol, value in enumerate(self[row]):
                    if value:
                        contact_list.append((irow + n, icol + n))
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
564
        return contact_list
fabrice's avatar
fabrice committed
565

566
    def compareplot(self, protmap, save_fig=True, alpha=None, **kwargs):
567
        """
568
        Compare 2 contact map and plot the differences
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
569
570
571
572
573

        Parameters
        ----------
        protmap :
        save_fig :
574
575
        alpha :
        kwargs :
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
576
577
578
579

        Returns
        -------

580
        """
581
        self.plotflush()
fabrice's avatar
fabrice committed
582
583
584
585
586
587
        # Contact map plot
        if getattr(protmap, "shape") and self.shape != protmap.shape:
            logging.error("Cant't compare %s map with %s" % (
                protmap.__class__.__name__, self.__class__.__name__))
            return None
        else:
588
            cmplist = protmap.contact_list(human_idx=True)
fabrice's avatar
fabrice committed
589
590
591
592
593
594
595
596
597

            ymax = len(self.sequence) - 1
            if protmap.contact_flags:
                flags = set(protmap.contact_flags.values())
                # Color palette
                pal = sns.color_palette("hls", len(flags))
                for i, flag in enumerate(flags):
                    conlist = [contact for contact in protmap.contact_flags if
                               protmap.contact_flags[contact] == flag]
598
                    xind = [x - .5 for x in
fabrice's avatar
fabrice committed
599
                            zip(*conlist)[0] + zip(*conlist)[1]]
600
                    yind = [ymax - y + 1.5 for y in
fabrice's avatar
fabrice committed
601
602
603
604
                            zip(*conlist)[1] + zip(*conlist)[0]]
                    color = pal[i]
                    mark = Line2D.filled_markers[i]
                    for x, y in zip(xind, yind):
605
606
                        self.maplot.axes.scatter(x, y, s=8, c=color,
                                                 linewidths=0.1, alpha=alpha,
fabrice's avatar
fabrice committed
607
                                                 marker=mark)
fabrice's avatar
fabrice committed
608
            else:
609
                LOG.info("Contact list: %s", cmplist)
610
                xind = [x - .5 for x in
fabrice's avatar
fabrice committed
611
                        zip(*cmplist)[0] + zip(*cmplist)[1]]
612
                yind = [ymax - y + 1.5 for y in
fabrice's avatar
fabrice committed
613
                        zip(*cmplist)[1] + zip(*cmplist)[0]]
614
615
                LOG.debug("Xind: %s", xind)
                LOG.debug("Yind: %s", yind)
fabrice's avatar
fabrice committed
616
617
618
619
                color = "red"
                # width = [0.3 for _ in xind]
                # for x, y, h in zip(xind, yind, width):
                for x, y in zip(xind, yind):
620
621
                    self.maplot.axes.scatter(x, y, s=30, c=color,
                                             linewidths=0,
fabrice's avatar
fabrice committed
622
                                             alpha=alpha)
fabrice's avatar
fabrice committed
623
            if save_fig:
fabrice's avatar
fabrice committed
624
                self.saveplot(**kwargs)
fabrice's avatar
fabrice committed
625

626
627
    @staticmethod
    def _write_report(reportpath, **scores):
628
629
        """
        Write report file from score dict
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
630

631
632
        Parameters
        ----------
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
633
634
635
636
637
638
        reportpath :
            
        scores :
            
        **scores :
            
639
640
641
642

        Returns
        -------

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
643
        
644
645
        """
        with open(reportpath, 'w') as reportf:
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
            msg = string.Formatter().vformat("""\
## Report {map1name} vs. {map2name}
##
## Date: {date}
##
## Plots: {outdir}
## Reference map: {map1path}
## Contact map: {map2path}
##
## -----------------------------------------------------------------------------
##
## Sequence:
## {seq}
## Protein length: {protlen}
##
## -----------------------------------------------------------------------------
##
## Matthews correlation coefficient (MCC): {mcc}
## F1 score: {f1s}
## F2 score: {f2s}
## F0.5 score: {f05s}
##
## Precision: {precision}
## Recall (Sensibility): {recall}
## Accuracy: {accuracy}
##
## Hamming loss: {hamm}
## Hinge loss: {hin}
##
## -----------------------------------------------------------------------------
##
##                                  Plot scores
##
## ROC Area Under Curve: {roc_auc}
## Average precision score: {aver_prec}
##
## -----------------------------------------------------------------------------
##
##                          Precision recall curve
##
## Precision values:
## {allprec}
## Recall values:
## {allrec}
## Score tresholds ({map2name}):
## {prthres}
##
## -----------------------------------------------------------------------------
##
##                               ROC curve
##
## True Positive Rate (Sensibility) values:
## {alltpr}
## False Positive Rate (1 - Specificity) values:
## {allfpr}
## Score tresholds ({map2name}):
## {rocthres}""", (), defaultdict(str, **scores))
703
704
705
706
707
            LOG.debug("\n" + msg)
            reportf.write(msg)

    @staticmethod
    def classification_metrics(y_true, y_pred, y_scores=None):
708
709
        """
        Compute classification metrics
710
711
712

        Parameters
        ----------
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
713
714
715
716
717
718
        y_true : numpy array of true values
            
        y_pred : numpy array of predicted values
            
        y_scores : numpy array of prediction scores
            (Default value = None)
719
720
721
722

        Returns
        -------

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
723
        
724
        """
725
        metrics = {}
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741

        if 1 in y_pred and 1 in y_true:
            metrics.update({
                'accuracy': skm.accuracy_score(y_true, y_pred),
                'precision': skm.precision_score(y_true, y_pred),
                'recall': skm.recall_score(y_true, y_pred),
                'mcc': skm.matthews_corrcoef(y_true, y_pred),
                'f1s': skm.f1_score(y_true, y_pred),
                'f2s': skm.fbeta_score(y_true, y_pred, 2),
                'f05s': skm.fbeta_score(y_true, y_pred, 0.5),
                'hamm': skm.hamming_loss(y_true, y_pred),
                'hin': skm.hinge_loss(y_true, y_pred)
            })

        if y_scores:
            # ROC plot
Fabrice Allain's avatar
Fabrice Allain committed
742
743
            # Replace nan values in y_scores
            y_scores[np.isnan(y_scores)] = np.floor(np.nanmin(y_scores))
744
745
746
747
748
749
750
751
752
753
754
755
756
757
            metrics.update({
                'roc_auc': skm.roc_auc_score(y_true, y_scores),
                'aver_prec': skm.average_precision_score(y_true, y_scores)
            })
            metrics['allfpr'], metrics['alltpr'], metrics['rocthres'] = \
                skm.roc_curve(y_true, y_scores, pos_label=1)
            metrics['allprec'], metrics['allrec'], metrics['prthres'] = \
                skm.precision_recall_curve(y_true, y_scores)

        return metrics

    @staticmethod
    def _metrics_plot(metrics, mapnames=('', ''), plotdir="", outprefix="",
                      plot_ext="pdf"):
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
758
        """
759
        
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774

        Parameters
        ----------
        metrics :
            
        mapnames :
            (Default value = ('')
        '' :
            

        Returns
        -------

        
        """
Fabrice Allain's avatar
Fabrice Allain committed
775
        outprefix = outprefix if outprefix else "maplot"
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
        csv_roc = os.path.join(plotdir, "%s.roc.csv" % outprefix)
        LOG.info("Generate roc file (%s)", csv_roc)
        with open(csv_roc, "w") as f:
            f.write("TPR,FPR,Treshold\n")
            writer = csv.writer(f)
            writer.writerows(zip(metrics['alltpr'], metrics['allfpr'],
                                 metrics['rocthres']))

        plotpath = os.path.join(plotdir, "%s.roc.%s" % (outprefix,
                                                        plot_ext))
        LOG.info("Generate roc plot (%s)", plotpath)
        plt.figure()
        plt.plot(metrics['allfpr'], metrics['alltpr'],
                 label='ROC curve (area = %0.2f)' % metrics['roc_auc'])
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic %s vs. %s' % (
            mapnames[0], mapnames[1]))
        plt.legend(loc="lower right")
        plt.savefig(plotpath)

        csv_precall = os.path.join(plotdir, "%s.roc.csv" % outprefix)
        LOG.info("Generate precall file (%s)", csv_precall)
        with open(csv_precall, "w") as f:
            f.write("Precision,Recall,Treshold\n")
            writer = csv.writer(f)
            writer.writerows(zip(metrics['allprec'], metrics['allrec'],
                                 metrics['prthres']))

        plotpath = os.path.join(plotdir, "%s.precall.%s" % (outprefix,
                                                            plot_ext))
        LOG.info("Generate precall plot (%s)", plotpath)
        # Precision recall curve
        plt.clf()
        plt.plot(metrics['allrec'], metrics['allprec'],
                 label='Precision-Recall curve')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.0])
        plt.xlim([0.0, 1.0])
        plt.title('Precision-Recall {1} vs. {2}: AUC={0:0.2f}'.format(
            metrics['aver_prec'], mapnames[0], mapnames[1]))
        plt.legend(loc="lower left")
        plt.savefig(plotpath)

    def report(self, cmpmap, scoremap=None, outprefix="", outdir="", plotdir="",
825
826
827
828
               # plot_ext="pdf", plotag=True, n_factors=(0.5, 1.0, 1.5)):
               plot_ext="pdf", plotag=True):
        """
        Generate contact map report file
829
830
831

        Parameters
        ----------
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
832
833
834
835
836
837
838
839
840
841
842
843
844
845
        cmpmap :
            
        scoremap :
            (Default value = None)
        outprefix :
            (Default value = "")
        outdir :
            (Default value = "")
        plotdir :
            (Default value = "")
        plot_ext :
            (Default value = "pdf")
        plotag :
            (Default value = True)
846
847
848
849

        Returns
        -------

850
        """
851
852
853
        filename = ".".join((outprefix, "mapreport")) if outprefix else \
            "mapreport"
        reportpath = "%s/%s" % (outdir, filename)
854
        LOG.info("Generate map report file (%s)", reportpath)
855

856
        # TODO: evaluate map statistics for several treshold (use submap var)
857
858
        # nb_c = int(len(self.sequence) * float(n_factors[0]))
        # submap = cmpmap.copy()
859
860
        # print(np.array_equal(cmpmap.values.astype(int).flat,
        #                      submap.topmap(scoremap, nb_c).values.astype(int).flat))
861
862
863

        y_true = list(self.values.astype(int).flat)
        y_pred = list(cmpmap.values.astype(int).flat)
Fabrice Allain's avatar
Fabrice Allain committed
864
        y_scores = scoremap.values.astype(float).flat if scoremap is not None else None
865

Fabrice Allain's avatar
Fabrice Allain committed
866
        # Compute classification metrics for the entire map
867
868
869
870
871
872
873
874
875
        metrics = self.classification_metrics(y_true, y_pred, y_scores)

        self._write_report(
            reportpath, map1name=self.desc, map2name=cmpmap.desc,
            map1path=self.path, map2path=cmpmap.path,
            seq="\n## ".join(textwrap.wrap(self.sequence, width=77)),
            protlen=len(self.sequence),
            date=datetime.date.today().strftime("%A %d. %B %Y"),
            outdir=outdir, **metrics)
876

877
        if plotag and scoremap is not None:
878
879
880
            self._metrics_plot(metrics, mapnames=(self.desc, cmpmap.desc),
                               outprefix=outprefix, plot_ext=plot_ext,
                               plotdir=plotdir)
881

882
    def compare_contactmap(self, cmpmap, contactlist, outprefix,
883
                           outdir="", distmap=None, human_idx=True):
884
885
        """
        Compare 2 contact map and plot differences
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905

        Parameters
        ----------
        cmpmap :
            param contactlist:
        outprefix :
            param outdir:
        distmap :
            param human_idx: (Default value = None)
        contactlist :
            
        outdir :
            (Default value = "")
        human_idx :
            (Default value = True)

        Returns
        -------

        
906
        """
fabrice's avatar
fabrice committed
907
        # CSV file giving TP/FP contacts
Fabrice Allain's avatar
Fabrice Allain committed
908
909
        outpath = "%s/%s.contactcmp.csv" % (outdir,
                                            outprefix if outprefix else "maplot")
910
        LOG.info("Generate stat file (%s)", outpath)
911
912
913
        with open(outpath, 'w') as outfile:
            offset = 1 if human_idx else 0
            extra_header = "" if distmap is None else ",dmin"
914
            outfile.write("#resid1,resid2,res1,res2,TP/FP%s\n" % extra_header)
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
            for x in contactlist:
                if extra_header:
                    dmin = "," + str(distmap.iat[(int(x[0]) - offset, int(x[1]) -
                                                  offset)])
                else:
                    dmin = ""
                contact = self.iat[(int(x[0]) - offset, int(x[1]) - offset)]
                cmpcontact = cmpmap.iat[(int(x[0]) - offset, int(x[1]) - offset)]
                if contact and cmpcontact:
                    eq = "TP"
                elif contact and not cmpcontact:
                    eq = "FP"
                elif not contact and cmpcontact:
                    eq = "FN"
                else:
                    eq = "TN"
931
                msg = "%s,%s,%s,%s,%s%s\n" % (x[0], x[1],
932
                                            self.sequence[int(x[0]) - offset],
933
934
                                            self.sequence[int(x[1]) - offset],
                                            eq,
935
                                            dmin)
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
936
                outfile.write(msg)
fabrice's avatar
fabrice committed
937

Fabrice Allain's avatar
Fabrice Allain committed
938
    def write_contacts(self, filename, outdir="", prefix="", human_idx=True,
939
                       scoremap=None):
940
        """
941
        
942

Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
943
944
        Parameters
        ----------
Fabrice Allain's avatar
Fabrice Allain committed
945
        prefix
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
946
947
948
949
950
951
952
953
954
955
956
957
958
        filename :
            param outdir:
        human_idx :
            param scoremap: (Default value = True)
        outdir :
            (Default value = "")
        scoremap :
            (Default value = None)

        Returns
        -------

        
959
        """
Fabrice Allain's avatar
Fabrice Allain committed
960
961
        prefix = prefix + "_" if prefix else prefix
        filepath = "%s/%s%s.contact.txt" % (outdir, prefix, filename.replace(".", "_"))
962
        LOG.info("Generate contact file (%s)", filepath)
963
964
965
966
967
968
        with open(filepath, 'w') as outfile:
            offset = 1 if human_idx else 0
            # contacts = [sorted(contact) for contact in self.contactset()]
            # for contact in sorted(contacts):
            for contact in self.contactset():
                contact = sorted([contact[0] + offset, contact[1] + offset])
969
970
971
                if scoremap is not None:
                    score = scoremap.iat[(int(contact[0]) - offset,
                                          int(contact[1]) - offset)]
972
                    outfile.write("%d %d %.4f\n" %
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
973
                                  (contact[0], contact[1], score))
974
                else:
975
                    outfile.write("%d %d\n" % (contact[0], contact[1]))
fabrice's avatar
fabrice committed
976

fabrice's avatar
fabrice committed
977

fabrice's avatar
fabrice committed
978
class ResAtmMap(ProteinMap):
979
980
    """
    Protein distance/contact matrix for all atom pairs. If no sequence given,
fabrice's avatar
fabrice committed
981
    protein distance/contact matrix for all amino acids
982
983

    Attributes
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
984
985
    ----------

986
987
    Examples
    --------
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
988

fabrice's avatar
fabrice committed
989
990
    """

Fabrice Allain's avatar
Fabrice Allain committed
991
992
    # def _constructor_expanddim(self):
    #     super(ResAtmMap, self)._constructor_expanddim()
fabrice's avatar
fabrice committed
993

994
    def __init__(self, sequence, flaglist=None, atom_groups="min", **kwargs):
fabrice's avatar
fabrice committed
995
996
997
998
999
        # Sequence: 1L string or MultiIndex object
        # Dataframe is in 3L code
        if not sequence:
            sequence = ConversionTable.ConversionTable().table['AMINO_ACID'][
                'iupac'].keys()
Fabrice  ALLAIN's avatar
Fabrice ALLAIN committed
1000
        # Super call will initialize index and columns with self.create_index()
For faster browsing, not all history is shown. View entire blame