reader.py 25.3 KB
Newer Older
1
# coding=utf-8
fabrice's avatar
fabrice committed
2
3
4
5
6
7
8
9
"""
                            Reader objects
"""
from __future__ import absolute_import, division, print_function

import os
import re
import logging
10
import os.path
fabrice's avatar
fabrice committed
11
12
13
14
15
import collections
import scipy.spatial.distance as distance
from .base import sort_2dict
from .protmap import (ResMap, ResAtmMap)

16
LOG = logging.getLogger(__name__)
fabrice's avatar
fabrice committed
17
18
19
20
Atom = collections.namedtuple("Atom", ["name", "coords"])


class RegexFile(object):
21
22
23
24
    """
    File which can be parsed with a regex
    """

fabrice's avatar
fabrice committed
25
26
27
28
29
    def __init__(self, filepath, filetype='', regex='', sort=''):
        self.regex = regex
        self.sort = sort
        self.filepath = filepath
        self.filetype = filetype
30
        self.filename = os.path.basename(os.path.splitext(filepath)[0])
fabrice's avatar
fabrice committed
31
32
33
34
35
36
37
38
39
40
        self.lines = {}

    def load(self):
        """
        Fill lines with dictionary. Each key is a line number in the given file
        :return: None
        """
        lines_dict = {}

        if not self.regex:
41
            LOG.error("Can't parse file %s", self.filepath)
fabrice's avatar
fabrice committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

        with open(self.filepath) as f:
            for index, line in enumerate(f):
                match = self.regex.match(line)
                if match:
                    lines_dict[index] = match.groupdict()

        if self.sort:
            lines_dict = sort_2dict(lines_dict, self.sort)

        self.lines = lines_dict


class MapFile(RegexFile):
    # List of 3tuples ("regex_file", "filetype", "sort_field")
    # sort_field allow sorting lines with values into this field
    # TODO: wrong regex for native_full ?
59
    # TODO: smarter dict ...
60
61
62
    """
    Map file class
    """
fabrice's avatar
fabrice committed
63
64
    types = {
        "plmdca": {
fabrice's avatar
fabrice committed
65
66
67
68
            "regex": re.compile(r"^(?P<res1_nb>\d+)\s+(?P<res1_name>\w)\s+"
                                r"(?P<res2_nb>\d+)\s+(?P<res2_name>\w)\s+"
                                r"(?P<mi_score>\d)\s+"
                                r"(?P<plm_score>\-?\d+\.\d+)\s*$"),
fabrice's avatar
fabrice committed
69
70
71
72
            "score_field": "plm_score"
        },
        "evfold": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
73
74
75
76
77
78
79
                r'^(?P<res1_nb>\d+),(?P<res2_nb>\d+),'
                r'(?P<ec_score>\-?\d+\.\d+e?\-?\d*),'
                r'(?P<placeholder>\d),(?P<res1_cons>\d+),'
                r'(?P<res2_cons>\d+),(?P<ss_filter>\d|\d{3}),'
                r'(?P<high_cons_filter>\d|\d{3}),'
                r'(?P<cc_filter>\d|\d{3}),(?P<res1_1l_code>\w),'
                r'(?P<res2_1l_code>\w)$'),
fabrice's avatar
fabrice committed
80
81
82
83
            "score_field": "ec_score"
        },
        "pconsc": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
84
85
                r'^(?P<res1_nb>\d+) (?P<res2_nb>\d+) '
                r'(?P<ec_score>\-?\d+\.\d+e?\-?\d*)$'),
fabrice's avatar
fabrice committed
86
87
            "score_field": "ec_score"
        },
88
89
        "pconsc1": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
90
91
                r'^(?P<res1_nb>\d+) (?P<res2_nb>\d+) '
                r'(?P<ec_score>\-?\d+\.\d+e?\-?\d*)$'),
92
93
94
95
            "score_field": "ec_score"
        },
        "pconsc2": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
96
97
                r'^(?P<res1_nb>\d+) (?P<res2_nb>\d+) '
                r'(?P<ec_score>\-?\d+\.\d+e?\-?\d*)$'),
98
99
100
101
            "score_field": "ec_score"
        },
        "metapsicov_stg1": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
102
103
                r'^(?P<res1_nb>\d+) (?P<res2_nb>\d+) \d (?P<res_dist>-?\d+.?\d*) '
                r'(?P<ec_score>\-?\d+.\d+e?\-?\d*)$'),
104
105
106
107
            "score_field": "ec_score"
        },
        "metapsicov_stg2": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
108
109
                r'^(?P<res1_nb>\d+) (?P<res2_nb>\d+) \d (?P<res_dist>-?\d+.?\d*) '
                r'(?P<ec_score>\-?\d+.\d+e?\-?\d*)$'),
110
111
112
            "score_field": "ec_score"
        },
        "psicov": {
fabrice's avatar
fabrice committed
113
            "regex": re.compile(
fabrice's avatar
fabrice committed
114
115
                r'^(?P<res1_nb>\d+) (?P<res2_nb>\d+) \d (?P<res_dist>-?\d+.?\d*) '
                r'(?P<ec_score>\-?\d+.\d+e?\-?\d*)$'),
fabrice's avatar
fabrice committed
116
117
118
119
            "score_field": "ec_score"
        },
        "gremlin": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
120
121
122
123
124
125
                r'^(?P<res1_nb>\d+)\t(?P<res2_nb>\d+)\t'
                r'(?P<res1_id>\d+_[AC-IK-NP-TVWYZ])\t'
                r'(?P<res2_id>\d+_[AC-IK-NP-TVWYZ])\t'
                r'(?P<raw_score>\-?\d+\.\d+e?\-?\d*)\t'
                r'(?P<scale_score>\-?\d+\.\d+e?\-?\d*)\t'
                r'(?P<prob>\-?\d+\.\d+e?\-?\d*)'),
fabrice's avatar
fabrice committed
126
127
128
129
            "score_field": "scale_score"
        },
        "native": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
130
131
132
                r'^\s*(?P<res1_nb>\d+)\s+(?P<res2_nb>\d+)\s+('
                r'?P<ca_ca>\d+\.\d+)\s+(?P<cb_cb>\d+\.\d+)\s+'
                r'(?P<sc_sc>\d+\.\d+)\s+(?P<valid>\w+)'),
fabrice's avatar
fabrice committed
133
134
135
136
            "score_field": None
        },
        "native_full": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
137
138
139
                r'^\s*(?P<res1_nb>\d+)\s+(?P<res2_nb>\d+)\s+('
                r'?P<ca_ca>\d+\.\d+)\s+(?P<cb_cb>\d+\.\d+)\s+'
                r'(?P<sc_sc>\d+\.\d+)\s+(?P<valid>\w+)'),
fabrice's avatar
fabrice committed
140
141
142
143
            "score_field": None
        },
        "bbcontacts": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
144
145
146
147
148
149
                r'\s*(?P<identifier>\w+)\s+(?P<diversity>-?\d+\.?\d*)'
                r'\s+(?P<direction>Parallel|Antiparallel)'
                r'\s+(?P<viterbiscore>-?\d+\.?\d*)'
                r'\s+(?P<indexpred>\d+)'
                r'\s+(?P<state>first|internal|last)'
                r'\s+(?P<res1_nb>\d+)\s+(?P<res2_nb>\d+)'),
fabrice's avatar
fabrice committed
150
151
152
153
            "score_field": None
        },
        "contactlist": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
154
155
                r'^\s*(?P<res1_nb>\d+)[\s,;]+(?P<res2_nb>\d+)[\s,'
                r';]*(?P<con_flag>\w*)'),
fabrice's avatar
fabrice committed
156
157
158
159
            "score_field": None
        },
        "metapsicovhb": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
160
161
162
163
                r'^\s*(?P<res_donor>\d+)[\s,;]+'
                r'(?P<res_acceptor>\d+)[\s,;]+\d[\s,;]+'
                r'(?P<res_dist>-?\d+.?\d*)[\s,;]+'
                r'(?P<hbscore>-?\d+\.?\d*)'),
fabrice's avatar
fabrice committed
164
165
            "score_field": "hbscore"
        },
166
167
        "default_1": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
168
169
170
171
                r'^\s*(?P<res1_nb>\d+)\s+(?P<res2_nb>\d+)\s+'
                r'(?P<resn1>\w+)\s+'
                r'(?P<resn2>\w+)\s+'
                r'(?P<score>[\w\d\.\+\-]+)'),
172
173
174
175
            "score_field": "score"
        },
        "default_2": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
176
177
                r'^\s*(?P<res1_nb>\d+)\s+(?P<res2_nb>\d+)\s+'
                r'(?P<score>[\w\d\.\+\-]+)'),
178
179
180
            "score_field": "score"
        },
        "default_3": {
fabrice's avatar
fabrice committed
181
            "regex": re.compile(
fabrice's avatar
fabrice committed
182
183
184
                r'^\s*(?P<res1_nb>\d+)\s+(?P<res2_nb>\d+)\s+'
                r'(?P<resn1>\w+)\s+'
                r'(?P<resn2>\w+)\s+'),
185
186
187
188
            "score_field": None
        },
        "default_4": {
            "regex": re.compile(
fabrice's avatar
fabrice committed
189
                r'^\s*(?P<res1_nb>\d+)\s+(?P<res2_nb>\d+)\s+'),
190
            "score_field": None
191
192
193
194
        },
        "empty": {
            "regex": re.compile(r'^\s*$'),
            "score_field": None
fabrice's avatar
fabrice committed
195
196
197
198
199
        }
    }
    check_type = True

    def __init__(self, *args, **kwargs):
fabrice's avatar
fabrice committed
200
201
202
203
204
205
        """

        :param args:
        :param kwargs:
        :return:
        """
fabrice's avatar
fabrice committed
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
        super(MapFile, self).__init__(*args, **kwargs)
        if self.check_type:
            self.regex, self.filetype, self.sort = self.check_maptype()
        self.mapdict = {"alldistmap": None,
                        "allcontactmap": None,
                        "distmap": None,
                        "contactmap": None,
                        "scoremap": None}
        self.clashlist = None
        self.contactlist = None
        self.flaglist = None
        # self.contactmap = None
        # self.distmap = None

    def create_map(self, protein, contactdef, flaglist=None, offset=0, sym=True,
221
                   path="", **kwargs):
fabrice's avatar
fabrice committed
222
223
        """

224
        :param path:
fabrice's avatar
fabrice committed
225
226
227
228
229
230
231
232
        :param protein:
        :param contactdef:
        :param flaglist:
        :param offset:
        :param sym:
        :param kwargs:
        :return:
        """
fabrice's avatar
fabrice committed
233
234
235
236
237
238
239
240
        raise NotImplementedError("Class %s doesn't implement create_map" %
                                  self.__class__.__name__)

    def check_contacts(self, aa_seq):
        """
        Check if plm_dict is consistent with input sequence
        :param aa_seq:
        """
241
        LOG.info("Checking consistency of contacts with input sequence")
fabrice's avatar
fabrice committed
242
243
244
        for line in self.lines:
            if self.lines[line]['res1_name'] != aa_seq[int(self.lines[line]['res1_nb']) - 1] \
                    or self.lines[line]['res2_name'] != aa_seq[int(self.lines[line]['res2_nb']) - 1]:
245
                LOG.error("Difference between given sequence and residu "
246
                          "names in contact file at line %d !", line)
fabrice's avatar
fabrice committed
247
248

    def update_map(self, resmap):
fabrice's avatar
fabrice committed
249
250
251
252
253
        """

        :param resmap:
        :return:
        """
fabrice's avatar
fabrice committed
254
255
256
257
        raise NotImplementedError("Class %s doesn't implement update_map" %
                                  self.__class__.__name__)

    def check_maptype(self):
fabrice's avatar
fabrice committed
258
259
260
261
        """

        :return:
        """
262
263
        LOG.info("Checking if file %s correspond to %s format", self.filepath,
                 self.filetype)
264
        # Check if given type is supported
fabrice's avatar
fabrice committed
265
        # TODO: report this check into commands section
266
267
268
269
270
271
272
273
        defaults = ("default_1", "default_2", "default_3", "default_4", "empty")
        if os.stat(self.filepath).st_size == 0:
            LOG.warning("File %s is empty !", self.filepath)
            return [
                self.types["empty"].get("regex"),
                self.filetype,
                self.types["empty"].get("score_field")
            ]
fabrice's avatar
fabrice committed
274
275
276
        with open(self.filepath) as infile:
            # Check first and second line of file
            for index, line in enumerate(infile):
fabrice's avatar
fabrice committed
277
                if self.filetype in self.types:
278
279
                    LOG.info("Given format (%s) should be supported",
                             self.filetype)
fabrice's avatar
fabrice committed
280
281
282
                    match = self.types[self.filetype].get("regex").match(line)
                else:
                    match = None
283
                    LOG.error("Format %s not supported !", self.filetype)
fabrice's avatar
fabrice committed
284
                if match:
285
                    LOG.debug("Format type correct")
fabrice's avatar
fabrice committed
286
287
288
289
290
                    return [
                        self.types[self.filetype].get("regex"),
                        self.filetype,
                        self.types[self.filetype].get("score_field")
                    ]
291
292
293
294
295
296
297
298
299
300
301
                else:
                    LOG.warning("Given type do not correspond, checking default"
                                " format for contactlist or empty file...")
                    for subformat in defaults:
                        if self.types.get(subformat)["regex"].match(line):
                            LOG.debug("Format type correct %s", subformat)
                            return [
                                self.types[subformat].get("regex"),
                                self.filetype,
                                self.types[subformat].get("score_field")
                            ]
fabrice's avatar
fabrice committed
302
                if index > 2:
303
                    # Stop checking after second line
304
                    LOG.error("Error reading %s file.", self.filetype)
fabrice's avatar
fabrice committed
305
                    break
306
        LOG.error("Wrong format type given ...")
fabrice's avatar
fabrice committed
307
308
309
310
        return [None] * 3

    def read(self, protein=None, contactdef=5.0, groupby_method="min",
             scsc=None):
fabrice's avatar
fabrice committed
311
312
313
314
315
316
317
318
        """

        :param protein:
        :param contactdef:
        :param groupby_method:
        :param scsc:
        :return:
        """
319
        LOG.info("Reading %s file", self.filepath)
fabrice's avatar
fabrice committed
320
321
322
        if self.filetype:
            # Read file with regex related to filetype
            self.load()
323
            LOG.debug(self.lines)
fabrice's avatar
fabrice committed
324
        if protein:
325
            LOG.info("Loading contact list")
fabrice's avatar
fabrice committed
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
            if self.filetype == "contactlist":
                self.flaglist = {
                    tuple(sorted(
                        [int(self.lines[contact].get("res1_nb")),
                         int(self.lines[contact].get("res2_nb"))]
                    )): self.lines[contact].get("con_flag")
                    for contact in self.lines if
                    self.lines[contact].get("res1_nb") and
                    self.lines[contact].get("res2_nb")}
            if self.filetype == "metapsicovhb":
                # HB contacts aren't sorted since the first res correspond to
                #  donor and second to acceptor
                self.contactlist = [
                    tuple([int(self.lines[contact].get("res_donor")),
                           int(self.lines[contact].get("res_acceptor"))])
                    for contact in self.lines
                    if self.lines[contact].get("res_donor") and
                    self.lines[contact].get("res_acceptor")]
            else:
                self.contactlist = [
                    tuple(sorted(
                        [int(self.lines[contact].get("res1_nb")),
                         int(self.lines[contact].get("res2_nb"))]))
                    for contact in self.lines
                    if self.lines[contact].get("res1_nb") and
                    self.lines[contact].get("res2_nb")]
352
            LOG.debug(self.contactlist)
fabrice's avatar
fabrice committed
353
354
355
            sym = False if self.filetype == "metapsicovhb" else True
            self.create_map(protein, contactdef,
                            groupby_method=groupby_method, scsc=scsc,
356
                            flaglist=self.flaglist, sym=sym, path=self.filepath)
fabrice's avatar
fabrice committed
357
358
359
360
361
            if self.filetype == "plmdca":
                # If contact filetype contain residues name, check if it is
                # consistent with given sequence
                self.check_contacts(protein.aa_sequence.sequence)
            if self.filetype == "evfold":
362
                LOG.info("Loading evfold clash list")
fabrice's avatar
fabrice committed
363
364
365
366
367
368
369
370
                self.clashlist = [
                    next((el for el in (self.lines[contact].get("ss_filter"),
                                        self.lines[contact].get("high_cons_filter"),
                                        self.lines[contact].get("cc_filter")) if el != "0"), "0")
                    for contact in self.lines if
                    self.lines[contact].get("res1_nb") and
                    self.lines[contact].get("res2_nb")]
                if len(self.contactlist) != len(self.clashlist):
371
                    LOG.error("When reading input file, clash list is not "
372
                              "the same length than contactlist")
373
                LOG.debug(self.clashlist)
fabrice's avatar
fabrice committed
374
375
376
377


class ContactMapFile(MapFile):
    # "plmdca", "evfold", "bbcontacts", "pconsc", "gremlin", "metapsicov",
378
379
380
381
    """
    Contact map file
    """

fabrice's avatar
fabrice committed
382
    def __init__(self, filepath, filetype):
fabrice's avatar
fabrice committed
383
384
385
386
387
388
        """

        :param filepath:
        :param filetype:
        :return:
        """
fabrice's avatar
fabrice committed
389
390
391
        super(self.__class__, self).__init__(filepath, filetype)

    def update_map(self, resmap):
392
393
394
395
396
        """

        :param resmap:
        :return:
        """
fabrice's avatar
fabrice committed
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
        # TODO: swap dataframe factory here
        raise NotImplementedError

    def create_map(self, protein, *args, **kwargs):
        """
        Get res - res map
        :param protein:
        :param args:
        :param kwargs:
        :return:
        """
        offset = min(protein.index)  # Should be 1 or upper (human_idx)
        idxnames = ["residuex"] if self.filetype != "metapsicovhb" else [
            "donor"]
        colnames = ["residuey"] if self.filetype != "metapsicovhb" else [
            "acceptor"]
        contactmap = ResMap(protein.aa_sequence.sequence, mtype='contact',
                            flaglist=kwargs['flaglist'],
                            seqidx=protein.index, index=idxnames,
416
                            columns=colnames, sym=kwargs['sym'],
417
                            desc=self.filetype, path=kwargs.get("path"))
fabrice's avatar
fabrice committed
418
419
420
        # DataFrame containing ec scores
        scoremap = ResMap(protein.aa_sequence.sequence, mtype='score',
                          seqidx=protein.index, index=idxnames,
421
                          columns=colnames, sym=kwargs['sym'],
422
                          path=kwargs.get("path"),
423
                          desc=self.filetype) if self.sort else None
fabrice's avatar
fabrice committed
424
        distmap = ResMap(protein.aa_sequence.sequence, mtype='distance',
425
426
427
                         seqidx=protein.index, index=idxnames, path=kwargs.get("path"),
                         columns=colnames, sym=kwargs['sym'],
                         desc=self.filetype) if self.filetype == "metapsicovhb" else None
fabrice's avatar
fabrice committed
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443

        for contact_id in self.lines:
            if self.filetype == "metapsicovhb":
                resid1 = int(self.lines[contact_id].get('res_donor'))
                resid2 = int(self.lines[contact_id].get('res_acceptor'))
                dist = float(self.lines[contact_id].get('res_dist'))
            else:
                resid1 = int(self.lines[contact_id].get('res1_nb'))
                resid2 = int(self.lines[contact_id].get('res2_nb'))
                dist = None
            # Res id start from 0 in res-res map
            residx1 = contactmap.index[resid1 - offset]
            residx2 = contactmap.index[resid2 - offset]

            if (int(residx1.split("-")[0]) != resid1) or \
                    (resid2 != int(residx2.split("-")[0])):
444
                LOG.error("Wrong resid humanidx (%d, %d) in contact (%d) is "
445
                          "not the same in resmap (%d, %d)",
446
447
448
                          resid1, resid2, contact_id,
                          int(residx1.split("-")[0]),
                          int(residx2.split("-")[0]))
fabrice's avatar
fabrice committed
449
450
451
452
453
454
455
456
457

            contactmap.set_value(residx1, residx2, True)
            if self.sort:
                scoremap.sort_list.append((resid1 - offset, resid2 - offset))
                scoremap.set_value(residx1, residx2,
                                   float(self.lines[contact_id].get(self.sort)))
            if distmap is not None:
                distmap.set_value(residx1, residx2, dist)

458
        LOG.debug("%s contact map:\n%s", self.filetype, contactmap)
fabrice's avatar
fabrice committed
459
        self.mapdict["contactmap"] = contactmap
460
        LOG.debug("%s score map:\n%s", self.filetype, scoremap)
fabrice's avatar
fabrice committed
461
462
463
464
465
466
        self.mapdict["scoremap"] = scoremap
        if distmap is not None:
            self.mapdict["distmap"] = distmap


class PDBFile(MapFile):
467
468
469
    """
    PDB file
    """
fabrice's avatar
fabrice committed
470
471
472
473
474
475
476
477
478
479
480
    pdbreg = re.compile(r'^(?P<record>ATOM  |HETATM)(?P<serial>[\s\w]{5})'
                        r'\s(?P<name>[\s\w]{4})'
                        r'(?P<altLoc>[\s\w])'
                        r'(?P<resName>\w{3})\s(?P<chainID>\w)'
                        r'(?P<resSeq>[\s\w]{4})(?P<iCode>[\s\w])'
                        r'\s{3}(?P<x>[\s\d-]{4}\.\d{3})(?P<y>[\s\d-]{4}\.\d{3})'
                        r'(?P<z>[\s\d-]{4}\.\d{3})'
                        r'(?P<occupancy>[\s\d-]{3}\.\d{2})'
                        r'(?P<tempFactor>[\s\d-]{3}\.\d{2})'
                        r'\s{10}(?P<element>[\s\w]{2})'
                        r'(?P<charge>[\s\w]{2})')
fabrice's avatar
fabrice committed
481
482
483
484
485
486
487
488
489

    def __init__(self, *args, **kwargs):
        # TODO: use PDB object in aria
        # TODO: write dataframe in a separated file
        self.check_type = False
        super(PDBFile, self).__init__(*args, regex=self.pdbreg, filetype="pdb",
                                      **kwargs)

    def create_map(self, protein, contactdef, groupby_method="min", scsc=None,
490
                   flaglist=None, sym=True, path=""):
491
492
493
494
495
496
497
498
499
500
        """

        :param protein:
        :param contactdef:
        :param groupby_method:
        :param scsc:
        :param flaglist:
        :param sym:
        :param path:
        """
fabrice's avatar
fabrice committed
501
        resmap = ResAtmMap(protein.aa_sequence.sequence, mtype='distance',
502
                           flaglist=flaglist, path=path,
503
                           seqidx=protein.index, desc=self.filetype)
504
        # noinspection PyTypeChecker
fabrice's avatar
fabrice committed
505
        resmap[:] = self.update_map(resmap, sym=sym)
506
        LOG.debug("pdb distance map:\n%s", resmap)
fabrice's avatar
fabrice committed
507
508
509
510
511
512
513
        self.mapdict["alldistmap"] = resmap
        self.mapdict["distmap"] = resmap.reduce(groupby=groupby_method)
        self.mapdict["allcontactmap"] = resmap.contact_map(
            contactdef=contactdef, scsc_min=scsc)
        self.mapdict["contactmap"] = self.mapdict["allcontactmap"].reduce()

    def update_map(self, resmap, sym=True):
514
515
516
517
518
519
        """

        :param resmap:
        :param sym:
        :return:
        """
fabrice's avatar
fabrice committed
520
521
        # Map only on heavy atoms
        # TODO: check if same sequence in pdb file
522
        LOG.info("Updating distance map with pdb file")
fabrice's avatar
fabrice committed
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
        newmap = resmap.copy()
        heavylist = []
        error_list = set()
        for atom in self.lines:
            if resmap.heavy_reg.match(self.lines[atom]['name'].strip()):
                heavylist.append(atom)

        # For each heavy atom
        for x, atomx in enumerate(heavylist):
            for atomy in heavylist[x:]:
                # TODO: Check first residue number in pdb file
                indx = "%03d-%s" % (int(self.lines[atomx]['resSeq']),
                                    self.lines[atomx]['resName'].strip()), \
                       self.lines[atomx]['name'].strip()
                indy = "%03d-%s" % (int(self.lines[atomy]['resSeq']),
                                    self.lines[atomy]['resName'].strip()), \
                       self.lines[atomy]['name'].strip()
                coordx = (float(self.lines[atomx]['x']),
                          float(self.lines[atomx]['y']),
                          float(self.lines[atomx]['z']))
                coordy = (float(self.lines[atomy]['x']),
                          float(self.lines[atomy]['y']),
                          float(self.lines[atomy]['z']))

                dist = distance.euclidean(coordx, coordy)
                if indx[0] in list(resmap.index.get_level_values("residuex"))\
                        and indy[0] in list(resmap.index.get_level_values("residuex")):
550
                    LOG.debug("Update distance value (%s, %s)", indx, indy)
fabrice's avatar
fabrice committed
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
                    newmap.at[indx, indy] = dist
                    if sym:
                        # If symmetric matrix
                        newmap.at[indy, indx] = dist
                elif indx[0] not in list(resmap.index.get_level_values("residuex")):
                    error_list.add(indx[0])
                elif indy[0] not in list(resmap.index.get_level_values("residuex")):
                    error_list.add(indy[0])
        if error_list:
            # Listing related humanidx in the initial df
            idxlist = list(resmap.index.get_level_values("residuex"))
            erridx = [idx.split("-")[0] for idx in list(error_list)]
            missidx = list(set([idx for idx in idxlist
                                if idx.split("-")[0] in erridx]))
            for idx in missidx:
                newmap.loc[idx] = None
                if sym:
                    newmap.loc[:][idx] = None
569
            LOG.error("Can't update pdb distance map for pos in pdb file "
570
                      "%s with %s", list(error_list), missidx)
fabrice's avatar
fabrice committed
571
572
573
574
        return newmap


class DistanceMapFile(MapFile):
575
576
577
    """
    Distance matrix file
    """
fabrice's avatar
fabrice committed
578
579
580
581
582
    def __init__(self, filepath, filetype):
        super(MapFile).__init__(filepath, filetype)
        raise NotImplementedError

    def create_map(self, aa_seq, contactdef, **kwargs):
583
584
585
586
587
588
589
        """

        :param aa_seq:
        :param contactdef:
        :param kwargs:
        :return:
        """
fabrice's avatar
fabrice committed
590
591
592
593
        pass

    # Native dist
    def update_map(self, resmap):
594
595
596
597
598
        """

        :param resmap:
        :return:
        """
fabrice's avatar
fabrice committed
599
600
601
602
603
        pass
        # Construit map avec la liste de residus +  infos de distance du fichier
        # return DistanceMap


604
605
606
607
class ProtFileListReader(object):
    """
    List of file object
    """
fabrice's avatar
fabrice committed
608
609
610
611
612
    def __init__(self, cont_def=5.0):
        self.filelist = []
        self.contactdef = cont_def

    def clear(self):
613
614
615
616
        """
        Initiatise from scratch object
        :return:
        """
fabrice's avatar
fabrice committed
617
618
619
620
        # TODO: Init supprime bien les fichiers du cache ?
        self.__init__(self.contactdef)

    def add_file(self, filepathlist, filetypelist=None):
621
622
623
624
625
626
        """

        :param filepathlist:
        :param filetypelist:
        :return:
        """
fabrice's avatar
fabrice committed
627
628
629
630
631
632
        filepathlist = [filepathlist] if type(
            filepathlist) != list else filepathlist
        filetypelist = [filetypelist] if type(
            filetypelist) != list else filetypelist
        if not filetypelist or len(filepathlist) != len(filetypelist):
            filetypelist = [os.path.splitext(_)[1][1:] for _ in filepathlist]
633
634
        LOG.info("Reader focused on file(s) %s %s", filepathlist,
                 filetypelist)
fabrice's avatar
fabrice committed
635
636
637
        for i, filepath in enumerate(filepathlist):
            if os.path.exists(filepath):
                # TODO: check_type functionstr
638
                LOG.debug("Adding %s file to watchlist", filetypelist[i])
639
640
                if filetypelist[i].lower() == "pdb" and \
                        os.path.splitext(filepath)[1][1:] == "pdb":
fabrice's avatar
fabrice committed
641
642
643
644
                        self.filelist.append(PDBFile(filepath))
                else:
                    self.filelist.append(ContactMapFile(filepath,
                                                        filetypelist[i]))
645
646
647
648
                # else:
                #     self.filelist.append(DistanceMapFile(filepath,
                #                                          filetypelist[i]))
                # TODO: DistanceMapFile condition
fabrice's avatar
fabrice committed
649
                if not self.filelist[-1].regex:
650
                    LOG.warning("Can't read %s", filepath)
fabrice's avatar
fabrice committed
651
652
653
654
                    self.filelist.pop()

    def read(self, filepathlist, filetypelist=None, protein=None, scsc=None,
             **kwargs):
655
656
657
658
659
660
661
662
663
        """

        :param filepathlist:
        :param filetypelist:
        :param protein:
        :param scsc:
        :param kwargs:
        :return:
        """
fabrice's avatar
fabrice committed
664
665
666
667
668
        self.clear()
        self.add_file(filepathlist, filetypelist=filetypelist)
        for fo in self.filelist:
            fo.read(protein=protein, contactdef=self.contactdef,
                    scsc=scsc, **kwargs)