diff --git a/ariaec/__init__.py b/ariaec/__init__.py index 139597f9cb07c5d48bed18984ec4747f4b4f3438..697ee03ec6e1fa96182fd9268b0a0324de2228c5 100644 --- a/ariaec/__init__.py +++ b/ariaec/__init__.py @@ -1,2 +1,3 @@ - - +""" + Aria EC module +""" diff --git a/ariaec/__init__.pyc b/ariaec/__init__.pyc index cc787257a928f0d8da01c0067a690129bc1b5871..aaad1100aa43704a4c4d15ca44c9edcec7c634b0 100644 Binary files a/ariaec/__init__.pyc and b/ariaec/__init__.pyc differ diff --git a/ariaec/base.py b/ariaec/base.py index 0705509986bb54b0e8dd472b5c775a6f9c77bfe0..ce0501d6e5db92c78a02bc77aaa1d499c460b166 100644 --- a/ariaec/base.py +++ b/ariaec/base.py @@ -102,9 +102,9 @@ def format_str(string): :param string: :return: """ - if re.search(r"^\s*true\s*$", string, re.I): + if re.search(r"^\s*(true|yes)\s*$", string, re.I): return True - elif re.search(r"^\s*false\s*$", string, re.I): + elif re.search(r"^\s*(false|no)\s*$", string, re.I): return False elif re.search(r"^\s*\d+\s*$", string): return int(string) @@ -114,7 +114,7 @@ def format_str(string): return string.split(',') elif "+" in string: return string.split('+') - elif re.search(r"[/\\w]+", string): + elif re.search(r"[/\w]+", string): return string else: if string: @@ -133,12 +133,23 @@ def format_str(string): def format_dict(indict): + """ + + :param indict: + :return: + """ for key in indict: indict[key] = format_str(indict[key]) return indict def ppdict(indict, indent=2): + """ + + :param indict: + :param indent: + :return: + """ return json.dumps(indict, indent=indent) @@ -197,6 +208,12 @@ class CustomLogging: default_file = "conf/logging.json" def __init__(self, level=logging.INFO, desc=None): + """ + + :param level: + :param desc: + :return: + """ # TODO: detect path log filenames and makedirs if not exists logging.basicConfig(level=level) if desc: @@ -206,6 +223,11 @@ class CustomLogging: self.config = self.default_config() def update_msg(self, desc): + """ + + :param desc: + :return: + """ if type(self.msg) == list: self.msg += desc self.msg = " - ".join(self.msg) @@ -213,6 +235,10 @@ class CustomLogging: self.msg = " - ".join((self.msg, desc.capitalize())) def default_config(self): + """ + + :return: + """ # with open(self.default_file, 'rt') as f: with pkgr.resource_stream(__name__, self.default_file) as f: config = json.load(f) @@ -237,6 +263,10 @@ class CustomLogging: logging.config.dictConfig(self.config) def welcome(self): + """ + + :return: + """ desc = ''' ================================================================================ @@ -249,6 +279,10 @@ class CustomLogging: class Capturing(list): def __enter__(self): + """ + + :return: + """ # Stock default stdout and redirect current stdout to this class self._stdout = sys.stdout # All print calls are saved into self._stringio @@ -256,6 +290,11 @@ class Capturing(list): return self def __exit__(self, *args): + """ + + :param args: + :return: + """ self.extend("\n".join(self._stringio.getvalue().splitlines())) sys.stdout = self._stdout diff --git a/ariaec/base.pyc b/ariaec/base.pyc index 005803ab55abad265e668c20803035f1de32b0a7..6ba57492ff0c8c8321e0c1a639df8d983482e33f 100644 Binary files a/ariaec/base.pyc and b/ariaec/base.pyc differ diff --git a/ariaec/econtacts.py b/ariaec/econtacts.py index a130f0779f69bf3ffd07cb3577b0b69f1370898b..66206d8ff495a93bc165bebfbdf37491e3aca783 100644 --- a/ariaec/econtacts.py +++ b/ariaec/econtacts.py @@ -3,13 +3,13 @@ """ from __future__ import absolute_import, division, print_function -import math -import re -import numpy as np -import pandas as pd +# import re +# import math +# import numpy as np +# import pandas as pd from collections import namedtuple -from .base import cart_dist +# from .base import cart_dist Contact = namedtuple("Contact", ["resid1", "resid2"]) @@ -18,154 +18,154 @@ Atom = namedtuple("Atom", ["name", "coords"]) # BBcontacts scripts # # ---------------------------------------------------------------------------- # -def diversityvalue(msa, l): - """ - Compute diversityvalue for bbcontacts.py : sqrt(n)/L where n is the - number of sequence in MSA and L the protein length - :param msa: filepath (fasta format) - :param l: protein length - :return: diversityvalue (float) - """ - - # Compute n value - msa_reg = re.compile(r"^>[A-Za-z0-9]+_[A-Za-z0-9]+") - n = 0 - - with open(msa) as f: - for index, line in enumerate(f): - match = msa_reg.match(line) - if match and "deselect" not in line: - n += 1 - - return math.sqrt(n) / float(l) - - -def indexup_bbcontact(bbcontact_dict, startind, out, prefix): - """ - Index update if startind of coupling matrix is not eq 0 - :param out: - :param prefix: - :param bbcontact_dict: - :param startind: - :return: - """ - up = int(startind) - 1 - col = ('identifier', 'diversity', 'direction', 'viterbiscore', - 'indexpred', 'state', 'res1', 'res2') - with open('%s/%s.filteredoutput_up.txt' % (out, prefix), 'w') as f: - f.write("#{:>14}\t{:>10}\t{:>15}\t{:>15}\t{:>10}\t{:>10}\t{:>5}\t{" - ":>5}".format(*col)) - - for contact in bbcontact_dict: - resid1 = int(bbcontact_dict[contact]['res1_nb']) - resid2 = int(bbcontact_dict[contact]['res2_nb']) - bbcontact_dict[contact]['res1'] = resid1 + up - bbcontact_dict[contact]['res1_nb'] = resid1 + up - bbcontact_dict[contact]['res2'] = resid2 + up - bbcontact_dict[contact]['res2_nb'] = resid2 + up - line = [bbcontact_dict[contact][x] for x in col] - f.write("\n{:>15}\t{:>10}\t{:>15}\t{:>15}\t{:>10}\t{:>10}\t{" - ":>5}\t{:>5}".format(*line)) - - -def contact_insertion(contacts_dict, insert_list, insertvalue, seq): - """ - Insert - :param contacts_dict: - :param insert_list: - :param insertvalue: - :param seq: - :return: - """ - maxind = max(contacts_dict.keys()) - for insert in insert_list: - if insert not in contacts_dict: - contacts_dict[insert] = {'resname': seq[insert - 1]} - tmp = {} - # Init all contacts related to insert - for contact in range(insert + 1, maxind + 1): - tmp[contact] = float(insertvalue) - contacts_dict[insert].update(tmp) +# def diversityvalue(msa, l): +# """ +# Compute diversityvalue for bbcontacts.py : sqrt(n)/L where n is the +# number of sequence in MSA and L the protein length +# :param msa: filepath (fasta format) +# :param l: protein length +# :return: diversityvalue (float) +# """ +# +# Compute n value +# msa_reg = re.compile(r"^>[A-Za-z0-9]+_[A-Za-z0-9]+") +# n = 0 +# +# with open(msa) as f: +# for index, line in enumerate(f): +# match = msa_reg.match(line) +# if match and "deselect" not in line: +# n += 1 +# +# return math.sqrt(n) / float(l) + + +# def indexup_bbcontact(bbcontact_dict, startind, out, prefix): +# """ +# Index update if startind of coupling matrix is not eq 0 +# :param out: +# :param prefix: +# :param bbcontact_dict: +# :param startind: +# :return: +# """ +# up = int(startind) - 1 +# col = ('identifier', 'diversity', 'direction', 'viterbiscore', +# 'indexpred', 'state', 'res1', 'res2') +# with open('%s/%s.filteredoutput_up.txt' % (out, prefix), 'w') as f: +# f.write("#{:>14}\t{:>10}\t{:>15}\t{:>15}\t{:>10}\t{:>10}\t{:>5}\t{" +# ":>5}".format(*col)) +# +# for contact in bbcontact_dict: +# resid1 = int(bbcontact_dict[contact]['res1_nb']) +# resid2 = int(bbcontact_dict[contact]['res2_nb']) +# bbcontact_dict[contact]['res1'] = resid1 + up +# bbcontact_dict[contact]['res1_nb'] = resid1 + up +# bbcontact_dict[contact]['res2'] = resid2 + up +# bbcontact_dict[contact]['res2_nb'] = resid2 + up +# line = [bbcontact_dict[contact][x] for x in col] +# f.write("\n{:>15}\t{:>10}\t{:>15}\t{:>15}\t{:>10}\t{:>10}\t{" +# ":>5}\t{:>5}".format(*line)) + + +# def contact_insertion(contacts_dict, insert_list, insertvalue, seq): +# """ +# Insert +# :param contacts_dict: +# :param insert_list: +# :param insertvalue: +# :param seq: +# :return: +# """ +# maxind = max(contacts_dict.keys()) +# for insert in insert_list: +# if insert not in contacts_dict: +# contacts_dict[insert] = {'resname': seq[insert - 1]} +# tmp = {} +# Init all contacts related to insert +# for contact in range(insert + 1, maxind + 1): +# tmp[contact] = float(insertvalue) +# contacts_dict[insert].update(tmp) # Contacts scripts # # ---------------------------------------------------------------------------- # -def pdb_to_contact(pdbpath, threshold): - """ - List of contacts inside given pdb file - :param threshold: - :author: bardiaux - :param pdbpath: - :return: - """ - mapy = [] - descmap = {} - - with open(pdbpath, 'r') as pdb: - coords = {} - - while 1: - - l = pdb.readline() - - if not l: - break - - elif l.startswith('ATOM'): - # Foreach atom - - fields = l - atom = fields[12:16].strip() - resid = int(fields[23:26].strip()) - - if atom[0] != "H": - - coords.setdefault(resid, []) - coord = np.array([float(a) for a in [fields[30:38], - fields[38:46], - fields[46:54]]]) - coords[resid].append(Atom(name=atom, coords=coord)) - - k = coords.keys() - k.sort() # sorted resid list - - for ii in range(0, len(k)): - # Foreach res i - i = k[ii] - at1 = coords[i] # (x, y, z) coords list (N, CA, C, ...) - for jj in range(ii, len(k)): - # foreach res j (i+1, i+2, ...) - j = k[jj] - found = False - at2 = coords[j] - - for a in at1: - # foreach atm of res i - for b in at2: - # foreach atm of res j - if not found and cart_dist(a.coords, b.coords) < \ - float( - threshold): - # If no contact found between 2 res and cart dist < t - found = True - if (i, j) not in mapy: - # If contact not already found - mapy.append((i, j)) - descmap[Contact(resid1=i, resid2=j)] = {"dist": "%.2f" % cart_dist(a.coords, b.coords), - "atoms": (a.name, b.name)} - - return mapy, descmap - - -def contact2dataframe(contacts, refseq): - dim = len(refseq) - matrix = np.zeros(shape=(dim, dim)) - matrix = pd.DataFrame(matrix) - # print matrix - # print contacts - for contact in contacts: - # Contactmatrix humanidx starts at 0 and input contacts at 1 - matrix.iloc[int(contact[0]) - 1, int(contact[1]) - 1] = 1 - matrix.iloc[int(contact[1]) - 1, int(contact[0]) - 1] = 1 - - return matrix +# def pdb_to_contact(pdbpath, threshold): +# """ +# List of contacts inside given pdb file +# :param threshold: +# :author: bardiaux +# :param pdbpath: +# :return: +# """ +# mapy = [] +# descmap = {} +# +# with open(pdbpath, 'r') as pdb: +# coords = {} +# +# while 1: +# +# l = pdb.readline() +# +# if not l: +# break +# +# elif l.startswith('ATOM'): +# Foreach atom +# +# fields = l +# atom = fields[12:16].strip() +# resid = int(fields[23:26].strip()) +# +# if atom[0] != "H": +# +# coords.setdefault(resid, []) +# coord = np.array([float(a) for a in [fields[30:38], +# fields[38:46], +# fields[46:54]]]) +# coords[resid].append(Atom(name=atom, coords=coord)) +# +# k = coords.keys() +# k.sort() # sorted resid list +# +# for ii in range(0, len(k)): +# Foreach res i +# i = k[ii] +# at1 = coords[i] # (x, y, z) coords list (N, CA, C, ...) +# for jj in range(ii, len(k)): +# foreach res j (i+1, i+2, ...) +# j = k[jj] +# found = False +# at2 = coords[j] +# +# for a in at1: +# foreach atm of res i +# for b in at2: +# foreach atm of res j +# if not found and cart_dist(a.coords, b.coords) < \ +# float( +# threshold): +# If no contact found between 2 res and cart dist < t +# found = True +# if (i, j) not in mapy: +# If contact not already found +# mapy.append((i, j)) +# descmap[Contact(resid1=i, resid2=j)] = {"dist": "%.2f" % cart_dist(a.coords, b.coords), +# "atoms": (a.name, b.name)} +# +# return mapy, descmap + + +# def contact2dataframe(contacts, refseq): +# dim = len(refseq) +# matrix = np.zeros(shape=(dim, dim)) +# matrix = pd.DataFrame(matrix) +# print matrix +# print contacts +# for contact in contacts: +# Contactmatrix humanidx starts at 0 and input contacts at 1 +# matrix.iloc[int(contact[0]) - 1, int(contact[1]) - 1] = 1 +# matrix.iloc[int(contact[1]) - 1, int(contact[0]) - 1] = 1 +# +# return matrix diff --git a/ariaec/econverter.pyc b/ariaec/econverter.pyc index f40bc0ebb91b323884eae2aaff8222fe42f6584e..cb31273d5e9ba6ae64853689ff210effa8750620 100644 Binary files a/ariaec/econverter.pyc and b/ariaec/econverter.pyc differ diff --git a/ariaec/protein.py b/ariaec/protein.py index de9efd87df34ae9c256a4b24765a764879f5c2c7..d0802bcac3646b4c293e93bcbce6c2adf78273b3 100644 --- a/ariaec/protein.py +++ b/ariaec/protein.py @@ -13,8 +13,6 @@ import aria.legacy.SequenceList as SequenceList import aria.legacy.AminoAcid as AmnAcd from .base import (reg_load, Capturing, ppdict) # import skbio.Protein as skprot - - # TODO: interface skbio ?? @@ -38,6 +36,11 @@ class SsList: ss_dist_reg = re.compile(r"\s+(\d+\.\d+) \( (\d+\.\d+)\)") def __init__(self, sett): + """ + + :param sett: + :return: + """ self.settings = sett self.ss_matrix = [] self.ssdict = {} @@ -46,6 +49,10 @@ class SsList: @property def index(self): + """ + + :return: + """ if self.ss_matrix: # Assuming human idx (1 ...) correspond to ss_matrix first column !!! return [int(_) for _ in zip(*self.ss_matrix)[0]] @@ -53,10 +60,21 @@ class SsList: return [] def check_filetype(self, filename): + """ + + :param filename: + :return: + """ self.filetype = os.path.splitext(filename)[1][1:] # TODO: check if given file is supported def read(self, filename, sequence=''): + """ + + :param filename: + :param sequence: + :return: + """ self.check_filetype(filename) logger.info("Reading secondary structure file %s [%s]" % ( filename, @@ -73,6 +91,11 @@ class SsList: self.seq_sublist(sequence) def read_psipred(self, filename): + """ + + :param filename: + :return: + """ self.ssdict = reg_load(self.psipred_reg, filename) # TODO: supprimer psipred_list dans les futures implementations @@ -95,11 +118,22 @@ class SsList: self.ssdict[line_id]['ss_conf']]) def write_ssfasta(self, filename, desc="pdbid"): + """ + + :param filename: + :param desc: + :return: + """ with open(filename, 'w') as psipred: psipred.write("> %s\n" % desc) psipred.write("".join([_[0] for _ in zip(*self.ss_matrix)[2]])) def read_indextableplus(self, filename): + """ + + :param filename: + :return: + """ c = 0 error_list = [] ss_index_dict = {'H': 1, 'C': 1, 'E': 1} @@ -134,6 +168,12 @@ class SsList: str(error_list))) def _read_ssdist(self, infile, filename=''): + """ + + :param infile: + :param filename: + :return: + """ logger.info("Reading distance file {0}".format(filename)) c = 0 atom_list = [] @@ -228,6 +268,13 @@ class AminoAcidSequence(SequenceList.SequenceList): } def __init__(self, topologyfile, *args, **kwargs): + """ + + :param topologyfile: + :param args: + :param kwargs: + :return: + """ SequenceList.SequenceList.__init__(self, *args, **kwargs) self._topfile = topologyfile self._topology = None @@ -236,6 +283,10 @@ class AminoAcidSequence(SequenceList.SequenceList): @property def humanidx(self): + """ + + :return: + """ return range(1, len(self.sequence) + 1) @property @@ -254,13 +305,26 @@ class AminoAcidSequence(SequenceList.SequenceList): return self._topology def __getitem__(self, key): + """ + + :param key: + :return: + """ return self.aalist[key] def __repr__(self): + """ + + :return: + """ return "Amino Acid sequence %s (%d)" % (self.sequence, len(self.sequence)) def readtopo(self): + """ + + :return: + """ topo = {} with pkgr.resource_stream(__name__, self._topfile) as tpf: res_flag = False @@ -302,6 +366,11 @@ class AminoAcidSequence(SequenceList.SequenceList): return topo def read(self, filename): + """ + + :param filename: + :return: + """ # TODO: smarter reader checking type of file (fasta, etc ...) with Capturing() as output: self.ReadFasta(filename) @@ -315,6 +384,11 @@ class AminoAcidSequence(SequenceList.SequenceList): class Protein: def __init__(self, sett): + """ + + :param sett: + :return: + """ self.aa_sequence = AminoAcidSequence(sett.TOPO) self.sec_struct = SsList(sett) self.index = [] # Index starting from 1 @@ -345,9 +419,19 @@ class Protein: @property def topology(self): + """ + + :return: + """ return self.aa_sequence.topology def set_aa_sequence(self, filename, ssidx=False): + """ + + :param filename: + :param ssidx: + :return: + """ self.aa_sequence.read(filename) self.index = range(1, len(self.aa_sequence.sequence) + 1) if self.sec_struct.ss_matrix: @@ -359,6 +443,13 @@ class Protein: self.sec_struct.index) def set_sec_struct(self, filename, ssdist_filename='', ssidx=False): + """ + + :param filename: + :param ssdist_filename: + :param ssidx: + :return: + """ # TODO: Add test checking if both amino acid and sec_struct sequence # have the same length after seq_sublist call # Read secondary structure prediction file @@ -376,6 +467,11 @@ class Protein: self.sec_struct.index) def write_seq(self, outfile): + """ + + :param outfile: + :return: + """ with Capturing() as output: self.aa_sequence.WriteSeq(outfile) logger.info(''.join(output)) diff --git a/ariaec/protein.pyc b/ariaec/protein.pyc index d1705b8cec306250c959a81e7042ce8c4a265b99..bb3ccaa9234ccfc7e5c497e3221e2035e56ec577 100644 Binary files a/ariaec/protein.pyc and b/ariaec/protein.pyc differ diff --git a/ariaec/protmap.pyc b/ariaec/protmap.pyc index e62ac3d42b8bf007616fbbb32a42b4105c14abac..1491a80fcb6c161b11f21ee12cae5c2610f3ec42 100644 Binary files a/ariaec/protmap.pyc and b/ariaec/protmap.pyc differ diff --git a/ariaec/reader.py b/ariaec/reader.py index 6065ef7d8eb6540ffcfe7e0cf75561c4ad658814..62a14d39dee39989899c60d11470177b3bd2b8fe 100644 --- a/ariaec/reader.py +++ b/ariaec/reader.py @@ -183,6 +183,12 @@ class MapFile(RegexFile): check_type = True def __init__(self, *args, **kwargs): + """ + + :param args: + :param kwargs: + :return: + """ super(MapFile, self).__init__(*args, **kwargs) if self.check_type: self.regex, self.filetype, self.sort = self.check_maptype() @@ -199,6 +205,16 @@ class MapFile(RegexFile): def create_map(self, protein, contactdef, flaglist=None, offset=0, sym=True, **kwargs): + """ + + :param protein: + :param contactdef: + :param flaglist: + :param offset: + :param sym: + :param kwargs: + :return: + """ raise NotImplementedError("Class %s doesn't implement create_map" % self.__class__.__name__) @@ -215,10 +231,19 @@ class MapFile(RegexFile): "names in contact file at line %d !" % line) def update_map(self, resmap): + """ + + :param resmap: + :return: + """ raise NotImplementedError("Class %s doesn't implement update_map" % self.__class__.__name__) def check_maptype(self): + """ + + :return: + """ logger.info("Checking format for file %s" % self.filepath) # Check if given type is supported # TODO: report this check into commands section @@ -279,6 +304,14 @@ class MapFile(RegexFile): def read(self, protein=None, contactdef=5.0, groupby_method="min", scsc=None): + """ + + :param protein: + :param contactdef: + :param groupby_method: + :param scsc: + :return: + """ logger.info("Reading %s file" % self.filepath) if self.filetype: # Read file with regex related to filetype @@ -339,6 +372,12 @@ class MapFile(RegexFile): class ContactMapFile(MapFile): # "plmdca", "evfold", "bbcontacts", "pconsc", "gremlin", "metapsicov", def __init__(self, filepath, filetype): + """ + + :param filepath: + :param filetype: + :return: + """ super(self.__class__, self).__init__(filepath, filetype) def update_map(self, resmap): diff --git a/ariaec/reader.pyc b/ariaec/reader.pyc index 54d903a53dd0ac6cbfe5622a16d372710a27c4ef..cbaf7ae40f5c54325f1073688eef100c73a386b0 100644 Binary files a/ariaec/reader.pyc and b/ariaec/reader.pyc differ diff --git a/ariaec/setup.py b/ariaec/setup.py index f6e830fb8a379481162a7cd37c8244f543114d47..b357532b253facbbcf6191bb0e5eedeef7ad5a5f 100644 --- a/ariaec/setup.py +++ b/ariaec/setup.py @@ -21,6 +21,11 @@ logger = logging.getLogger(__name__) class AriaEcSetup: def __init__(self, settings): + """ + + :param settings: + :return: + """ # TODO: check_type settings (AriaEcSettings) self.settings = settings self.protein = Protein(settings) @@ -34,6 +39,10 @@ class AriaEcSetup: self.converter = AriaEcXMLConverter(settings) def run(self): + """ + + :return: + """ # Check input logger.debug("Settings:\n" + json.dumps(self.settings.setup.config, indent=4)) @@ -162,6 +171,10 @@ class AriaEcSetup: self.write_optional_files() def write_optional_files(self): + """ + + :return: + """ # Indextableplus file (submatrix) # Contacts_refined.out for maptype in self.allresmap: diff --git a/ariaec/test/__init__.py b/ariaec/test/__init__.py index 43decae88a2205964a8a7881f0409802584ece80..0852314d099c329d1702dc8929831706d9a912cc 100644 --- a/ariaec/test/__init__.py +++ b/ariaec/test/__init__.py @@ -1 +1,3 @@ -__author__ = 'raionic' +""" + ARIA EC test module +"""