Commit 91ebaf02 authored by Fabrice  ALLAIN's avatar Fabrice ALLAIN
Browse files

Updated sspred parser in order to support ss3 file format

parent c6ef2b4f
......@@ -28,25 +28,49 @@ LOG = logging.getLogger(__name__)
# TODO: implement psipred format 3.5
class SsList(object):
"""Reader for secondary prediction structure files"""
psipred_reg = re.compile(r'^(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<ss_conf>\d?)')
psipred2_reg = re.compile(r'^(?P<ss_pred>[HEC]+)')
psipred3_reg = re.compile(r'^\s*(?P<up_index>\d+)'
types = {
'psipred': re.compile(r'^(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<dunno1>\d?\.?\d*)'
r'\s+(?P<dunno2>\d?\.?\d*)'
r'\s+(?P<dunno3>\d?\.?\d*)')
indxplus_reg = re.compile(
r'^(?P<up_index>\d+)\s+(?P<up_residue>[AC-IK-NP-TVWYZ])\s+'
r'(?P<ss_pred>[CEH])\s+(?P<ss_conf>\d)\s+(?P<msa_index>[\d\-]+)\s+'
r'(?P<msa_consper>[\d\-]+)\s+(?P<msa_cons>[*~\-])\s+'
r'(?P<in_const>[*~\-])\s+(?P<pdb_atom>[\d\-]+)\s+'
r'(?P<pdb_chain>[\-\w])\s+(?P<pdb_index>[\d\-]+\w?)\s+'
r'(?P<pdb_residue>[AC-IK-NP-TVWYZ\-])\s+(?P<pdb_x_pos>[\d.\-]+)\s+'
r'(?P<pdb_y_pos>[\d.\-]+)\s+(?P<pdb_z_pos>[\d\-.]+)')
r'\s+(?P<ss_conf>\d?)'),
'psipred2': re.compile(r'^(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<ss_conf>\d?)'),
'psipred3': re.compile(r'^(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<ss_conf>\d?)'),
'indextableplus': re.compile(r'^(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<ss_conf>\d?)'),
'ss3': re.compile(r'^\s*(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<h_conf>\d?\.?\d*)'
r'\s+(?P<e_conf>\d?\.?\d*)'
r'\s+(?P<c_conf>\d?\.?\d*)'),
}
# psipred_reg = re.compile(r'^(?P<up_index>\d+)'
# r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
# r'\s+(?P<ss_pred>[HEC])'
# r'\s+(?P<ss_conf>\d?)')
# psipred2_reg = re.compile(r'^(?P<ss_pred>[HEC]+)')
# psipred3_reg = re.compile(r'^\s*(?P<up_index>\d+)'
# r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
# r'\s+(?P<ss_pred>[HEC])'
# r'\s+(?P<h_conf>\d?\.?\d*)'
# r'\s+(?P<e_conf>\d?\.?\d*)'
# r'\s+(?P<c_conf>\d?\.?\d*)')
# indxplus_reg = re.compile(
# r'^(?P<up_index>\d+)\s+(?P<up_residue>[AC-IK-NP-TVWYZ])\s+'
# r'(?P<ss_pred>[CEH])\s+(?P<ss_conf>\d)\s+(?P<msa_index>[\d\-]+)\s+'
# r'(?P<msa_consper>[\d\-]+)\s+(?P<msa_cons>[*~\-])\s+'
# r'(?P<in_const>[*~\-])\s+(?P<pdb_atom>[\d\-]+)\s+'
# r'(?P<pdb_chain>[\-\w])\s+(?P<pdb_index>[\d\-]+\w?)\s+'
# r'(?P<pdb_residue>[AC-IK-NP-TVWYZ\-])\s+(?P<pdb_x_pos>[\d.\-]+)\s+'
# r'(?P<pdb_y_pos>[\d.\-]+)\s+(?P<pdb_z_pos>[\d\-.]+)')
ss_dist_reg = re.compile(r"\s+(\d+\.\d+) \( (\d+\.\d+)\)")
def __init__(self, sett):
......@@ -101,10 +125,45 @@ class SsList(object):
Returns
-------
"""
self.filetype = os.path.splitext(filename)[1][1:]
# TODO: check if given file is supported
filetype = os.path.splitext(filename)[1][1:]
filetype = filetype if 'txt' not in filetype else \
os.path.splitext(os.path.splitext(filename)[0])[1][1:]
LOG.info("Checking if file %s correspond to %s format", filename,
filetype)
# Check if given type is supported
# TODO: make only one file type checking (cf reader.MapFile)
# TODO: report this check into commands section
if os.stat(filename).st_size == 0:
LOG.warning("File %s is empty !", filename)
return [None] * 2
with open(filename) as infile:
# Check first and second line of file
for index, line in enumerate(infile):
if filetype in self.types:
match = self.types[filetype].match(line)
else:
LOG.error("Format %s not supported !", filetype)
match = None
if match:
LOG.info("Format type correct (%s)", filetype)
return filetype, self.types[filetype]
if index > 3:
LOG.warning("Given type do not correspond, checking if "
"the file corresponds to default format for "
"contact lists...")
for subformat in self.types:
if self.types.get(subformat).match(line):
LOG.info("Default format type found") \
if subformat != "empty" else \
LOG.warning("The file seems to be empty ...")
return subformat, self.types[subformat]
# Stop checking after second line
LOG.error("Can't read %s file.", filetype)
break
LOG.error("Wrong format type given ...")
return [None] * 2
def read(self, filename):
"""
......@@ -120,21 +179,42 @@ class SsList(object):
"""
self.check_filetype(filename)
sstype, ssregex = self.check_filetype(filename)
LOG.info("Reading secondary structure file %s [%s]",
filename, self.filetype)
# TODO: better read with getattr
if self.filetype == "indextableplus":
self.read_indextableplus(filename)
elif self.filetype == "ss2":
self.read_psipred(filename, ss2=True)
else:
self.read_psipred(filename)
filename, sstype)
self.ssdict = reg_load(ssregex, filename)
ss_index_dict = {'H': 1, 'C': 1, 'E': 1}
for line_id in sorted(self.ssdict.keys()):
# Modif champ ss_pred
# Si line_id
if line_id != min(self.ssdict.keys()) and \
self.ssdict[line_id]['ss_pred'] not in \
self.ssdict[line_id - 1]['ss_pred']:
# If next ss isn't the same, increment relative struct in
# ss_index_dict
ss_index_dict[self.ssdict[line_id - 1]['ss_pred'][0]] += 1
ss_pred = "".join((
self.ssdict[line_id]['ss_pred'],
str(ss_index_dict[self.ssdict[line_id]['ss_pred']])))
# TODO: change ssmatrix in order to accept h, e and c conf scores
score = self.ssdict[line_id].get('ss_conf') if \
self.ssdict[line_id].get('ss_conf') else \
self.ssdict[line_id].get(
self.ssdict[line_id].get('ss_pred', "").lower() +
'_conf', "")
self.ss_matrix.append([
self.ssdict[line_id].get('up_index', ""),
self.ssdict[line_id].get('up_residue', ""),
ss_pred,
score])
LOG.debug("Secondary structure list:\n%s\n"
"Secondary structure dict:\n%s", self.ss_matrix,
self.ssdict)
# TODO: remove in future updates
def read_psipred(self, filename, ss2=False):
"""
......@@ -194,9 +274,10 @@ class SsList(object):
psipred.write(str("> %s\n" % desc))
psipred.write(str("".join([_[0] for _ in zip(*self.ss_matrix)[2]])))
# TODO: remove in future updates
def read_indextableplus(self, filename):
"""
Parameters
----------
......@@ -345,7 +426,8 @@ class SsList(object):
class AminoAcidSequence(SequenceList.SequenceList, object):
"""Amino acid sequence"""
startres_reg = re.compile(r"^\s*residue\s+(?P<name>[A-Za-z]{1,4})", flags=re.I)
startres_reg = re.compile(r"^\s*residue\s+(?P<name>[A-Za-z]{1,4})",
flags=re.I)
end_reg = re.compile(r"^\s*end", flags=re.I)
restatement_reg = {
"atom": re.compile(r"atom\s+(?P<name>\w{1,4})\s*(?:.*?"
......@@ -366,8 +448,9 @@ class AminoAcidSequence(SequenceList.SequenceList, object):
r"\s+(?P<atm4>\w{1,4})", flags=re.I),
"dono": re.compile(r"dono\s+(?P<hatm>\w{1,4})\s+(?:(?P<heavatm>\w{1,"
r"4})|(?:\"\s*\"))?", flags=re.I),
"acce": re.compile(r"^\s*acce\s+(?P<accatm>\w{1,4})\s+(?:(?P<antatm>\w{1,"
r"4})|(?:\"\s*\"))", flags=re.I)
"acce": re.compile(
r"^\s*acce\s+(?P<accatm>\w{1,4})\s+(?:(?P<antatm>\w{1,"
r"4})|(?:\"\s*\"))", flags=re.I)
}
def __init__(self, topologyfile, *args, **kwargs):
......@@ -476,7 +559,8 @@ class AminoAcidSequence(SequenceList.SequenceList, object):
topo[resname][regid].append(match.groups())
else:
# Add dict
topo[resname][regid].append(match.groupdict())
topo[resname][regid].append(
match.groupdict())
break
LOG.debug("Topology used:\n%s", ppdict(topo))
return topo
......@@ -648,6 +732,7 @@ class Protein(object):
if __name__ == "__main__":
from .settings import AriaEcSettings
settings = AriaEcSettings("setup")
prot = Protein(settings)
prot.set_aa_sequence("../examples/data/BPT1_BOVIN.fa")
......
......@@ -411,14 +411,14 @@ class MapFile(RegexFile):
"the file corresponds to default format for "
"contact lists...")
for subformat in self.unknown:
if self.types.get(subformat)["regex"].match(line):
if self.unknown.get(subformat)["regex"].match(line):
LOG.info("Default format type found") \
if subformat != "empty" else \
LOG.warning("The file seems to be empty ...")
return [
self.types[subformat].get("regex"),
self.unknown[subformat].get("regex"),
self.filetype,
self.types[subformat].get("score_field")
self.unknown[subformat].get("score_field")
]
# Stop checking after second line
LOG.error("Can't read %s file.", self.filetype)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment