Commit a63afa91 authored by Fabrice  ALLAIN's avatar Fabrice ALLAIN
Browse files

Redefined several commands in order to be more explicit

parent ff1880f5
......@@ -13,7 +13,8 @@ from glob import glob
from collections import OrderedDict
from Bio.PDB import PDBParser, PDBIO
from aria.AriaXML import AriaXMLPickler
from .econverter import AriaEcXMLConverter
from aria.SuperImposer import SuperImposer
from .converter import AriaEcXMLConverter
from .base import NotDisordered, Capturing
from aria.DataContainer import DATA_SEQUENCE
from aria.StructureEnsemble import StructureEnsemble, StructureEnsembleSettings
......@@ -22,70 +23,16 @@ from aria.StructureEnsemble import StructureEnsemble, StructureEnsembleSettings
LOG = logging.getLogger(__name__)
class PDBViol(object):
class EnsembleAnalysis(object):
"""
Violation analysis of distance restraints on structure ensemble
ARIA extended ensemble analysis
"""
def __init__(self, settings):
self.settings = settings
def run(self):
"""
Execute Violation analysis
"""
# Args
project_path = self.settings.pdbviol.args["project"]
# restraints_path = self.settings.pdbviol.args["restraints"]
native_path = self.settings.pdbviol.args.get("ref")
out_path = self.settings.pdbviol.args["output_directory"]
out_file = os.path.join(out_path, 'violations.csv')
list_name = self.settings.pdbviol.args["listname"]
# Aria objects
pickler = AriaXMLPickler()
project = pickler.load(project_path)
molecule_path = project.getData(DATA_SEQUENCE)[0].getLocation()[0]
molecule = pickler.load(molecule_path)
# If we are at the first iteration, we select the related ensemble and
# restraints, otherwise we take the ensemble from the previous iteration
iteration_path = self.settings.pdbviol.args["iteration"]
iteration_id = int(re.search(
'[0-9]+', os.path.basename(iteration_path)).group(0))
if iteration_id != 0:
iteration_path = os.path.join(
os.path.dirname(iteration_path), "it%d" % (iteration_id - 1))
LOG.info("Violation analysis will be done on restraints and "
"ensemble from it%d with violation criteria of it%d",
(iteration_id - 1), iteration_id)
if not os.path.exists(iteration_path):
LOG.error("Can not found previous iteration (%s)",
iteration_path)
restraints = glob(os.path.join(iteration_path, '*restraints.xml'))
if not restraints:
# Load tbl restraints and convert them into aria xml format
restraints = glob(os.path.join(iteration_path, '*.tbl'))
restraints = AriaEcXMLConverter.tbl2xml(
iteration_path, molecule_path, restraints, list_name)
restraints = [pickler.load(restraint).restraint
for restraint in restraints]
# Setup
protein_id = project.getSettings()['name']
nbest = project.getProtocol().getIterationSettings(iteration_id)[
"number_of_best_structures"]
sort_crit = project.getProtocol().getIterationSettings(iteration_id)[
"sort_criterion"]
cutoff = project.getProtocol().getIterationSettings(iteration_id)[
"violation_analyser_settings"]["violation_tolerance"]
se_settings = StructureEnsembleSettings()
se_settings['sort_criterion'] = sort_crit
se_settings['number_of_best_structures'] = nbest
@staticmethod
def _get_pdblist(iteration_path):
# Get the list of all the generated structures in iteration path if no
# clustering. Otherwise, get the list of the lowest energy cluster
if os.path.exists(os.path.join(iteration_path, "report.clustering")):
......@@ -100,56 +47,38 @@ class PDBViol(object):
# no clustering, pdb list correspond to all generated structures
LOG.info("No cluster found in this iteration, compute analysis for"
"iteration ensemble")
list_of_pdb = [[
foo for foo in glob(os.path.join(iteration_path, "*.pdb"))
if not os.path.basename(foo).startswith('fitted_')]]
list_of_pdb = [
[foo for foo in glob(os.path.join(iteration_path, "*.pdb"))
if not os.path.basename(foo).startswith('fitted_')]]
LOG.debug("Lists of structures:\n%s", "\n".join(
[str(_) for _ in list_of_pdb]))
return list_of_pdb
# Read structure ensembles
LOG.info("Reading structure ensemble(s)")
with Capturing() as output:
ensembles = [
StructureEnsemble(se_settings) for _ in list_of_pdb]
[ensemble.read(list_of_pdb[i], molecule, format='cns')
for i, ensemble in enumerate(ensembles)]
[ensemble.sort()
for i, ensemble in enumerate(ensembles)]
if output:
LOG.info(output)
# Get the lowest energy ensemble
LOG.info("Sorting structure ensemble(s) with %s criteria", sort_crit)
energies = np.array([
np.mean([d['total_energy'] for d in ens.getInfo()[:, 1]][:nbest])
if len(ens) >= nbest else None for ens in ensembles], dtype=np.float)
ensemble = ensembles[np.argmin(energies)]
# Get native structure
# # Issue with several pdb files which have the same residue_number in
# atm and hetatm sections ... We remove them with bio pdb
dists_ref = None
if native_path:
LOG.info("Reading native structure")
logging.captureWarnings(True)
native = PDBParser().get_structure(protein_id, native_path)
native_path = os.path.join(out_path, protein_id + "_ordered.native.pdb")
io = PDBIO()
io.set_structure(native)
io.save(native_path, select=NotDisordered())
native = StructureEnsemble(se_settings)
native.display_warnings = 0
native.read([native_path], molecule, format='cns')
dists_ref = native.getDistances
dists = ensemble.getDistances
@staticmethod
def violation_analysis(project, iteration_id, restraints, dists, out_file,
dists_ref=None, headerflag=True):
"""
headerflag = True
Parameters
----------
project
iteration_id
restraints
dists
out_file
dists_ref
headerflag
LOG.info("Writing violation csv file")
Returns
-------
"""
protein_id = project.getSettings()['name']
nbest = project.getProtocol().getIterationSettings(iteration_id)[
"number_of_best_structures"]
cutoff = project.getProtocol().getIterationSettings(iteration_id)[
"violation_analyser_settings"]["violation_tolerance"]
with open(out_file, 'w') as out:
for restraintlist in restraints:
......@@ -272,4 +201,167 @@ class PDBViol(object):
out.write("\n" + ",".join(["%s" % output[x][k]
for k in output[x].keys()]))
LOG.info("PDBviol finished")
LOG.info("Wrote %s file", out_file)
@staticmethod
def pca_projection(iter_dir, ensemble, molecule, atmask="CA"):
"""
Parameters
----------
iter_dir
ensemble
molecule
atmask
Returns
-------
"""
mask = [a.getId() for c in molecule.get_chains() for r in c.getResidues()
for a in r.getAtoms() if a.getName() == atmask]
# Align all structures on ca backbone
si = SuperImposer(ensemble, molecule)
si.getSettings()['number_of_best_structures'] = 'all'
si._fit(mask)
fitcoords = si.getFittedCoordinates()
fitcoords = np.take(fitcoords, mask, axis=1)
ns, na, xyz = fitcoords.shape
# Change the shape of coords matrix in order to use pca, kmeans, ...
fitcoords.shape = ns, na * xyz
ensemble.getSettings()['number_of_best_structures'] = 15
# return fitcoords, infos
def run(self):
"""
Execute Violation analysis
"""
# Args
project_path = self.settings.analysis.args["project"]
# restraints_path = self.settings.analysis.args["restraints"]
native_path = self.settings.analysis.args.get("ref")
out_path = self.settings.analysis.args["output_directory"]
list_name = self.settings.analysis.args["listname"]
# Aria objects
pickler = AriaXMLPickler()
project = pickler.load(project_path)
molecule_path = project.getData(DATA_SEQUENCE)[0].getLocation()[0]
molecule = pickler.load(molecule_path)
# If we are at the first iteration, we select the related ensemble and
# restraints, otherwise we take the ensemble from the previous iteration
iteration_path = self.settings.analysis.args["iteration"]
iteration_id = int(re.search(
'[0-9]+', os.path.basename(iteration_path)).group(0))
if iteration_id != 0:
iteration_path = os.path.join(
os.path.dirname(iteration_path), "it%d" % (iteration_id - 1))
LOG.info("Ensemble analysis will be done on restraints and "
"ensemble from it%d with violation criteria of it%d",
(iteration_id - 1), iteration_id)
if not os.path.exists(iteration_path):
LOG.error("Can not found previous iteration (%s)",
iteration_path)
restraints = glob(os.path.join(iteration_path, '*restraints.xml'))
if not restraints:
# Load tbl restraints and convert them into aria xml format
restraints = glob(os.path.join(iteration_path, '*.tbl'))
restraints = AriaEcXMLConverter.tbl2xml(
iteration_path, molecule_path, restraints, list_name)
restraints = [pickler.load(restraint).restraint
for restraint in restraints]
# Setup
protein_id = project.getSettings()['name']
nbest = project.getProtocol().getIterationSettings(iteration_id)[
"number_of_best_structures"]
sort_crit = project.getProtocol().getIterationSettings(iteration_id)[
"sort_criterion"]
se_settings = StructureEnsembleSettings()
se_settings['sort_criterion'] = sort_crit
se_settings['number_of_best_structures'] = nbest
# Get list of pdb related to structure ensemble(s)
list_of_pdb = self._get_pdblist(iteration_path)
# Read structure ensembles
LOG.info("Reading structure ensemble(s)")
with Capturing() as output:
ensembles = [
StructureEnsemble(se_settings) for _ in list_of_pdb]
[ensemble.read(list_of_pdb[i], molecule, format='cns')
for i, ensemble in enumerate(ensembles)]
[ensemble.sort()
for i, ensemble in enumerate(ensembles)]
if output:
LOG.info(output)
# Get the lowest energy ensemble
LOG.info("Sorting structure ensemble(s) with %s criteria", sort_crit)
energies = np.array([
np.mean([d['total_energy'] for d in ens.getInfo()[:, 1]][:nbest])
if len(ens) >= nbest else None for ens in ensembles], dtype=np.float)
ensemble = ensembles[np.argmin(energies)]
# Get native structure
# # Issue with several pdb files which have the same residue_number in
# atm and hetatm sections ... We remove them with bio pdb
dists_ref = None
if native_path:
LOG.info("Reading native structure")
logging.captureWarnings(True)
native = PDBParser().get_structure(protein_id, native_path)
native_path = os.path.join(out_path, protein_id + "_ordered.native.pdb")
io = PDBIO()
io.set_structure(native)
io.save(native_path, select=NotDisordered())
native = StructureEnsemble(se_settings)
native.display_warnings = 0
native.read([native_path], molecule, format='cns')
dists_ref = native.getDistances
# We get here the distance of ALL THE STRUCTURES IN THE CLUSTER for viol
# analysis
dists = ensemble.getDistances
LOG.info("Violation analysis")
out_file = os.path.join(out_path, 'violations.csv')
self.violation_analysis(project, iteration_id, restraints, dists,
out_file, dists_ref=dists_ref)
infos = [inf for inf in ensemble.getInfo()]
print(len(infos))
print(len(list_of_pdb[0]))
clustlists = [open(listpath).readlines() for listpath in
glob(os.path.join(iteration_path, '*_clust*.list'))]
# with open(os.path.join(iter_dir,
# "analysis/pyfit/accuracydssp/RMSD.dat")) as rmsdfile:
# accdssp = {key: float(value) for key, value in
# [line.split() for line in rmsdfile if
# re.search("pdb", line)]}
# Add clust label key for each pdb struct
# [info[1].update({'clust': idx}) for info in infos for idx, clustlist in
# enumerate(clustlists) if filter(re.compile(info[0]).match, clustlist)]
# [info[1].update({'ensemble': False})
# for info in infos]
# [info[1].update({'ensemble': True})
# for info in infos
# for clustlist in [clustlist[0:15] for clustlist in clustlists] if
# filter(re.compile(info[0]).match, clustlist)]
# [info[1].update(
# {'accdssp': accdssp.get(os.path.basename(info[0]), None)})
# for info in infos]
\ No newline at end of file
......@@ -10,13 +10,13 @@ import argparse as argp
from . import __doc__
from .base import format_dict, CustomLogging
from .ecsettings import AriaEcSettings
from .settings import AriaEcSettings
from .maplot import AriaEcContactMap
from .econverter import AriaEcBbConverter, AriaEcXMLConverter
from .converter import AriaEcBbConverter, AriaEcXMLConverter
from .setup import AriaEcSetup
from .pdbqual import AriaEcPdbqual
from .pdbdist import PDBDist
from .pdbviol import PDBViol
from .analysis import EnsembleAnalysis
LOG = logging.getLogger(__name__)
......@@ -57,27 +57,26 @@ class ReadableFile(argp.Action):
# TODO: Make parent Command class with _create_argparser, self.args,
# update_logger and run
class AriaEcCommand(object):
class AriaEcCommands(object):
"""
Argparse interface for aria_ec
"""
command_list = ("setup", "bbconv", "contactmap", "pdbqual", "pdbviol",
command_list = ("setup", "bbconv", "maplot", "pdbqual", "analysis",
"tbl2xml", "pdbdist")
desc_list = (u"Setup ARIA infrastructure with given contacts translated "
desc_list = (u"Setup ARIA infrastructure with contact maps translated "
u"into ARIA restraints",
u"Convert contacts in bbcontact format",
u"Contactmap tool",
u"Convert a contact map in bbcontact format",
u"Contactmap visualization tool",
u"Quality tool for pdb file(s)",
u"Extended ARIA Violation analysis on a specific iteration "
u"vs. native structure",
u"Extended ARIA ensemble analysis on a specific iteration ",
u"XML converter for tbl distance restraint",
u"Extract distance distribution from culled list of pdb files")
contact_types = ("evfold", "plmev", "plm", "plmdca", "plmc", "bbcontacts",
"pconsc", "pconsc1", "pconsc2", "psicov", "metapsicovhb",
"metapsicov_stg1", "metapsicov_stg2", "gremlin", "pdb",
"native", "native_full", "contactlist")
default_confile = "conf/aria_ec.ini"
default_confile = "conf/config.ini"
def __init__(self, custom_logging=None):
# Def Argument pdbparser
......@@ -190,9 +189,9 @@ class AriaEcCommand(object):
"type")
return parser
def _contactmap_argparser(self, desc=None):
def _maplot_argparser(self, desc=None):
"""
contactmap opt & args
maplot opt & args
:param desc: command descriptor
:return: argparser object
"""
......@@ -238,7 +237,7 @@ class AriaEcCommand(object):
return parser
@staticmethod
def _pdbviol_argparser(desc=None):
def _analysis_argparser(desc=None):
parser = argp.ArgumentParser(description=desc,
add_help=False)
parser.add_argument("project", action=ReadableFile,
......@@ -332,7 +331,7 @@ class AriaEcCommand(object):
bbconverter = AriaEcBbConverter(self.create_settings())
bbconverter.run()
def contactmap(self):
def maplot(self):
"""
instantiate AriaEcContactmap with AriaSettings
"""
......@@ -360,22 +359,22 @@ class AriaEcCommand(object):
inst = PDBDist(self.create_settings())
inst.run()
def pdbviol(self):
def analysis(self):
"""
Violation analysis of distance restraints on structure ensemble
Extended ensemble analysis of distance restraints
"""
inst = PDBViol(self.create_settings())
inst = EnsembleAnalysis(self.create_settings())
inst.run()
def ec2aria():
def main():
"""
Launch ariaec command interface
"""
mylog = CustomLogging(desc=__doc__)
command = AriaEcCommand(custom_logging=mylog)
command = AriaEcCommands(custom_logging=mylog)
command.run()
......@@ -384,4 +383,4 @@ if __name__ == "__main__":
# Test AriaEcCommand object
logging.basicConfig(level=logging.DEBUG)
LOG = logging.getLogger("IO")
AriaEcCommand()
AriaEcCommands()
......@@ -22,7 +22,7 @@ clashlist_executable:
[contactdef]
; Contact definition section used to define contactmap from pdb file.
; Contact definition section used to define maplot from pdb file.
; Decrease this threshold if using other cutoff (e.g. 5.0)
default_cutoff: 8.0
; Add contact cutoff folowwing the syntax atm1_atm2
......@@ -229,7 +229,7 @@ clustering_mask: CA
clustering_nclusters: 2
clustering_method: kmeans
[contactmap]
[maplot]
; -------------------------- Contactmap parameters --------------------------- #
; Report settings
; n_factors: Number of EC tested: n * n_factor (n: sequence length)
......@@ -271,7 +271,7 @@ obsolete_directory: /tmp/obsolete
remove_pdbs: False
pair_list: min
[pdbviol]
[analysis]
violation_treshold: 0.5
nbest_structures: 15
sort_criterion: total_energy
......@@ -401,8 +401,8 @@ assign (resid {res1} and name o) (resid {res2} and name hn) 1.8 {dminus} {dplus
# print(topo[residx1][1]['dono'])
# print(topo[residx2][1]['acce'])
# print([x[0] for x in topo])
# print(hbmap['contactmap'].sequence[residx1])
# print(hbmap['contactmap'].sequence[residx2])
# print(hbmap['maplot'].sequence[residx1])
# print(hbmap['maplot'].sequence[residx2])
# print(distmap.index[residx2])
# print(dist)
# print(hbmap["scoremap"])
......@@ -665,7 +665,7 @@ class AriaEcXMLConverter(AriaXMLConverter):
def write_map_restraint(self, contactmap, nb_c, targetdist, scoremap=None,
listname=""):
"""
Translate contactmap in ARIA XML restraint
Translate maplot in ARIA XML restraint
:param listname:
:param contactmap: ResAtmMap for a protein
:param nb_c: Number of restraints selected
......@@ -676,8 +676,8 @@ class AriaEcXMLConverter(AriaXMLConverter):
# Number of selected contact
# If there isn't enough contacts in input contact map after filtering
# step, change nb_c
# nb_c = nb_c if nb_c < len(contactmap.contactset()) else len(
# contactmap.contactset())
# nb_c = nb_c if nb_c < len(maplot.contactset()) else len(
# maplot.contactset())
if scoremap is not None:
# TODO: implement viterbiscore as scoremap for bbcontacts ?
......@@ -919,7 +919,7 @@ class AriaEcXMLConverter(AriaXMLConverter):
for maptype in maplist:
LOG.info("Writing %s ARIA XML distance restraints", maptype)
outfile, pairlist = self.write_map_restraint(
maplist[maptype]['contactmap'], maplist[maptype]["nb_c"],
maplist[maptype]['maplot'], maplist[maptype]["nb_c"],
targetmap, listname=maptype,
scoremap=maplist[maptype].get("scoremap"))
out[0].append(outfile)
......
(dp0
S'A'
p1
(dp2
g1
S'CB, CB'
p3
sS'C'
p4
S'CB, SG'
p5
sS'E'
p6
S'CB, OE1'
p7
sS'D'
p8
S'CB, OD1'
p9
sS'G'
p10
S'CB, CA'
p11
sS'F'
p12
S'CB, CZ'
p13
sS'I'
p14
S'CB, CD1'
p15
sS'H'
p16
S'CB, CE1'
p17
sS'K'
p18
S'CB, NZ'
p19
sS'M'
p20
S'CB, CE'
p21
sS'L'
p22
g15
sS'N'
p23
g9
sS'Q'
p24
g7
sS'P'
p25
S'CB, CG'
p26
sS'S'
p27
S'CB, OG'
p28
sS'R'
p29
S'CB, NH1'
p30
sS'T'
p31
S'CB, OG1'
p32
sS'W'
p33
S'CB, CH2'
p34
sS'V'
p35
S'CB, CG1'
p36
sS'Y'
p37
S'CB, OH'
p38
ssg4
(dp39
g1
S'SG, CB'
p40
sg4
S'SG, SG'
p41
sg6
S'SG, OE1'
p42
sg8
S'SG, OD1'
p43
sg10
S'SG, CA'
p44
sg12
S'SG, CZ'
p45
sg14
S'SG, CD1'
p46
sg16
S'SG, CE1'
p47
sg18
S'SG, NZ'
p48
sg20