Commit 6c3162f7 authored by Fabrice Allain's avatar Fabrice Allain
Browse files

merging not fully functionnal

parent 912fc3b2
......@@ -27,19 +27,4 @@ $Revision: 1.1.1.1 $
$Date: 2010/03/23 15:27:24 $
"""
__all__ = ["Analyser", "ariabase", "AriaPeak", "AriaXML", "Assignment",
"AssignmentFilter", "Atom", "Calibrator", "ccpn2top",
"ccpn_conversion", "Chain", "ChemicalShiftFilter", "Cluster",
"ChemicalShiftList", "cns", "Contribution", "ContributionAssigner",
"conversion", "ConversionTable", "CovalentDistances", "CrossPeak",
"CrossPeakFilter", "DataContainer", "Datum", "Experiment",
"exportToCcpn", "Factory", "FloatFile", "importFromCcpn",
"Infrastructure", "Iteration", "JobManager", "mathutils", "Merger",
"Molecule", "MolMol", "Molprobity", "Network", "NOEModel",
"NOESYSpectrum", "NOESYSpectrumFilter" "OrderedDict", "PDBReader",
"PeakAssigner", "Project", "Protocol", "Relaxation", "Report",
"Residue", "RmsReport", "Settings", "ShiftAssignment",
"ShiftAssignmentFilter", "Singleton", "SpinPair",
"StructureEnsemble", "SuperImposer", "tools", "Topology",
"TypeChecking", "ViolationAnalyser", "WhatifProfile", "xmlparser",
"xmlutils"]
from core import *
\ No newline at end of file
......@@ -4,26 +4,26 @@ Created on 4/7/17
@author: fallain
"""
import re
import os
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from collections import OrderedDict
from Bio.PDB import PDBParser, PDBIO
from sklearn.decomposition import PCA
from aria.AriaXML import AriaXMLPickler
from ..core.AriaXML import AriaXMLPickler
from ..core.DataContainer import DATA_SEQUENCE
from ..core.SuperImposer import SuperImposer
from collections import OrderedDict
from glob import glob
from matplotlib.colors import ListedColormap
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
from aria.SuperImposer import SuperImposer
from .converter import AriaEcXMLConverter
from .common import NotDisordered, Capturing
from matplotlib.colors import ListedColormap
from aria.DataContainer import DATA_SEQUENCE
from aria.StructureEnsemble import StructureEnsemble, StructureEnsembleSettings
from sklearn.decomposition import PCA
from ..core.StructureEnsemble import StructureEnsemble, StructureEnsembleSettings
from .common import NotDisordered, Capturing
from .converter import AriaEcXMLConverter
LOG = logging.getLogger(__name__)
......
......@@ -177,8 +177,8 @@ class AriaEcCommands(object):
group = parser.add_argument_group('required arguments')
group.add_argument("seq", action=ReadableFile,
help="sequence file [FASTA]")
group.add_argument("sspred", action=ReadableFile,
help="secondary structure prediction file")
# group.add_argument("sspred", action=ReadableFile,
# help="secondary structure prediction file")
group.add_argument("infiles", nargs="+", metavar="infile",
action=ReadableFile,
help="contact or pdb file(s) used to build aria "
......@@ -189,6 +189,12 @@ class AriaEcCommands(object):
"use distances in the given file as "
"target distance to build distance "
"restraints")
group.add_argument("-s", "--ssfile", dest="sspred", action=ReadableFile,
help="secondary structure prediction file")
group.add_argument("-p", "--ariaproject", dest="ariaproject",
action=ReadableFile,
help="ARIA project file. This file will be used as"
"an initialization file if")
group.add_argument("-t", "--type", required=True,
nargs="+", dest="contact_types",
choices=self.contact_types, help="Infile(s) contact "
......@@ -201,6 +207,10 @@ class AriaEcCommands(object):
default=False, help="Use secondary structure index")
group.add_argument("--no-filter", dest="no_filter", action="store_true",
default=False, help="Do not filter contact map.")
group.add_argument("--extract-all", dest="extractall", action="store_true",
default=False, help="Extract data or all data and"
"parameters if an ARIA project"
"is defined with -p option")
return parser
def _bbconv_argparser(self, desc=None):
......@@ -272,12 +282,17 @@ class AriaEcCommands(object):
parser.add_argument("--onlyreport", dest="onlyreport",
action="store_true",
default=False, help="Generate only report file")
parser.add_argument("--no-filter", dest="no_filter", action="store_true",
default=False, help="Do not filter contact map.")
parser.add_argument("--ssidx", dest="ssidx", action="store_true",
default=False,
help="Use secondary structure index")
parser.add_argument("--prefix", dest="prefix", default=False,
action="store_true",
help="Add specific prefix to generated file names")
parser.add_argument("--prefix", dest="prefix", action="store_true",
default="",
help="Generate prefix for file names")
parser.add_argument("--prefixname", dest="prefixname",
default="",
help="Prefix name for file names")
return parser
@staticmethod
......
......@@ -62,7 +62,6 @@ class TqdmToLogger(io.StringIO):
"""
self.logger.log(self.level, self.buf)
# Code below adapated from an answer of klaus se on stackoverflow
# (http://stackoverflow.com/a/16071616)
def worker(f, task_queue, done_queue):
......
......@@ -31,10 +31,11 @@ clashlist_executable:
; Contact definition section used to define maplot from pdb file.
; Decrease this threshold if using other cutoff (e.g. 5.0)
default_cutoff: 8.0
; Add contact cutoff folowwing the syntax atm1_atm2
;ca_ca: 7.0
;cb_cb: 7.0
;sc_sc: 5.0
; Add contact cutoff folowwing the syntax all, atm1_atm2 or sc_sc for side chains
;all:
;ca_ca:
;cb_cb:
;sc_sc:
[setup]
; ------------------------------ TBL parameters ------------------------------ #
......@@ -68,10 +69,6 @@ hb_dplus: 0.5
; neighborhood_contact : True, False [False]
; Generate restraints for neighbors foreach
; contact in the contact map
; pair_list : all, heavy, min [min]
; use all, heavy atms or from a minimized
; list (CA, CB, SC) for contribution list for
; each distance restraint
; atoms_type : all, heavy, min [min]
; use all, heavy atms or from a minimized
; list (CA, CB, SC) for contribution list for
......
This diff is collapsed.
......@@ -24,7 +24,8 @@ class AriaEcContactMap(object):
self.protein = Protein(settings)
self.file_reader = MapFileListReader(
cont_def=settings.contactdef.config)
self.filter = MapFilter(settings.setup.config)
self.filter = MapFilter(settings.setup.config,
nofilter=settings.maplot.args.get("no_filter"))
self.protname = ''
self.allresmap = {}
self.refmap = None
......@@ -140,17 +141,21 @@ class AriaEcContactMap(object):
# ------------------------------ Output ------------------------------ #
for mapname, mapt, mapath in self.allresmap.keys():
prefix = "%s_%svs%s" % (self.protname, mapt, self.reftype) if self.settings.maplot.args.get("prefix") else ""
prefix = self.settings.maplot.args.get("prefixname") if self.settings.maplot.args.get("prefixname") else ""
if mapname == self.refname:
if self.settings.maplot.args.get("onlyreport", False) is not False:
refmap.write_contacts(mapname,
if not self.settings.maplot.args.get("onlyreport", False):
refmap.write_contacts(mapname, prefix=prefix,
outdir=outdir,
scoremap=self.refmap.get("scoremap",
None))
continue
prefix = "%s_%svs%s" % (self.protname, mapt, self.reftype) if \
self.settings.maplot.args.get("prefix") and not \
self.settings.maplot.args.get("prefixname") else \
self.settings.maplot.args.get("prefixname") if \
self.settings.maplot.args.get("prefixname") else ""
scoremap = self.allresmap[(mapname, mapt, mapath)].get(
'scoremap', None)
# if self.allresmap[mapt].get("maplot") is not None and \
......
......@@ -3,25 +3,23 @@
PDB distance distribution generation
"""
import os
import sys
import logging
import os
import pandas as pd
import pbxplore as pbx
from glob import glob
import sys
from Bio.PDB import PDBList, PDBParser, Selection, is_aa, NeighborSearch, \
MMCIFParser
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
from future.utils import iteritems
from collections import defaultdict, OrderedDict
from future.utils import iteritems
from glob import glob
from ..core.legacy.AminoAcid import AminoAcid
# from .base import ppdict
from .common import Capturing
from .reader import CulledPdbFile
from .protmap import ResAtmMap
from aria.legacy.AminoAcid import AminoAcid
from .reader import CulledPdbFile
LOG = logging.getLogger(__name__)
......
......@@ -5,14 +5,13 @@ Created on 9/5/16
Derived from qual.py script by Dr. Benjamin Bardiaux
"""
import logging
import os
import shutil
import logging
from aria.legacy.QualityChecks import QualityChecks
from ..core.legacy import QualityChecks
from .common import CommandProtocol
LOG = logging.getLogger(__name__)
......
......@@ -3,22 +3,21 @@
PDB distance distribution analysis
"""
import os
import re
import pickle
import logging
import itertools
import logging
import numpy as np
import os
import pandas as pd
import pickle
import re
import sklearn.mixture as mixture
from tqdm import tqdm
from .protmap import SsAaAtmMap
from .common import TqdmToLogger
from aria.legacy.AminoAcid import AminoAcid
from aria.ConversionTable import ConversionTable
from ..core.ConversionTable import ConversionTable
from pathos.multiprocessing import ProcessingPool
from tqdm import tqdm
from ..core.legacy.AminoAcid import AminoAcid
from .common import TqdmToLogger
from .protmap import SsAaAtmMap
LOG = logging.getLogger(__name__)
......@@ -173,6 +172,7 @@ class PDBStat(object):
df = df.append(tmp)
# TODO: CHANGE DEFAULT SELECTION CRITERIA TO AIC
if bic < lowest_bic:
lowest_bic = bic
best_gmm = gmm
......
......@@ -6,15 +6,18 @@
from __future__ import absolute_import, division, print_function, \
unicode_literals
import os
import sys
import re
# from ..core import legacy.SequenceList as SequenceList
from ..core.legacy import SequenceList as SequenceList
import logging
import os
import pkg_resources as pkgr
import aria.legacy.SequenceList as SequenceList
import aria.legacy.AminoAcid as AmnAcd
import re
import sys
from six import iteritems, text_type
from ..core.legacy import AminoAcid as AmnAcd
from .common import (reg_load, ppdict)
# import skbio.Protein as skprot
# TODO: interface skbio ??
......@@ -30,6 +33,12 @@ class SsList(object):
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<ss_conf>\d?)')
psipred2_reg = re.compile(r'^(?P<ss_pred>[HEC]+)')
psipred3_reg = re.compile(r'^\s*(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<dunno1>\d?\.?\d*)'
r'\s+(?P<dunno2>\d?\.?\d*)'
r'\s+(?P<dunno3>\d?\.?\d*)')
indxplus_reg = re.compile(
r'^(?P<up_index>\d+)\s+(?P<up_residue>[AC-IK-NP-TVWYZ])\s+'
r'(?P<ss_pred>[CEH])\s+(?P<ss_conf>\d)\s+(?P<msa_index>[\d\-]+)\s+'
......@@ -52,6 +61,12 @@ class SsList(object):
self.ssdist = {}
self.filetype = ''
def __bool__(self):
return True if self.ss_matrix else False
def __nonzero__(self):
return self.__bool__()
@property
def index(self):
""":return:"""
......@@ -111,6 +126,8 @@ class SsList(object):
# TODO: better read with getattr
if self.filetype == "indextableplus":
self.read_indextableplus(filename)
elif self.filetype == "ss2":
self.read_psipred(filename, ss2=True)
else:
self.read_psipred(filename)
......@@ -118,7 +135,7 @@ class SsList(object):
"Secondary structure dict:\n%s", self.ss_matrix,
self.ssdict)
def read_psipred(self, filename):
def read_psipred(self, filename, ss2=False):
"""
......@@ -132,14 +149,17 @@ class SsList(object):
"""
self.ssdict = reg_load(self.psipred_reg, filename)
if ss2:
self.ssdict = reg_load(self.psipred3_reg, filename)
else:
self.ssdict = reg_load(self.psipred_reg, filename)
# TODO: supprimer psipred_list dans les futures implementations
ss_index_dict = {'H': 1, 'C': 1, 'E': 1}
for line_id in sorted(self.ssdict.keys()):
# Modif champ ss_pred
# Si line_id
if line_id > 1 and self.ssdict[line_id]['ss_pred'] not in \
if line_id != min(self.ssdict.keys()) and \
self.ssdict[line_id]['ss_pred'] not in \
self.ssdict[line_id - 1]['ss_pred']:
# If next ss isn't the same, increment relative struct in
# ss_index_dict
......@@ -152,7 +172,7 @@ class SsList(object):
self.ss_matrix.append([self.ssdict[line_id]['up_index'],
self.ssdict[line_id]['up_residue'],
self.ssdict[line_id]['ss_pred'],
self.ssdict[line_id]['ss_conf']])
self.ssdict[line_id].get('ss_conf')])
def write_ssfasta(self, filename, desc="pdbid"):
"""
......@@ -478,7 +498,10 @@ class AminoAcidSequence(SequenceList.SequenceList, object):
# TODO: smarter reader checking type of file (fasta, etc ...)
# TODO: capturing has some troubles with unicode ...
# with Capturing() as output:
self.ReadFasta(text_type(filename))
if os.path.splitext(filename)[1] == '.seq':
self.ReadSeq(text_type(filename))
else:
self.ReadFasta(text_type(filename))
# LOG.info(''.join(output))
self.sequence = "".join((AmnAcd.AminoAcid(str(_))[0] for _ in
......
This diff is collapsed.
......@@ -8,9 +8,11 @@ import os
import re
import logging
import os.path
import numpy as np
import collections
import pkg_resources as pkgr
import scipy.spatial.distance as distance
from Bio import pairwise2
from .common import sort_2dict
from .protmap import (ResMap, ResAtmMap)
......@@ -182,8 +184,8 @@ class MapFile(RegexFile):
r'(?P<placeholder>\d),(?P<res1_cons>\d+),'
r'(?P<res2_cons>\d+),(?P<ss_filter>\d|\d{3}),'
r'(?P<high_cons_filter>\d|\d{3}),'
r'(?P<cc_filter>\d|\d{3}),(?P<res1_1l_code>\w),'
r'(?P<res2_1l_code>\w)$'),
r'(?P<cc_filter>\d|\d{3}),(?P<res1_name>\w),'
r'(?P<res2_name>\w)$'),
"score_field": "ec_score"
},
"pconsc": {
......@@ -318,9 +320,11 @@ class MapFile(RegexFile):
"distmap": None,
"maplot": None,
"scoremap": None}
self.clashlist = None
self.contactlist = None
self.flaglist = None
self.clashlist = []
self.distlist = []
self.contactlist = []
self.flaglist = {}
self.scorelist = []
# self.maplot = None
# self.distmap = None
......@@ -356,27 +360,6 @@ class MapFile(RegexFile):
raise NotImplementedError("Class %s doesn't implement create_map" %
self.__class__.__name__)
def check_contacts(self, aa_seq):
"""
Check if plm_dict is consistent with input sequence
Parameters
----------
aa_seq :
Returns
-------
"""
LOG.info("Checking consistency of contacts with input sequence")
for line in self.lines:
if self.lines[line]['res1_name'] != aa_seq[int(self.lines[line]['res1_nb']) - 1] \
or self.lines[line]['res2_name'] != aa_seq[int(self.lines[line]['res2_nb']) - 1]:
LOG.error("Difference between given sequence and residu "
"names in contact file at line %d !", line)
def update_map(self, resmap):
"""
......@@ -440,7 +423,7 @@ class MapFile(RegexFile):
LOG.error("Wrong format type given ...")
return [None] * 3
def read(self, protein=None, contactdef=5.0, groupby_method="min",
def read(self, protein, contactdef=5.0, groupby_method="min",
scsc=None):
"""
......@@ -461,61 +444,107 @@ class MapFile(RegexFile):
"""
LOG.info("Reading %s file", self.filepath)
# res1_1l_code
aaseq = {}
if self.filetype:
LOG.info("Reading %s file", self.filepath)
# Read file with regex related to filetype
self.load()
LOG.debug(self.lines)
if protein:
LOG.info("Loading contact list")
if self.filetype == "contactlist":
self.flaglist = {
tuple(sorted(
[int(self.lines[contact].get("res1_nb")),
int(self.lines[contact].get("res2_nb"))]
)): self.lines[contact].get("con_flag")
for contact in self.lines if
self.lines[contact].get("res1_nb") and
self.lines[contact].get("res2_nb")}
if self.filetype == "metapsicovhb":
# HB contacts aren't sorted since the first res correspond to
# donor and second to acceptor
self.contactlist = [
tuple([int(self.lines[contact].get("res_donor")),
int(self.lines[contact].get("res_acceptor"))])
for contact in self.lines
if self.lines[contact].get("res_donor") and
self.lines[contact].get("res_acceptor")]
else:
else:
LOG.error("Unrecognized file type")
return None
LOG.info("Loading contact file")
sym = False if self.filetype == "metapsicovhb" else True
confields = ("res_donor", "res_acceptor") if \
self.filetype == "metapsicovhb" else ('res1_nb', 'res2_nb')
for contact in self.lines:
# If contact defined
if self.lines[contact].get(confields[0]) and \
self.lines[contact].get(confields[1]):
conkeys = tuple(sorted([
int(self.lines[contact].get(confields[0])),
int(self.lines[contact].get(confields[1]))]))
self.contactlist.append(conkeys)
if self.sort:
self.scorelist.append(float(self.lines[contact].get(self.sort)))
if self.filetype == "contactlist":
self.flaglist[conkeys] = self.lines[contact].get("con_flag")
if self.filetype == "metapsicovhb":
self.distlist.append(self.lines[contact].get("res_dist"))
if self.filetype in ("evfold", "plmdca", "plm", "plmev"):
self.clashlist.append(next(
(el for el in (
self.lines[contact].get("ss_filter"),
self.lines[contact].get("high_cons_filter"),
self.lines[contact].get("cc_filter")) if el != "0"),
"0"))
if self.lines[contact].get("res1_name") and \
self.lines[contact].get("res2_name"):
aaseq[int(self.lines[contact].get("res1_nb"))] = self.lines[
contact].get("res1_name")
aaseq[int(self.lines[contact].get("res2_nb"))] = self.lines[
contact].get("res2_name")
if aaseq:
# Align evfold amino acid sequence with sequence obtained from seq
# file
print(aaseq)
seq = ''.join([
aaseq[key] if key in aaseq else '*'
for key in range(1, max(aaseq) + 1)])
# With gap penalty set to -1, we should only have an alignment without
# gap since mismatch is the preferred way (with score set to 0)
alignment = pairwise2.align.localxs(
seq, protein.aa_sequence.sequence, -1, -1,
one_alignment_only=True)[0]
LOG.info('Alignment of amino acid sequence with contact file\n'
'%s' % pairwise2.format_alignment(*alignment))
shift = re.match(r'^-*', alignment[1])
shift = len(shift.group(0)) if shift else 0
if shift:
LOG.warning("Found a shift of %d residues in positions given"
" within contact list", shift)
LOG.info("Update index in contact list and remove unassigned "
"contacts")
LOG.debug("Old contact list\n%s", self.contactlist)
self.contactlist = [
tuple(sorted(
[int(self.lines[contact].get("res1_nb")),
int(self.lines[contact].get("res2_nb"))]))
for contact in self.lines
if self.lines[contact].get("res1_nb") and
self.lines[contact].get("res2_nb")]
LOG.debug(self.contactlist)
sym = False if self.filetype == "metapsicovhb" else True
self.create_map(protein, contactdef,
groupby_method=groupby_method, scsc=scsc,
flaglist=self.flaglist, sym=sym, path=self.filepath)
if self.filetype == "plmdca":
# If contact filetype contain residues name, check if it is
# consistent with given sequence
self.check_contacts(protein.aa_sequence.sequence)
if self.filetype == "evfold":
LOG.info("Loading evfold clash list")
self.clashlist = [
next((el for el in (self.lines[contact].get("ss_filter"),
self.lines[contact].get("high_cons_filter"),
self.lines[contact].get("cc_filter")) if el != "0"), "0")
for contact in self.lines if
self.lines[contact].get("res1_nb") and
self.lines[contact].get("res2_nb")]
if len(self.contactlist) != len(self.clashlist):
LOG.error("When reading input file, clash list is not "
"the same length than contactlist")
LOG.debug(self.clashlist)
(contact[0] - shift, contact[1] - shift)
for contact in self.contactlist
]
LOG.debug("New contact list\n%s", self.contactlist)
LOG.info("Remove contacts outside sequence bonds")
# Checking for unassigned contacts
validx = range(1, len(protein.aa_sequence.sequence) + 1)
unascon = [contactidx
for contactidx, contact in enumerate(self.contactlist)
if contact[0] not in v