Commit 6c3162f7 authored by Fabrice Allain's avatar Fabrice Allain
Browse files

merging not fully functionnal

parent 912fc3b2
......@@ -27,19 +27,4 @@ $Revision: 1.1.1.1 $
$Date: 2010/03/23 15:27:24 $
"""
__all__ = ["Analyser", "ariabase", "AriaPeak", "AriaXML", "Assignment",
"AssignmentFilter", "Atom", "Calibrator", "ccpn2top",
"ccpn_conversion", "Chain", "ChemicalShiftFilter", "Cluster",
"ChemicalShiftList", "cns", "Contribution", "ContributionAssigner",
"conversion", "ConversionTable", "CovalentDistances", "CrossPeak",
"CrossPeakFilter", "DataContainer", "Datum", "Experiment",
"exportToCcpn", "Factory", "FloatFile", "importFromCcpn",
"Infrastructure", "Iteration", "JobManager", "mathutils", "Merger",
"Molecule", "MolMol", "Molprobity", "Network", "NOEModel",
"NOESYSpectrum", "NOESYSpectrumFilter" "OrderedDict", "PDBReader",
"PeakAssigner", "Project", "Protocol", "Relaxation", "Report",
"Residue", "RmsReport", "Settings", "ShiftAssignment",
"ShiftAssignmentFilter", "Singleton", "SpinPair",
"StructureEnsemble", "SuperImposer", "tools", "Topology",
"TypeChecking", "ViolationAnalyser", "WhatifProfile", "xmlparser",
"xmlutils"]
from core import *
\ No newline at end of file
......@@ -4,26 +4,26 @@ Created on 4/7/17
@author: fallain
"""
import re
import os
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from collections import OrderedDict
from Bio.PDB import PDBParser, PDBIO
from sklearn.decomposition import PCA
from aria.AriaXML import AriaXMLPickler
from ..core.AriaXML import AriaXMLPickler
from ..core.DataContainer import DATA_SEQUENCE
from ..core.SuperImposer import SuperImposer
from collections import OrderedDict
from glob import glob
from matplotlib.colors import ListedColormap
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
from aria.SuperImposer import SuperImposer
from .converter import AriaEcXMLConverter
from .common import NotDisordered, Capturing
from matplotlib.colors import ListedColormap
from aria.DataContainer import DATA_SEQUENCE
from aria.StructureEnsemble import StructureEnsemble, StructureEnsembleSettings
from sklearn.decomposition import PCA
from ..core.StructureEnsemble import StructureEnsemble, StructureEnsembleSettings
from .common import NotDisordered, Capturing
from .converter import AriaEcXMLConverter
LOG = logging.getLogger(__name__)
......
......@@ -177,8 +177,8 @@ class AriaEcCommands(object):
group = parser.add_argument_group('required arguments')
group.add_argument("seq", action=ReadableFile,
help="sequence file [FASTA]")
group.add_argument("sspred", action=ReadableFile,
help="secondary structure prediction file")
# group.add_argument("sspred", action=ReadableFile,
# help="secondary structure prediction file")
group.add_argument("infiles", nargs="+", metavar="infile",
action=ReadableFile,
help="contact or pdb file(s) used to build aria "
......@@ -189,6 +189,12 @@ class AriaEcCommands(object):
"use distances in the given file as "
"target distance to build distance "
"restraints")
group.add_argument("-s", "--ssfile", dest="sspred", action=ReadableFile,
help="secondary structure prediction file")
group.add_argument("-p", "--ariaproject", dest="ariaproject",
action=ReadableFile,
help="ARIA project file. This file will be used as"
"an initialization file if")
group.add_argument("-t", "--type", required=True,
nargs="+", dest="contact_types",
choices=self.contact_types, help="Infile(s) contact "
......@@ -201,6 +207,10 @@ class AriaEcCommands(object):
default=False, help="Use secondary structure index")
group.add_argument("--no-filter", dest="no_filter", action="store_true",
default=False, help="Do not filter contact map.")
group.add_argument("--extract-all", dest="extractall", action="store_true",
default=False, help="Extract data or all data and"
"parameters if an ARIA project"
"is defined with -p option")
return parser
def _bbconv_argparser(self, desc=None):
......@@ -272,12 +282,17 @@ class AriaEcCommands(object):
parser.add_argument("--onlyreport", dest="onlyreport",
action="store_true",
default=False, help="Generate only report file")
parser.add_argument("--no-filter", dest="no_filter", action="store_true",
default=False, help="Do not filter contact map.")
parser.add_argument("--ssidx", dest="ssidx", action="store_true",
default=False,
help="Use secondary structure index")
parser.add_argument("--prefix", dest="prefix", default=False,
action="store_true",
help="Add specific prefix to generated file names")
parser.add_argument("--prefix", dest="prefix", action="store_true",
default="",
help="Generate prefix for file names")
parser.add_argument("--prefixname", dest="prefixname",
default="",
help="Prefix name for file names")
return parser
@staticmethod
......
......@@ -62,7 +62,6 @@ class TqdmToLogger(io.StringIO):
"""
self.logger.log(self.level, self.buf)
# Code below adapated from an answer of klaus se on stackoverflow
# (http://stackoverflow.com/a/16071616)
def worker(f, task_queue, done_queue):
......
......@@ -31,10 +31,11 @@ clashlist_executable:
; Contact definition section used to define maplot from pdb file.
; Decrease this threshold if using other cutoff (e.g. 5.0)
default_cutoff: 8.0
; Add contact cutoff folowwing the syntax atm1_atm2
;ca_ca: 7.0
;cb_cb: 7.0
;sc_sc: 5.0
; Add contact cutoff folowwing the syntax all, atm1_atm2 or sc_sc for side chains
;all:
;ca_ca:
;cb_cb:
;sc_sc:
[setup]
; ------------------------------ TBL parameters ------------------------------ #
......@@ -68,10 +69,6 @@ hb_dplus: 0.5
; neighborhood_contact : True, False [False]
; Generate restraints for neighbors foreach
; contact in the contact map
; pair_list : all, heavy, min [min]
; use all, heavy atms or from a minimized
; list (CA, CB, SC) for contribution list for
; each distance restraint
; atoms_type : all, heavy, min [min]
; use all, heavy atms or from a minimized
; list (CA, CB, SC) for contribution list for
......
This diff is collapsed.
......@@ -24,7 +24,8 @@ class AriaEcContactMap(object):
self.protein = Protein(settings)
self.file_reader = MapFileListReader(
cont_def=settings.contactdef.config)
self.filter = MapFilter(settings.setup.config)
self.filter = MapFilter(settings.setup.config,
nofilter=settings.maplot.args.get("no_filter"))
self.protname = ''
self.allresmap = {}
self.refmap = None
......@@ -140,17 +141,21 @@ class AriaEcContactMap(object):
# ------------------------------ Output ------------------------------ #
for mapname, mapt, mapath in self.allresmap.keys():
prefix = "%s_%svs%s" % (self.protname, mapt, self.reftype) if self.settings.maplot.args.get("prefix") else ""
prefix = self.settings.maplot.args.get("prefixname") if self.settings.maplot.args.get("prefixname") else ""
if mapname == self.refname:
if self.settings.maplot.args.get("onlyreport", False) is not False:
refmap.write_contacts(mapname,
if not self.settings.maplot.args.get("onlyreport", False):
refmap.write_contacts(mapname, prefix=prefix,
outdir=outdir,
scoremap=self.refmap.get("scoremap",
None))
continue
prefix = "%s_%svs%s" % (self.protname, mapt, self.reftype) if \
self.settings.maplot.args.get("prefix") and not \
self.settings.maplot.args.get("prefixname") else \
self.settings.maplot.args.get("prefixname") if \
self.settings.maplot.args.get("prefixname") else ""
scoremap = self.allresmap[(mapname, mapt, mapath)].get(
'scoremap', None)
# if self.allresmap[mapt].get("maplot") is not None and \
......
......@@ -3,25 +3,23 @@
PDB distance distribution generation
"""
import os
import sys
import logging
import os
import pandas as pd
import pbxplore as pbx
from glob import glob
import sys
from Bio.PDB import PDBList, PDBParser, Selection, is_aa, NeighborSearch, \
MMCIFParser
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
from future.utils import iteritems
from collections import defaultdict, OrderedDict
from future.utils import iteritems
from glob import glob
from ..core.legacy.AminoAcid import AminoAcid
# from .base import ppdict
from .common import Capturing
from .reader import CulledPdbFile
from .protmap import ResAtmMap
from aria.legacy.AminoAcid import AminoAcid
from .reader import CulledPdbFile
LOG = logging.getLogger(__name__)
......
......@@ -5,14 +5,13 @@ Created on 9/5/16
Derived from qual.py script by Dr. Benjamin Bardiaux
"""
import logging
import os
import shutil
import logging
from aria.legacy.QualityChecks import QualityChecks
from ..core.legacy import QualityChecks
from .common import CommandProtocol
LOG = logging.getLogger(__name__)
......
......@@ -3,22 +3,21 @@
PDB distance distribution analysis
"""
import os
import re
import pickle
import logging
import itertools
import logging
import numpy as np
import os
import pandas as pd
import pickle
import re
import sklearn.mixture as mixture
from tqdm import tqdm
from .protmap import SsAaAtmMap
from .common import TqdmToLogger
from aria.legacy.AminoAcid import AminoAcid
from aria.ConversionTable import ConversionTable
from ..core.ConversionTable import ConversionTable
from pathos.multiprocessing import ProcessingPool
from tqdm import tqdm
from ..core.legacy.AminoAcid import AminoAcid
from .common import TqdmToLogger
from .protmap import SsAaAtmMap
LOG = logging.getLogger(__name__)
......@@ -173,6 +172,7 @@ class PDBStat(object):
df = df.append(tmp)
# TODO: CHANGE DEFAULT SELECTION CRITERIA TO AIC
if bic < lowest_bic:
lowest_bic = bic
best_gmm = gmm
......
......@@ -6,15 +6,18 @@
from __future__ import absolute_import, division, print_function, \
unicode_literals
import os
import sys
import re
# from ..core import legacy.SequenceList as SequenceList
from ..core.legacy import SequenceList as SequenceList
import logging
import os
import pkg_resources as pkgr
import aria.legacy.SequenceList as SequenceList
import aria.legacy.AminoAcid as AmnAcd
import re
import sys
from six import iteritems, text_type
from ..core.legacy import AminoAcid as AmnAcd
from .common import (reg_load, ppdict)
# import skbio.Protein as skprot
# TODO: interface skbio ??
......@@ -30,6 +33,12 @@ class SsList(object):
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<ss_conf>\d?)')
psipred2_reg = re.compile(r'^(?P<ss_pred>[HEC]+)')
psipred3_reg = re.compile(r'^\s*(?P<up_index>\d+)'
r'\s+(?P<up_residue>[AC-IK-NP-TVWYZ])'
r'\s+(?P<ss_pred>[HEC])'
r'\s+(?P<dunno1>\d?\.?\d*)'
r'\s+(?P<dunno2>\d?\.?\d*)'
r'\s+(?P<dunno3>\d?\.?\d*)')
indxplus_reg = re.compile(
r'^(?P<up_index>\d+)\s+(?P<up_residue>[AC-IK-NP-TVWYZ])\s+'
r'(?P<ss_pred>[CEH])\s+(?P<ss_conf>\d)\s+(?P<msa_index>[\d\-]+)\s+'
......@@ -52,6 +61,12 @@ class SsList(object):
self.ssdist = {}
self.filetype = ''
def __bool__(self):
return True if self.ss_matrix else False
def __nonzero__(self):
return self.__bool__()
@property
def index(self):
""":return:"""
......@@ -111,6 +126,8 @@ class SsList(object):
# TODO: better read with getattr
if self.filetype == "indextableplus":
self.read_indextableplus(filename)
elif self.filetype == "ss2":
self.read_psipred(filename, ss2=True)
else:
self.read_psipred(filename)
......@@ -118,7 +135,7 @@ class SsList(object):
"Secondary structure dict:\n%s", self.ss_matrix,
self.ssdict)
def read_psipred(self, filename):
def read_psipred(self, filename, ss2=False):
"""
......@@ -132,14 +149,17 @@ class SsList(object):
"""
if ss2:
self.ssdict = reg_load(self.psipred3_reg, filename)
else:
self.ssdict = reg_load(self.psipred_reg, filename)
# TODO: supprimer psipred_list dans les futures implementations
ss_index_dict = {'H': 1, 'C': 1, 'E': 1}
for line_id in sorted(self.ssdict.keys()):
# Modif champ ss_pred
# Si line_id
if line_id > 1 and self.ssdict[line_id]['ss_pred'] not in \
if line_id != min(self.ssdict.keys()) and \
self.ssdict[line_id]['ss_pred'] not in \
self.ssdict[line_id - 1]['ss_pred']:
# If next ss isn't the same, increment relative struct in
# ss_index_dict
......@@ -152,7 +172,7 @@ class SsList(object):
self.ss_matrix.append([self.ssdict[line_id]['up_index'],
self.ssdict[line_id]['up_residue'],
self.ssdict[line_id]['ss_pred'],
self.ssdict[line_id]['ss_conf']])
self.ssdict[line_id].get('ss_conf')])
def write_ssfasta(self, filename, desc="pdbid"):
"""
......@@ -478,6 +498,9 @@ class AminoAcidSequence(SequenceList.SequenceList, object):
# TODO: smarter reader checking type of file (fasta, etc ...)
# TODO: capturing has some troubles with unicode ...
# with Capturing() as output:
if os.path.splitext(filename)[1] == '.seq':
self.ReadSeq(text_type(filename))
else:
self.ReadFasta(text_type(filename))
# LOG.info(''.join(output))
......
......@@ -3,36 +3,33 @@
ARIA Evolutionary Constraints Tools
"""
from __future__ import absolute_import, division, print_function
from collections import defaultdict
from copy import deepcopy
import os
import re
from ..core import ConversionTable as ConversionTable
import collections
import csv
import string
import logging
import textwrap
import datetime
import operator
import itertools
import collections
import logging
import matplotlib
import numpy as np
import operator
import os
import pandas as pd
import re
import seaborn as sns
import six
import sklearn.metrics as skm
import aria.ConversionTable as ConversionTable
import aria.legacy.AminoAcid as AminoAcid
import string
import textwrap
from collections import defaultdict
from copy import deepcopy
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from ..core.legacy import AminoAcid as AminoAcid
from .common import (tickmin, tickrot, titleprint, addtup)
from .ndconv import net_deconv
matplotlib.use("Agg", warn=False)
LOG = logging.getLogger(__name__)
# TODO: check dataframe symmetry or always use unstack
......@@ -90,7 +87,7 @@ class Map(pd.DataFrame):
dtype = self.check_type(mtype)
# TODO: should probably init to np.NaN
if data is None:
data = False if mtype == bool else 0.
data = False if dtype == bool else np.NaN
super(Map, self).__init__(data=data, dtype=dtype, index=index,
columns=columns)
self.duplicate_levels = duplicate_levels
......@@ -201,7 +198,7 @@ class Map(pd.DataFrame):
:param rm_list:
"""
value = False if self.dtype == bool else 0.0
value = False if self.dtype == bool else np.NaN
for contact in rm_list:
idx1, idx2 = self.index[contact[0]], self.index[contact[1]]
self.set_value(idx1, idx2, value)
......@@ -713,6 +710,8 @@ class ProteinMap(Map):
if y_scores:
# ROC plot
# Replace nan values in y_scores
y_scores[np.isnan(y_scores)] = np.floor(np.nanmin(y_scores))
metrics.update({
'roc_auc': skm.roc_auc_score(y_true, y_scores),
'aver_prec': skm.average_precision_score(y_true, y_scores)
......@@ -744,6 +743,7 @@ class ProteinMap(Map):
"""
outprefix = outprefix if outprefix else "maplot"
csv_roc = os.path.join(plotdir, "%s.roc.csv" % outprefix)
LOG.info("Generate roc file (%s)", csv_roc)
with open(csv_roc, "w") as f:
......@@ -832,7 +832,7 @@ class ProteinMap(Map):
y_true = list(self.values.astype(int).flat)
y_pred = list(cmpmap.values.astype(int).flat)
y_scores = list(scoremap.values.astype(float).flat) if scoremap is not None else None
y_scores = scoremap.values.astype(float).flat if scoremap is not None else None
# Compute classification metrics for the entire map
metrics = self.classification_metrics(y_true, y_pred, y_scores)
......@@ -876,7 +876,8 @@ class ProteinMap(Map):
"""
# CSV file giving TP/FP contacts
outpath = "%s/%s.contactcmp.csv" % (outdir, outprefix)
outpath = "%s/%s.contactcmp.csv" % (outdir,
outprefix if outprefix else "maplot")
LOG.info("Generate stat file (%s)", outpath)
with open(outpath, 'w') as outfile:
offset = 1 if human_idx else 0
......@@ -905,13 +906,14 @@ class ProteinMap(Map):
dmin)
outfile.write(msg)
def write_contacts(self, filename, outdir="", human_idx=True,
def write_contacts(self, filename, outdir="", prefix="", human_idx=True,
scoremap=None):
"""
Parameters
----------
prefix
filename :
param outdir:
human_idx :
......@@ -926,7 +928,8 @@ class ProteinMap(Map):
"""
filepath = "%s/%s.contact.txt" % (outdir, filename)
prefix = prefix + "_" if prefix else prefix
filepath = "%s/%s%s.contact.txt" % (outdir, prefix, filename.replace(".", "_"))
LOG.info("Generate contact file (%s)", filepath)
with open(filepath, 'w') as outfile:
offset = 1 if human_idx else 0
......@@ -1091,11 +1094,11 @@ class ResAtmMap(ProteinMap):
newmap = ResMap(self.sequence, dtype=self.dtype, desc=self.desc,
sym=self.sym, path=self.path)
if self.dtype == bool:
newmap[:] = self.copy().stack().groupby(level=0).any()
newmap[:] = self.copy().stack(dropna=False).groupby(level=0).any()
elif groupby == "mean":
newmap[:] = self.copy().stack().groupby(level=0).mean()
newmap[:] = self.copy().stack(dropna=False).groupby(level=0).mean()
elif groupby == "min":
newmap[:] = self.copy().stack().groupby(level=0).min()
newmap[:] = self.copy().stack(dropna=False).groupby(level=0).min()
return newmap
def contact_map(self, contactdef, scsc_min=None, def_cut=5.0):
......@@ -1106,7 +1109,7 @@ class ResAtmMap(ProteinMap):
Parameters
----------
def_cut :
param scsc_min: (Default value = 5.0)
(Default value = 5.0)
contactdef :
for all atom pair
scsc_min :
......@@ -1146,17 +1149,20 @@ class ResAtmMap(ProteinMap):
atm_list = set(self.index.get_level_values(1))
atms_list = set([(a, b) for a in atm_list for b in atm_list])
for pair in contactdef.keys():
treshold = contactdef[pair]
if pair == "default_cutoff":
# Since there is more than one treshold, we do not take
# default cutoff into account
continue
if pair == "all":
tmp = self.applymap(lambda x: x < treshold)
contact_map.update(tmp)
continue
pair = tuple(pair.upper().split("_"))
if contactdef[pair] > def_cutoff:
treshold = contactdef[pair]
else:
LOG.warning("Treshold for %s ignored (lower than "
"the default cutoff)", str(pair))
pair = tuple(pair.upper().split("_"))
LOG.info(
"Filtering values in matrix related to %s (%s)",
str(pair), str(treshold) if treshold else def_cutoff)
str(pair), str(treshold))
if pair in (("SC", "SC"), ("sc", "sc")):
# Use scsc_min to apply treshold updateonly for selected atom
# sidechain
......@@ -1471,8 +1477,9 @@ class AaAtmMap(AaMap):
def __init__(self, *args, **kwargs):
super(AaAtmMap, self).__init__(*args, **kwargs)
self.atom_types = kwargs.get("atom_types", "min")
def create_index(self, sequence, **kwargs):
def create_index(self, sequence, atom_types="min", **kwargs):
"""
Update Aa index with atoms
Returns
......@@ -1487,15 +1494,41 @@ class AaAtmMap(AaMap):
res_list = [
"%s" % aa for i, aa in enumerate(seq)
for _ in filter(self.heavy_reg.match, atomtable[aa].keys())]
atm_list = [atm for aa in seq
for atm in filter(self.heavy_reg.match,
atomtable[aa].keys())]
atm_list = [
atm for aa in seq for atm in ('CA', 'CB', 'SC')] \
if atom_types == "min" else [
atm for aa in seq for atm in filter(self.heavy_reg.match,
atomtable[aa].keys())] \
if atom_types == "heavy" else [
atm for aa in seq for atm in atomtable[aa].keys()]
idx = pd.MultiIndex.from_tuples(list(zip(*[res_list, atm_list])),
names=('AminoAcid', 'Atom'))
col = pd.MultiIndex.from_tuples(list(zip(*[res_list, atm_list])),
names=('AminoAcid', 'Atom'))
return idx, col
def copy(self, **kwargs):
"""
Parameters
----------
kwargs :
**kwargs :
Returns
-------
"""
df = super(Map, self).copy()
return AaAtmMap(
path=self.path, data=df, desc=self.desc,
sym=self.sym, mtype=self.mtype, dtype=self.dtype,
atom_types=self.atom_types)
class SsAaAtmMap(AaAtmMap):