Commit 66604a25 authored by Bertrand  NÉRON's avatar Bertrand NÉRON

adapt parser to new data

GEM provide new data with some fields in plus and some have disapear
modification of the parser according to these new data
parent 69e0df87
......@@ -10,226 +10,256 @@ from collections import namedtuple
from couchdbkit.client import Server
from couchdbkit.exceptions import ResourceNotFound
from couchdbkit.resource import CouchdbResource
from couchdbkit.schema import Document
from couchdbkit.schema.properties import *
from restkit import Resource, BasicAuth
import restkit.errors
def replicon_parser( replicon_data ):
def replicon_parser(replicon_data):
"""
parse a file containing the informations about replicons
@param replicon_data: the path of replicon information file
@type replicon_data: string
@return: a dict containing Replicon_info as values and Replicon name (field 0) as key
@rtype: dict
parse a file containing the information about replicons
:param replicon_data: the path of replicon information file
:type replicon_data: string
:return: a dict containing Replicon_info as values and Replicon name (field 0) as key
:rtype: dict
"""
replicon_db = {}
Replicon_info = namedtuple('Replicon_info', 'name, taxid, strain, taxonomy, type')
with open( replicon_data , 'r' ) as replicon_file :
Replicon_info = namedtuple('Replicon_info', 'name, ncbi_id, taxid, strain, taxonomy, type')
with open(replicon_data, 'r') as replicon_file:
for line in replicon_file:
if line[0] != '#':
if not line.startswith('#'):
line = line.strip()
fields = line.split( '\t')
fields = line.split('\t')
if fields[0] in replicon_db:
raise KeyError( "duplicate replicon:" + fields[0])
raise KeyError("duplicate replicon:" + fields[0])
else:
try:
replicon_db[ fields[0] ] = Replicon_info( fields[0] , int(fields[1]) , fields[2] , fields[3].split('; ') , fields[4])
except Exception, err:
raise Exception( "Error during parsing line :"+line )
#remove ending dot or semi-colon from the last term of taxonnomy
if( replicon_db[ fields[0] ].taxonomy[-1].endswith('.') or replicon_db[ fields[0] ].taxonomy[-1].endswith(';')):
replicon_db[ fields[0] ].taxonomy[-1] = replicon_db[ fields[0] ].taxonomy[-1][:-1]
replicon_id = fields[0]
ncbi_id = fields[1]
taxid = int(fields[2])
strain = fields[3]
taxonomy = fields[4].split('; ')
# remove ending dot or semi-colon from the last term of taxonnomy
if taxonomy[-1].endswith('.') or taxonomy[-1].endswith(';'):
taxonomy[-1] = taxonomy[-1][:-1]
replicon_type = fields[5]
replicon_db[replicon_id] = Replicon_info(replicon_id, ncbi_id, taxid, strain,
taxonomy, replicon_type)
except Exception as err:
raise Exception("Error during parsing line : {0} : {1}".format(line, err))
return replicon_db
def system_parser( system_data ):
def system_parser(system_data):
"""
@param system_data: the path of secretion system information file
@type system_data: string
@return: a mapping wit system-code as keys and
@rtype: dict
:param system_data: the path of secretion system information file
:type system_data: string
:return: a mapping wit system-code as keys and
:rtype: dict
"""
system_db = {}
System_info = namedtuple('System_info', 'code, T3SS_family, replicon, genes' )
Gene = namedtuple( 'Gene', 'code, id, protein_length, strand, begin, end, match, full_score, e_value, best_domain_score, best_domain_evalue, c_value, coverage_profile, match_begin, match_end, name, description')
System_info = namedtuple('System_info', 'code, predicted_system, system_status, replicon, genes')
Gene = namedtuple('Gene',
'code, id, protein_length, strand, begin, end, match, score, i-evalue, coverage, match_begin, match_end, name, description')
with open( system_data , 'r' ) as system_file :
with open(system_data, 'r') as system_file :
for line in system_file:
if line[0] != '#':
line = line.strip()
fields = line.split( '\t')
if fields[0] in system_db:
raise KeyError( "duplicate replicon:" + fields[0])
else:
gene = Gene( fields[0],# code
fields[1],# id
int(fields[2]),# protein_length
fields[3] if fields[3] != '-' else None,# strand
int(fields[4]) if fields[4] != '-' else None ,# begin
int(fields[5]) if fields[5] != '-' else None,# end
fields[6] if fields[6] != '-' else None,# match
float(fields[7].replace( ',' , '.')) if fields[7] != '-' else None , # full_score
float(fields[8].replace( ',' , '.')) if fields[8] != '-' else None , # e_value
float(fields[9].replace( ',' , '.')) if fields[9] != '-' else None , # best_domain_score
float(fields[10].replace( ',' , '.')) if fields[10] != '-' else None ,# best_domain_evalue
float(fields[11].replace( ',' , '.')) if fields[11] != '-' else None ,# c_value
float(fields[12].replace( ',' , '.')) if fields[12] != '-' else None ,# coverage_profile
int(fields[13]) if fields[13] != '-' else None ,# match_begin
int(fields[14]) if fields[14] != '-' else None ,# match_end
fields[18] if fields[18] else None, # name
fields[19] if fields[19] else None, # description
)
if fields[16] in system_db:
if gene.code in system_db[ fields[16] ]:
raise KeyError( "duplicate gene:" + fields[16])
else:
#append this gene to System_info genes
system_db[ fields[16] ].genes[ gene.code ] = gene
fields = line.split('\t')
gene_code = fields[0]
if gene_code in system_db:
raise KeyError("duplicate replicon:" + fields[0])
gene_id = fields[1]
protein_length = int(fields[2])
strand = fields[3] if fields[3] != '-' else None
begin = int(fields[4]) if fields[4] != '-' else None
end = int(fields[5]) if fields[5] != '-' else None
match = fields[6] if fields[6] != '-' else None
# in old data, float number use , instead of dot
score = float(fields[7].replace(',', '.')) if fields[7] != '-' else None
i_evalue = float(fields[8].replace(',', '.')) if fields[8] != '-' else None
coverage = float(fields[9].replace(',', '.')) if fields[9] != '-' else None
match_begin = int(fields[10]) if fields[10] != '-' else None
match_end = int(fields[11]) if fields[11] != '-' else None
replicon_id = fields[12]
predicted_system = fields[13] if fields[13] != '-' else None
system_id = fields[14] if fields[14] != '-' else None
system_status = fields[15] if fields[15] != '-' else None
gene_name = fields[16] if fields[16] else None
description = fields[17] if fields[17] else None
gene = Gene(gene_code,
gene_id,
protein_length,
strand,
begin,
end,
match,
score,
i_evalue,
coverage,
match_begin,
match_end,
gene_name,
description
)
if system_id in system_db:
if gene.code in system_db[system_id].genes:
raise KeyError("duplicate gene: replicon= {0}; gene= {1};".format(replicon_id, gene.code))
else:
#create a new Sysem_info entry
system_db[ fields[16] ] = System_info( fields[16] , fields[17] , fields[15] , genes = { gene.code : gene } )
return system_db
# append this gene to System_info genes
system_db[system_id].genes[gene.code] = gene
else:
# create a new System_info entry
system_db[system_id] = System_info(system_id, predicted_system, system_status,
replicon_id, genes={gene.code: gene})
from couchdbkit.schema import Document
from couchdbkit.schema.properties import *
return system_db
class SecretionSystem( Document ):
class SecretionSystem(Document):
"""
a representation of a secretion System to be use with couchdb
"""
code = StringProperty( required=True )
T3SS_family = StringProperty()
replicon = DictProperty()
genes = ListProperty()
code = StringProperty(required=True)
predicted_system = StringProperty()
replicon = DictProperty()
genes = ListProperty()
def fill_db( server_uri, db_name, user, passwd, replicon_db , system_db , force_update = False):
def fill_db(server_uri, db_name, user, passwd, replicon_db, system_db, force_update=False):
"""
@param server_uri: the url of the couchdb server (with port)
@type server_uri: string
@param db_name: the name of the db in the couchdb server
@type db_name: string
@param replicon_db: the set of replicons info as return by replicon_parser
@type replicon_db: dict
@param system_db: the set of secretion systems info as return by system_parser
@type system_db: dict
@param force_update: if true force the entry to be updated even if the _rev number is not provided
@type force_update: boolean
:param server_uri: the url of the couchdb server (with port)
:type server_uri: string
:param db_name: the name of the db in the couchdb server
:type db_name: string
:param user: a login representing a user who is granted to modify the DB
:type user: string
:param passwd: the password that allow to authenticate the user
:type passwd: string
:param replicon_db: the set of replicons info as return by replicon_parser
:type replicon_db: dict
:param system_db: the set of secretion systems info as return by system_parser
:type system_db: dict
:param force_update: if true force the entry to be updated even if the _rev number is not provided
:type force_update: boolean
"""
auth = BasicAuth(user , passwd )
resource = CouchdbResource( server_uri , filters=[auth])
server = Server( resource_instance= resource )
secreton_db = server.get_or_create_db( db_name )
auth = BasicAuth(user, passwd)
resource = CouchdbResource(server_uri, filters=[auth])
server = Server(resource_instance=resource)
secreton_db = server.get_or_create_db(db_name)
system_codes = system_db.keys()
system_codes.sort()
for syst_code in system_codes:
system = system_db[syst_code]
replicon = replicon_db[ system.replicon ]
replicon = replicon_db[system.replicon]
secretion_system = SecretionSystem()
secretion_system._id = system.code
secretion_system._id = system.code
secretion_system.code = system.code
secretion_system.T3SS_family = system.T3SS_family
secretion_system.replicon = { 'name' : replicon.name ,
'taxid' : replicon.taxid,
'strain' : replicon.strain,
'taxonomy' : replicon.taxonomy,
'type' : replicon.type
}
secretion_system.predicted_system = system.predicted_system
secretion_system.replicon = {'name': replicon.name,
'ncbi_id': replicon.ncbi_id,
'taxid': replicon.taxid,
'strain': replicon.strain,
'taxonomy': replicon.taxonomy,
'type': replicon.type
}
genes_code = system.genes.keys()
genes_code.sort()
genes = []
for gene_code in genes_code:
gene = system.genes[ gene_code ]
gene = system.genes[gene_code]
g = {}
for field in gene._fields:
if getattr(gene , field ) is not None:
g[ field ]= getattr( gene , field )
genes.append( g )
for field in gene._fields:
value = getattr(gene, field)
if value is not None:
g[field] = value
genes.append(g)
secretion_system.genes = genes
secreton_db.save_doc( secretion_system , force_update= force_update )
secreton_db.save_doc(secretion_system, force_update=force_update)
if __name__ == '__main__':
from optparse import OptionParser , OptionGroup
from optparse import OptionParser, OptionGroup
import sys
import getpass
def get_credentials():
user = raw_input('login: ')
password = getpass.getpass('password: ')
return user, password
user = raw_input('login: ')
password = getpass.getpass('password: ')
return user, password
usage="""
usage = """
%prog [options]
parse a file containing replicon informations and a file containing system informations
and fill a couchDB data base with these informations
"""
parser = OptionParser( usage= usage )
parser = OptionParser(usage=usage)
server_opt = OptionGroup(parser, "Server Options")
server_opt.add_option( "-S" , "--server" ,
action = "store",
type= "string" ,
dest = "server_url" ,
help = "the url of the couchDB server (with the port)")
server_opt.add_option( "-d" , "--database" ,
action = "store",
type= "string" ,
dest = "db_name" ,
help = "the name of the data base")
parser.add_option_group( server_opt )
server_opt.add_option("-S", "--server",
action="store",
type="string",
dest="server_url",
help="the url of the couchDB server (with the port)")
server_opt.add_option("-d", "--database",
action="store",
type="string",
dest="db_name",
help="the name of the data base")
parser.add_option_group(server_opt)
parsing_opt = OptionGroup(parser, "Parsing Options")
parsing_opt.add_option( "-r" , "--replicon" ,
action = "store",
type= "string" ,
dest = "replicon_path" ,
help = "the path to the replicon file to parse")
parsing_opt.add_option( "-s" , "--system" ,
action = "store",
type= "string" ,
dest = "system_path" ,
help = "the path to the system secretion file to parse")
parsing_opt.add_option( "-f" , "--force_update" ,
action= "store_true",
dest = "force_update" ,
default = False,
help = "")
parser.add_option_group( parsing_opt )
parsing_opt.add_option("-r", "--replicon",
action="store",
type="string",
dest="replicon_path",
help="the path to the replicon file to parse")
parsing_opt.add_option("-s", "--system",
action="store",
type="string",
dest="system_path",
help="the path to the system secretion file to parse")
parsing_opt.add_option("-f", "--force_update",
action="store_true",
dest="force_update",
default=False,
help="")
parser.add_option_group(parsing_opt)
options, args = parser.parse_args()
if not options.server_url:
print >> sys.stderr , "You must specify a server url"
print >> sys.stderr, "You must specify a server url"
parser.print_help(sys.stderr)
sys.exit(1)
if not options.db_name:
print >> sys.stderr , "You must specify a data base name"
print >> sys.stderr, "You must specify a data base name"
parser.print_help(sys.stderr)
sys.exit(1)
if not options.replicon_path:
print >> sys.stderr , "You must specify the path to the replicon information file"
print >> sys.stderr, "You must specify the path to the replicon information file"
parser.print_help(sys.stderr)
sys.exit(1)
if not options.system_path:
print >> sys.stderr , "You must specify the path to the secretion system information file"
print >> sys.stderr, "You must specify the path to the secretion system information file"
parser.print_help(sys.stderr)
sys.exit(1)
replicon_db = replicon_parser( options.replicon_path )
system_db = system_parser( options.system_path )
replicon_db = replicon_parser(options.replicon_path)
system_db = system_parser(options.system_path)
try_again = 0
while True:
user, password = get_credentials()
try:
fill_db(options.server_url, options.db_name, user, password, replicon_db, system_db, force_update = options.force_update)
fill_db(options.server_url, options.db_name, user, password,
replicon_db, system_db, force_update=options.force_update)
break
except restkit.errors.Unauthorized, err:
print >> sys.stderr, "Bad authentication, try again"
......@@ -239,4 +269,3 @@ if __name__ == '__main__':
except Exception, err:
print >> sys.stderr, err
sys.exit(2)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment