diff --git a/src/parser/parser.py b/src/parser/parser.py index ba5436ec0c5e29f77571a328c87333dd0ac19390..fce83032190d92810f3e38014d46d29f16fb290c 100755 --- a/src/parser/parser.py +++ b/src/parser/parser.py @@ -5,7 +5,7 @@ Created on 27 dec. 2011 @author: Bertrand Néron """ - +from __future__ import print_function from collections import namedtuple from couchdbkit.client import Server from couchdbkit.exceptions import ResourceNotFound @@ -14,7 +14,7 @@ from restkit import Resource, BasicAuth import restkit.errors -def replicon_parser( replicon_data ): +def replicon_parser(replicon_data): """ parse a file containing the informations about replicons @param replicon_data: the path of replicon information file @@ -23,27 +23,31 @@ def replicon_parser( replicon_data ): @rtype: dict """ replicon_db = {} - Replicon_info = namedtuple('Replicon_info', 'name, taxid, strain, taxonomy, type') - with open( replicon_data , 'r' ) as replicon_file : + Replicon_info = namedtuple('Replicon_info', ('name', 'taxid', 'strain', 'taxonomy', 'type')) + with open(replicon_data, 'r') as replicon_file: for line in replicon_file: if line[0] != '#': line = line.strip() - fields = line.split( '\t') + fields = line.split('\t') if fields[0] in replicon_db: - raise KeyError( "duplicate replicon:" + fields[0]) + raise KeyError("duplicate replicon:" + fields[0]) else: try: - replicon_db[ fields[0] ] = Replicon_info( fields[0] , int(fields[1]) , fields[2] , fields[3].split('; ') , fields[4]) - except Exception, err: - raise Exception( "Error during parsing line :"+line ) - #remove ending dot or semi-colon from the last term of taxonnomy - if( replicon_db[ fields[0] ].taxonomy[-1].endswith('.') or replicon_db[ fields[0] ].taxonomy[-1].endswith(';')): - replicon_db[ fields[0] ].taxonomy[-1] = replicon_db[ fields[0] ].taxonomy[-1][:-1] + replicon_db[fields[0]] = Replicon_info(fields[0], + int(fields[1]), + fields[2], + fields[3].split('; '), + fields[4]) + except Exception as err: + raise Exception("Error during parsing line: {0}\n{1}".format(line, err)) + # remove ending dot or semi-colon from the last term of taxonomy + tax_last_char = replicon_db[fields[0]].taxonomy[-1] + if tax_last_char.endswith('.') or tax_last_char.endswith(';'): + replicon_db[fields[0]].taxonomy[-1] = replicon_db[fields[0]].taxonomy[-1][:-1] return replicon_db - -def system_parser( system_data ): +def system_parser(system_data): """ @param system_data: the path of secretion system information file @type system_data: string @@ -51,44 +55,46 @@ def system_parser( system_data ): @rtype: dict """ system_db = {} - System_info = namedtuple('System_info', 'code, T3SS_family, replicon, genes' ) - Gene = namedtuple( 'Gene', 'code, id, protein_length, strand, begin, end, match, full_score, e_value, best_domain_score, best_domain_evalue, c_value, coverage_profile, match_begin, match_end, name, description') + System_info = namedtuple('System_info', 'code, T3SS_family, replicon, genes') + Gene = namedtuple('Gene', ('code', 'id', 'protein_length', 'strand', 'begin', 'end', 'match', 'full_score', + 'e_value', 'best_domain_score', 'best_domain_evalue', 'c_value', 'coverage_profile', + 'match_begin', 'match_end', 'name', 'description')) - with open( system_data , 'r' ) as system_file : + with open(system_data, 'r') as system_file: for line in system_file: if line[0] != '#': line = line.strip() - fields = line.split( '\t') + fields = line.split('\t') if fields[0] in system_db: - raise KeyError( "duplicate replicon:" + fields[0]) + raise KeyError("duplicate replicon:" + fields[0]) else: - gene = Gene( fields[0],# code - fields[1],# id - int(fields[2]),# protein_length - fields[3] if fields[3] != '-' else None,# strand - int(fields[4]) if fields[4] != '-' else None ,# begin - int(fields[5]) if fields[5] != '-' else None,# end - fields[6] if fields[6] != '-' else None,# match - float(fields[7].replace( ',' , '.')) if fields[7] != '-' else None , # full_score - float(fields[8].replace( ',' , '.')) if fields[8] != '-' else None , # e_value - float(fields[9].replace( ',' , '.')) if fields[9] != '-' else None , # best_domain_score - float(fields[10].replace( ',' , '.')) if fields[10] != '-' else None ,# best_domain_evalue - float(fields[11].replace( ',' , '.')) if fields[11] != '-' else None ,# c_value - float(fields[12].replace( ',' , '.')) if fields[12] != '-' else None ,# coverage_profile - int(fields[13]) if fields[13] != '-' else None ,# match_begin - int(fields[14]) if fields[14] != '-' else None ,# match_end - fields[18] if fields[18] else None, # name - fields[19] if fields[19] else None, # description - ) + gene = Gene(fields[0], # code + fields[1], # id + int(fields[2]), # protein_length + fields[3] if fields[3] != '-' else None, # strand + int(fields[4]) if fields[4] != '-' else None, # begin + int(fields[5]) if fields[5] != '-' else None, # end + fields[6] if fields[6] != '-' else None, # match + float(fields[7].replace(',', '.')) if fields[7] != '-' else None, # full_score + float(fields[8].replace(',', '.')) if fields[8] != '-' else None, # e_value + float(fields[9].replace(',', '.')) if fields[9] != '-' else None, # best_domain_score + float(fields[10].replace(',', '.')) if fields[10] != '-' else None, # best_domain_evalue + float(fields[11].replace(',', '.')) if fields[11] != '-' else None, # c_value + float(fields[12].replace(',', '.')) if fields[12] != '-' else None, # coverage_profile + int(fields[13]) if fields[13] != '-' else None, # match_begin + int(fields[14]) if fields[14] != '-' else None, # match_end + fields[18] if fields[18] else None, # name + fields[19] if fields[19] else None, # description + ) if fields[16] in system_db: - if gene.code in system_db[ fields[16] ]: - raise KeyError( "duplicate gene:" + fields[16]) + if gene.code in system_db[fields[16]]: + raise KeyError("duplicate gene:" + fields[16]) else: - #append this gene to System_info genes - system_db[ fields[16] ].genes[ gene.code ] = gene + # append this gene to System_info genes + system_db[fields[16]].genes[gene.code] = gene else: - #create a new Sysem_info entry - system_db[ fields[16] ] = System_info( fields[16] , fields[17] , fields[15] , genes = { gene.code : gene } ) + # create a new System_info entry + system_db[fields[16]] = System_info(fields[16], fields[17], fields[15], genes={gene.code: gene}) return system_db @@ -96,17 +102,17 @@ from couchdbkit.schema import Document from couchdbkit.schema.properties import * -class SecretionSystem( Document ): +class SecretionSystem(Document): """ a representation of a secretion System to be use with couchdb """ - code = StringProperty( required=True ) + code = StringProperty(required=True) T3SS_family = StringProperty() - replicon = DictProperty() - genes = ListProperty() + replicon = DictProperty() + genes = ListProperty() -def fill_db( server_uri, db_name, user, passwd, replicon_db , system_db , force_update = False): +def fill_db(server_uri, db_name, user, passwd, replicon_db, system_db, force_update=False): """ @param server_uri: the url of the couchdb server (with port) @type server_uri: string @@ -119,124 +125,117 @@ def fill_db( server_uri, db_name, user, passwd, replicon_db , system_db , force_ @param force_update: if true force the entry to be updated even if the _rev number is not provided @type force_update: boolean """ - auth = BasicAuth(user , passwd ) - resource = CouchdbResource( server_uri , filters=[auth]) - server = Server( resource_instance= resource ) - secreton_db = server.get_or_create_db( db_name ) + auth = BasicAuth(user, passwd ) + resource = CouchdbResource(server_uri, filters=[auth]) + server = Server(resource_instance=resource) + secreton_db = server.get_or_create_db(db_name) system_codes = system_db.keys() system_codes.sort() for syst_code in system_codes: system = system_db[syst_code] - replicon = replicon_db[ system.replicon ] + replicon = replicon_db[system.replicon] secretion_system = SecretionSystem() - secretion_system._id = system.code + secretion_system._id = system.code secretion_system.code = system.code secretion_system.T3SS_family = system.T3SS_family - secretion_system.replicon = { 'name' : replicon.name , - 'taxid' : replicon.taxid, - 'strain' : replicon.strain, - 'taxonomy' : replicon.taxonomy, - 'type' : replicon.type + secretion_system.replicon = {'name': replicon.name, + 'taxid': replicon.taxid, + 'strain': replicon.strain, + 'taxonomy': replicon.taxonomy, + 'type': replicon.type } genes_code = system.genes.keys() genes_code.sort() genes = [] for gene_code in genes_code: - gene = system.genes[ gene_code ] + gene = system.genes[gene_code] g = {} - for field in gene._fields: - if getattr(gene , field ) is not None: - g[ field ]= getattr( gene , field ) - genes.append( g ) + for field in gene._fields: + if getattr(gene, field) is not None: + g[field] = getattr(gene, field) + genes.append(g) secretion_system.genes = genes - secreton_db.save_doc( secretion_system , force_update= force_update ) + secreton_db.save_doc(secretion_system, force_update=force_update) if __name__ == '__main__': - from optparse import OptionParser , OptionGroup + import argparse import sys import getpass def get_credentials(): - user = raw_input('login: ') - password = getpass.getpass('password: ') - return user, password + user = raw_input('login: ') + password = getpass.getpass('password: ') + return user, password - usage=""" + usage = """ %prog [options] parse a file containing replicon informations and a file containing system informations and fill a couchDB data base with these informations """ - parser = OptionParser( usage= usage ) - server_opt = OptionGroup(parser, "Server Options") - server_opt.add_option( "-S" , "--server" , - action = "store", - type= "string" , - dest = "server_url" , - help = "the url of the couchDB server (with the port)") - server_opt.add_option( "-d" , "--database" , - action = "store", - type= "string" , - dest = "db_name" , - help = "the name of the data base") - parser.add_option_group( server_opt ) - - parsing_opt = OptionGroup(parser, "Parsing Options") - parsing_opt.add_option( "-r" , "--replicon" , - action = "store", - type= "string" , - dest = "replicon_path" , - help = "the path to the replicon file to parse") - parsing_opt.add_option( "-s" , "--system" , - action = "store", - type= "string" , - dest = "system_path" , - help = "the path to the system secretion file to parse") - parsing_opt.add_option( "-f" , "--force_update" , - action= "store_true", - dest = "force_update" , - default = False, - help = "") - parser.add_option_group( parsing_opt ) - + parser = argparse.ArgumentParser(usage=usage) + server_opt = parser.add_argument_group(title="Server Options") + server_opt.add_argument("-S", "--server", + action="store", + type="string", + dest="server_url", + help="the url of the couchDB server (with the port)") + server_opt.add_argument("-d", "--database", + action="store", + type="string", + dest="db_name", + help="the name of the data base") + parsing_opt = parser.add_argument_group(title="Parsing Options") + parsing_opt.add_argument("-r", "--replicon", + action="store", + type="string", + dest="replicon_path", + help="the path to the replicon file to parse") + parsing_opt.add_argument("-s", "--system", + action="store", + type="string", + dest="system_path", + help="the path to the system secretion file to parse") + parsing_opt.add_argument("-f", "--force_update", + action="store_true", + dest="force_update", + default=False, + help="") options, args = parser.parse_args() if not options.server_url: - print >> sys.stderr , "You must specify a server url" + print("You must specify a server url", file=sys.stderr) parser.print_help(sys.stderr) sys.exit(1) if not options.db_name: - print >> sys.stderr , "You must specify a data base name" + print("You must specify a data base name", file=sys.stderr) parser.print_help(sys.stderr) sys.exit(1) if not options.replicon_path: - print >> sys.stderr , "You must specify the path to the replicon information file" + print("You must specify the path to the replicon information file", file=sys.stderr) parser.print_help(sys.stderr) sys.exit(1) if not options.system_path: - print >> sys.stderr , "You must specify the path to the secretion system information file" + print("You must specify the path to the secretion system information file", file=sys.stderr) parser.print_help(sys.stderr) sys.exit(1) - - replicon_db = replicon_parser( options.replicon_path ) - system_db = system_parser( options.system_path ) + replicon_db = replicon_parser(options.replicon_path) + system_db = system_parser(options.system_path) try_again = 0 while True: user, password = get_credentials() try: - fill_db(options.server_url, options.db_name, user, password, replicon_db, system_db, force_update = options.force_update) + fill_db(options.server_url, options.db_name, user, password, replicon_db, system_db, + force_update=options.force_update) break - except restkit.errors.Unauthorized, err: - print >> sys.stderr, "Bad authentication, try again" + except restkit.errors.Unauthorized as err: + print("Bad authentication, try again", file=sys.stderr) try_again += 1 if try_again > 2: sys.exit("Authentication failure") - except Exception, err: - print >> sys.stderr, err - sys.exit(2) - \ No newline at end of file +