Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
capsuledb
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
gem
capsuledb
Commits
9ac9ba7f
Commit
9ac9ba7f
authored
9 years ago
by
Bertrand NÉRON
Browse files
Options
Downloads
Patches
Plain Diff
fix pep8 and python2.7 syntax
parent
2319b154
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/parser/parser.py
+115
-116
115 additions, 116 deletions
src/parser/parser.py
with
115 additions
and
116 deletions
src/parser/parser.py
+
115
−
116
View file @
9ac9ba7f
...
...
@@ -5,7 +5,7 @@ Created on 27 dec. 2011
@author: Bertrand Néron
"""
from
__future__
import
print_function
from
collections
import
namedtuple
from
couchdbkit.client
import
Server
from
couchdbkit.exceptions
import
ResourceNotFound
...
...
@@ -14,7 +14,7 @@ from restkit import Resource, BasicAuth
import
restkit.errors
def
replicon_parser
(
replicon_data
):
def
replicon_parser
(
replicon_data
):
"""
parse a file containing the informations about replicons
@param replicon_data: the path of replicon information file
...
...
@@ -23,27 +23,31 @@ def replicon_parser( replicon_data ):
@rtype: dict
"""
replicon_db
=
{}
Replicon_info
=
namedtuple
(
'
Replicon_info
'
,
'
name, taxid, strain, taxonomy, type
'
)
with
open
(
replicon_data
,
'
r
'
)
as
replicon_file
:
Replicon_info
=
namedtuple
(
'
Replicon_info
'
,
(
'
name
'
,
'
taxid
'
,
'
strain
'
,
'
taxonomy
'
,
'
type
'
)
)
with
open
(
replicon_data
,
'
r
'
)
as
replicon_file
:
for
line
in
replicon_file
:
if
line
[
0
]
!=
'
#
'
:
line
=
line
.
strip
()
fields
=
line
.
split
(
'
\t
'
)
fields
=
line
.
split
(
'
\t
'
)
if
fields
[
0
]
in
replicon_db
:
raise
KeyError
(
"
duplicate replicon:
"
+
fields
[
0
])
raise
KeyError
(
"
duplicate replicon:
"
+
fields
[
0
])
else
:
try
:
replicon_db
[
fields
[
0
]
]
=
Replicon_info
(
fields
[
0
]
,
int
(
fields
[
1
])
,
fields
[
2
]
,
fields
[
3
].
split
(
'
;
'
)
,
fields
[
4
])
except
Exception
,
err
:
raise
Exception
(
"
Error during parsing line :
"
+
line
)
#remove ending dot or semi-colon from the last term of taxonnomy
if
(
replicon_db
[
fields
[
0
]
].
taxonomy
[
-
1
].
endswith
(
'
.
'
)
or
replicon_db
[
fields
[
0
]
].
taxonomy
[
-
1
].
endswith
(
'
;
'
)):
replicon_db
[
fields
[
0
]
].
taxonomy
[
-
1
]
=
replicon_db
[
fields
[
0
]
].
taxonomy
[
-
1
][:
-
1
]
replicon_db
[
fields
[
0
]]
=
Replicon_info
(
fields
[
0
],
int
(
fields
[
1
]),
fields
[
2
],
fields
[
3
].
split
(
'
;
'
),
fields
[
4
])
except
Exception
as
err
:
raise
Exception
(
"
Error during parsing line: {0}
\n
{1}
"
.
format
(
line
,
err
))
# remove ending dot or semi-colon from the last term of taxonomy
tax_last_char
=
replicon_db
[
fields
[
0
]].
taxonomy
[
-
1
]
if
tax_last_char
.
endswith
(
'
.
'
)
or
tax_last_char
.
endswith
(
'
;
'
):
replicon_db
[
fields
[
0
]].
taxonomy
[
-
1
]
=
replicon_db
[
fields
[
0
]].
taxonomy
[
-
1
][:
-
1
]
return
replicon_db
def
system_parser
(
system_data
):
def
system_parser
(
system_data
):
"""
@param system_data: the path of secretion system information file
@type system_data: string
...
...
@@ -51,44 +55,46 @@ def system_parser( system_data ):
@rtype: dict
"""
system_db
=
{}
System_info
=
namedtuple
(
'
System_info
'
,
'
code, T3SS_family, replicon, genes
'
)
Gene
=
namedtuple
(
'
Gene
'
,
'
code, id, protein_length, strand, begin, end, match, full_score, e_value, best_domain_score, best_domain_evalue, c_value, coverage_profile, match_begin, match_end, name, description
'
)
System_info
=
namedtuple
(
'
System_info
'
,
'
code, T3SS_family, replicon, genes
'
)
Gene
=
namedtuple
(
'
Gene
'
,
(
'
code
'
,
'
id
'
,
'
protein_length
'
,
'
strand
'
,
'
begin
'
,
'
end
'
,
'
match
'
,
'
full_score
'
,
'
e_value
'
,
'
best_domain_score
'
,
'
best_domain_evalue
'
,
'
c_value
'
,
'
coverage_profile
'
,
'
match_begin
'
,
'
match_end
'
,
'
name
'
,
'
description
'
))
with
open
(
system_data
,
'
r
'
)
as
system_file
:
with
open
(
system_data
,
'
r
'
)
as
system_file
:
for
line
in
system_file
:
if
line
[
0
]
!=
'
#
'
:
line
=
line
.
strip
()
fields
=
line
.
split
(
'
\t
'
)
fields
=
line
.
split
(
'
\t
'
)
if
fields
[
0
]
in
system_db
:
raise
KeyError
(
"
duplicate replicon:
"
+
fields
[
0
])
raise
KeyError
(
"
duplicate replicon:
"
+
fields
[
0
])
else
:
gene
=
Gene
(
fields
[
0
],
# code
fields
[
1
],
# id
int
(
fields
[
2
]),
# protein_length
fields
[
3
]
if
fields
[
3
]
!=
'
-
'
else
None
,
# strand
int
(
fields
[
4
])
if
fields
[
4
]
!=
'
-
'
else
None
,
# begin
int
(
fields
[
5
])
if
fields
[
5
]
!=
'
-
'
else
None
,
# end
fields
[
6
]
if
fields
[
6
]
!=
'
-
'
else
None
,
# match
float
(
fields
[
7
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
7
]
!=
'
-
'
else
None
,
# full_score
float
(
fields
[
8
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
8
]
!=
'
-
'
else
None
,
# e_value
float
(
fields
[
9
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
9
]
!=
'
-
'
else
None
,
# best_domain_score
float
(
fields
[
10
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
10
]
!=
'
-
'
else
None
,
# best_domain_evalue
float
(
fields
[
11
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
11
]
!=
'
-
'
else
None
,
# c_value
float
(
fields
[
12
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
12
]
!=
'
-
'
else
None
,
# coverage_profile
int
(
fields
[
13
])
if
fields
[
13
]
!=
'
-
'
else
None
,
# match_begin
int
(
fields
[
14
])
if
fields
[
14
]
!=
'
-
'
else
None
,
# match_end
fields
[
18
]
if
fields
[
18
]
else
None
,
# name
fields
[
19
]
if
fields
[
19
]
else
None
,
# description
)
gene
=
Gene
(
fields
[
0
],
# code
fields
[
1
],
# id
int
(
fields
[
2
]),
# protein_length
fields
[
3
]
if
fields
[
3
]
!=
'
-
'
else
None
,
# strand
int
(
fields
[
4
])
if
fields
[
4
]
!=
'
-
'
else
None
,
# begin
int
(
fields
[
5
])
if
fields
[
5
]
!=
'
-
'
else
None
,
# end
fields
[
6
]
if
fields
[
6
]
!=
'
-
'
else
None
,
# match
float
(
fields
[
7
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
7
]
!=
'
-
'
else
None
,
# full_score
float
(
fields
[
8
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
8
]
!=
'
-
'
else
None
,
# e_value
float
(
fields
[
9
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
9
]
!=
'
-
'
else
None
,
# best_domain_score
float
(
fields
[
10
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
10
]
!=
'
-
'
else
None
,
# best_domain_evalue
float
(
fields
[
11
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
11
]
!=
'
-
'
else
None
,
# c_value
float
(
fields
[
12
].
replace
(
'
,
'
,
'
.
'
))
if
fields
[
12
]
!=
'
-
'
else
None
,
# coverage_profile
int
(
fields
[
13
])
if
fields
[
13
]
!=
'
-
'
else
None
,
# match_begin
int
(
fields
[
14
])
if
fields
[
14
]
!=
'
-
'
else
None
,
# match_end
fields
[
18
]
if
fields
[
18
]
else
None
,
# name
fields
[
19
]
if
fields
[
19
]
else
None
,
# description
)
if
fields
[
16
]
in
system_db
:
if
gene
.
code
in
system_db
[
fields
[
16
]
]:
raise
KeyError
(
"
duplicate gene:
"
+
fields
[
16
])
if
gene
.
code
in
system_db
[
fields
[
16
]]:
raise
KeyError
(
"
duplicate gene:
"
+
fields
[
16
])
else
:
#append this gene to System_info genes
system_db
[
fields
[
16
]
].
genes
[
gene
.
code
]
=
gene
#
append this gene to System_info genes
system_db
[
fields
[
16
]].
genes
[
gene
.
code
]
=
gene
else
:
#create a new Sysem_info entry
system_db
[
fields
[
16
]
]
=
System_info
(
fields
[
16
]
,
fields
[
17
]
,
fields
[
15
]
,
genes
=
{
gene
.
code
:
gene
}
)
#
create a new Sys
t
em_info entry
system_db
[
fields
[
16
]]
=
System_info
(
fields
[
16
],
fields
[
17
],
fields
[
15
],
genes
=
{
gene
.
code
:
gene
})
return
system_db
...
...
@@ -96,17 +102,17 @@ from couchdbkit.schema import Document
from
couchdbkit.schema.properties
import
*
class
SecretionSystem
(
Document
):
class
SecretionSystem
(
Document
):
"""
a representation of a secretion System to be use with couchdb
"""
code
=
StringProperty
(
required
=
True
)
code
=
StringProperty
(
required
=
True
)
T3SS_family
=
StringProperty
()
replicon
=
DictProperty
()
genes
=
ListProperty
()
replicon
=
DictProperty
()
genes
=
ListProperty
()
def
fill_db
(
server_uri
,
db_name
,
user
,
passwd
,
replicon_db
,
system_db
,
force_update
=
False
):
def
fill_db
(
server_uri
,
db_name
,
user
,
passwd
,
replicon_db
,
system_db
,
force_update
=
False
):
"""
@param server_uri: the url of the couchdb server (with port)
@type server_uri: string
...
...
@@ -119,124 +125,117 @@ def fill_db( server_uri, db_name, user, passwd, replicon_db , system_db , force_
@param force_update: if true force the entry to be updated even if the _rev number is not provided
@type force_update: boolean
"""
auth
=
BasicAuth
(
user
,
passwd
)
resource
=
CouchdbResource
(
server_uri
,
filters
=
[
auth
])
server
=
Server
(
resource_instance
=
resource
)
secreton_db
=
server
.
get_or_create_db
(
db_name
)
auth
=
BasicAuth
(
user
,
passwd
)
resource
=
CouchdbResource
(
server_uri
,
filters
=
[
auth
])
server
=
Server
(
resource_instance
=
resource
)
secreton_db
=
server
.
get_or_create_db
(
db_name
)
system_codes
=
system_db
.
keys
()
system_codes
.
sort
()
for
syst_code
in
system_codes
:
system
=
system_db
[
syst_code
]
replicon
=
replicon_db
[
system
.
replicon
]
replicon
=
replicon_db
[
system
.
replicon
]
secretion_system
=
SecretionSystem
()
secretion_system
.
_id
=
system
.
code
secretion_system
.
_id
=
system
.
code
secretion_system
.
code
=
system
.
code
secretion_system
.
T3SS_family
=
system
.
T3SS_family
secretion_system
.
replicon
=
{
'
name
'
:
replicon
.
name
,
'
taxid
'
:
replicon
.
taxid
,
'
strain
'
:
replicon
.
strain
,
'
taxonomy
'
:
replicon
.
taxonomy
,
'
type
'
:
replicon
.
type
secretion_system
.
replicon
=
{
'
name
'
:
replicon
.
name
,
'
taxid
'
:
replicon
.
taxid
,
'
strain
'
:
replicon
.
strain
,
'
taxonomy
'
:
replicon
.
taxonomy
,
'
type
'
:
replicon
.
type
}
genes_code
=
system
.
genes
.
keys
()
genes_code
.
sort
()
genes
=
[]
for
gene_code
in
genes_code
:
gene
=
system
.
genes
[
gene_code
]
gene
=
system
.
genes
[
gene_code
]
g
=
{}
for
field
in
gene
.
_fields
:
if
getattr
(
gene
,
field
)
is
not
None
:
g
[
field
]
=
getattr
(
gene
,
field
)
genes
.
append
(
g
)
for
field
in
gene
.
_fields
:
if
getattr
(
gene
,
field
)
is
not
None
:
g
[
field
]
=
getattr
(
gene
,
field
)
genes
.
append
(
g
)
secretion_system
.
genes
=
genes
secreton_db
.
save_doc
(
secretion_system
,
force_update
=
force_update
)
secreton_db
.
save_doc
(
secretion_system
,
force_update
=
force_update
)
if
__name__
==
'
__main__
'
:
from
optparse
import
OptionParser
,
OptionGroup
import
argparse
import
sys
import
getpass
def
get_credentials
():
user
=
raw_input
(
'
login:
'
)
password
=
getpass
.
getpass
(
'
password:
'
)
return
user
,
password
user
=
raw_input
(
'
login:
'
)
password
=
getpass
.
getpass
(
'
password:
'
)
return
user
,
password
usage
=
"""
usage
=
"""
%prog [options]
parse a file containing replicon informations and a file containing system informations
and fill a couchDB data base with these informations
"""
parser
=
OptionParser
(
usage
=
usage
)
server_opt
=
OptionGroup
(
parser
,
"
Server Options
"
)
server_opt
.
add_option
(
"
-S
"
,
"
--server
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
server_url
"
,
help
=
"
the url of the couchDB server (with the port)
"
)
server_opt
.
add_option
(
"
-d
"
,
"
--database
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
db_name
"
,
help
=
"
the name of the data base
"
)
parser
.
add_option_group
(
server_opt
)
parsing_opt
=
OptionGroup
(
parser
,
"
Parsing Options
"
)
parsing_opt
.
add_option
(
"
-r
"
,
"
--replicon
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
replicon_path
"
,
help
=
"
the path to the replicon file to parse
"
)
parsing_opt
.
add_option
(
"
-s
"
,
"
--system
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
system_path
"
,
help
=
"
the path to the system secretion file to parse
"
)
parsing_opt
.
add_option
(
"
-f
"
,
"
--force_update
"
,
action
=
"
store_true
"
,
dest
=
"
force_update
"
,
default
=
False
,
help
=
""
)
parser
.
add_option_group
(
parsing_opt
)
parser
=
argparse
.
ArgumentParser
(
usage
=
usage
)
server_opt
=
parser
.
add_argument_group
(
title
=
"
Server Options
"
)
server_opt
.
add_argument
(
"
-S
"
,
"
--server
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
server_url
"
,
help
=
"
the url of the couchDB server (with the port)
"
)
server_opt
.
add_argument
(
"
-d
"
,
"
--database
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
db_name
"
,
help
=
"
the name of the data base
"
)
parsing_opt
=
parser
.
add_argument_group
(
title
=
"
Parsing Options
"
)
parsing_opt
.
add_argument
(
"
-r
"
,
"
--replicon
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
replicon_path
"
,
help
=
"
the path to the replicon file to parse
"
)
parsing_opt
.
add_argument
(
"
-s
"
,
"
--system
"
,
action
=
"
store
"
,
type
=
"
string
"
,
dest
=
"
system_path
"
,
help
=
"
the path to the system secretion file to parse
"
)
parsing_opt
.
add_argument
(
"
-f
"
,
"
--force_update
"
,
action
=
"
store_true
"
,
dest
=
"
force_update
"
,
default
=
False
,
help
=
""
)
options
,
args
=
parser
.
parse_args
()
if
not
options
.
server_url
:
print
>>
sys
.
stderr
,
"
You must specify a server url
"
print
(
"
You must specify a server url
"
,
file
=
sys
.
stderr
)
parser
.
print_help
(
sys
.
stderr
)
sys
.
exit
(
1
)
if
not
options
.
db_name
:
print
>>
sys
.
stderr
,
"
You must specify a data base name
"
print
(
"
You must specify a data base name
"
,
file
=
sys
.
stderr
)
parser
.
print_help
(
sys
.
stderr
)
sys
.
exit
(
1
)
if
not
options
.
replicon_path
:
print
>>
sys
.
stderr
,
"
You must specify the path to the replicon information file
"
print
(
"
You must specify the path to the replicon information file
"
,
file
=
sys
.
stderr
)
parser
.
print_help
(
sys
.
stderr
)
sys
.
exit
(
1
)
if
not
options
.
system_path
:
print
>>
sys
.
stderr
,
"
You must specify the path to the secretion system information file
"
print
(
"
You must specify the path to the secretion system information file
"
,
file
=
sys
.
stderr
)
parser
.
print_help
(
sys
.
stderr
)
sys
.
exit
(
1
)
replicon_db
=
replicon_parser
(
options
.
replicon_path
)
system_db
=
system_parser
(
options
.
system_path
)
replicon_db
=
replicon_parser
(
options
.
replicon_path
)
system_db
=
system_parser
(
options
.
system_path
)
try_again
=
0
while
True
:
user
,
password
=
get_credentials
()
try
:
fill_db
(
options
.
server_url
,
options
.
db_name
,
user
,
password
,
replicon_db
,
system_db
,
force_update
=
options
.
force_update
)
fill_db
(
options
.
server_url
,
options
.
db_name
,
user
,
password
,
replicon_db
,
system_db
,
force_update
=
options
.
force_update
)
break
except
restkit
.
errors
.
Unauthorized
,
err
:
print
>>
sys
.
stderr
,
"
Bad authentication, try again
"
except
restkit
.
errors
.
Unauthorized
as
err
:
print
(
"
Bad authentication, try again
"
,
file
=
sys
.
stderr
)
try_again
+=
1
if
try_again
>
2
:
sys
.
exit
(
"
Authentication failure
"
)
except
Exception
,
err
:
print
>>
sys
.
stderr
,
err
sys
.
exit
(
2
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment