Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Hervé MENAGER
bioweb
Commits
d62f53c6
Commit
d62f53c6
authored
Sep 06, 2016
by
Eric DEVEAUD
Browse files
port to BIODOCS.yaml
parent
060e575d
Changes
3
Show whitespace changes
Inline
Side-by-side
gensoft/Makefile
0 → 100644
View file @
d62f53c6
NAM
=
biodocs2mongo
PREFIX
=
/local/gensoft2/adm
BIN
=
$(PREFIX)
/bin
PYMODULEDIR
=
$(PREFIX)
/share/gensoft/pymodules
DAT
=
$(PREFIX)
/share/gensoft/
$(NAM)
build
:
sed
-i
-e
's,^\#!.*python,\#! /local/gensoft2/adm/bin/python,'
*
.py
sed
-e
's|^DAT=.*|DAT="
$(DAT)
"|'
\
-e
's|^PYMODULEDIR=.*|PYMODULEDIR="
$(PYMODULEDIR)
"|'
$(NAM)
.py
>
$(NAM)
install
:
build
test
-d
$(DAT)
||
mkdir
-p
$(DAT)
install
-m
0775
$(NAM)
$(BIN)
install
-m
0664 config.cfg
$(DAT)
$(MAKE)
clean
clean
:
rm
-f
$(NAM)
rm
-f
*
.pyc
uninstall
:
rm
-f
$(BIN)
/
$(NAM)
$(DAT)
/
*
rm
-rf
$(DAT)
gensoft/biodocs2mongo.py
View file @
d62f53c6
#!/usr/bin/env python2.7
#! /local/gensoft2/adm/bin/python
from
__future__
import
print_function
import
argparse
import
arrow
# date manipulation made easy.
import
collection
s
import
ConfigParser
import
o
s
import
pymongo
import
re
import
os
import
string
import
ssl
import
string
import
sys
import
BiodocParser
import
mobyledefs
# tweak modulecmd in order to use 3.3.a updated (not yet released)
os
.
environ
[
'PY_MODULECMD'
]
=
'/local/gensoft2/adm/Modules/3.3.a/bin/modulecmd'
import
module
as
M
#from biowebmongo import *
#from pymongo.errors import BulkWriteError
#--------------------------------------------------
# SOME GLOBALS
#--------------------------------------------------
LOGFH
=
sys
.
stdout
ERRFH
=
sys
.
stderr
FATAL
=
1
WARN
=
0
VERBOSE
=
1
MTRUE
=
True
# "true" value to insert in mongodb
MFALSE
=
False
# "false" value to insert in mongodb
from
pprint
import
pprint
ID_SEPARATOR
=
'@'
#### TODO !!!!!
#
# add config entry for use module.
# if set => check program listing from biodocs versus program listing from modules
MOBYLEURL
=
'http://mobyle.pasteur.fr'
HOST
=
'bioweb-prod.web.pasteur.fr'
PORT
=
27017
JOURNALING
=
True
# implies w=1
WRITECONCERN
=
1
DB_DEF
=
'bioweb'
COL_DEF
=
'catalog'
#---- import homebrew modules
known_ops
=
[
'install'
,
'remove'
,
'setdefault'
,
'unsetdefault'
,
'update'
]
PYMODULEDIR
=
"/local/gensoft2/adm/share/gensoft/pymodules"
sys
.
path
.
insert
(
0
,
PYMODULEDIR
)
import
BiodocParser
mapper
=
{}
#---- some hugly global variables
#--------------------------------------------------
# INTERNAL USE
#--------------------------------------------------
DAT
=
'.'
ERRFH
=
sys
.
stderr
LOGFH
=
sys
.
stdout
FATAL
=
1
WARN
=
0
def
err
or
(
exit_val
,
*
msg
):
def
err
(
exit_val
,
*
msg
):
head
=
[
'Warning'
,
'Error'
]
print
>>
ERRFH
,
"%s: %s"
%
(
head
[
exit_val
],
" - "
.
join
(
map
(
str
,
msg
)))
print
(
"%s: %s"
%
(
head
[
exit_val
],
" - "
.
join
(
map
(
str
,
msg
)))
,
file
=
ERRFH
)
if
exit_val
:
sys
.
exit
(
exit_val
)
return
None
def
log
(
*
msg
):
if
VERBOSE
:
print
>>
LOGFH
,
"%s"
%
(
' '
.
join
(
map
(
str
,
msg
)))
print
(
"%s"
%
(
' '
.
join
(
map
(
str
,
msg
)))
,
file
=
LOGFH
)
LOGFH
.
flush
()
def
get_DB
(
host
,
port
,
db_name
=
DB_DEF
,
j
=
JOURNALING
,
w
=
WRITECONCERN
):
log
(
'connect to'
,
host
,
'on port:'
,
port
)
try
:
client
=
pymongo
.
MongoClient
(
host
,
port
,
j
=
JOURNALING
,
ssl
=
True
,
ssl_cert_reqs
=
ssl
.
CERT_NONE
)
except
pymongo
.
errors
.
ConnectionFailure
as
err
:
error
(
FATAL
,
"mongodb %s/%s"
%
(
host
,
port
),
err
)
return
client
[
db_name
]
def
getmodule_infos
(
package
):
'''
returns programs provided by a package//module
htmldocs provided by a package//module
pack dependencies
information is more uptodate on programs than the
one provided by BIODOCS.
information may vary given the pack version, while
BIDOCS only document default version
returns a list of names.
'''
m
=
M
.
Module
()
def
get_progs
(
info
):
'''
get programs provided by installed module version
returns a list of program_names as str
'''
progs
=
[]
keep
=
False
for
elem
in
infos
:
elem
=
elem
.
strip
()
if
not
elem
:
continue
if
keep
:
progs
.
append
(
elem
)
if
elem
==
"package provides following commands:"
:
keep
=
True
return
progs
def
_id_generator
(
*
arg
):
return
ID_SEPARATOR
.
join
(
arg
)
def
get_docs
(
info
):
'''
get local documentation available for given package
returns a tuple (htmldocs, txtdocs) of list as str
'''
htmldocs
=
[]
txtdocs
=
[]
keep
=
False
for
elem
in
infos
:
elem
=
elem
.
strip
()
if
not
elem
:
continue
if
elem
==
"local documentation available:"
:
keep
=
True
continue
elif
elem
==
"package provides following commands:"
:
break
if
keep
:
if
'bioweb2'
in
elem
:
htmldocs
.
append
(
elem
)
else
:
txtdocs
.
append
(
elem
)
return
htmldocs
,
txtdocs
def
get_dependencies
(
info
):
'''
get the package/version that a given module depends or relies on.
returns a list of packages_names
'''
deps
=
[]
for
elem
in
infos
:
elem
=
elem
.
strip
()
if
not
elem
:
continue
if
'WARNING'
in
elem
:
deps
=
elem
[
elem
.
find
(
'<'
)
+
1
:
elem
.
find
(
'>'
)]
fields
=
deps
.
split
()
# remove extraneous and or tokens
deps
=
list
(
set
(
fields
))
try
:
deps
.
remove
(
'or'
)
deps
.
remove
(
'and'
)
except
ValueError
as
err
:
pass
break
return
[
item
for
item
in
deps
if
item
]
if
testmodule
:
package
=
'test/%s'
%
(
package
)
print
">>>>>>>>>"
,
package
try
:
infos
=
m
.
help
(
package
)
except
M
.
ModuleError
as
err
:
raise
M
.
ModuleError
(
"no modulefile %s"
%
(
package
))
garbage
=
infos
.
find
(
'*****'
)
if
garbage
!=
-
1
:
infos
=
infos
[:
garbage
]
infos
=
infos
.
split
(
'
\n
'
)
ret
=
{}
ret
[
'progs'
]
=
get_progs
(
infos
)
ret
[
'htmldocs'
],
ret
[
'docs'
]
=
get_docs
(
infos
)
ret
[
'depends'
]
=
get_dependencies
(
infos
)
return
ret
def
format_references
(
ref_lst
):
'''
format reference information as a dictionary
suitable to be embeded in mongo package document
'''
def
extract_doi
(
item
):
'''
extract publication unique id (doi, pmid or pmcid) if
available on the reference description from BIODOCS.
return a tuble:
citation curated from uinique id
name of unique identifier ressource or None
unique id or None
'''
def
check_missing_biodocs
(
biodocs_lst
):
ret
=
[]
for
ref
in
ref_lst
:
if
not
ref
:
continue
ret
.
append
({
'ID'
:
ref
[
'ID'
]
,
'IDtype'
:
ref
[
'idType'
]
,
'citation'
:
ref
[
'title'
]
})
return
ret
def
version_extract
(
version
):
'''
extract version from BIODOCS version name
returns a tuple.
version
boolean if default version
'''
default
=
False
if
'(default)'
in
version
:
version
,
default
=
version
.
split
(
'('
)
default
=
MTRUE
if
default
else
MFALSE
return
version
,
default
def
history_maker
(
history_info
):
'''
format history in a mongo compatible way
returns list of dict
{ 'date' : datetime.datetime object
, 'operation' : keyword from 'install', 'remove', 'setdefault', 'unsetdefault', 'update
, 'message' : free text as str }
'''
for
info
in
history_info
:
if
not
info
:
continue
try
:
info
[
'date'
]
=
arrow
.
get
(
info
[
'date'
],
'YYYY/MM/DD'
).
naive
except
arrow
.
parser
.
ParserError
as
msg
:
error
(
FATAL
,
date
,
msg
)
return
history_info
def
merge_htmldocs
(
biodocs_lst
,
module_lst
):
'''
merge BIODOCS documentation with the one provided by module
WARNING: currently discard BIODOCS info
'''
return
module_lst
def
getMapping
(
fileName
):
mapper
=
{}
with
open
(
fileName
,
'r'
)
as
fh
:
for
line
in
fh
:
line
=
line
.
strip
()
if
line
.
startswith
(
'#'
):
for
biodoc
in
biodocs_lst
:
if
not
os
.
path
.
isfile
(
biodoc
):
err
(
WARN
,
biodoc
,
'no such file'
)
continue
old
,
new
=
line
.
split
()
mapper
[
old
]
=
new
return
mapper
def
updateCategories
(
lst
,
mapper
=
mapper
):
ret
=
[]
for
elem
in
lst
:
if
elem
in
mapper
:
ret
.
append
(
mapper
[
elem
])
elif
elem
in
EDAM_READY
:
ret
.
append
(
elem
)
else
:
error
(
WARN
,
elem
,
"non matching edam operationn or topic"
)
ret
.
append
(
biodoc
)
return
ret
def
getOrigin
(
pack
):
return
pack
[
'ORIGIN'
].
split
()
#--------------------------------------------------
# DO THE JOB
#--------------------------------------------------
def
pack2mongo
(
pack
):
'''
format mongo package document from BIODOCS package informations
returns a mongo package document
'''
#---- get infos from BIODOCS parsed holder
pack_name
=
pack
[
'name'
]
pack_authors
=
[
auth
.
strip
()
for
auth
in
pack
[
'authors'
]
if
auth
]
pack_refs
=
format_references
(
pack
[
'references'
])
pack_library
=
MTRUE
if
pack
[
'library'
]
else
MFALSE
pack_private
=
MTRUE
if
pack
[
'private'
]
else
MFALSE
hist
=
history_maker
(
pack
[
'history'
])
if
hist
is
None
:
error
(
WARN
,
pack_name
,
"invalid history"
)
hist
=
[]
pack_history
=
hist
pack_id
=
"pack%s%s"
%
(
ID_SEPARATOR
,
pack_name
)
pack_collections
=
[
pack
[
'origin'
]]
#---- map gensoft Categories to edam relevant operation and topic terms
# categories = updateCategories(pack['categories'])
mongopack
=
{
'_id'
:
pack_id
,
'type'
:
'package'
,
'name'
:
pack_name
,
'description'
:
pack
[
'description'
]
#pack_description
,
'home'
:
pack
[
'home'
]
,
'source'
:
pack
[
'sources'
]
,
'categories'
:
pack
[
'operations'
]
+
pack
[
'topics'
]
,
'authors'
:
pack_authors
,
'references'
:
pack_refs
,
'library'
:
pack_library
,
'private'
:
pack_private
,
'history'
:
pack_history
,
'collections'
:
pack_collections
def
get_biodocs
(
packdir
):
# filter to keep only pack_version
pack_name
=
os
.
path
.
basename
(
packdir
)
pack_version_lst
=
[
os
.
path
.
join
(
packdir
,
d
)
for
d
in
os
.
listdir
(
packdir
)
if
d
.
startswith
(
pack_name
)
]
biodocs_lst
=
[
os
.
path
.
join
(
d
,
'BIODOCS.yaml'
)
for
d
in
pack_version_lst
]
return
check_missing_biodocs
(
biodocs_lst
)
def
citation_key_translate
(
refs_col
):
# mapping = {key in yaml : key expected in mongo }
mapping
=
{
'ID'
:
'ID'
,
'idType'
:
'IDtype'
,
'title'
:
'citation'
}
for
ref
in
refs_col
:
for
k
,
v
in
mapping
.
items
():
ref
[
v
]
=
ref
.
pop
(
k
)
return
refs_col
def
info2package
(
info
):
pack_name
=
info
[
'name'
]
pack_id
=
_id_generator
(
'pack'
,
pack_name
)
pack_doc
=
{
'_id'
:
pack_id
,
'type'
:
'package'
,
'name'
:
info
[
'name'
]
,
'description'
:
info
[
'description'
]
,
'home'
:
info
[
'home'
]
,
'source'
:
info
[
'sources'
]
,
'authors'
:
info
[
'authors'
]
,
'categories'
:
info
[
'operations'
]
+
info
[
'topics'
]
,
'topics'
:
info
[
'topics'
]
,
'operations'
:
info
[
'operations'
]
,
'collections'
:
[
info
[
'origin'
]]
,
'history'
:
info
[
'history'
]
,
'library'
:
info
[
'library'
]
,
'private'
:
info
[
'private'
]
,
'references'
:
citation_key_translate
(
info
[
'references'
])
}
return
pack_doc
return
mongopack
def
packversion2mongo
(
pack
,
version
,
module_info
):
'''
format mongo package document from BIODOCS package informations
returns a mongo package document
'''
#---- get infos from BIODOCS parsed holder
pack_name
=
pack
[
'name'
]
pack_version
,
pack_default
=
version_extract
(
version
)
pack_id
=
"pack%s%s%s%s"
%
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
)
pack_htmldocs
=
pack
[
'htmldocs'
]
pack_manpages
=
pack
[
'manpages'
]
#---- complement with module info
pack_htmldocs
=
merge_htmldocs
(
pack_htmldocs
,
module_info
[
'htmldocs'
])
pack_docs
=
module_info
[
'docs'
]
pack_depends
=
module_info
[
'depends'
]
pack_id
=
"pack%s%s"
%
(
ID_SEPARATOR
,
pack_name
)
version_id
=
"pack%s%s%s%s"
%
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
)
versionpack
=
{
'_id'
:
version_id
def
info2pack_version
(
info
):
pack_name
=
info
[
'name'
]
pack_id
=
_id_generator
(
"pack"
,
pack_name
)
pack_version
=
info
[
'version'
]
pack_version_id
=
_id_generator
(
"pack"
,
pack_name
,
pack_version
)
pack_version_doc
=
{
'_id'
:
pack_version_id
,
'type'
:
'packageVersion'
,
'package'
:
pack_id
,
'version'
:
pack_version
,
'default'
:
pack_default
,
'doc'
:
{
'html'
:
pack_htmldocs
,
'docs'
:
pack_docs
,
'man'
:
pack_manpages
,
'package'
:
pack_id
,
'default'
:
info
[
'default'
]
,
'version'
:
info
[
'version'
]
,
'depends'
:
info
[
'depends'
]
,
'doc'
:
{
'docs'
:
[]
,
'html'
:
info
[
'htmldocs'
]
,
'man'
:
[]
}
,
'depends'
:
pack_depends
}
return
pack_version_doc
return
versionpack
def
info2progs
(
info
):
progs
=
[]
pack_name
=
info
[
'name'
]
pack_version
=
info
[
'version'
]
for
elem
in
info
[
'programs'
]:
new
=
{}
new
[
'_id'
]
=
_id_generator
(
"prog"
,
pack_name
,
pack_version
,
elem
[
'name'
])
new
[
'type'
]
=
'program'
new
[
'name'
]
=
elem
[
'name'
]
new
[
'description'
]
=
elem
[
'description'
]
if
elem
[
'description'
]
else
''
new
[
'packageVersion'
]
=
_id_generator
(
"pack"
,
pack_name
,
pack_version
)
new
[
'categories'
]
=
[]
new
[
'doc'
]
=
{
'html'
:
[],
'man'
:
[
elem
[
'manpages'
]]}
progs
.
append
(
new
)
return
progs
def
progs2mongo
(
pack_name
,
pack_version
,
prg_lst
,
module_info
):
'''
format mongo program document from BIODOCS program informations
returns a list of programs mongo documents
'''
def
mongoprog_creator
(
prog_id
,
pack_id
,
name
,
htmldocs
=
[],
manpages
=
[],
categories
=
[],
description
=
''
):
mongoprog
=
{
'_id'
:
prog_id
,
'type'
:
'program'
,
'packageVersion'
:
pack_id
,
'name'
:
prog_name
,
'doc'
:
{
'html'
:
htmldocs
,
'man'
:
manpages
}
,
'categories'
:
categories
,
'description'
:
description
}
return
mongoprog
ret
=
[]
#---- generate corresponding module name to get uptodate program version
#---- only default version package are documented, when they are ;-(
pack_version
,
_
=
version_extract
(
pack_version
)
provided_prgs
=
[
prg
.
lower
()
for
prg
in
module_info
[
'progs'
]
if
prg
]
pack_id
=
"pack%s%s%s%s"
%
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
)
#---- because BIDOCS only document one version, we need to perform 2 operations:
#---- 1: remove programs documented in BIODOCS not available in module definitions
#---- 2: add programs in module definitions not documented in BIODOCS
for
prog
in
prg_lst
:
prog_name
=
prog
[
'name'
]
# avoid typo documentation
if
prog_name
.
lower
()
not
in
provided_prgs
:
error
(
WARN
,
prog_name
,
'not in module'
,
"%s/%s"
%
(
pack_name
,
pack_version
))
continue
provided_prgs
.
remove
(
prog_name
.
lower
())
prog_id
=
"prog%s%s%s%s%s%s"
%
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
,
ID_SEPARATOR
,
prog_name
)
prog_description
=
prog
[
'description'
]
if
prog
[
'description'
]
else
''
mongoprog
=
mongoprog_creator
(
prog_id
,
pack_id
,
prog_name
,
prog
[
'HTMLDOCS'
],
prog
[
'MANPAGES'
],
prog
[
'CATEGORIES'
],
prog_description
)
ret
.
append
(
mongoprog
)
#---- insert non documented programs in BIODOCS if some
remaining
=
[]
for
prg
in
provided_prgs
:
for
prog_name
in
module_info
[
'progs'
]:
if
prog_name
.
lower
()
!=
prg
:
continue
remaining
.
append
(
prog_name
)
for
prog_name
in
remaining
:
prog_id
=
"prog%s%s%s%s%s%s"
%
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
,
ID_SEPARATOR
,
prog_name
)
mongoprog
=
mongoprog_creator
(
prog_id
,
pack_id
,
prog_name
)
ret
.
append
(
mongoprog
)
return
ret
def
info2mobyle
(
info
):
# import mobyledefs
##---- should no longer used with Biodocs.yaml
# is the version used by mobyle ?
# if "%s/%s" %(pack_name, pack_version) not in mobyledefs.pack_info:
# return mobyle
def
getweb
(
web_info
):
mobyle
=
[]
pack_name
=
info
[
'name'
]
pack_version
=
info
[
'version'
]
pattern
=
'(?P<name>[\w-]+)\s*\[(?P<desc>.*)\]'
auto
=
re
.
compile
(
pattern
)
match
=
auto
.
match
(
web_info
)
if
match
is
None
:
error
(
WARN
,
'invalib web description'
,
web_info
)
interface_name
=
match
.
group
(
'name'
)
description
=
match
.
group
(
'desc'
)
mobyle_url
=
"%s#forms::%s"
%
(
MOBYLEURL
,
interface_name
)
return
interface_name
,
mobyle_url
,
description
def
mobyle2mongo
(
pack_name
,
pack_version
,
prg_lst
,
module_info
):
'''
format mongo mobyle document from BIODOCS program informations
returns a list of mobyle mongo documents
'''
ret
=
[]
#---- filter progs with WEB interface.
prg_lst
=
filter
(
lambda
n
:
'WEB'
in
n
,
prg_lst
)
if
not
prg_lst
:
return
ret
pack_version
,
_
=
version_extract
(
pack_version
)
interfaces
=
collections
.
defaultdict
(
list
)
for
prog
in
prg_lst
:
prog_name
=
prog
[
'NAME'
]
# one program may hold multiple interfaces
for
interface
in
prog
[
'WEB'
]:
if
not
interface
:
for
prg
in
info
[
'programs'
]:
prog_name
=
prg
[
'name'
]
# something to document ?
web
=
prg
.
get
(
'web'
,
False
)
if
not
web
:
continue
mobyle_interface_name
,
mobyle_url
,
mobyle_desc
=
getweb
(
interface
)
interfaces
[
mobyle_interface_name
].
append
((
prog_name
,
mobyle_url
,
mobyle_desc
))
#--- reduce and consistency check
for
interface
in
interfaces
:
programs
=
[]
urls
=
set
()
descs
=
[]
for
prg
,
url
,
desc
in
interfaces
[
interface
]:
programs
.
append
(
prg
)
urls
.
add
(
url
)
descs
.
append
(
desc
)
if
len
(
urls
)
!=
1
:
error
(
FATAL
,
pack_name
,
pack_version
,
"PROG.WEB conflict"
)
interfaces
[
interface
]
=
(
list
(
urls
),
programs
,
descs
)
for
interface_name
in
interfaces
:
url
,
prog_names
,
desc
=
interfaces
[
interface_name
]
url
=
url
[
0
]
mobyle_desc
=
' '
.
join
(
desc
).
strip
()
mobyle_id
=
"mobyle%s%s%s%s%s%s"
%
\
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
,
ID_SEPARATOR
,
interface_name
)
pack_id
=
"pack%s%s%s%s"
%
\
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
)
prog_ids
=
[]
for
prog_name
in
prog_names
:
prog_id
=
"prog%s%s%s%s%s%s"
%
\
(
ID_SEPARATOR
,
pack_name
,
ID_SEPARATOR
,
pack_version
,
ID_SEPARATOR
,
prog_name
)
prog_ids
.
append
(
prog_id
)
mongomobyle
=
{
'_id'
:
mobyle_id
,
'type'
:
'mobyle'
,
'programs'
:
prog_ids
,
'package'
:
pack_id
,
'name'
:
interface_name
,
'url'
:
url
,
'description'
:
mobyle_desc
}
ret
.
append
(
mongomobyle
)
return
ret
def
biodocs2mongo
(
biodocs_pack_info
,
biodocs_progs_info
,
mobyle_progs_info
):
versions_lst
=
[
biodocs_pack_info
[
'version'
]]
pack_name
=
biodocs_pack_info
[
'name'