Commit d62f53c6 authored by Eric  DEVEAUD's avatar Eric DEVEAUD
Browse files

port to BIODOCS.yaml

parent 060e575d
NAM=biodocs2mongo
PREFIX=/local/gensoft2/adm
BIN=$(PREFIX)/bin
PYMODULEDIR=$(PREFIX)/share/gensoft/pymodules
DAT=$(PREFIX)/share/gensoft/$(NAM)
build:
sed -i -e 's,^\#!.*python,\#! /local/gensoft2/adm/bin/python,' *.py
sed -e 's|^DAT=.*|DAT="$(DAT)"|' \
-e 's|^PYMODULEDIR=.*|PYMODULEDIR="$(PYMODULEDIR)"|' $(NAM).py > $(NAM)
install: build
test -d $(DAT) || mkdir -p $(DAT)
install -m 0775 $(NAM) $(BIN)
install -m 0664 config.cfg $(DAT)
$(MAKE) clean
clean:
rm -f $(NAM)
rm -f *.pyc
uninstall:
rm -f $(BIN)/$(NAM) $(DAT)/*
rm -rf $(DAT)
#!/usr/bin/env python2.7 #! /local/gensoft2/adm/bin/python
from __future__ import print_function
import argparse import argparse
import arrow # date manipulation made easy. import ConfigParser
import collections import os
import pymongo import pymongo
import re import re
import os
import string
import ssl import ssl
import string
import sys import sys
import BiodocParser from pprint import pprint
import mobyledefs
# tweak modulecmd in order to use 3.3.a updated (not yet released)
os.environ['PY_MODULECMD'] = '/local/gensoft2/adm/Modules/3.3.a/bin/modulecmd'
import module as M
#from biowebmongo import *
#from pymongo.errors import BulkWriteError
#--------------------------------------------------
# SOME GLOBALS
#--------------------------------------------------
LOGFH= sys.stdout
ERRFH= sys.stderr
FATAL= 1 #### TODO !!!!!
WARN= 0 #
# add config entry for use module.
# if set => check program listing from biodocs versus program listing from modules
VERBOSE= 1
MTRUE = True # "true" value to insert in mongodb
MFALSE = False # "false" value to insert in mongodb
ID_SEPARATOR = '@' #---- import homebrew modules
MOBYLEURL = 'http://mobyle.pasteur.fr' PYMODULEDIR="/local/gensoft2/adm/share/gensoft/pymodules"
sys.path.insert(0, PYMODULEDIR)
HOST='bioweb-prod.web.pasteur.fr' import BiodocParser
PORT=27017
JOURNALING=True # implies w=1
WRITECONCERN=1
DB_DEF='bioweb'
COL_DEF='catalog'
known_ops = ['install', 'remove', 'setdefault', 'unsetdefault', 'update']
mapper = {} #---- some hugly global variables
#-------------------------------------------------- DAT='.'
# INTERNAL USE ERRFH = sys.stderr
#-------------------------------------------------- LOGFH = sys.stdout
FATAL = 1
WARN = 0
def error(exit_val, *msg): def err(exit_val, *msg):
head=['Warning', 'Error'] head=['Warning', 'Error']
print >> ERRFH, "%s: %s" % (head[exit_val], " - ".join(map(str, msg))) print ("%s: %s" % (head[exit_val], " - ".join(map(str, msg))), file=ERRFH)
if exit_val: if exit_val:
sys.exit(exit_val) sys.exit(exit_val)
return None return None
def log(*msg): def log(*msg):
if VERBOSE: if VERBOSE:
print >> LOGFH, "%s" %(' '.join(map(str, msg))) print ("%s" %(' '.join(map(str, msg))), file=LOGFH)
LOGFH.flush() LOGFH.flush()
def get_DB(host, port, db_name=DB_DEF, j=JOURNALING, w=WRITECONCERN ):
log('connect to', host, 'on port:', port)
try:
client = pymongo.MongoClient(host, port, j=JOURNALING, ssl=True, ssl_cert_reqs=ssl.CERT_NONE)
except pymongo.errors.ConnectionFailure as err:
error(FATAL, "mongodb %s/%s" %(host, port), err)
return client[db_name]
def getmodule_infos(package):
'''
returns programs provided by a package//module
htmldocs provided by a package//module
pack dependencies
information is more uptodate on programs than the
one provided by BIODOCS.
information may vary given the pack version, while
BIDOCS only document default version
returns a list of names.
'''
m = M.Module()
def get_progs(info):
'''
get programs provided by installed module version
returns a list of program_names as str
'''
progs = []
keep = False
for elem in infos:
elem = elem.strip()
if not elem:
continue
if keep:
progs.append(elem)
if elem == "package provides following commands:":
keep = True
return progs
def get_docs(info):
'''
get local documentation available for given package
returns a tuple (htmldocs, txtdocs) of list as str
'''
htmldocs = []
txtdocs = []
keep = False
for elem in infos:
elem = elem.strip()
if not elem:
continue
if elem == "local documentation available:":
keep = True
continue
elif elem == "package provides following commands:":
break
if keep:
if 'bioweb2' in elem:
htmldocs.append(elem)
else:
txtdocs.append(elem)
return htmldocs, txtdocs
def get_dependencies(info):
'''
get the package/version that a given module depends or relies on.
returns a list of packages_names
'''
deps = []
for elem in infos:
elem = elem.strip()
if not elem:
continue
if 'WARNING' in elem:
deps = elem[elem.find('<')+1:elem.find('>')]
fields = deps.split()
# remove extraneous and or tokens
deps = list(set(fields))
try:
deps.remove('or')
deps.remove('and')
except ValueError as err:
pass
break
return [item for item in deps if item]
if testmodule:
package = 'test/%s' %(package)
print ">>>>>>>>>", package
try:
infos=m.help(package)
except M.ModuleError as err:
raise M.ModuleError("no modulefile %s" %(package))
garbage = infos.find('*****')
if garbage != -1:
infos = infos[:garbage]
infos = infos.split('\n')
ret = {}
ret['progs'] = get_progs(infos)
ret['htmldocs'], ret['docs'] = get_docs(infos)
ret['depends'] = get_dependencies(infos)
return ret
def format_references(ref_lst): def _id_generator(*arg):
''' return ID_SEPARATOR.join(arg)
format reference information as a dictionary
suitable to be embeded in mongo package document
'''
def extract_doi(item): def check_missing_biodocs(biodocs_lst):
'''
extract publication unique id (doi, pmid or pmcid) if
available on the reference description from BIODOCS.
return a tuble:
citation curated from uinique id
name of unique identifier ressource or None
unique id or None
'''
ret = [] ret = []
for ref in ref_lst: for biodoc in biodocs_lst:
if not ref: if not os.path.isfile(biodoc):
err(WARN, biodoc, 'no such file')
continue continue
ret.append({ 'ID': ref['ID'] ret.append(biodoc)
, 'IDtype': ref['idType']
, 'citation' : ref['title']
})
return ret return ret
def version_extract(version): def get_biodocs(packdir):
''' # filter to keep only pack_version
extract version from BIODOCS version name pack_name = os.path.basename(packdir)
returns a tuple. pack_version_lst = [ os.path.join(packdir, d) for d in os.listdir(packdir) if d.startswith(pack_name) ]
version biodocs_lst = [ os.path.join(d, 'BIODOCS.yaml') for d in pack_version_lst ]
boolean if default version return check_missing_biodocs(biodocs_lst)
'''
def citation_key_translate(refs_col):
default=False # mapping = {key in yaml : key expected in mongo }
if '(default)' in version: mapping = {'ID': 'ID'
version, default = version.split('(') ,'idType' : 'IDtype'
default = MTRUE if default else MFALSE ,'title': 'citation'
return version, default }
for ref in refs_col:
def history_maker(history_info): for k, v in mapping.items():
''' ref[v] = ref.pop(k)
format history in a mongo compatible way return refs_col
returns list of dict
{ 'date' : datetime.datetime object
, 'operation' : keyword from 'install', 'remove', 'setdefault', 'unsetdefault', 'update def info2package(info):
, 'message' : free text as str } pack_name = info['name']
''' pack_id = _id_generator('pack', pack_name)
for info in history_info: pack_doc = {
if not info: '_id': pack_id
continue , 'type' : 'package'
try: , 'name' : info['name']
info['date'] = arrow.get(info['date'], 'YYYY/MM/DD').naive , 'description' : info['description']
except arrow.parser.ParserError as msg: , 'home' : info['home']
error(FATAL, date, msg) , 'source' : info['sources']
return history_info , 'authors' : info['authors']
, 'categories' : info['operations'] + info['topics']
def merge_htmldocs (biodocs_lst, module_lst): , 'topics' : info['topics']
''' , 'operations': info['operations']
merge BIODOCS documentation with the one provided by module , 'collections' : [info['origin']]
WARNING: currently discard BIODOCS info , 'history' : info['history']
''' , 'library' : info['library']
return module_lst , 'private' : info['private']
, 'references' : citation_key_translate(info['references'])
}
def getMapping(fileName): return pack_doc
mapper = {}
with open(fileName, 'r') as fh:
for line in fh: def info2pack_version(info):
line = line.strip() pack_name = info['name']
if line.startswith('#'): pack_id = _id_generator("pack", pack_name)
continue pack_version = info['version']
old, new = line.split() pack_version_id = _id_generator("pack", pack_name, pack_version)
mapper[old] = new pack_version_doc = {
return mapper '_id' : pack_version_id
, 'type' : 'packageVersion'
, 'package' : pack_id
def updateCategories(lst, mapper=mapper): , 'default' : info['default']
ret = [] , 'version' : info['version']
for elem in lst: , 'depends' : info['depends']
if elem in mapper: , 'doc' : {'docs' : []
ret.append(mapper[elem]) , 'html' : info['htmldocs']
elif elem in EDAM_READY: , 'man' : []
ret.append(elem)
else:
error(WARN, elem, "non matching edam operationn or topic")
return ret
def getOrigin(pack):
return pack['ORIGIN'].split()
#--------------------------------------------------
# DO THE JOB
#--------------------------------------------------
def pack2mongo(pack):
'''
format mongo package document from BIODOCS package informations
returns a mongo package document
'''
#---- get infos from BIODOCS parsed holder
pack_name = pack['name']
pack_authors = [auth.strip() for auth in pack['authors'] if auth]
pack_refs = format_references(pack['references'])
pack_library = MTRUE if pack['library'] else MFALSE
pack_private = MTRUE if pack['private'] else MFALSE
hist = history_maker(pack['history'])
if hist is None:
error(WARN, pack_name, "invalid history")
hist = []
pack_history = hist
pack_id = "pack%s%s" %(ID_SEPARATOR, pack_name)
pack_collections = [pack['origin']]
#---- map gensoft Categories to edam relevant operation and topic terms
# categories = updateCategories(pack['categories'])
mongopack = { '_id': pack_id
, 'type': 'package'
, 'name' : pack_name
, 'description' : pack['description'] #pack_description
, 'home' : pack['home']
, 'source' : pack['sources']
, 'categories' : pack['operations'] + pack['topics']
, 'authors' : pack_authors
, 'references' : pack_refs
, 'library' : pack_library
, 'private' : pack_private
, 'history' : pack_history
, 'collections' : pack_collections
}
return mongopack
def packversion2mongo(pack, version, module_info):
'''
format mongo package document from BIODOCS package informations
returns a mongo package document
'''
#---- get infos from BIODOCS parsed holder
pack_name = pack['name']
pack_version, pack_default= version_extract(version)
pack_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
pack_htmldocs = pack['htmldocs']
pack_manpages = pack['manpages']
#---- complement with module info
pack_htmldocs = merge_htmldocs(pack_htmldocs, module_info['htmldocs'])
pack_docs = module_info['docs']
pack_depends = module_info['depends']
pack_id = "pack%s%s" %(ID_SEPARATOR, pack_name)
version_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
versionpack = { '_id': version_id
, 'type' : 'packageVersion'
, 'package': pack_id
, 'version' : pack_version
, 'default' : pack_default
, 'doc' : { 'html': pack_htmldocs
, 'docs': pack_docs
, 'man' : pack_manpages
}
, 'depends' : pack_depends
}
return versionpack
def progs2mongo(pack_name, pack_version, prg_lst, module_info):
'''
format mongo program document from BIODOCS program informations
returns a list of programs mongo documents
'''
def mongoprog_creator(prog_id, pack_id, name, htmldocs=[], manpages=[],categories=[], description=''):
mongoprog= { '_id' : prog_id
, 'type' : 'program'
, 'packageVersion' : pack_id
, 'name' : prog_name
, 'doc' : { 'html': htmldocs
, 'man' : manpages
}
, 'categories' : categories
, 'description' : description
} }
return mongoprog }
return pack_version_doc
ret = []
#---- generate corresponding module name to get uptodate program version
#---- only default version package are documented, when they are ;-( def info2progs(info):
pack_version, _ = version_extract(pack_version) progs = []
pack_name = info['name']
provided_prgs = [prg.lower() for prg in module_info['progs'] if prg] pack_version = info['version']
pack_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
#---- because BIDOCS only document one version, we need to perform 2 operations: for elem in info['programs']:
#---- 1: remove programs documented in BIODOCS not available in module definitions new = {}
#---- 2: add programs in module definitions not documented in BIODOCS new['_id'] = _id_generator("prog", pack_name, pack_version, elem['name'])
new['type'] = 'program'
for prog in prg_lst: new['name'] = elem['name']
prog_name = prog['name'] new['description'] = elem['description'] if elem['description'] else ''
# avoid typo documentation new['packageVersion'] = _id_generator("pack", pack_name, pack_version)
if prog_name.lower() not in provided_prgs: new['categories'] = []
error(WARN, prog_name, 'not in module', "%s/%s" %(pack_name, pack_version)) new['doc'] = {'html': [], 'man': [elem['manpages']]}
continue progs.append(new)
provided_prgs.remove(prog_name.lower()) return progs
prog_id = "prog%s%s%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, prog_name)
prog_description = prog['description'] if prog['description'] else ''
mongoprog = mongoprog_creator(prog_id, pack_id, prog_name, prog['HTMLDOCS'], prog['MANPAGES'], prog['CATEGORIES'], prog_description) def info2mobyle(info):
ret.append(mongoprog) # import mobyledefs
#---- insert non documented programs in BIODOCS if some ##---- should no longer used with Biodocs.yaml
remaining = [] # is the version used by mobyle ?
# if "%s/%s" %(pack_name, pack_version) not in mobyledefs.pack_info:
for prg in provided_prgs: # return mobyle
for prog_name in module_info['progs']:
if prog_name.lower() != prg: mobyle = []
continue pack_name = info['name']
remaining.append(prog_name) pack_version = info['version']
for prog_name in remaining:
prog_id = "prog%s%s%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, prog_name)
mongoprog = mongoprog_creator(prog_id, pack_id, prog_name)
ret.append(mongoprog)
return ret
def getweb(web_info):
pattern= '(?P<name>[\w-]+)\s*\[(?P<desc>.*)\]' pattern= '(?P<name>[\w-]+)\s*\[(?P<desc>.*)\]'
auto = re.compile(pattern) auto = re.compile(pattern)
match = auto.match(web_info)
if match is None:
error(WARN, 'invalib web description', web_info)
interface_name = match.group('name')
description = match.group('desc')
mobyle_url = "%s#forms::%s" % (MOBYLEURL, interface_name) for prg in info['programs']:
return interface_name, mobyle_url, description prog_name = prg['name']
# something to document ?
def mobyle2mongo(pack_name, pack_version, prg_lst, module_info): web = prg.get('web', False)
''' if not web:
format mongo mobyle document from BIODOCS program informations continue
returns a list of mobyle mongo documents for web_info in web:
''' interface = {}
match = auto.match(web_info)
ret = [] if match is None:
#---- filter progs with WEB interface. err(WARN, 'invalib web description', web_info)
prg_lst = filter(lambda n: 'WEB' in n, prg_lst) i_name = match.group('name')
if not prg_lst: i_desc = match.group('desc')
return ret interface['_id'] = _id_generator("mobyle", pack_name, pack_version, i_name)
interface['type'] = 'mobyle'
pack_version, _ = version_extract(pack_version) interface['package'] = _id_generator("pack", pack_name, pack_version)
interface['programs'] = [_id_generator("prog", pack_name, pack_version, prog_name)]
interfaces = collections.defaultdict(list) interface['name'] = i_name
interface['description'] = i_desc
interface['url'] = "%s#forms::%s" % (MOBYLEURL, interface['name'])
mobyle.append(interface)
return mobyle
def mongo_connect(host, port, db, col):
log('connect to', host, 'on port', port)
for prog in prg_lst: try:
prog_name=prog['NAME'] client = pymongo.MongoClient(host, port, j=True, ssl=True, ssl_cert_reqs=ssl.CERT_NONE)
# one program may hold multiple interfaces except pymongo.errors.ConnectionFailure as msg:
for interface in prog['WEB']: err(FATAL, "mongodb %s/%s" %(host, port), msg)
if not interface : db = client[db]
continue col = db[col]
mobyle_interface_name, mobyle_url, mobyle_desc = getweb(interface) return col
interfaces[mobyle_interface_name].append((prog_name, mobyle_url, mobyle_desc))
#--- reduce and consistency check
for interface in interfaces:
programs = []
urls = set()
descs = []
for prg, url, desc in interfaces[interface]: