Commit d62f53c6 authored by Eric  DEVEAUD's avatar Eric DEVEAUD

port to BIODOCS.yaml

parent 060e575d
NAM=biodocs2mongo
PREFIX=/local/gensoft2/adm
BIN=$(PREFIX)/bin
PYMODULEDIR=$(PREFIX)/share/gensoft/pymodules
DAT=$(PREFIX)/share/gensoft/$(NAM)
build:
sed -i -e 's,^\#!.*python,\#! /local/gensoft2/adm/bin/python,' *.py
sed -e 's|^DAT=.*|DAT="$(DAT)"|' \
-e 's|^PYMODULEDIR=.*|PYMODULEDIR="$(PYMODULEDIR)"|' $(NAM).py > $(NAM)
install: build
test -d $(DAT) || mkdir -p $(DAT)
install -m 0775 $(NAM) $(BIN)
install -m 0664 config.cfg $(DAT)
$(MAKE) clean
clean:
rm -f $(NAM)
rm -f *.pyc
uninstall:
rm -f $(BIN)/$(NAM) $(DAT)/*
rm -rf $(DAT)
#!/usr/bin/env python2.7
#! /local/gensoft2/adm/bin/python
from __future__ import print_function
import argparse
import arrow # date manipulation made easy.
import collections
import ConfigParser
import os
import pymongo
import re
import os
import string
import ssl
import string
import sys
import BiodocParser
import mobyledefs
# tweak modulecmd in order to use 3.3.a updated (not yet released)
os.environ['PY_MODULECMD'] = '/local/gensoft2/adm/Modules/3.3.a/bin/modulecmd'
import module as M
#from biowebmongo import *
#from pymongo.errors import BulkWriteError
#--------------------------------------------------
# SOME GLOBALS
#--------------------------------------------------
LOGFH= sys.stdout
ERRFH= sys.stderr
from pprint import pprint
FATAL= 1
WARN= 0
#### TODO !!!!!
#
# add config entry for use module.
# if set => check program listing from biodocs versus program listing from modules
VERBOSE= 1
MTRUE = True # "true" value to insert in mongodb
MFALSE = False # "false" value to insert in mongodb
ID_SEPARATOR = '@'
#---- import homebrew modules
MOBYLEURL = 'http://mobyle.pasteur.fr'
HOST='bioweb-prod.web.pasteur.fr'
PORT=27017
JOURNALING=True # implies w=1
WRITECONCERN=1
DB_DEF='bioweb'
COL_DEF='catalog'
known_ops = ['install', 'remove', 'setdefault', 'unsetdefault', 'update']
PYMODULEDIR="/local/gensoft2/adm/share/gensoft/pymodules"
sys.path.insert(0, PYMODULEDIR)
import BiodocParser
mapper = {}
#---- some hugly global variables
#--------------------------------------------------
# INTERNAL USE
#--------------------------------------------------
DAT='.'
ERRFH = sys.stderr
LOGFH = sys.stdout
FATAL = 1
WARN = 0
def error(exit_val, *msg):
def err(exit_val, *msg):
head=['Warning', 'Error']
print >> ERRFH, "%s: %s" % (head[exit_val], " - ".join(map(str, msg)))
print ("%s: %s" % (head[exit_val], " - ".join(map(str, msg))), file=ERRFH)
if exit_val:
sys.exit(exit_val)
return None
def log(*msg):
if VERBOSE:
print >> LOGFH, "%s" %(' '.join(map(str, msg)))
print ("%s" %(' '.join(map(str, msg))), file=LOGFH)
LOGFH.flush()
def get_DB(host, port, db_name=DB_DEF, j=JOURNALING, w=WRITECONCERN ):
log('connect to', host, 'on port:', port)
try:
client = pymongo.MongoClient(host, port, j=JOURNALING, ssl=True, ssl_cert_reqs=ssl.CERT_NONE)
except pymongo.errors.ConnectionFailure as err:
error(FATAL, "mongodb %s/%s" %(host, port), err)
return client[db_name]
def getmodule_infos(package):
'''
returns programs provided by a package//module
htmldocs provided by a package//module
pack dependencies
information is more uptodate on programs than the
one provided by BIODOCS.
information may vary given the pack version, while
BIDOCS only document default version
returns a list of names.
'''
m = M.Module()
def get_progs(info):
'''
get programs provided by installed module version
returns a list of program_names as str
'''
progs = []
keep = False
for elem in infos:
elem = elem.strip()
if not elem:
continue
if keep:
progs.append(elem)
if elem == "package provides following commands:":
keep = True
return progs
def get_docs(info):
'''
get local documentation available for given package
returns a tuple (htmldocs, txtdocs) of list as str
'''
htmldocs = []
txtdocs = []
keep = False
for elem in infos:
elem = elem.strip()
if not elem:
continue
if elem == "local documentation available:":
keep = True
continue
elif elem == "package provides following commands:":
break
if keep:
if 'bioweb2' in elem:
htmldocs.append(elem)
else:
txtdocs.append(elem)
return htmldocs, txtdocs
def get_dependencies(info):
'''
get the package/version that a given module depends or relies on.
returns a list of packages_names
'''
deps = []
for elem in infos:
elem = elem.strip()
if not elem:
continue
if 'WARNING' in elem:
deps = elem[elem.find('<')+1:elem.find('>')]
fields = deps.split()
# remove extraneous and or tokens
deps = list(set(fields))
try:
deps.remove('or')
deps.remove('and')
except ValueError as err:
pass
break
return [item for item in deps if item]
if testmodule:
package = 'test/%s' %(package)
print ">>>>>>>>>", package
try:
infos=m.help(package)
except M.ModuleError as err:
raise M.ModuleError("no modulefile %s" %(package))
garbage = infos.find('*****')
if garbage != -1:
infos = infos[:garbage]
infos = infos.split('\n')
ret = {}
ret['progs'] = get_progs(infos)
ret['htmldocs'], ret['docs'] = get_docs(infos)
ret['depends'] = get_dependencies(infos)
return ret
def format_references(ref_lst):
'''
format reference information as a dictionary
suitable to be embeded in mongo package document
'''
def _id_generator(*arg):
return ID_SEPARATOR.join(arg)
def extract_doi(item):
'''
extract publication unique id (doi, pmid or pmcid) if
available on the reference description from BIODOCS.
return a tuble:
citation curated from uinique id
name of unique identifier ressource or None
unique id or None
'''
def check_missing_biodocs(biodocs_lst):
ret = []
for ref in ref_lst:
if not ref:
for biodoc in biodocs_lst:
if not os.path.isfile(biodoc):
err(WARN, biodoc, 'no such file')
continue
ret.append({ 'ID': ref['ID']
, 'IDtype': ref['idType']
, 'citation' : ref['title']
})
ret.append(biodoc)
return ret
def version_extract(version):
'''
extract version from BIODOCS version name
returns a tuple.
version
boolean if default version
'''
default=False
if '(default)' in version:
version, default = version.split('(')
default = MTRUE if default else MFALSE
return version, default
def history_maker(history_info):
'''
format history in a mongo compatible way
returns list of dict
{ 'date' : datetime.datetime object
, 'operation' : keyword from 'install', 'remove', 'setdefault', 'unsetdefault', 'update
, 'message' : free text as str }
'''
for info in history_info:
if not info:
continue
try:
info['date'] = arrow.get(info['date'], 'YYYY/MM/DD').naive
except arrow.parser.ParserError as msg:
error(FATAL, date, msg)
return history_info
def merge_htmldocs (biodocs_lst, module_lst):
'''
merge BIODOCS documentation with the one provided by module
WARNING: currently discard BIODOCS info
'''
return module_lst
def getMapping(fileName):
mapper = {}
with open(fileName, 'r') as fh:
for line in fh:
line = line.strip()
if line.startswith('#'):
continue
old, new = line.split()
mapper[old] = new
return mapper
def updateCategories(lst, mapper=mapper):
ret = []
for elem in lst:
if elem in mapper:
ret.append(mapper[elem])
elif elem in EDAM_READY:
ret.append(elem)
else:
error(WARN, elem, "non matching edam operationn or topic")
return ret
def getOrigin(pack):
return pack['ORIGIN'].split()
#--------------------------------------------------
# DO THE JOB
#--------------------------------------------------
def pack2mongo(pack):
'''
format mongo package document from BIODOCS package informations
returns a mongo package document
'''
#---- get infos from BIODOCS parsed holder
pack_name = pack['name']
pack_authors = [auth.strip() for auth in pack['authors'] if auth]
pack_refs = format_references(pack['references'])
pack_library = MTRUE if pack['library'] else MFALSE
pack_private = MTRUE if pack['private'] else MFALSE
hist = history_maker(pack['history'])
if hist is None:
error(WARN, pack_name, "invalid history")
hist = []
pack_history = hist
pack_id = "pack%s%s" %(ID_SEPARATOR, pack_name)
pack_collections = [pack['origin']]
#---- map gensoft Categories to edam relevant operation and topic terms
# categories = updateCategories(pack['categories'])
mongopack = { '_id': pack_id
, 'type': 'package'
, 'name' : pack_name
, 'description' : pack['description'] #pack_description
, 'home' : pack['home']
, 'source' : pack['sources']
, 'categories' : pack['operations'] + pack['topics']
, 'authors' : pack_authors
, 'references' : pack_refs
, 'library' : pack_library
, 'private' : pack_private
, 'history' : pack_history
, 'collections' : pack_collections
}
return mongopack
def packversion2mongo(pack, version, module_info):
'''
format mongo package document from BIODOCS package informations
returns a mongo package document
'''
#---- get infos from BIODOCS parsed holder
pack_name = pack['name']
pack_version, pack_default= version_extract(version)
pack_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
pack_htmldocs = pack['htmldocs']
pack_manpages = pack['manpages']
#---- complement with module info
pack_htmldocs = merge_htmldocs(pack_htmldocs, module_info['htmldocs'])
pack_docs = module_info['docs']
pack_depends = module_info['depends']
pack_id = "pack%s%s" %(ID_SEPARATOR, pack_name)
version_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
versionpack = { '_id': version_id
, 'type' : 'packageVersion'
, 'package': pack_id
, 'version' : pack_version
, 'default' : pack_default
, 'doc' : { 'html': pack_htmldocs
, 'docs': pack_docs
, 'man' : pack_manpages
}
, 'depends' : pack_depends
}
return versionpack
def progs2mongo(pack_name, pack_version, prg_lst, module_info):
'''
format mongo program document from BIODOCS program informations
returns a list of programs mongo documents
'''
def mongoprog_creator(prog_id, pack_id, name, htmldocs=[], manpages=[],categories=[], description=''):
mongoprog= { '_id' : prog_id
, 'type' : 'program'
, 'packageVersion' : pack_id
, 'name' : prog_name
, 'doc' : { 'html': htmldocs
, 'man' : manpages
}
, 'categories' : categories
, 'description' : description
def get_biodocs(packdir):
# filter to keep only pack_version
pack_name = os.path.basename(packdir)
pack_version_lst = [ os.path.join(packdir, d) for d in os.listdir(packdir) if d.startswith(pack_name) ]
biodocs_lst = [ os.path.join(d, 'BIODOCS.yaml') for d in pack_version_lst ]
return check_missing_biodocs(biodocs_lst)
def citation_key_translate(refs_col):
# mapping = {key in yaml : key expected in mongo }
mapping = {'ID': 'ID'
,'idType' : 'IDtype'
,'title': 'citation'
}
for ref in refs_col:
for k, v in mapping.items():
ref[v] = ref.pop(k)
return refs_col
def info2package(info):
pack_name = info['name']
pack_id = _id_generator('pack', pack_name)
pack_doc = {
'_id': pack_id
, 'type' : 'package'
, 'name' : info['name']
, 'description' : info['description']
, 'home' : info['home']
, 'source' : info['sources']
, 'authors' : info['authors']
, 'categories' : info['operations'] + info['topics']
, 'topics' : info['topics']
, 'operations': info['operations']
, 'collections' : [info['origin']]
, 'history' : info['history']
, 'library' : info['library']
, 'private' : info['private']
, 'references' : citation_key_translate(info['references'])
}
return pack_doc
def info2pack_version(info):
pack_name = info['name']
pack_id = _id_generator("pack", pack_name)
pack_version = info['version']
pack_version_id = _id_generator("pack", pack_name, pack_version)
pack_version_doc = {
'_id' : pack_version_id
, 'type' : 'packageVersion'
, 'package' : pack_id
, 'default' : info['default']
, 'version' : info['version']
, 'depends' : info['depends']
, 'doc' : {'docs' : []
, 'html' : info['htmldocs']
, 'man' : []
}
return mongoprog
ret = []
#---- generate corresponding module name to get uptodate program version
#---- only default version package are documented, when they are ;-(
pack_version, _ = version_extract(pack_version)
provided_prgs = [prg.lower() for prg in module_info['progs'] if prg]
pack_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
#---- because BIDOCS only document one version, we need to perform 2 operations:
#---- 1: remove programs documented in BIODOCS not available in module definitions
#---- 2: add programs in module definitions not documented in BIODOCS
for prog in prg_lst:
prog_name = prog['name']
# avoid typo documentation
if prog_name.lower() not in provided_prgs:
error(WARN, prog_name, 'not in module', "%s/%s" %(pack_name, pack_version))
continue
provided_prgs.remove(prog_name.lower())
prog_id = "prog%s%s%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, prog_name)
prog_description = prog['description'] if prog['description'] else ''
mongoprog = mongoprog_creator(prog_id, pack_id, prog_name, prog['HTMLDOCS'], prog['MANPAGES'], prog['CATEGORIES'], prog_description)
ret.append(mongoprog)
#---- insert non documented programs in BIODOCS if some
remaining = []
for prg in provided_prgs:
for prog_name in module_info['progs']:
if prog_name.lower() != prg:
continue
remaining.append(prog_name)
for prog_name in remaining:
prog_id = "prog%s%s%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, prog_name)
mongoprog = mongoprog_creator(prog_id, pack_id, prog_name)
ret.append(mongoprog)
return ret
def getweb(web_info):
}
return pack_version_doc
def info2progs(info):
progs = []
pack_name = info['name']
pack_version = info['version']
for elem in info['programs']:
new = {}
new['_id'] = _id_generator("prog", pack_name, pack_version, elem['name'])
new['type'] = 'program'
new['name'] = elem['name']
new['description'] = elem['description'] if elem['description'] else ''
new['packageVersion'] = _id_generator("pack", pack_name, pack_version)
new['categories'] = []
new['doc'] = {'html': [], 'man': [elem['manpages']]}
progs.append(new)
return progs
def info2mobyle(info):
# import mobyledefs
##---- should no longer used with Biodocs.yaml
# is the version used by mobyle ?
# if "%s/%s" %(pack_name, pack_version) not in mobyledefs.pack_info:
# return mobyle
mobyle = []
pack_name = info['name']
pack_version = info['version']
pattern= '(?P<name>[\w-]+)\s*\[(?P<desc>.*)\]'
auto = re.compile(pattern)
match = auto.match(web_info)
if match is None:
error(WARN, 'invalib web description', web_info)
interface_name = match.group('name')
description = match.group('desc')
mobyle_url = "%s#forms::%s" % (MOBYLEURL, interface_name)
return interface_name, mobyle_url, description
def mobyle2mongo(pack_name, pack_version, prg_lst, module_info):
'''
format mongo mobyle document from BIODOCS program informations
returns a list of mobyle mongo documents
'''
ret = []
#---- filter progs with WEB interface.
prg_lst = filter(lambda n: 'WEB' in n, prg_lst)
if not prg_lst:
return ret
pack_version, _ = version_extract(pack_version)
interfaces = collections.defaultdict(list)
for prg in info['programs']:
prog_name = prg['name']
# something to document ?
web = prg.get('web', False)
if not web:
continue
for web_info in web:
interface = {}
match = auto.match(web_info)
if match is None:
err(WARN, 'invalib web description', web_info)
i_name = match.group('name')
i_desc = match.group('desc')
interface['_id'] = _id_generator("mobyle", pack_name, pack_version, i_name)
interface['type'] = 'mobyle'
interface['package'] = _id_generator("pack", pack_name, pack_version)
interface['programs'] = [_id_generator("prog", pack_name, pack_version, prog_name)]
interface['name'] = i_name
interface['description'] = i_desc
interface['url'] = "%s#forms::%s" % (MOBYLEURL, interface['name'])
mobyle.append(interface)
return mobyle
def mongo_connect(host, port, db, col):
log('connect to', host, 'on port', port)
for prog in prg_lst:
prog_name=prog['NAME']
# one program may hold multiple interfaces
for interface in prog['WEB']:
if not interface :
continue
mobyle_interface_name, mobyle_url, mobyle_desc = getweb(interface)
interfaces[mobyle_interface_name].append((prog_name, mobyle_url, mobyle_desc))
#--- reduce and consistency check
for interface in interfaces:
programs = []
urls = set()
descs = []
for prg, url, desc in interfaces[interface]:
programs.append(prg)
urls.add(url)
descs.append(desc)
if len(urls) != 1:
error(FATAL, pack_name, pack_version, "PROG.WEB conflict")
interfaces[interface] = (list(urls), programs, descs)
for interface_name in interfaces:
url, prog_names, desc = interfaces[interface_name]
url = url[0]
mobyle_desc = ' '.join(desc).strip()
mobyle_id = "mobyle%s%s%s%s%s%s" % \
(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, interface_name)
pack_id = "pack%s%s%s%s" % \
(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
prog_ids = []
for prog_name in prog_names:
prog_id = "prog%s%s%s%s%s%s" % \
(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, prog_name)
prog_ids.append(prog_id)
mongomobyle= { '_id' : mobyle_id
, 'type' : 'mobyle'
, 'programs' : prog_ids
, 'package' : pack_id
, 'name' : interface_name
, 'url' : url
, 'description' : mobyle_desc