Commit d62f53c6 authored by Eric  DEVEAUD's avatar Eric DEVEAUD
Browse files

port to BIODOCS.yaml

parent 060e575d
NAM=biodocs2mongo
PREFIX=/local/gensoft2/adm
BIN=$(PREFIX)/bin
PYMODULEDIR=$(PREFIX)/share/gensoft/pymodules
DAT=$(PREFIX)/share/gensoft/$(NAM)
build:
sed -i -e 's,^\#!.*python,\#! /local/gensoft2/adm/bin/python,' *.py
sed -e 's|^DAT=.*|DAT="$(DAT)"|' \
-e 's|^PYMODULEDIR=.*|PYMODULEDIR="$(PYMODULEDIR)"|' $(NAM).py > $(NAM)
install: build
test -d $(DAT) || mkdir -p $(DAT)
install -m 0775 $(NAM) $(BIN)
install -m 0664 config.cfg $(DAT)
$(MAKE) clean
clean:
rm -f $(NAM)
rm -f *.pyc
uninstall:
rm -f $(BIN)/$(NAM) $(DAT)/*
rm -rf $(DAT)
#!/usr/bin/env python2.7 #! /local/gensoft2/adm/bin/python
from __future__ import print_function
import argparse import argparse
import arrow # date manipulation made easy. import ConfigParser
import collections import os
import pymongo import pymongo
import re import re
import os
import string
import ssl import ssl
import string
import sys import sys
import BiodocParser from pprint import pprint
import mobyledefs
# tweak modulecmd in order to use 3.3.a updated (not yet released)
os.environ['PY_MODULECMD'] = '/local/gensoft2/adm/Modules/3.3.a/bin/modulecmd'
import module as M
#from biowebmongo import *
#from pymongo.errors import BulkWriteError
#--------------------------------------------------
# SOME GLOBALS
#--------------------------------------------------
LOGFH= sys.stdout
ERRFH= sys.stderr
FATAL= 1
WARN= 0
VERBOSE= 1
MTRUE = True # "true" value to insert in mongodb
MFALSE = False # "false" value to insert in mongodb
ID_SEPARATOR = '@' #### TODO !!!!!
#
# add config entry for use module.
# if set => check program listing from biodocs versus program listing from modules
MOBYLEURL = 'http://mobyle.pasteur.fr'
HOST='bioweb-prod.web.pasteur.fr'
PORT=27017
JOURNALING=True # implies w=1 #---- import homebrew modules
WRITECONCERN=1
DB_DEF='bioweb'
COL_DEF='catalog'
known_ops = ['install', 'remove', 'setdefault', 'unsetdefault', 'update'] PYMODULEDIR="/local/gensoft2/adm/share/gensoft/pymodules"
sys.path.insert(0, PYMODULEDIR)
import BiodocParser
mapper = {} #---- some hugly global variables
#-------------------------------------------------- DAT='.'
# INTERNAL USE ERRFH = sys.stderr
#-------------------------------------------------- LOGFH = sys.stdout
FATAL = 1
WARN = 0
def error(exit_val, *msg): def err(exit_val, *msg):
head=['Warning', 'Error'] head=['Warning', 'Error']
print >> ERRFH, "%s: %s" % (head[exit_val], " - ".join(map(str, msg))) print ("%s: %s" % (head[exit_val], " - ".join(map(str, msg))), file=ERRFH)
if exit_val: if exit_val:
sys.exit(exit_val) sys.exit(exit_val)
return None return None
def log(*msg): def log(*msg):
if VERBOSE: if VERBOSE:
print >> LOGFH, "%s" %(' '.join(map(str, msg))) print ("%s" %(' '.join(map(str, msg))), file=LOGFH)
LOGFH.flush() LOGFH.flush()
def get_DB(host, port, db_name=DB_DEF, j=JOURNALING, w=WRITECONCERN ):
log('connect to', host, 'on port:', port)
try:
client = pymongo.MongoClient(host, port, j=JOURNALING, ssl=True, ssl_cert_reqs=ssl.CERT_NONE)
except pymongo.errors.ConnectionFailure as err:
error(FATAL, "mongodb %s/%s" %(host, port), err)
return client[db_name]
def getmodule_infos(package): def _id_generator(*arg):
''' return ID_SEPARATOR.join(arg)
returns programs provided by a package//module
htmldocs provided by a package//module
pack dependencies
information is more uptodate on programs than the
one provided by BIODOCS.
information may vary given the pack version, while
BIDOCS only document default version
returns a list of names.
'''
m = M.Module()
def get_progs(info):
'''
get programs provided by installed module version
returns a list of program_names as str
'''
progs = []
keep = False
for elem in infos:
elem = elem.strip()
if not elem:
continue
if keep:
progs.append(elem)
if elem == "package provides following commands:":
keep = True
return progs
def get_docs(info): def check_missing_biodocs(biodocs_lst):
'''
get local documentation available for given package
returns a tuple (htmldocs, txtdocs) of list as str
'''
htmldocs = []
txtdocs = []
keep = False
for elem in infos:
elem = elem.strip()
if not elem:
continue
if elem == "local documentation available:":
keep = True
continue
elif elem == "package provides following commands:":
break
if keep:
if 'bioweb2' in elem:
htmldocs.append(elem)
else:
txtdocs.append(elem)
return htmldocs, txtdocs
def get_dependencies(info):
'''
get the package/version that a given module depends or relies on.
returns a list of packages_names
'''
deps = []
for elem in infos:
elem = elem.strip()
if not elem:
continue
if 'WARNING' in elem:
deps = elem[elem.find('<')+1:elem.find('>')]
fields = deps.split()
# remove extraneous and or tokens
deps = list(set(fields))
try:
deps.remove('or')
deps.remove('and')
except ValueError as err:
pass
break
return [item for item in deps if item]
if testmodule:
package = 'test/%s' %(package)
print ">>>>>>>>>", package
try:
infos=m.help(package)
except M.ModuleError as err:
raise M.ModuleError("no modulefile %s" %(package))
garbage = infos.find('*****')
if garbage != -1:
infos = infos[:garbage]
infos = infos.split('\n')
ret = {}
ret['progs'] = get_progs(infos)
ret['htmldocs'], ret['docs'] = get_docs(infos)
ret['depends'] = get_dependencies(infos)
return ret
def format_references(ref_lst):
'''
format reference information as a dictionary
suitable to be embeded in mongo package document
'''
def extract_doi(item):
'''
extract publication unique id (doi, pmid or pmcid) if
available on the reference description from BIODOCS.
return a tuble:
citation curated from uinique id
name of unique identifier ressource or None
unique id or None
'''
ret = [] ret = []
for ref in ref_lst: for biodoc in biodocs_lst:
if not ref: if not os.path.isfile(biodoc):
continue err(WARN, biodoc, 'no such file')
ret.append({ 'ID': ref['ID']
, 'IDtype': ref['idType']
, 'citation' : ref['title']
})
return ret
def version_extract(version):
'''
extract version from BIODOCS version name
returns a tuple.
version
boolean if default version
'''
default=False
if '(default)' in version:
version, default = version.split('(')
default = MTRUE if default else MFALSE
return version, default
def history_maker(history_info):
'''
format history in a mongo compatible way
returns list of dict
{ 'date' : datetime.datetime object
, 'operation' : keyword from 'install', 'remove', 'setdefault', 'unsetdefault', 'update
, 'message' : free text as str }
'''
for info in history_info:
if not info:
continue
try:
info['date'] = arrow.get(info['date'], 'YYYY/MM/DD').naive
except arrow.parser.ParserError as msg:
error(FATAL, date, msg)
return history_info
def merge_htmldocs (biodocs_lst, module_lst):
'''
merge BIODOCS documentation with the one provided by module
WARNING: currently discard BIODOCS info
'''
return module_lst
def getMapping(fileName):
mapper = {}
with open(fileName, 'r') as fh:
for line in fh:
line = line.strip()
if line.startswith('#'):
continue continue
old, new = line.split() ret.append(biodoc)
mapper[old] = new
return mapper
def updateCategories(lst, mapper=mapper):
ret = []
for elem in lst:
if elem in mapper:
ret.append(mapper[elem])
elif elem in EDAM_READY:
ret.append(elem)
else:
error(WARN, elem, "non matching edam operationn or topic")
return ret return ret
def getOrigin(pack): def get_biodocs(packdir):
return pack['ORIGIN'].split() # filter to keep only pack_version
pack_name = os.path.basename(packdir)
#-------------------------------------------------- pack_version_lst = [ os.path.join(packdir, d) for d in os.listdir(packdir) if d.startswith(pack_name) ]
# DO THE JOB biodocs_lst = [ os.path.join(d, 'BIODOCS.yaml') for d in pack_version_lst ]
#-------------------------------------------------- return check_missing_biodocs(biodocs_lst)
def pack2mongo(pack): def citation_key_translate(refs_col):
''' # mapping = {key in yaml : key expected in mongo }
format mongo package document from BIODOCS package informations mapping = {'ID': 'ID'
returns a mongo package document ,'idType' : 'IDtype'
''' ,'title': 'citation'
#---- get infos from BIODOCS parsed holder }
pack_name = pack['name'] for ref in refs_col:
pack_authors = [auth.strip() for auth in pack['authors'] if auth] for k, v in mapping.items():
pack_refs = format_references(pack['references']) ref[v] = ref.pop(k)
pack_library = MTRUE if pack['library'] else MFALSE return refs_col
pack_private = MTRUE if pack['private'] else MFALSE
hist = history_maker(pack['history'])
if hist is None: def info2package(info):
error(WARN, pack_name, "invalid history") pack_name = info['name']
hist = [] pack_id = _id_generator('pack', pack_name)
pack_history = hist pack_doc = {
pack_id = "pack%s%s" %(ID_SEPARATOR, pack_name) '_id': pack_id
pack_collections = [pack['origin']] , 'type' : 'package'
, 'name' : info['name']
#---- map gensoft Categories to edam relevant operation and topic terms , 'description' : info['description']
# categories = updateCategories(pack['categories']) , 'home' : info['home']
, 'source' : info['sources']
mongopack = { '_id': pack_id , 'authors' : info['authors']
, 'type': 'package' , 'categories' : info['operations'] + info['topics']
, 'name' : pack_name , 'topics' : info['topics']
, 'description' : pack['description'] #pack_description , 'operations': info['operations']
, 'home' : pack['home'] , 'collections' : [info['origin']]
, 'source' : pack['sources'] , 'history' : info['history']
, 'categories' : pack['operations'] + pack['topics'] , 'library' : info['library']
, 'authors' : pack_authors , 'private' : info['private']
, 'references' : pack_refs , 'references' : citation_key_translate(info['references'])
, 'library' : pack_library
, 'private' : pack_private
, 'history' : pack_history
, 'collections' : pack_collections
} }
return pack_doc
return mongopack
def packversion2mongo(pack, version, module_info): def info2pack_version(info):
''' pack_name = info['name']
format mongo package document from BIODOCS package informations pack_id = _id_generator("pack", pack_name)
returns a mongo package document pack_version = info['version']
''' pack_version_id = _id_generator("pack", pack_name, pack_version)
#---- get infos from BIODOCS parsed holder pack_version_doc = {
pack_name = pack['name'] '_id' : pack_version_id
pack_version, pack_default= version_extract(version)
pack_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
pack_htmldocs = pack['htmldocs']
pack_manpages = pack['manpages']
#---- complement with module info
pack_htmldocs = merge_htmldocs(pack_htmldocs, module_info['htmldocs'])
pack_docs = module_info['docs']
pack_depends = module_info['depends']
pack_id = "pack%s%s" %(ID_SEPARATOR, pack_name)
version_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
versionpack = { '_id': version_id
, 'type' : 'packageVersion' , 'type' : 'packageVersion'
, 'package': pack_id , 'package' : pack_id
, 'version' : pack_version , 'default' : info['default']
, 'default' : pack_default , 'version' : info['version']
, 'doc' : { 'html': pack_htmldocs , 'depends' : info['depends']
, 'docs': pack_docs , 'doc' : {'docs' : []
, 'man' : pack_manpages , 'html' : info['htmldocs']
, 'man' : []
} }
, 'depends' : pack_depends
} }
return pack_version_doc
return versionpack
def info2progs(info):
progs = []
pack_name = info['name']
pack_version = info['version']
for elem in info['programs']:
new = {}
new['_id'] = _id_generator("prog", pack_name, pack_version, elem['name'])
new['type'] = 'program'
new['name'] = elem['name']
new['description'] = elem['description'] if elem['description'] else ''
new['packageVersion'] = _id_generator("pack", pack_name, pack_version)
new['categories'] = []
new['doc'] = {'html': [], 'man': [elem['manpages']]}
progs.append(new)
return progs
def progs2mongo(pack_name, pack_version, prg_lst, module_info):
'''
format mongo program document from BIODOCS program informations
returns a list of programs mongo documents
'''
def mongoprog_creator(prog_id, pack_id, name, htmldocs=[], manpages=[],categories=[], description=''):
mongoprog= { '_id' : prog_id
, 'type' : 'program'
, 'packageVersion' : pack_id
, 'name' : prog_name
, 'doc' : { 'html': htmldocs
, 'man' : manpages
}
, 'categories' : categories
, 'description' : description
}
return mongoprog
ret = []
#---- generate corresponding module name to get uptodate program version
#---- only default version package are documented, when they are ;-(
pack_version, _ = version_extract(pack_version)
provided_prgs = [prg.lower() for prg in module_info['progs'] if prg]
pack_id = "pack%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version)
#---- because BIDOCS only document one version, we need to perform 2 operations:
#---- 1: remove programs documented in BIODOCS not available in module definitions
#---- 2: add programs in module definitions not documented in BIODOCS
for prog in prg_lst:
prog_name = prog['name']
# avoid typo documentation
if prog_name.lower() not in provided_prgs:
error(WARN, prog_name, 'not in module', "%s/%s" %(pack_name, pack_version))
continue
provided_prgs.remove(prog_name.lower())
prog_id = "prog%s%s%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, prog_name)
prog_description = prog['description'] if prog['description'] else ''
mongoprog = mongoprog_creator(prog_id, pack_id, prog_name, prog['HTMLDOCS'], prog['MANPAGES'], prog['CATEGORIES'], prog_description)
ret.append(mongoprog)
#---- insert non documented programs in BIODOCS if some
remaining = []
for prg in provided_prgs:
for prog_name in module_info['progs']:
if prog_name.lower() != prg:
continue
remaining.append(prog_name)
for prog_name in remaining:
prog_id = "prog%s%s%s%s%s%s" %(ID_SEPARATOR, pack_name, ID_SEPARATOR, pack_version, ID_SEPARATOR, prog_name)
mongoprog = mongoprog_creator(prog_id, pack_id, prog_name)
ret.append(mongoprog)
return ret def info2mobyle(info):
# import mobyledefs
##---- should no longer used with Biodocs.yaml
# is the version used by mobyle ?
# if "%s/%s" %(pack_name, pack_version) not in mobyledefs.pack_info:
# return mobyle
def getweb(web_info): mobyle = []
pack_name = info['name']
pack_version = info['version']
pattern= '(?P<name>[\w-]+)\s*\[(?P<desc>.*)\]' pattern= '(?P<name>[\w-]+)\s*\[(?P<desc>.*)\]'
auto = re.compile(pattern) auto = re.compile(pattern)
match = auto.match(web_info)
if match is None:
error(WARN, 'invalib web description', web_info)
interface_name = match.group('name')
description = match.group('desc')
mobyle_url = "%s#forms::%s" % (MOBYLEURL, interface_name) for prg in info['programs']:
return interface_name, mobyle_url, description prog_name = prg['name']
# something to document ?
def mobyle2mongo(pack_name, pack_version, prg_lst, module_info): web = prg.get('web', False)
''' if not web:
format mongo mobyle document from BIODOCS program informations
returns a list of mobyle mongo documents
'''
ret = []
#---- filter progs with WEB interface.
prg_lst = filter(lambda n: 'WEB' in n, prg_lst)
if not prg_lst:
return ret
pack_version, _ = version_extract(pack_version)
interfaces = collections.defaultdict(list)
for prog in prg_lst:
prog_name=prog['NAME']
# one program may hold multiple interfaces
for interface in prog['WEB']:
if not interface :
continue continue
mobyle_interface_name, mobyle_url, mobyle_desc = getweb(interface) for web_info in web:
interfaces[mobyle_interface_name].append((prog_name, mobyle_url, mobyle_desc)) interface = {}
match = auto.match(web_info)
#--- reduce and consistency check if match is None:
for interface in interfaces: err(WARN, 'invalib web description', web_info)
programs = [] i_name = match.group('name')
urls = set() i_desc = match.group('desc')
descs = [] interface['_id'] = _id_generator("mobyle", pack_name, pack_version, i_name)
for prg, url, desc in interfaces[interface]: interface['type'] = 'mobyle'
programs.append(prg) interface['package'] = _id_generator("pack", pack_name, pack_version)
urls.add(url) interface['programs'] = [_id_generator("prog", pack_name, pack_version, prog_name)]
descs.append(desc) interface['name'] = i_name
if len(urls) != 1: interface['description'] = i_desc
error(FATAL, pack_name, pack_version, "PROG.WEB conflict") interface['url'] = "%s#forms::%s" % (MOBYLEURL, interface['name'])
interfaces[interface] = (list(urls), programs, descs) mobyle.append(interface)
return mobyle