diff --git a/README.rst b/README.rst index cbd0f95f532dbcee859f82b070fb3070540a1550..1c87a14b9d56a0c32d69b224aa83e4797c7cae13 100644 --- a/README.rst +++ b/README.rst @@ -3,17 +3,80 @@ Larva IO & QC **larva-io** features basic *load* and *save* routines for data files derived from larva experiment. -This includes *.spine* and *.outline* ascii files, *trx.mat* mat files, and many more to come. +This includes *.spine* and *.outline* ascii files, *trx.mat* mat files, and others to come. Installation ------------ Python >= 3.6 is required. -*pip* should work just fine: - -:: +The **larva-io** package is not available on PyPI yet. You can get a copy of it and install it locally:: + git clone git@gitlab.pasteur.fr:flaurent/larva-io.git + cd larva-io pip install . *pip install* will install some Python dependencies if missing, but you may still need to install the `HDF5 reference library <https://support.hdfgroup.org/downloads/index.html>`_. + +Basic usage +----------- + +For each supported format in the `larva.io` package, three functions are provided: + +* `load` that loads the entire data from a file into a format that depends on the backend for this file format, +* `save` that exports data to a file, +* `check` that runs a series of quality control checks on the data. + +For example, the `larva.io.chore` subpackage exports `load_spine`, `load_outline`, `save_spine`, etc. + +All such functions exhibit the same base interface: + +.. code::python + + data = load_spine(input_filepath) + + save_spine(output_filepath, data) + + check_spine(filepath) + # or: + check_spine(data) + # or else: + check_spine(data, datasource=filepath) + +These functionalities are also available as type classes. For example, `larva.io.chore` exports the `Spine` and `Outline` classes: + +.. code::python + + spine = Spine(input_filepath) + data = spine.load() + + spine = Spine(output_filepath) + spine.save(data) + + spine = Spine(filepath) + spine.check() + spine.check(data) + + spine = Spine() + spine.check(data) + +Data quality control +-------------------- + +The `check` function or method also admits keyword argument `policy`. +Per default, `check` raises a `QCFailure` error on the first failure to comply with a quality control check (`policy='fail'`). + +Any value other than `'fail'` (or default `None`) lets all the checks run, and `check` returns comprehensive diagnosis information. + +The `policy` argument also admits value `'fix'` which makes each check try to fix the data, possibly removing the data elements that do not comply. + +Not all the checks can fix the data. In this case, a check only adds diagnosis information and passes on the data to the next check. + +Not all the file formats or backends include quality control. + +Lazily loading data +------------------- + +In addition to the simple `load` and `save` functions and methods, a backend may feature extra functionalities, especially to lazily process data. + +For example, the `larva.io.trx` package provides the `Trx` class with a `list_varnames` method, and equivalently a `list_trx_varnames` function. diff --git a/larva/qc/check.py b/larva/qc/check.py index c4b861fb9eaa1727c831258004cf2cd0c8174a06..0cae8988999103ff19ac59c974277c516b5a0ae3 100644 --- a/larva/qc/check.py +++ b/larva/qc/check.py @@ -2,6 +2,20 @@ import logging from .exceptions import QCFailure class QCController: + """ + Controller for quality control checks. + + An instance is passed as first argument to each quality control function. + + The main features a quality control function can access are: + + * logging, with methods `debug`, `info`, `warn` and `error`, + * reporting of data item that do not comply (`report`), + * asking whether to fix the items that do not comply (`tryfix`). + + Calling `report` may raise an error or return depending on the selected + policy. + """ def __init__(self, qccheck=None, datasource=None): self._logger = None @@ -27,6 +41,11 @@ class QCController: @property def logger(self): + """ + logging.Logger: + Logging utility, whose main methods are also exposed by the + `QCController` object + """ if self._logger is None: self._logger = logging.getLogger(__name__) if not self._logger.hasHandlers(): @@ -44,6 +63,14 @@ class QCController: return f"in file '{src}', check '{name}' failed with: {kwargs}" def report(self, **kwargs): + """ + Register the input key-value pairs for diagnostic and either raise a + `QCFailure` error to interrupt the quality control, or silently return + to allow for qc continuation. + + Every quality control function is free to define its own diagnosis + keys. + """ self.records.append(kwargs) if self.policy is None or self.policy == 'fail': raise QCFailure(self.format(kwargs)) @@ -51,28 +78,69 @@ class QCController: logger = self.logger.warn(self.format(kwargs)) def tryfix(self): + """ + Boolean method that returns :const:`True` if fixing the data items that + do not comply is desired, :const:`False` otherwise. + + A quality control function is not expected to call this method if no + fixing strategies are implemented. + """ return 'fix' in self.policy def diagnose(self): + """ + Return a compilation of all reported key-value pairs. + """ return list(self.records) @property def debug(self): + """ + method: Shortcut to the `logger.debug` method + """ return self.logger.debug @property def info(self): + """ + method: Shortcut to the `logger.info` method + """ return self.logger.info @property def warn(self): + """ + method: Shortcut to the `logger.warn` method + """ return self.logger.warn @property def error(self): + """ + method: Shortcut to the `logger.error` method + """ return self.logger.error class QCCheck: + """ + Wrapper for control quality functions. + + A `QCCheck` object is callable, but the type signature differs from that of + the wrapped function. + + Especially, it supports two different signatures: + + * if the data are passed as a positional argument (more generally if any + positional argument is passed), the wrapped function is called with a + `QCController` instance as first input argument and the data (or + datasource) as second argument, plus additional keyword arguments. If no + `QCFailure` error is raised, the modified data are returned as first + output argument and a diagnosis `dict` as second output argument. + + * if only keyword arguments are passed, they are stored and passed to the + wrapped function on each subsequent call to this function. + + """ __slots__ = ( 'fun', 'args', @@ -120,7 +188,8 @@ class QCCheck: def __enter__(self): if self.active_policy is not None: - self._controller.logger.warn('active_policy already set; will be overwritten') + log = self._controller.logger.warn + log('active_policy already set; will be overwritten') return self def __exit__(self, *args): @@ -136,7 +205,11 @@ class QCCheck: input_data = datasource elif callable(input_data): input_data = input_data() - fun_kwargs = { kw: arg for kw, arg in self.args.items() if arg is not None } + fun_kwargs = { + kw: arg \ + for kw, arg in self.args.items() \ + if arg is not None + } fun_kwargs.update(kwargs) self.active_policy = fun_kwargs.pop('policy', None) ctrl = self.get_controller(datasource) @@ -163,6 +236,22 @@ class QCCheck: return getattr(self.fun, attr) class OrderedSet: + """ + Sequence of unique elements (or sequence-like set) that preserves insertion + order. + + An `OrderedSet` can also be viewed as an `OrderedDict` for element types + that can contain the keys. + + An element is translated into a key with the `get_elem_id` function. + Such a function should return an id value that is not an `int`, not to be + confused with indices, and should also differ from the element type. + + Unlike `set` or other typical set types, the elements passed on + initialization are assumed to be unique. + + Uniqueness is checked only on insertion methods such as `insert` or `add`. + """ __slots__ = '_elems', '_get_elem_id', '_id_checked' @@ -180,6 +269,9 @@ class OrderedSet: @property def get_elem_id(self): + """ + callable: Function that takes an element and returns an id + """ return self._get_elem_id @get_elem_id.setter @@ -191,6 +283,11 @@ class OrderedSet: @property def elems(self): + """ + list: + Sequence of unique elements; + setting `elems` does not check for duplicate values + """ return self._elems @elems.setter @@ -237,6 +334,10 @@ class OrderedSet: return elem def index(self, elem): + """ + Return the index of the first occurence of `elem`, or raise a + `ValueError` exception if not found. + """ prefix, suffix = self.split(elem) if suffix: return len(prefix) @@ -245,6 +346,12 @@ class OrderedSet: raise ValueError(f"cannot find elem '{_id}'") def split(self, elem): + """ + Split the sequence of elements at the first occurence of `elem`. + + Two lists are returned so that the matching element is at the head of + the second list. + """ _id = self.get_elem_id(elem) prefix, suffix = [], [] for _elem in self.elems: @@ -261,9 +368,17 @@ class OrderedSet: return prefix, suffix def reverse(self): + """ + Reverse the sequence of elements. Works inplace. + """ self._elems = self.elems[::-1] def move(self, index, elem): + """ + Move element `elem` to position `index` in the sequence. + + If `elem` is not found, a `ValueError` exception is raised. + """ if index < 0: self.reverse() try: @@ -294,6 +409,11 @@ class OrderedSet: raise def insert(self, index, elem, fail_if_exists=False): + """ + Insert element `elem` into the sequence at position `index`. + + Works inplace. + """ if elem in self: if fail_if_exists: _id = self.get_elem_id(elem) @@ -304,6 +424,14 @@ class OrderedSet: self.move(index, elem) def add(self, elem, fail_if_exists=False): + """ + Add or append an element. + + If `elem` is found in the sequence, nothing happens. `elem` is + appended to the right end of the sequence only if it is missing. + + Works inplace. + """ if elem in self: if fail_if_exists: _id = self.get_elem_id(elem) @@ -331,6 +459,11 @@ class OrderedSet: return set1 class QCChecks(OrderedSet): + """ + Series of quality control checks. + + Each checking function should be wrapped into a `QCCheck` instance. + """ __slots__ = () @@ -354,6 +487,36 @@ class QCChecks(OrderedSet): raise AttributeError(e.msg) from None def qc_check(*args, default_policy=None, input_type=None): + """ + Wrapping function intended to be used as a decorator: + + .. code::python + + @qc_check + def my_data_check(controller, data): + # check data + ... + + @qc_check(input_type='datasource') + def my_filepath_check(controller, datasource): + # check datasource + ... + + @qc_check(default_policy='fix') + def my_data_fix(controller, data): + # check data + ... + + # on failure to comply + controller.report(my_diagnosis_key=my_diagnosis_value, ...) + + if controller.tryfix(): + # fix data + ... + + return data + + """ if args: fun = args[0] if args[1:] or not callable(fun): diff --git a/larva/qc/file.py b/larva/qc/file.py index 916123fc407ed96db7c54799bc14c81f5d037d1f..4e027905fdf4477659face1e93526cb747f95734 100644 --- a/larva/qc/file.py +++ b/larva/qc/file.py @@ -75,23 +75,6 @@ class QCFile: else: raise AttributeError('not a list') - @classmethod - def reify(cls, *args, filepath=None, **kwargs): - """ - The first positional argument, if of type *str* or *Path*, - is interpreted as *filepath*, and a *TypeError* exception - is raised if keyworded *filepath* is also defined. - """ - if args and isinstance(args[0], (str, Path)) and args[0]: - if filepath is None: - filepath = args[0] - else: - raise TypeError('filepath is specified twice') - self = cls(filepath=filepath) - if not self.filepath: - raise ValueError('filepath is not defined') - return self - def load(self, *args, **kwargs): return self.backend.load(self.filepath, *args, **kwargs) @@ -122,8 +105,11 @@ def asfun(cls, met): _fun.__doc__ = met.__doc__ return _fun -load = lambda cls: asfun(cls, cls.load) -save = lambda cls: asfun(cls, cls.save) +def load(cls): + return asfun(cls, cls.load) + +def save(cls): + return asfun(cls, cls.save) def check(cls): def _check(filepath_or_data, policy=None, **kwargs):