diff --git a/README.rst b/README.rst index cbd0f95f532dbcee859f82b070fb3070540a1550..1c87a14b9d56a0c32d69b224aa83e4797c7cae13 100644 --- a/README.rst +++ b/README.rst @@ -3,17 +3,80 @@ Larva IO & QC **larva-io** features basic *load* and *save* routines for data files derived from larva experiment. -This includes *.spine* and *.outline* ascii files, *trx.mat* mat files, and many more to come. +This includes *.spine* and *.outline* ascii files, *trx.mat* mat files, and others to come. Installation ------------ Python >= 3.6 is required. -*pip* should work just fine: - -:: +The **larva-io** package is not available on PyPI yet. You can get a copy of it and install it locally:: + git clone git@gitlab.pasteur.fr:flaurent/larva-io.git + cd larva-io pip install . *pip install* will install some Python dependencies if missing, but you may still need to install the `HDF5 reference library <https://support.hdfgroup.org/downloads/index.html>`_. + +Basic usage +----------- + +For each supported format in the `larva.io` package, three functions are provided: + +* `load` that loads the entire data from a file into a format that depends on the backend for this file format, +* `save` that exports data to a file, +* `check` that runs a series of quality control checks on the data. + +For example, the `larva.io.chore` subpackage exports `load_spine`, `load_outline`, `save_spine`, etc. + +All such functions exhibit the same base interface: + +.. code::python + + data = load_spine(input_filepath) + + save_spine(output_filepath, data) + + check_spine(filepath) + # or: + check_spine(data) + # or else: + check_spine(data, datasource=filepath) + +These functionalities are also available as type classes. For example, `larva.io.chore` exports the `Spine` and `Outline` classes: + +.. code::python + + spine = Spine(input_filepath) + data = spine.load() + + spine = Spine(output_filepath) + spine.save(data) + + spine = Spine(filepath) + spine.check() + spine.check(data) + + spine = Spine() + spine.check(data) + +Data quality control +-------------------- + +The `check` function or method also admits keyword argument `policy`. +Per default, `check` raises a `QCFailure` error on the first failure to comply with a quality control check (`policy='fail'`). + +Any value other than `'fail'` (or default `None`) lets all the checks run, and `check` returns comprehensive diagnosis information. + +The `policy` argument also admits value `'fix'` which makes each check try to fix the data, possibly removing the data elements that do not comply. + +Not all the checks can fix the data. In this case, a check only adds diagnosis information and passes on the data to the next check. + +Not all the file formats or backends include quality control. + +Lazily loading data +------------------- + +In addition to the simple `load` and `save` functions and methods, a backend may feature extra functionalities, especially to lazily process data. + +For example, the `larva.io.trx` package provides the `Trx` class with a `list_varnames` method, and equivalently a `list_trx_varnames` function. diff --git a/larva/README.md b/larva/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0103498b3e7708cbfcda02a477c0fa79c6763046 --- /dev/null +++ b/larva/README.md @@ -0,0 +1,11 @@ +Package structure +================= + +*larva* contains two subpackages: + +* [*io*](io): + exposes subpackages for reading and writing files of different formats, + +* [*qc*](qc): + implements general quality control mechanics used by *io* subpackages. + diff --git a/larva/io/README.md b/larva/io/README.md new file mode 100644 index 0000000000000000000000000000000000000000..513d6ed4c53754afe914ab458bb28720181396d5 --- /dev/null +++ b/larva/io/README.md @@ -0,0 +1,8 @@ +Supported file formats +====================== + ++---------+------------------------+ +| *chore* | _*.outline_, _*.spine_ | ++---------+------------------------+ +| *trx* | _trx.mat_ | ++---------+------------------------+ diff --git a/larva/qc/README.rst b/larva/qc/README.rst new file mode 100644 index 0000000000000000000000000000000000000000..0a8766f9ab01d0c4d0b241a41e373f1def340ab1 --- /dev/null +++ b/larva/qc/README.rst @@ -0,0 +1,32 @@ +Quality control for larva-related data +====================================== + +A quality control function tests for a single desired or undesired +feature of the data, and may report any lack of compliance. + +A full-featured qc function will be structured as follows: + +.. code::python + + @qc_check + def my_check(controller, data, my_check_extra_argument0=None, ...): + + # crawl the data + for item in data: + + # check for compliance + ... + + # if `item` does not comply + controller.report(my_diagnostic_key0=my_diagnostic_value0, ...) + + if controller.tryfix(): + # modify `data` + ... + + return modified_data + +Such a function takes a *controller* object as input argument, and can query +this object for directions to follow, especially in the case the data can be +fixed, whether the function should fix the data or not. + diff --git a/larva/qc/check.py b/larva/qc/check.py index c4b861fb9eaa1727c831258004cf2cd0c8174a06..7d2e655f5a7b2b428b6010f15de6fbaf073bee7e 100644 --- a/larva/qc/check.py +++ b/larva/qc/check.py @@ -2,6 +2,21 @@ import logging from .exceptions import QCFailure class QCController: + """ + Controller for quality control checks. + + An instance is passed as first argument to each quality control + function. + + The main features a quality control function can access are: + + * logging, with methods `debug`, `info`, `warn` and `error`, + * reporting of data item that do not comply (`report`), + * asking whether to fix the items that do not comply (`tryfix`). + + Calling `report` may raise an error or return depending on the + selected policy. + """ def __init__(self, qccheck=None, datasource=None): self._logger = None @@ -27,6 +42,11 @@ class QCController: @property def logger(self): + """ + logging.Logger: + Logging utility, whose main methods are also exposed by the + `QCController` object + """ if self._logger is None: self._logger = logging.getLogger(__name__) if not self._logger.hasHandlers(): @@ -44,6 +64,14 @@ class QCController: return f"in file '{src}', check '{name}' failed with: {kwargs}" def report(self, **kwargs): + """ + Register the input key-value pairs for diagnostic and either + raise a `QCFailure` error to interrupt the quality control, or + silently return to allow for qc continuation. + + Every quality control function is free to define its own + diagnosis keys. + """ self.records.append(kwargs) if self.policy is None or self.policy == 'fail': raise QCFailure(self.format(kwargs)) @@ -51,28 +79,70 @@ class QCController: logger = self.logger.warn(self.format(kwargs)) def tryfix(self): + """ + Boolean method that returns :const:`True` if fixing the data + items that do not comply is desired, :const:`False` otherwise. + + A quality control function is not expected to call this method + if no fixing strategies are implemented. + """ return 'fix' in self.policy def diagnose(self): + """ + Return a compilation of all reported key-value pairs. + """ return list(self.records) @property def debug(self): + """ + method: Shortcut to the `logger.debug` method + """ return self.logger.debug @property def info(self): + """ + method: Shortcut to the `logger.info` method + """ return self.logger.info @property def warn(self): + """ + method: Shortcut to the `logger.warn` method + """ return self.logger.warn @property def error(self): + """ + method: Shortcut to the `logger.error` method + """ return self.logger.error class QCCheck: + """ + Wrapper for control quality functions. + + A `QCCheck` object is callable, but the type signature differs from + that of the wrapped function. + + Especially, it supports two different signatures: + + * if the data are passed as a positional argument (more generally + if any positional argument is passed), the wrapped function is + called with a `QCController` instance as first input argument and + the data (or datasource) as second argument, plus additional + keyword arguments. If no `QCFailure` error is raised, the + modified data are returned as first output argument and a + diagnosis `dict` as second output argument. + + * if only keyword arguments are passed, they are stored and passed + to the wrapped function on each subsequent call to this function. + + """ __slots__ = ( 'fun', 'args', @@ -120,7 +190,8 @@ class QCCheck: def __enter__(self): if self.active_policy is not None: - self._controller.logger.warn('active_policy already set; will be overwritten') + log = self._controller.logger.warn + log('active_policy already set; will be overwritten') return self def __exit__(self, *args): @@ -136,7 +207,11 @@ class QCCheck: input_data = datasource elif callable(input_data): input_data = input_data() - fun_kwargs = { kw: arg for kw, arg in self.args.items() if arg is not None } + fun_kwargs = { + kw: arg \ + for kw, arg in self.args.items() \ + if arg is not None + } fun_kwargs.update(kwargs) self.active_policy = fun_kwargs.pop('policy', None) ctrl = self.get_controller(datasource) @@ -163,6 +238,24 @@ class QCCheck: return getattr(self.fun, attr) class OrderedSet: + """ + Sequence of unique elements (or sequence-like set) that preserves + insertion order. + + An `OrderedSet` can also be viewed as an `OrderedDict` for element + types that can contain the keys. + + An element is translated into a key with the `get_elem_id` function. + Such a function should return an id value that is not an `int`, not + to be confused with indices, and should also differ from the element + type. + + Unlike `set` or other typical set types, the elements passed on + initialization are assumed to be unique. + + Uniqueness is checked only on insertion methods such as `insert` or + `add`. + """ __slots__ = '_elems', '_get_elem_id', '_id_checked' @@ -180,6 +273,9 @@ class OrderedSet: @property def get_elem_id(self): + """ + callable: Function that takes an element and returns an id + """ return self._get_elem_id @get_elem_id.setter @@ -191,6 +287,11 @@ class OrderedSet: @property def elems(self): + """ + list: + Sequence of unique elements; + setting `elems` does not check for duplicate values + """ return self._elems @elems.setter @@ -237,6 +338,10 @@ class OrderedSet: return elem def index(self, elem): + """ + Return the index of the first occurence of `elem`, or raise a + `ValueError` exception if not found. + """ prefix, suffix = self.split(elem) if suffix: return len(prefix) @@ -245,6 +350,12 @@ class OrderedSet: raise ValueError(f"cannot find elem '{_id}'") def split(self, elem): + """ + Split the sequence of elements at the first occurence of `elem`. + + Two lists are returned so that the matching element is at the + head of the second list. + """ _id = self.get_elem_id(elem) prefix, suffix = [], [] for _elem in self.elems: @@ -261,9 +372,17 @@ class OrderedSet: return prefix, suffix def reverse(self): + """ + Reverse the sequence of elements. Works inplace. + """ self._elems = self.elems[::-1] def move(self, index, elem): + """ + Move element `elem` to position `index` in the sequence. + + If `elem` is not found, a `ValueError` exception is raised. + """ if index < 0: self.reverse() try: @@ -294,6 +413,11 @@ class OrderedSet: raise def insert(self, index, elem, fail_if_exists=False): + """ + Insert element `elem` into the sequence at position `index`. + + Works inplace. + """ if elem in self: if fail_if_exists: _id = self.get_elem_id(elem) @@ -304,6 +428,14 @@ class OrderedSet: self.move(index, elem) def add(self, elem, fail_if_exists=False): + """ + Add or append an element. + + If `elem` is found in the sequence, nothing happens. `elem` is + appended to the right end of the sequence only if it is missing. + + Works inplace. + """ if elem in self: if fail_if_exists: _id = self.get_elem_id(elem) @@ -331,6 +463,11 @@ class OrderedSet: return set1 class QCChecks(OrderedSet): + """ + Series of quality control checks. + + Each checking function should be wrapped into a `QCCheck` instance. + """ __slots__ = () @@ -354,6 +491,43 @@ class QCChecks(OrderedSet): raise AttributeError(e.msg) from None def qc_check(*args, default_policy=None, input_type=None): + """ + Wrapping function intended to be used as a decorator: + + .. code::python + + @qc_check + def my_data_check(controller, data): + # check data + ... + + @qc_check(input_type='datasource') + def my_filepath_check(controller, datasource): + # check datasource + ... + + @qc_check(default_policy='fix') + def my_data_fix(controller, data): + # check data + ... + + # on failure to comply + controller.report(my_diagnosis_key=my_diagnosis_value, ...) + + if controller.tryfix(): + # fix data + ... + + return data + + Per default, newly decorated quality control functions are NOT + associated to any file type. + Especially, they may be specific to a backend for the target file + type. + As a consequence, each backend defines its own quality control + checks. + + """ if args: fun = args[0] if args[1:] or not callable(fun): diff --git a/larva/qc/file.py b/larva/qc/file.py index 916123fc407ed96db7c54799bc14c81f5d037d1f..4e027905fdf4477659face1e93526cb747f95734 100644 --- a/larva/qc/file.py +++ b/larva/qc/file.py @@ -75,23 +75,6 @@ class QCFile: else: raise AttributeError('not a list') - @classmethod - def reify(cls, *args, filepath=None, **kwargs): - """ - The first positional argument, if of type *str* or *Path*, - is interpreted as *filepath*, and a *TypeError* exception - is raised if keyworded *filepath* is also defined. - """ - if args and isinstance(args[0], (str, Path)) and args[0]: - if filepath is None: - filepath = args[0] - else: - raise TypeError('filepath is specified twice') - self = cls(filepath=filepath) - if not self.filepath: - raise ValueError('filepath is not defined') - return self - def load(self, *args, **kwargs): return self.backend.load(self.filepath, *args, **kwargs) @@ -122,8 +105,11 @@ def asfun(cls, met): _fun.__doc__ = met.__doc__ return _fun -load = lambda cls: asfun(cls, cls.load) -save = lambda cls: asfun(cls, cls.save) +def load(cls): + return asfun(cls, cls.load) + +def save(cls): + return asfun(cls, cls.save) def check(cls): def _check(filepath_or_data, policy=None, **kwargs):