diff --git a/Project.toml b/Project.toml index 4fefa105e15dca5838e8e13597c110207b17085b..1aebd820e4bba1e480e1295ed6bb7aa2dcefe10b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TaggingBackends" uuid = "e551f703-3b82-4335-b341-d497b48d519b" authors = ["François Laurent", "Institut Pasteur"] -version = "0.7.2" +version = "0.8" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/README.md b/README.md index d6b1a6d328470016749f6ad279d302bd9a813638..e372ac9ca40b4a2e26cfad5ca4deed7cc22ab277 100644 --- a/README.md +++ b/README.md @@ -193,3 +193,4 @@ JULIA_PROJECT=<path> scripts/larvatagger.jl train ``` with `<path>` the path of the Julia project with TaggingBackends installed. +Note however that the last command above will not work if Julia was installed using juliaup. Prefer [jill](https://pypi.org/project/jill/). diff --git a/pyproject.toml b/pyproject.toml index 84e30fe3132801c1d9a3d26c05b4f9a0f8e9f24e..cb7e01260aeb61f0c67ab38d5f9704ac8a81c8f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "TaggingBackends" -version = "0.7.2" +version = "0.8" description = "Backbone for LarvaTagger.jl tagging backends" authors = ["François Laurent"] diff --git a/src/taggingbackends/data/convert.py b/src/taggingbackends/data/convert.py new file mode 100644 index 0000000000000000000000000000000000000000..2a65318d2706fc45d90f1480c2e0a49f9581c866 --- /dev/null +++ b/src/taggingbackends/data/convert.py @@ -0,0 +1,35 @@ +from .trxmat import TrxMat +from .labels import Labels +import numpy as np +from pathlib import Path + +""" +Import behavior labels from a trx.mat file and return a Labels object. +""" +def import_labels_from_trxmat(trxmat_file, labels, decode=True): + if not isinstance(labels, list) or not labels: + raise ValueError("labels should be a non-empty list of strings") + if isinstance(trxmat_file, str): + trxmat_file = Path(trxmat_file) + trxmat = TrxMat(trxmat_file) + trxmat_labels = trxmat.read(['t'] + labels) + imported_labels = Labels(labelspec=labels, tracking=[trxmat_file]) + run = next(iter(trxmat_labels[labels[0]])) + larvae = list(trxmat_labels[labels[0]][run]) + for larva in larvae: + times = trxmat_labels['t'][run][larva] + for i, label in enumerate(labels): + indicator = (i+1) * (trxmat_labels[label][run][larva]==1) + if i==0: + encoded_labels = indicator + else: + encoded_labels += indicator + if np.any(len(labels) < encoded_labels): + raise NotImplementedError("overlapping labels") + if decode: + _labels = imported_labels.decode(encoded_labels) + else: + _labels = encoded_labels + imported_labels[(run, larva)] = {t: l for t, l in zip(times, _labels)} + return imported_labels + diff --git a/src/taggingbackends/data/labels.py b/src/taggingbackends/data/labels.py index d741a89ecbecd4960365a2532469729c828d73e0..f9402f502e3a8ee746d1bd54bd07cd57fea41206 100644 --- a/src/taggingbackends/data/labels.py +++ b/src/taggingbackends/data/labels.py @@ -13,6 +13,19 @@ Python interface to JSON *.label* files. It is a poor parent of Julia struct `Dataset` from `PlanarLarvae.jl`. +`Labels` can be used as a `dict`, with (runid, trackid) pairs as keys. +To iterate over its content and access timeseries of labels, you can do: + +``` +for run_larva in labels: + _, larva_id = run_larva + timeseries_data = labels[run_larva] + ... +``` + +A `Labels` object will typically describe a single run, but can technically +represent several runs. + The following example function opens such a *.label* file and loads the associated track data assumed to be available as a *trx.mat* file. It iterates over the tracks defined in the *.label* file. The same tracks are @@ -46,11 +59,6 @@ def retagged_trxmat_confusion_matrix(label_file): return cm ``` -Note that unlike the `TrxMat` class, keys in `Labels` objects are -(runid, trackid) pairs. -A `Labels` object will typically describe a single run, but can technically -represent several runs. - The `labelspec` attribute is assumed to be a list, which is valid for *.label* generated by automatic tagging. Manual tagging will store label names as `Labels.labelspec['names']`, because @@ -301,28 +309,57 @@ class Labels: for timestamp, label in zip(track["t"], track["labels"])} return self - def encode(self, labels): - if isinstance(self.labelspec, dict): - labelset = self.labelspec["names"] + """ + Encode the text labels as indices (`int` or `list` of `int`). + + Labels are 1-indexed. If shifted down, the indices apply to attribute + `labelspec`. + """ + def encode(self, label=None): + if label is None: + encoded = label = self + for run_larva in label: + label[run_larva] = self.encode(label[run_larva]) + elif isinstance(label, dict): + encoded = {t: self.encode(l) for t, l in label.items()} else: - labelset = self.labelspec - encoded = [] - for label in labels: + if isinstance(self.labelspec, dict): + labelset = self.labelspec['names'] + else: + labelset = self.labelspec if isinstance(label, str): - encoded.append(labelset.index(label)+1) + encoded = labelset.index(label) + 1 + elif isinstance(label, int): + encoded = label + logging.debug('label(s) already encoded') else: - encoded.append([labelset.index(label)+1 for label in label]) + encoded = [labelset.index(l) + 1 for l in label] return encoded - def decode(self, label): - if isinstance(self.labelspec, dict): - labelset = self.labelspec["names"] + """ + Decode the label indices as text (`str` or `list` of `str`). + + Text labels are picked in `labelspec`. + """ + def decode(self, label=None): + if label is None: + decoded = label = self + for run_larva in label: + label[run_larva] = self.decode(label[run_larva]) + elif isinstance(label, dict): + decoded = {t: self.decode(l) for t, l in label.items()} else: - labelset = self.labelspec - if isinstance(label, int): - decoded = labelset[label-1] - else: - decoded = [labelset[label-1] for label in label] + if isinstance(self.labelspec, dict): + labelset = self.labelspec['names'] + else: + labelset = self.labelspec + if isinstance(label, int): + decoded = labelset[label-1] + elif isinstance(label, str): + decoded = label + logging.debug('label(s) already decoded') + else: + decoded = [labelset[l-1] for l in label] return decoded class LabelEncoder(json.JSONEncoder): diff --git a/src/taggingbackends/explorer.py b/src/taggingbackends/explorer.py index c8d123adee3324458ca7e256225783ab820b6067..803eb72db4cdbced317f9cca00e1984db9ffc6e4 100644 --- a/src/taggingbackends/explorer.py +++ b/src/taggingbackends/explorer.py @@ -7,6 +7,7 @@ import fnmatch import importlib import logging import subprocess +import tempfile from collections import defaultdict JULIA_PROJECT = os.environ.get('JULIA_PROJECT', '') @@ -14,15 +15,15 @@ JULIA_PROJECT = os.environ.get('JULIA_PROJECT', '') try: from julia import Julia Julia(compiled_modules=False) -except ImportError: +except: logging.warning(f"PyCall not found in JULIA_PROJECT={JULIA_PROJECT}; \ - please see https://gitlab.pasteur.fr/nyx/TaggingBackends#recommended-installation") +please see https://gitlab.pasteur.fr/nyx/TaggingBackends#recommended-installation") else: try: from julia import TaggingBackends except ImportError: logging.warning(f"TaggingBackends not found in JULIA_PROJECT={JULIA_PROJECT}; \ - please see https://gitlab.pasteur.fr/nyx/TaggingBackends#recommended-installation") +please see https://gitlab.pasteur.fr/nyx/TaggingBackends#recommended-installation") def getlogger(name): logger = logging.getLogger(name) @@ -47,7 +48,8 @@ class BackendExplorer: Locator for paths to data, scripts, model instances, etc. """ - def __init__(self, project_dir=None, package_name=None, model_instance=None): + def __init__(self, project_dir=None, package_name=None, model_instance=None, + sandbox=None): self.project_dir = pathlib.Path(os.getcwd() if project_dir is None else project_dir) logging.debug(f"project directory: {self.project_dir}") self._package_name = package_name @@ -57,6 +59,8 @@ class BackendExplorer: self._build_features = None self._train_model = None self._predict_model = None + # + self._sandbox = sandbox @property def package_name(self): @@ -155,7 +159,10 @@ Cannot find any Python package in project root directory: logger.info("\n".join(lines)) log(line[len(prefix):].lstrip()) return [] - lines.append(line) + if line.startswith(" warn(") or line.startswith(" warnings.warn("): + pass + else: + lines.append(line) return lines def _parse_stderr(self, logger, lines, line): @@ -267,6 +274,15 @@ run `poetry add {pkg}` from directory: \n raise return pkg + @property + def sandbox(self): + if self._sandbox is False: + self._sandbox = None + elif self._sandbox is True: + self._sandbox = pathlib.Path(tempfile.mkdtemp(dir=self.project_dir / 'data' / 'raw')).name + logging.info(f"sandboxing in {self._sandbox}") + return self._sandbox + def _model_dir(self, parent_dir, model_instance=None, create_if_missing=True): if model_instance is None: model_instance = self.model_instance @@ -278,19 +294,19 @@ run `poetry add {pkg}` from directory: \n def raw_data_dir(self, model_instance=None, create_if_missing=True): return self._model_dir( self.project_dir / "data" / "raw", - model_instance, + self.sandbox if model_instance is None else model_instance, create_if_missing) def interim_data_dir(self, model_instance=None, create_if_missing=True): return self._model_dir( self.project_dir / "data" / "interim", - model_instance, + self.sandbox if model_instance is None else model_instance, create_if_missing) def processed_data_dir(self, model_instance=None, create_if_missing=True): return self._model_dir( self.project_dir / "data" / "processed", - model_instance, + self.sandbox if model_instance is None else model_instance, create_if_missing) def model_dir(self, model_instance=None, create_if_missing=True): @@ -477,6 +493,7 @@ run `poetry add {pkg}` from directory: \n interim *.h5* data files in data/interim/{instance}/ and generate a *larva_dataset hdf5* file similarly to `generate_dataset`. """ + logging.warning('BackendExplorer.compile_trxmat_database is deprecated and will soon be removed') input_dir = str(input_dir) # in the case input_dir is a pathlib.Path interim_dir = str(self.interim_data_dir()) if not reuse_h5files: @@ -503,7 +520,7 @@ run `poetry add {pkg}` from directory: \n met = dict(raw=self.raw_data_dir, interim=self.interim_data_dir, processed=self.processed_data_dir, - )[dir] + )[dir] shutil.rmtree(met(model_instance, False), ignore_errors=True) def reset_model(self, model_instance=None): @@ -520,7 +537,7 @@ class BackendExplorerEncoder(json.JSONEncoder): def default(self, explorer): if isinstance(explorer, BackendExplorer): data = {} - for attr in ("project_dir", "package_name", "model_instance"): + for attr in ("project_dir", "package_name", "model_instance", "sandbox"): try: val = getattr(explorer, attr) except AttributeError: diff --git a/src/taggingbackends/main.py b/src/taggingbackends/main.py index b167129c8c0cdf27ee87bce81fc3d0a90da8514f..e5c435ef2b97aa6c32183a700ea4c0d29a007d3b 100644 --- a/src/taggingbackends/main.py +++ b/src/taggingbackends/main.py @@ -10,7 +10,7 @@ Usage: tagging-backend [train|predict] --model-instance <name> tagging-backend train ... --sample-size <N> tagging-backend train ... --frame-interval <I> --window-length <T> tagging-backend train ... --pretrained-model-instance <name> - tagging-backend predict ... --skip-make-dataset + tagging-backend predict ... --skip-make-dataset --sandbox <token> `tagging-backend` typically is run using `poetry run`. A name must be provided to identify the trained model and its location within @@ -35,6 +35,11 @@ Note that an existing larva_dataset file in data/interim/<name> makes the the `make_dataset` module is loaded and this may take quite some time due to dependencies (e.g. Julia FFI). The `--skip-make-dataset` option makes `train` truly skip this step; the corresponding module is not loaded. + +`--sandbox <token>` makes `tagging-backend` use a token instead of <name> as +directory name in data/raw, data/interim and data/processed. +This is intended to prevent conflicts on running `predict` in parallel on +multiple data files with multiple calls. """ if _print: print(msg) @@ -56,6 +61,7 @@ def main(fun=None): trxmat_only = reuse_h5files = False skip_make_dataset = skip_build_features = False pretrained_model_instance = None + sandbox = False unknown_args = {} k = 2 while k < len(sys.argv): @@ -91,11 +97,15 @@ def main(fun=None): elif sys.argv[k] == "--pretrained-model-instance": k = k + 1 pretrained_model_instance = sys.argv[k] + elif sys.argv[k] == "--sandbox": + k = k + 1 + sandbox = sys.argv[k] else: unknown_args[sys.argv[k].lstrip('-').replace('-', '_')] = sys.argv[k+1] k = k + 1 k = k + 1 - backend = BackendExplorer(project_dir, model_instance=model_instance) + backend = BackendExplorer(project_dir, model_instance=model_instance, + sandbox=sandbox) backend.reset_data(spare_raw=True) sys.stderr.flush() sys.stdout.flush()