diff --git a/Project.toml b/Project.toml index b4870abebb8cb3a3277536a52d2b1e1e43c87872..269205b0387619bc07687a13b0a88f5d088a6edc 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TaggingBackends" uuid = "e551f703-3b82-4335-b341-d497b48d519b" authors = ["François Laurent", "Institut Pasteur"] -version = "0.16" +version = "0.17" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/README.md b/README.md index 1c23a60cc30e2067e40a01a8ad32afeae6f6078e..5fbae2c1dc21b40b411e735ea7798072e588de37 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,14 @@ A tagging backend, called *e.g.* `TaggingBackend`, is a Python project with the │ │ and stores the retrained model as a new model │ │ instance in models/; optional. │ │ *Available since version 0.14*. -│ └── predict_model.py <- Loads the trained model and features from -│ data/interim, and moves the resulting -│ labels in data/processed. +│ ├── predict_model.py <- Loads the trained model, and data or features +│ │ from data/interim, predicts labels, and moves +│ │ the resulting labels into data/processed. +│ └── embed_model.py <- Loads the trained model, or the encoder part of it, +│ and data or features from data/interim, projects +│ the data, and moves the resulting embeddings into +│ data/processed; optional. +│ *Available since version 0.17*. └── test ├── __init__.py <- Empty file. └── test_taggingbackend.py <- Automated tests; optional. @@ -68,12 +73,12 @@ As the Python package is installed, this custom module will be loadable from any with `import <package_name>.models.mymodel`. On the other hand, the `make_dataset.py`, `build_features.py`, `predict_model.py`, -`train_model.py` and `finetune_model.py` are Python scripts, with a main program. +`train_model.py`, `finetune_model.py` and `embed_model.py` are Python scripts, with a main program. These scripts are run using Poetry, from the project root. More exactly, although the Nyx tagging UI does not expect the backend to be a Python project, the backend should be set a Poetry-managed virtual environment with the `taggingbackends` package installed as a dependency, so that the backend can be operated -calling `poetry run tagging-backend [train|predict|finetune]`, which in turn +calling `poetry run tagging-backend [train|predict|finetune|embed]`, which in turn calls the above-mentioned Python scripts. *New in version 0.14*, fine-tuning: `finetune_model.py` differs from `train_model.py` as @@ -91,6 +96,12 @@ It is not expected or checked for by the TaggingBackends logic, unlike all the o directories and scripts mentioned above. The `pretrained_models` directory was introduced by MaggotUBA-adapter. +New in version 0.17, embeddings: `embed_model.py` differs from `predict_model.py` as +it loads the encoder part of a trained model and generates the intermediate latent +representations of the input data, or embeddings. This may not be relevant to every models. +The data format for the projected data is currently defined by the backend. +See for example [MaggotUBA-adapter's *embeddings.h5* files](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter#embeddings). + See example scripts in the `examplebackend` directory. Only `predict_model.py` is required by the Nyx tagging UI. @@ -181,7 +192,7 @@ julia -e 'using Pkg; Pkg.add(url="https://gitlab.pasteur.fr/nyx/planarlarvae.jl" This is enough for Python-side *taggingbackends* to find its Julia counterpart. -Another approach we recommend, so that your main Julia environment is not populated by packages you do not need in every circumtances, consists in installing *TaggingBackends* in an existing Julia environment, *e.g.* the one that accommodates the *LarvaTagger* package. +Another approach we recommend, so that your main Julia environment is not populated by packages you do not need in every circumstances, consists in installing *TaggingBackends* in an existing Julia environment, *e.g.* the one that accommodates the *LarvaTagger* package. As a major inconvenience of this approach, the `JULIA_PROJECT` environment variable will have to be set whenever `tagging-backend train` is called. The `JULIA_PROJECT` variable should be the absolute path to the directory associated with the environment that accommodates the *TaggingBackends* package. @@ -210,7 +221,7 @@ JULIA_PROJECT=$(pwd) poetry run python -c 'import julia; julia.install()' Note that, if PyCall is not found, the above command will install it. However, TaggingBackends still needs to be installed for Python-side *taggingbackends* to successfully call Julia-side *TaggingBackends*. -So again, the `JULIA_PROJECT` environment variable must be set accordingly everytime the `train` command is called, which can also be done assigning the adequate absolute path to the variable on the same line as the command, immediately before the command. +So again, the `JULIA_PROJECT` environment variable must be set accordingly every time the `train` command is called, which can also be done assigning the adequate absolute path to the variable on the same line as the command, immediately before the command. For example, from the backend directory tree, on Unix-like OSes: ``` JULIA_PROJECT=<path> poetry run tagging-backend train @@ -219,10 +230,10 @@ or, from LarvaTagger.jl root directory: ``` JULIA_PROJECT=<path> scripts/larvatagger.jl train ``` -with `<path>` the absolute path to the Julia project/environement with TaggingBackends installed. +with `<path>` the absolute path to the Julia project/environment with TaggingBackends installed. There is a known issue with `JULIA_PROJECT` not being properly propagated on calling `larvatagger.jl`, in the case Julia was installed using juliaup. -Prefer [jill](https://pypi.org/project/jill/), instead of juliaup. +Prefer [jill](https://pypi.org/project/jill/) instead of juliaup. Note also that, on Linux, or macOS with coreutils installed, a relative path can be conveniently turned into an absolute path using the `realpath` command: ``` diff --git a/pyproject.toml b/pyproject.toml index 7a6d08727061a101ce40d9717c880c7dc65083be..9107283dbe586a75370f259d43ba83ae7eb57fbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "TaggingBackends" -version = "0.16" +version = "0.17" description = "Backbone for LarvaTagger.jl tagging backends" authors = ["François Laurent"] diff --git a/src/taggingbackends/__init__.py b/src/taggingbackends/__init__.py index b794fd409a5e3b3b65ad76a43d6a01a318877640..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/src/taggingbackends/__init__.py +++ b/src/taggingbackends/__init__.py @@ -1 +0,0 @@ -__version__ = '0.1.0' diff --git a/src/taggingbackends/explorer.py b/src/taggingbackends/explorer.py index caefe5ee00ee7f4d4d8b821af3ba1ec6018927a3..5ee4a0b3260528359c7a52ab9a98ac9cd47863b0 100644 --- a/src/taggingbackends/explorer.py +++ b/src/taggingbackends/explorer.py @@ -63,6 +63,7 @@ class BackendExplorer: self._train_model = None self._predict_model = None self._finetune_model = None + self._embed_model = None # self._sandbox = sandbox @@ -141,6 +142,13 @@ Cannot find any Python package in project root directory: if self._finetune_model is not False: return self._finetune_model + @property + def embed_model(self): + if self._embed_model is None: + self._embed_model = self._locate_script("models", "embed_model") + if self._embed_model is not False: + return self._embed_model + def _locate_script(self, subpkg, basename): basename = basename + ".py" in_root_dir = self.project_dir / basename diff --git a/src/taggingbackends/main.py b/src/taggingbackends/main.py index b552f261c7274a5402610dd8d6436aa8e8756b5e..a3e96a12ccca94d2f8dbdd03e557f2b9faca4045 100644 --- a/src/taggingbackends/main.py +++ b/src/taggingbackends/main.py @@ -5,7 +5,7 @@ from taggingbackends.explorer import BackendExplorer, BackendExplorerDecoder, ge def help(_print=False): msg = """ -Usage: tagging-backend [train|predict|finetune] --model-instance <name> +Usage: tagging-backend [train|predict|finetune|embed] --model-instance <name> tagging-backend [train|finetune] ... --sample-size <N> tagging-backend [train|finetune] ... --balancing-strategy <S> tagging-backend [train|finetune] ... --include-all <secondary-label> @@ -94,6 +94,9 @@ Fine-tuning is typically resorted to when the (re-)training dataset is small classes may be underrepresented. While totally missing classes are properly ignored, the data points of the underrepresented classes should be explicitly unlabelled to be similarly excluded from the (re-)training dataset. + +New in version 0.17: the `embed` switch loads a trained model, or the encoder +part of it, and generates latent representations of the input data. """ if _print: print(msg) @@ -244,6 +247,8 @@ def main(fun=None): if seed is not None: finetune_kwargs['seed'] = seed backend._run_script(backend.finetune_model, trailing=unknown_args, **finetune_kwargs) + elif task == 'embed': + backend._run_script(backend.embed_model, trailing=unknown_args) else: # called by make_dataset, build_features, train_model and predict_model backend = BackendExplorerDecoder().decode(sys.argv[1])