From bc6f3649a77eb9da559a5f8223b804dc39c9710b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Laurent?= <francois.laurent@posteo.net>
Date: Tue, 2 Jan 2024 21:13:32 +0100
Subject: [PATCH] embed command

---
 Project.toml                    |  2 +-
 README.md                       | 29 ++++++++++++++++++++---------
 pyproject.toml                  |  2 +-
 src/taggingbackends/__init__.py |  1 -
 src/taggingbackends/explorer.py |  8 ++++++++
 src/taggingbackends/main.py     |  7 ++++++-
 6 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/Project.toml b/Project.toml
index b4870ab..269205b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TaggingBackends"
 uuid = "e551f703-3b82-4335-b341-d497b48d519b"
 authors = ["FranÃ§ois Laurent", "Institut Pasteur"]
-version = "0.16"
+version = "0.17"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
diff --git a/README.md b/README.md
index 1c23a60..5fbae2c 100644
--- a/README.md
+++ b/README.md
@@ -47,9 +47,14 @@ A tagging backend, called *e.g.* `TaggingBackend`, is a Python project with the
 â”‚           â”‚                           and stores the retrained model as a new model
 â”‚           â”‚                           instance in models/; optional.
 â”‚           â”‚                           *Available since version 0.14*.
-â”‚           â””â”€â”€ predict_model.py     <- Loads the trained model and features from
-â”‚                                       data/interim, and moves the resulting
-â”‚                                       labels in data/processed.
+â”‚           â”œâ”€â”€ predict_model.py     <- Loads the trained model, and data or features
+â”‚           â”‚                           from data/interim, predicts labels, and moves
+â”‚           â”‚                           the resulting labels into data/processed.
+â”‚           â””â”€â”€ embed_model.py       <- Loads the trained model, or the encoder part of it,
+â”‚                                       and data or features from data/interim, projects
+â”‚                                       the data, and moves the resulting embeddings into
+â”‚                                       data/processed; optional.
+â”‚                                       *Available since version 0.17*.
 â””â”€â”€ test
     â”œâ”€â”€ __init__.py                  <- Empty file.
     â””â”€â”€ test_taggingbackend.py       <- Automated tests; optional.
@@ -68,12 +73,12 @@ As the Python package is installed, this custom module will be loadable from any
 with `import <package_name>.models.mymodel`.
 
 On the other hand, the `make_dataset.py`, `build_features.py`, `predict_model.py`,
-`train_model.py` and `finetune_model.py` are Python scripts, with a main program.
+`train_model.py`, `finetune_model.py` and `embed_model.py` are Python scripts, with a main program.
 These scripts are run using Poetry, from the project root.
 More exactly, although the Nyx tagging UI does not expect the backend to be a Python
 project, the backend should be set a Poetry-managed virtual environment with the
 `taggingbackends` package installed as a dependency, so that the backend can be operated
-calling `poetry run tagging-backend [train|predict|finetune]`, which in turn
+calling `poetry run tagging-backend [train|predict|finetune|embed]`, which in turn
 calls the above-mentioned Python scripts.
 
 *New in version 0.14*, fine-tuning: `finetune_model.py` differs from `train_model.py` as
@@ -91,6 +96,12 @@ It is not expected or checked for by the TaggingBackends logic, unlike all the o
 directories and scripts mentioned above. The `pretrained_models` directory was introduced
 by MaggotUBA-adapter.
 
+New in version 0.17, embeddings: `embed_model.py` differs from `predict_model.py` as
+it loads the encoder part of a trained model and generates the intermediate latent
+representations of the input data, or embeddings. This may not be relevant to every models.
+The data format for the projected data is currently defined by the backend.
+See for example [MaggotUBA-adapter's *embeddings.h5* files](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter#embeddings).
+
 See example scripts in the `examplebackend` directory.
 
 Only `predict_model.py` is required by the Nyx tagging UI.
@@ -181,7 +192,7 @@ julia -e 'using Pkg; Pkg.add(url="https://gitlab.pasteur.fr/nyx/planarlarvae.jl"
 
 This is enough for Python-side *taggingbackends* to find its Julia counterpart.
 
-Another approach we recommend, so that your main Julia environment is not populated by packages you do not need in every circumtances, consists in installing *TaggingBackends* in an existing Julia environment, *e.g.* the one that accommodates the *LarvaTagger* package.
+Another approach we recommend, so that your main Julia environment is not populated by packages you do not need in every circumstances, consists in installing *TaggingBackends* in an existing Julia environment, *e.g.* the one that accommodates the *LarvaTagger* package.
 
 As a major inconvenience of this approach, the `JULIA_PROJECT` environment variable will have to be set whenever `tagging-backend train` is called.
 The `JULIA_PROJECT` variable should be the absolute path to the directory associated with the environment that accommodates the *TaggingBackends* package.
@@ -210,7 +221,7 @@ JULIA_PROJECT=$(pwd) poetry run python -c 'import julia; julia.install()'
 Note that, if PyCall is not found, the above command will install it.
 However, TaggingBackends still needs to be installed for Python-side *taggingbackends* to successfully call Julia-side *TaggingBackends*.
 
-So again, the `JULIA_PROJECT` environment variable must be set accordingly everytime the `train` command is called, which can also be done assigning the adequate absolute path to the variable on the same line as the command, immediately before the command.
+So again, the `JULIA_PROJECT` environment variable must be set accordingly every time the `train` command is called, which can also be done assigning the adequate absolute path to the variable on the same line as the command, immediately before the command.
 For example, from the backend directory tree, on Unix-like OSes:
 ```
 JULIA_PROJECT=<path> poetry run tagging-backend train
@@ -219,10 +230,10 @@ or, from LarvaTagger.jl root directory:
 ```
 JULIA_PROJECT=<path> scripts/larvatagger.jl train
 ```
-with `<path>` the absolute path to the Julia project/environement with TaggingBackends installed.
+with `<path>` the absolute path to the Julia project/environment with TaggingBackends installed.
 
 There is a known issue with `JULIA_PROJECT` not being properly propagated on calling `larvatagger.jl`, in the case Julia was installed using juliaup.
-Prefer [jill](https://pypi.org/project/jill/), instead of juliaup.
+Prefer [jill](https://pypi.org/project/jill/) instead of juliaup.
 
 Note also that, on Linux, or macOS with coreutils installed, a relative path can be conveniently turned into an absolute path using the `realpath` command:
 ```
diff --git a/pyproject.toml b/pyproject.toml
index 7a6d087..9107283 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "TaggingBackends"
-version = "0.16"
+version = "0.17"
 description = "Backbone for LarvaTagger.jl tagging backends"
 authors = ["FranÃ§ois Laurent"]
 
diff --git a/src/taggingbackends/__init__.py b/src/taggingbackends/__init__.py
index b794fd4..e69de29 100644
--- a/src/taggingbackends/__init__.py
+++ b/src/taggingbackends/__init__.py
@@ -1 +0,0 @@
-__version__ = '0.1.0'
diff --git a/src/taggingbackends/explorer.py b/src/taggingbackends/explorer.py
index caefe5e..5ee4a0b 100644
--- a/src/taggingbackends/explorer.py
+++ b/src/taggingbackends/explorer.py
@@ -63,6 +63,7 @@ class BackendExplorer:
         self._train_model = None
         self._predict_model = None
         self._finetune_model = None
+        self._embed_model = None
         #
         self._sandbox = sandbox
 
@@ -141,6 +142,13 @@ Cannot find any Python package in project root directory:
         if self._finetune_model is not False:
             return self._finetune_model
 
+    @property
+    def embed_model(self):
+        if self._embed_model is None:
+            self._embed_model = self._locate_script("models", "embed_model")
+        if self._embed_model is not False:
+            return self._embed_model
+
     def _locate_script(self, subpkg, basename):
         basename = basename + ".py"
         in_root_dir = self.project_dir / basename
diff --git a/src/taggingbackends/main.py b/src/taggingbackends/main.py
index b552f26..a3e96a1 100644
--- a/src/taggingbackends/main.py
+++ b/src/taggingbackends/main.py
@@ -5,7 +5,7 @@ from taggingbackends.explorer import BackendExplorer, BackendExplorerDecoder, ge
 
 def help(_print=False):
     msg = """
-Usage:  tagging-backend [train|predict|finetune] --model-instance <name>
+Usage:  tagging-backend [train|predict|finetune|embed] --model-instance <name>
         tagging-backend [train|finetune] ... --sample-size <N>
         tagging-backend [train|finetune] ... --balancing-strategy <S>
         tagging-backend [train|finetune] ... --include-all <secondary-label>
@@ -94,6 +94,9 @@ Fine-tuning is typically resorted to when the (re-)training dataset is small
 classes may be underrepresented. While totally missing classes are properly
 ignored, the data points of the underrepresented classes should be explicitly
 unlabelled to be similarly excluded from the (re-)training dataset.
+
+New in version 0.17: the `embed` switch loads a trained model, or the encoder
+part of it, and generates latent representations of the input data.
 """
     if _print:
         print(msg)
@@ -244,6 +247,8 @@ def main(fun=None):
             if seed is not None:
                 finetune_kwargs['seed'] = seed
             backend._run_script(backend.finetune_model, trailing=unknown_args, **finetune_kwargs)
+        elif task == 'embed':
+            backend._run_script(backend.embed_model, trailing=unknown_args)
     else:
         # called by make_dataset, build_features, train_model and predict_model
         backend = BackendExplorerDecoder().decode(sys.argv[1])
-- 
GitLab