From 52c7016ec4f9404427866541f4332b52f0ac1730 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Laurent?= <francois.laurent@posteo.net>
Date: Wed, 2 Apr 2025 13:26:27 +0200
Subject: [PATCH] feat: Taggers.apply_make_dataset, make_dataset metadata
 entry, version increment

---
 Project.toml            |  2 +-
 doc/develop.md          | 38 ++++++++++++++++++++++++-
 recipes/Dockerfile      |  1 +
 recipes/patch.sh        | 11 ++++++--
 scripts/install.sh      |  4 +--
 src/REST/Client.jl      |  7 +++--
 src/REST/Model.jl       | 42 +++++-----------------------
 src/Taggers.jl          | 61 +++++++++++++++++++++++++++++++++++++++--
 src/backends.jl         |  4 +--
 test/deploy_and_test.sh |  4 +--
 10 files changed, 124 insertions(+), 50 deletions(-)

diff --git a/Project.toml b/Project.toml
index 4707dab..82ad0ca 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "LarvaTagger"
 uuid = "8b3b36f1-dfed-446e-8561-ea19fe966a4d"
 authors = ["FranÃ§ois Laurent", "Institut Pasteur"]
-version = "0.19.1"
+version = "0.20"
 
 [deps]
 Bonito = "824d6782-a2ef-11e9-3a09-e5662e0c26f8"
diff --git a/doc/develop.md b/doc/develop.md
index ec368bc..6648bcc 100644
--- a/doc/develop.md
+++ b/doc/develop.md
@@ -22,4 +22,40 @@ On the *Julia* side, the lower-level functionalities are provided by the *Planar
 Similarly, the *TidyObservables.jl* project has unit tests and a GitLab workflow to run these tests on every commit.
 
 For the remaining part of the *LarvaTagger* project, high-level functional tests only are available.
-These tests are available in the *LarvaTagger.jl* project, in the test directory, file *scenarii.sh*. They depend on [*shUnit2*](https://github.com/kward/shunit2).
+These tests are available in the *LarvaTagger.jl* project, as file `test/deploy_and_test.sh`. They depend on [*shUnit2*](https://github.com/kward/shunit2).
+
+The `test/deploy_and_test.sh` script implicitly tests the `scripts/install.sh` script, and then runs the `test/scenarii.sh` script.
+The `test/scenarii.sh` script requires some test data that are downloaded by the `test/deploy_and_test.sh` script.
+If these test data cannot be fetched, please contact FranÃ§ois Laurent so that the data are made available again (the download link periodically expires).
+
+## REST
+
+The REST API can be tested running the `test/rest_server.sh` and `test/rest_client.sh` scripts.
+
+The `test/rest_server.sh` script is fully automatic and completes with a stack trace that results from killing the backend after the last test.
+
+The `test/rest_client.sh` script does not perform any actual test.
+It launches a backend and a frontend, and the user is expected to manually operate the frontend to test the communication between the backend and the frontend.
+
+## Docker images
+
+The most complete test image can be built as follows:
+
+```
+docker=docker LARVATAGGER_IMAGE=flaur/larvatagger:latest scripts/larvatagger.sh build --dev
+docker build -t larvatagger:bigfat -f recipes/Dockerfile.pasteurjanelia .
+```
+
+Subsequent tests will run the backend using `larvatagger:bigfat` image:
+```
+docker=docker LARVATAGGER_IMAGE="larvatagger:bigfat" scripts/larvatagger.sh backend
+```
+
+The frontend is more conveniently run from the source tree:
+```
+scripts/larvatagger-gui.jl http://localhost:9285
+```
+
+The `docker=docker` environment variable is required if command `podman` is available.
+The `scripts/larvatagger.sh` script falls back on using `podman` instead of `docker`, if `podman` is available, but it is recommended to perform tests using Docker.
+In addition, at present, building the image works with Docker buildx only.
diff --git a/recipes/Dockerfile b/recipes/Dockerfile
index b83d356..c4d6989 100644
--- a/recipes/Dockerfile
+++ b/recipes/Dockerfile
@@ -83,6 +83,7 @@ RUN if [ -z $TAGGINGBACKENDS_BRANCH ]; then \
  && if [ "$(echo $BACKEND | cut -d/ -f2)" = "main" ] || [ "$(echo $BACKEND | cut -d/ -f2)" = "dev" ]; then \
     julia -e 'using Pkg; Pkg.add("JSON3")' \
  && scripts/make_models.jl default \
+ && cd $PROJECT_DIR \
  && recipes/patch.sh; \
     fi \
  && rm -rf ~/.cache; \
diff --git a/recipes/patch.sh b/recipes/patch.sh
index b32d491..42da07a 100755
--- a/recipes/patch.sh
+++ b/recipes/patch.sh
@@ -1,6 +1,7 @@
 #!/bin/sh
 
-# patch the taggers in a Docker image so that they include metadata.json files
+# patch the taggers in a Docker image so that they include metadata.json files;
+# ultimately the taggers should manage their metadata.json files themselves.
 
 if [ -d "MaggotUBA" ]; then
   if ! [ -f "MaggotUBA/metadata.json" ]; then
@@ -37,13 +38,19 @@ EOF
 
 fi
 
+# note about make_dataset:
+#  * default is equivalent to "train, finetune"
+#  * can be "always" or a comma-separated list of processing steps
+#  * valid steps are "train", "finetune", "predict", "embed"
+
 if [ -d "PasteurJanelia" ]; then
   if ! [ -f "PasteurJanelia/metadata.json" ]; then
     cat <<"EOF" >PasteurJanelia/metadata.json
 {
   "name": "PasteurJanelia",
   "homepage": "https://gitlab.pasteur.fr/nyx/PasteurJanelia-adapter",
-  "description": "Action classifiers initially designed by JBM at Janelia"
+  "description": "Action classifiers initially designed by JBM at Janelia",
+  "make_dataset": "always"
 }
 EOF
   fi
diff --git a/scripts/install.sh b/scripts/install.sh
index d649e46..30f8023 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -180,7 +180,7 @@ else
 
 if [ -z "$JULIA_VERSION" ]; then
   JULIA_VERSION=1.10
-  JULIA_CHANNEL=lts
+  JULIA_CHANNEL=1.10
 else
   echo "Using environment variable: JULIA_VERSION= $JULIA_VERSION"
   if [ -z "$JULIA_CHANNEL" ]; then
@@ -369,7 +369,7 @@ else
 
 activate() {
   # pyenv activation is necessary on WSL
-  command -v pyenv &>/dev/null && pyenv local $PYTHON_VERSION
+  command -v pyenv &>/dev/null && [ -n "`pyenv versions | grep '  $PYTHON_VERSION'`" ] && pyenv local $PYTHON_VERSION
   poetry env use $PYTHON_VERSION
 }
 
diff --git a/src/REST/Client.jl b/src/REST/Client.jl
index 611608e..93ca3c3 100644
--- a/src/REST/Client.jl
+++ b/src/REST/Client.jl
@@ -236,9 +236,10 @@ function Taggers.predict(back::LTBackend, file::String; metadata=nothing)
     connected(tagger) || connect!(tagger)
     Taggers.push(tagger, file, metadata)
     Taggers.predict(tagger)
-    labelfile = Taggers.pull(tagger, dirname(file))
-    @assert length(labelfile) == 1
-    return labelfile[1]
+    outputfiles = Taggers.pull(tagger, dirname(file))
+    @assert !isempty(outputfiles)
+    length(outputfiles) == 1 || @warn "Multiple output files" outputfiles
+    return outputfiles[1]
 end
 
 end
diff --git a/src/REST/Model.jl b/src/REST/Model.jl
index 9effac9..d712235 100644
--- a/src/REST/Model.jl
+++ b/src/REST/Model.jl
@@ -1,6 +1,6 @@
 module Model
 
-import ..Taggers: Taggers, Tagger
+import ..Taggers: Taggers, Tagger, loadmetadata, apply_make_dataset
 import HTTP: HTTP
 import JSON3
 using OrderedCollections: OrderedDict
@@ -187,45 +187,15 @@ function pullfile(lt_backend, backend_dir, model_instance, token, filename)
     return HTTP.Response(200, header, body)
 end
 
-function loadmodel(dir, hasinstances=false)
-    metadata = nothing
-    for filename in ("metadata", "metadata.json")
-        if isfile(joinpath(dir, filename))
-            metadata = JSON3.read(joinpath(dir, filename))
-            break
-        end
-    end
-    name = basename(dir)
-    T = if hasinstances
-        Union{AbstractString, Vector{OrderedDict{AbstractString, AbstractString}}}
-    else
-        AbstractString
-    end
-    model = OrderedDict{AbstractString, T}(
-        "name" => name,
-        "description" => "",
-        "homepage" => "",
-    )
-    if !isnothing(metadata)
-        for key in keys(model)
-            keyâ€² = Symbol(key)
-            if haskey(metadata, keyâ€²)
-                model[key] = metadata[keyâ€²]
-            end
-        end
-    end
-    return model
-end
-
 function listtaggers(lt_backend)
     inventory = Vector{OrderedDict{String, Any}}()
     backends_dir = lt_backend.root[]
     for tagging_backend_path in readdir(backends_dir; join=true)
         Taggers.isbackend(tagging_backend_path) || continue
         models_dir = joinpath(tagging_backend_path, "models")
-        models = loadmodel.(readdir(models_dir; join=true))
+        models = loadmetadata.(readdir(models_dir; join=true))
         isempty(models) && continue
-        tagging_backend = loadmodel(tagging_backend_path, true)
+        tagging_backend = loadmetadata(tagging_backend_path, false)
         tagging_backend["models"] = models
         push!(inventory, tagging_backend)
     end
@@ -234,14 +204,16 @@ end
 
 function predict(lt_backend, backend_dir, model_instance, token)
     tagger = gettagger(lt_backend, backend_dir, model_instance, token)
+    make_dataset = apply_make_dataset(tagger, "predict")
     # blocking; should we run async and expose a token-specific status api call?
-    Taggers.predict(tagger; skip_make_dataset=true)
+    Taggers.predict(tagger; make_dataset=make_dataset)
 end
 
 function embed(lt_backend, backend_dir, model_instance, token)
     tagger = gettagger(lt_backend, backend_dir, model_instance, token)
+    make_dataset = apply_make_dataset(tagger, "embed")
     # blocking, like predict
-    Taggers.embed(tagger; skip_make_dataset=true)
+    Taggers.embed(tagger; make_dataset=make_dataset)
 end
 
 end
diff --git a/src/Taggers.jl b/src/Taggers.jl
index e7864cc..6dad6a1 100644
--- a/src/Taggers.jl
+++ b/src/Taggers.jl
@@ -1,8 +1,11 @@
 module Taggers
 
 import PlanarLarvae.Formats, PlanarLarvae.Dataloaders
+using OrderedCollections: OrderedDict
+using JSON3
 
-export Tagger, isbackend, resetmodel, resetdata, train, predict, finetune, embed
+export Tagger, isbackend, resetmodel, resetdata, train, predict, finetune, embed,
+       loadmetadata, apply_make_dataset
 
 struct Tagger
     backend_dir::String
@@ -300,7 +303,9 @@ function run(tagger, switch, kwargs)
     args = Any[]
     parsekwargs!(args, kwargs)
     cmd = tagging_backend_command(tagger)
-    Base.run(Cmd(`$cmd $switch $args`; dir=tagger.backend_dir))
+    cmd = Cmd(`$cmd $switch $args`; dir=tagger.backend_dir)
+    @info "Running command" cmd
+    Base.run(cmd)
 end
 
 function train(tagger::Tagger; pretrained_instance=None, kwargs...)
@@ -323,4 +328,56 @@ end
 
 embed(tagger::Tagger; kwargs...) = run(tagger, "embed", kwargs)
 
+function loadmetadata(dir, instance=true)
+    metadata = nothing
+    for filename in ("metadata", "metadata.json")
+        if isfile(joinpath(dir, filename))
+            metadata = JSON3.read(joinpath(dir, filename))
+            break
+        end
+    end
+    name = basename(dir)
+    T = if instance
+        AbstractString
+    else
+        Union{AbstractString, Vector{OrderedDict{AbstractString, AbstractString}}}
+    end
+    model = OrderedDict{AbstractString, T}(
+        "name" => name,
+        "description" => "",
+        "homepage" => "",
+        "make_dataset" => "",
+    )
+    if !isnothing(metadata)
+        for key in keys(model)
+            keyâ€² = Symbol(key)
+            if haskey(metadata, keyâ€²)
+                model[key] = metadata[keyâ€²]
+            end
+        end
+    end
+    return model
+end
+
+function loadmetadata(tagger::Tagger, instance=true)
+    if instance
+        loadmetadata(Taggers.modeldir(tagger.backend_dir))
+    else
+        loadmetadata(tagger.backend_dir, true)
+    end
+end
+
+function apply_make_dataset(tagger::Tagger, step)
+    # see recipes/patch.sh for a note about the make_dataset entry
+    @assert step in ("train", "finetune", "predict", "embed")
+    metadata = loadmetadata(tagger, false)
+    make_dataset = metadata["make_dataset"]
+    apply_make_dataset = step in ("train", "finetune")
+    if !isempty(make_dataset)
+        apply_make_dataset = make_dataset == "always" || occursin(step, make_dataset)
+        @debug "apply_make_dataset" metadata apply_make_dataset
+    end
+    return apply_make_dataset
+end
+
 end # module
diff --git a/src/backends.jl b/src/backends.jl
index d734ad1..a3ad720 100644
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -98,8 +98,8 @@ function Taggers.predict(model::Backends, file::String; metadata=nothing)
     tagger = Tagger(backend_dir, model_instance)
     #
     Taggers.push(model, file; metadata=metadata)
-    # TODO: make the skip_make_dataset option discoverable in the backend
-    predict(tagger; skip_make_dataset=true)
+    make_dataset = apply_make_dataset(tagger, "predict")
+    predict(tagger; make_dataset=make_dataset)
     labelfile = Taggers.pull(model, dirname(file))
     @assert length(labelfile) == 1
     return labelfile[1]
diff --git a/test/deploy_and_test.sh b/test/deploy_and_test.sh
index 942ffd2..b0c0113 100755
--- a/test/deploy_and_test.sh
+++ b/test/deploy_and_test.sh
@@ -23,7 +23,7 @@ if ! [ -f scripts/install.sh ]; then
 fi
 
 scripts/install.sh --uninstall
-scripts/install.sh --with-backend --experimental
+scripts/install.sh --with-default-backend
 
 #############
 ## Maestro ##
@@ -74,7 +74,7 @@ if [ -f LarvaTagger_test_data.tgz ]; then
 else
   # Not recommended; reproducibility is not guarantee across hosts or architectures yet
   (cd "$LTROOT" && \
-    wget -O- https://dl.pasteur.fr/fop/ppk8GBQf/241127_LarvaTagger_test_data.tgz | tar zxv)
+    wget -O- https://dl.pasteur.fr/fop/VVIMasjG/241127_LarvaTagger_test_data.tgz | tar zxv)
 fi
 
 if [ "$LOCAL_SCENARII" = "1" ]; then
-- 
GitLab