feat: Taggers.apply_make_dataset, make_dataset metadata entry, version increment

52c7016e · François LAURENT · e77dd8a5 · 52c7016e · 52c7016e · 52c7016e
Commit 52c7016e authored 1 month ago by François LAURENT
--- a/Project.toml
+++ b/Project.toml
 name = "LarvaTagger"
 uuid = "8b3b36f1-dfed-446e-8561-ea19fe966a4d"
 authors = ["François Laurent", "Institut Pasteur"]
-version = "0.19.1"
+version = "0.20"
 [deps]
 Bonito = "824d6782-a2ef-11e9-3a09-e5662e0c26f8"

--- a/doc/develop.md
+++ b/doc/develop.md
@@ -22,4 +22,40 @@ On the *Julia* side, the lower-level functionalities are provided by the *Planar
 Similarly, the *TidyObservables.jl* project has unit tests and a GitLab workflow to run these tests on every commit.
 For the remaining part of the *LarvaTagger* project, high-level functional tests only are available.
-These tests are available in the *LarvaTagger.jl* project, in the test directory, file *scenarii.sh*. They depend on [*shUnit2*](https://github.com/kward/shunit2).
+These tests are available in the *LarvaTagger.jl* project, as file `test/deploy_and_test.sh`. They depend on [*shUnit2*](https://github.com/kward/shunit2).
+The `test/deploy_and_test.sh` script implicitly tests the `scripts/install.sh` script, and then runs the `test/scenarii.sh` script.
+The `test/scenarii.sh` script requires some test data that are downloaded by the `test/deploy_and_test.sh` script.
+If these test data cannot be fetched, please contact François Laurent so that the data are made available again (the download link periodically expires).
+## REST
+The REST API can be tested running the `test/rest_server.sh` and `test/rest_client.sh` scripts.
+The `test/rest_server.sh` script is fully automatic and completes with a stack trace that results from killing the backend after the last test.
+The `test/rest_client.sh` script does not perform any actual test.
+It launches a backend and a frontend, and the user is expected to manually operate the frontend to test the communication between the backend and the frontend.
+## Docker images
+The most complete test image can be built as follows:
+```
+docker=docker LARVATAGGER_IMAGE=flaur/larvatagger:latest scripts/larvatagger.sh build --dev
+docker build -t larvatagger:bigfat -f recipes/Dockerfile.pasteurjanelia .
+```
+Subsequent tests will run the backend using `larvatagger:bigfat` image:
+```
+docker=docker LARVATAGGER_IMAGE="larvatagger:bigfat" scripts/larvatagger.sh backend
+```
+The frontend is more conveniently run from the source tree:
+```
+scripts/larvatagger-gui.jl http://localhost:9285
+```
+The `docker=docker` environment variable is required if command `podman` is available.
+The `scripts/larvatagger.sh` script falls back on using `podman` instead of `docker`, if `podman` is available, but it is recommended to perform tests using Docker.
+In addition, at present, building the image works with Docker buildx only.
--- a/recipes/Dockerfile
+++ b/recipes/Dockerfile
@@ -83,6 +83,7 @@ RUN if [ -z $TAGGINGBACKENDS_BRANCH ]; then \
 && if [ "$(echo $BACKEND | cut -d/ -f2)" = "main" ] || [ "$(echo $BACKEND | cut -d/ -f2)" = "dev" ]; then \
    julia -e 'using Pkg; Pkg.add("JSON3")' \
 && scripts/make_models.jl default \
+ && cd $PROJECT_DIR \
 && recipes/patch.sh; \
    fi \
 && rm -rf ~/.cache; \

--- a/recipes/patch.sh
+++ b/recipes/patch.sh
 #!/bin/sh
-# patch the taggers in a Docker image so that they include metadata.json files
+# patch the taggers in a Docker image so that they include metadata.json files;
+# ultimately the taggers should manage their metadata.json files themselves.
 if [ -d "MaggotUBA" ]; then
  if ! [ -f "MaggotUBA/metadata.json" ]; then
@@ -37,13 +38,19 @@ EOF
 fi
+# note about make_dataset:
+#  * default is equivalent to "train, finetune"
+#  * can be "always" or a comma-separated list of processing steps
+#  * valid steps are "train", "finetune", "predict", "embed"
 if [ -d "PasteurJanelia" ]; then
  if ! [ -f "PasteurJanelia/metadata.json" ]; then
    cat <<"EOF" >PasteurJanelia/metadata.json
 {
  "name": "PasteurJanelia",
  "homepage": "https://gitlab.pasteur.fr/nyx/PasteurJanelia-adapter",
-  "description": "Action classifiers initially designed by JBM at Janelia"
+  "description": "Action classifiers initially designed by JBM at Janelia",
+  "make_dataset": "always"
 }
 EOF
  fi

--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -180,7 +180,7 @@ else
 if [ -z "$JULIA_VERSION" ]; then
  JULIA_VERSION=1.10
-  JULIA_CHANNEL=lts
+  JULIA_CHANNEL=1.10
 else
  echo "Using environment variable: JULIA_VERSION= $JULIA_VERSION"
  if [ -z "$JULIA_CHANNEL" ]; then
@@ -369,7 +369,7 @@ else
 activate() {
  # pyenv activation is necessary on WSL
-  command -v pyenv &>/dev/null && pyenv local $PYTHON_VERSION
+  command -v pyenv &>/dev/null && [ -n "`pyenv versions | grep '  $PYTHON_VERSION'`" ] && pyenv local $PYTHON_VERSION
  poetry env use $PYTHON_VERSION
 }

--- a/src/REST/Client.jl
+++ b/src/REST/Client.jl
@@ -236,9 +236,10 @@ function Taggers.predict(back::LTBackend, file::String; metadata=nothing)
    connected(tagger) || connect!(tagger)
    Taggers.push(tagger, file, metadata)
    Taggers.predict(tagger)
-    labelfile = Taggers.pull(tagger, dirname(file))
+    outputfiles = Taggers.pull(tagger, dirname(file))
-    @assert length(labelfile) == 1
+    @assert !isempty(outputfiles)
-    return labelfile[1]
+    length(outputfiles) == 1 || @warn "Multiple output files" outputfiles
+    return outputfiles[1]
 end
 end
--- a/src/REST/Model.jl
+++ b/src/REST/Model.jl
 module Model
-import ..Taggers: Taggers, Tagger
+import ..Taggers: Taggers, Tagger, loadmetadata, apply_make_dataset
 import HTTP: HTTP
 import JSON3
 using OrderedCollections: OrderedDict
@@ -187,45 +187,15 @@ function pullfile(lt_backend, backend_dir, model_instance, token, filename)
    return HTTP.Response(200, header, body)
 end
-function loadmodel(dir, hasinstances=false)
-    metadata = nothing
-    for filename in ("metadata", "metadata.json")
-        if isfile(joinpath(dir, filename))
-            metadata = JSON3.read(joinpath(dir, filename))
-            break
-        end
-    end
-    name = basename(dir)
-    T = if hasinstances
-        Union{AbstractString, Vector{OrderedDict{AbstractString, AbstractString}}}
-    else
-        AbstractString
-    end
-    model = OrderedDict{AbstractString, T}(
-        "name" => name,
-        "description" => "",
-        "homepage" => "",
-    )
-    if !isnothing(metadata)
-        for key in keys(model)
-            key′ = Symbol(key)
-            if haskey(metadata, key′)
-                model[key] = metadata[key′]
-            end
-        end
-    end
-    return model
-end
 function listtaggers(lt_backend)
    inventory = Vector{OrderedDict{String, Any}}()
    backends_dir = lt_backend.root[]
    for tagging_backend_path in readdir(backends_dir; join=true)
        Taggers.isbackend(tagging_backend_path) || continue
        models_dir = joinpath(tagging_backend_path, "models")
-        models = loadmodel.(readdir(models_dir; join=true))
+        models = loadmetadata.(readdir(models_dir; join=true))
        isempty(models) && continue
-        tagging_backend = loadmodel(tagging_backend_path, true)
+        tagging_backend = loadmetadata(tagging_backend_path, false)
        tagging_backend["models"] = models
        push!(inventory, tagging_backend)
    end
@@ -234,14 +204,16 @@ end
 function predict(lt_backend, backend_dir, model_instance, token)
    tagger = gettagger(lt_backend, backend_dir, model_instance, token)
+    make_dataset = apply_make_dataset(tagger, "predict")
    # blocking; should we run async and expose a token-specific status api call?
-    Taggers.predict(tagger; skip_make_dataset=true)
+    Taggers.predict(tagger; make_dataset=make_dataset)
 end
 function embed(lt_backend, backend_dir, model_instance, token)
    tagger = gettagger(lt_backend, backend_dir, model_instance, token)
+    make_dataset = apply_make_dataset(tagger, "embed")
    # blocking, like predict
-    Taggers.embed(tagger; skip_make_dataset=true)
+    Taggers.embed(tagger; make_dataset=make_dataset)
 end
 end
--- a/src/Taggers.jl
+++ b/src/Taggers.jl
 module Taggers
 import PlanarLarvae.Formats, PlanarLarvae.Dataloaders
+using OrderedCollections: OrderedDict
+using JSON3
-export Tagger, isbackend, resetmodel, resetdata, train, predict, finetune, embed
+export Tagger, isbackend, resetmodel, resetdata, train, predict, finetune, embed,
+       loadmetadata, apply_make_dataset
 struct Tagger
    backend_dir::String
@@ -300,7 +303,9 @@ function run(tagger, switch, kwargs)
    args = Any[]
    parsekwargs!(args, kwargs)
    cmd = tagging_backend_command(tagger)
-    Base.run(Cmd(`$cmd $switch $args`; dir=tagger.backend_dir))
+    cmd = Cmd(`$cmd $switch $args`; dir=tagger.backend_dir)
+    @info "Running command" cmd
+    Base.run(cmd)
 end
 function train(tagger::Tagger; pretrained_instance=None, kwargs...)
@@ -323,4 +328,56 @@ end
 embed(tagger::Tagger; kwargs...) = run(tagger, "embed", kwargs)
+function loadmetadata(dir, instance=true)
+    metadata = nothing
+    for filename in ("metadata", "metadata.json")
+        if isfile(joinpath(dir, filename))
+            metadata = JSON3.read(joinpath(dir, filename))
+            break
+        end
+    end
+    name = basename(dir)
+    T = if instance
+        AbstractString
+    else
+        Union{AbstractString, Vector{OrderedDict{AbstractString, AbstractString}}}
+    end
+    model = OrderedDict{AbstractString, T}(
+        "name" => name,
+        "description" => "",
+        "homepage" => "",
+        "make_dataset" => "",
+    )
+    if !isnothing(metadata)
+        for key in keys(model)
+            key′ = Symbol(key)
+            if haskey(metadata, key′)
+                model[key] = metadata[key′]
+            end
+        end
+    end
+    return model
+end
+function loadmetadata(tagger::Tagger, instance=true)
+    if instance
+        loadmetadata(Taggers.modeldir(tagger.backend_dir))
+    else
+        loadmetadata(tagger.backend_dir, true)
+    end
+end
+function apply_make_dataset(tagger::Tagger, step)
+    # see recipes/patch.sh for a note about the make_dataset entry
+    @assert step in ("train", "finetune", "predict", "embed")
+    metadata = loadmetadata(tagger, false)
+    make_dataset = metadata["make_dataset"]
+    apply_make_dataset = step in ("train", "finetune")
+    if !isempty(make_dataset)
+        apply_make_dataset = make_dataset == "always" || occursin(step, make_dataset)
+        @debug "apply_make_dataset" metadata apply_make_dataset
+    end
+    return apply_make_dataset
+end
 end # module
--- a/src/backends.jl
+++ b/src/backends.jl
@@ -98,8 +98,8 @@ function Taggers.predict(model::Backends, file::String; metadata=nothing)
    tagger = Tagger(backend_dir, model_instance)
    #
    Taggers.push(model, file; metadata=metadata)
-    # TODO: make the skip_make_dataset option discoverable in the backend
+    make_dataset = apply_make_dataset(tagger, "predict")
-    predict(tagger; skip_make_dataset=true)
+    predict(tagger; make_dataset=make_dataset)
    labelfile = Taggers.pull(model, dirname(file))
    @assert length(labelfile) == 1
    return labelfile[1]

--- a/test/deploy_and_test.sh
+++ b/test/deploy_and_test.sh
@@ -23,7 +23,7 @@ if ! [ -f scripts/install.sh ]; then
 fi
 scripts/install.sh --uninstall
-scripts/install.sh --with-backend --experimental
+scripts/install.sh --with-default-backend
 #############
 ## Maestro ##
@@ -74,7 +74,7 @@ if [ -f LarvaTagger_test_data.tgz ]; then
 else
  # Not recommended; reproducibility is not guarantee across hosts or architectures yet
  (cd "$LTROOT" && \
-    wget -O- https://dl.pasteur.fr/fop/ppk8GBQf/241127_LarvaTagger_test_data.tgz | tar zxv)
+    wget -O- https://dl.pasteur.fr/fop/VVIMasjG/241127_LarvaTagger_test_data.tgz | tar zxv)
 fi
 if [ "$LOCAL_SCENARII" = "1" ]; then