From 52c7016ec4f9404427866541f4332b52f0ac1730 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Laurent?= <francois.laurent@posteo.net> Date: Wed, 2 Apr 2025 13:26:27 +0200 Subject: [PATCH] feat: Taggers.apply_make_dataset, make_dataset metadata entry, version increment --- Project.toml | 2 +- doc/develop.md | 38 ++++++++++++++++++++++++- recipes/Dockerfile | 1 + recipes/patch.sh | 11 ++++++-- scripts/install.sh | 4 +-- src/REST/Client.jl | 7 +++-- src/REST/Model.jl | 42 +++++----------------------- src/Taggers.jl | 61 +++++++++++++++++++++++++++++++++++++++-- src/backends.jl | 4 +-- test/deploy_and_test.sh | 4 +-- 10 files changed, 124 insertions(+), 50 deletions(-) diff --git a/Project.toml b/Project.toml index 4707dab..82ad0ca 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LarvaTagger" uuid = "8b3b36f1-dfed-446e-8561-ea19fe966a4d" authors = ["François Laurent", "Institut Pasteur"] -version = "0.19.1" +version = "0.20" [deps] Bonito = "824d6782-a2ef-11e9-3a09-e5662e0c26f8" diff --git a/doc/develop.md b/doc/develop.md index ec368bc..6648bcc 100644 --- a/doc/develop.md +++ b/doc/develop.md @@ -22,4 +22,40 @@ On the *Julia* side, the lower-level functionalities are provided by the *Planar Similarly, the *TidyObservables.jl* project has unit tests and a GitLab workflow to run these tests on every commit. For the remaining part of the *LarvaTagger* project, high-level functional tests only are available. -These tests are available in the *LarvaTagger.jl* project, in the test directory, file *scenarii.sh*. They depend on [*shUnit2*](https://github.com/kward/shunit2). +These tests are available in the *LarvaTagger.jl* project, as file `test/deploy_and_test.sh`. They depend on [*shUnit2*](https://github.com/kward/shunit2). + +The `test/deploy_and_test.sh` script implicitly tests the `scripts/install.sh` script, and then runs the `test/scenarii.sh` script. +The `test/scenarii.sh` script requires some test data that are downloaded by the `test/deploy_and_test.sh` script. +If these test data cannot be fetched, please contact François Laurent so that the data are made available again (the download link periodically expires). + +## REST + +The REST API can be tested running the `test/rest_server.sh` and `test/rest_client.sh` scripts. + +The `test/rest_server.sh` script is fully automatic and completes with a stack trace that results from killing the backend after the last test. + +The `test/rest_client.sh` script does not perform any actual test. +It launches a backend and a frontend, and the user is expected to manually operate the frontend to test the communication between the backend and the frontend. + +## Docker images + +The most complete test image can be built as follows: + +``` +docker=docker LARVATAGGER_IMAGE=flaur/larvatagger:latest scripts/larvatagger.sh build --dev +docker build -t larvatagger:bigfat -f recipes/Dockerfile.pasteurjanelia . +``` + +Subsequent tests will run the backend using `larvatagger:bigfat` image: +``` +docker=docker LARVATAGGER_IMAGE="larvatagger:bigfat" scripts/larvatagger.sh backend +``` + +The frontend is more conveniently run from the source tree: +``` +scripts/larvatagger-gui.jl http://localhost:9285 +``` + +The `docker=docker` environment variable is required if command `podman` is available. +The `scripts/larvatagger.sh` script falls back on using `podman` instead of `docker`, if `podman` is available, but it is recommended to perform tests using Docker. +In addition, at present, building the image works with Docker buildx only. diff --git a/recipes/Dockerfile b/recipes/Dockerfile index b83d356..c4d6989 100644 --- a/recipes/Dockerfile +++ b/recipes/Dockerfile @@ -83,6 +83,7 @@ RUN if [ -z $TAGGINGBACKENDS_BRANCH ]; then \ && if [ "$(echo $BACKEND | cut -d/ -f2)" = "main" ] || [ "$(echo $BACKEND | cut -d/ -f2)" = "dev" ]; then \ julia -e 'using Pkg; Pkg.add("JSON3")' \ && scripts/make_models.jl default \ + && cd $PROJECT_DIR \ && recipes/patch.sh; \ fi \ && rm -rf ~/.cache; \ diff --git a/recipes/patch.sh b/recipes/patch.sh index b32d491..42da07a 100755 --- a/recipes/patch.sh +++ b/recipes/patch.sh @@ -1,6 +1,7 @@ #!/bin/sh -# patch the taggers in a Docker image so that they include metadata.json files +# patch the taggers in a Docker image so that they include metadata.json files; +# ultimately the taggers should manage their metadata.json files themselves. if [ -d "MaggotUBA" ]; then if ! [ -f "MaggotUBA/metadata.json" ]; then @@ -37,13 +38,19 @@ EOF fi +# note about make_dataset: +# * default is equivalent to "train, finetune" +# * can be "always" or a comma-separated list of processing steps +# * valid steps are "train", "finetune", "predict", "embed" + if [ -d "PasteurJanelia" ]; then if ! [ -f "PasteurJanelia/metadata.json" ]; then cat <<"EOF" >PasteurJanelia/metadata.json { "name": "PasteurJanelia", "homepage": "https://gitlab.pasteur.fr/nyx/PasteurJanelia-adapter", - "description": "Action classifiers initially designed by JBM at Janelia" + "description": "Action classifiers initially designed by JBM at Janelia", + "make_dataset": "always" } EOF fi diff --git a/scripts/install.sh b/scripts/install.sh index d649e46..30f8023 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -180,7 +180,7 @@ else if [ -z "$JULIA_VERSION" ]; then JULIA_VERSION=1.10 - JULIA_CHANNEL=lts + JULIA_CHANNEL=1.10 else echo "Using environment variable: JULIA_VERSION= $JULIA_VERSION" if [ -z "$JULIA_CHANNEL" ]; then @@ -369,7 +369,7 @@ else activate() { # pyenv activation is necessary on WSL - command -v pyenv &>/dev/null && pyenv local $PYTHON_VERSION + command -v pyenv &>/dev/null && [ -n "`pyenv versions | grep ' $PYTHON_VERSION'`" ] && pyenv local $PYTHON_VERSION poetry env use $PYTHON_VERSION } diff --git a/src/REST/Client.jl b/src/REST/Client.jl index 611608e..93ca3c3 100644 --- a/src/REST/Client.jl +++ b/src/REST/Client.jl @@ -236,9 +236,10 @@ function Taggers.predict(back::LTBackend, file::String; metadata=nothing) connected(tagger) || connect!(tagger) Taggers.push(tagger, file, metadata) Taggers.predict(tagger) - labelfile = Taggers.pull(tagger, dirname(file)) - @assert length(labelfile) == 1 - return labelfile[1] + outputfiles = Taggers.pull(tagger, dirname(file)) + @assert !isempty(outputfiles) + length(outputfiles) == 1 || @warn "Multiple output files" outputfiles + return outputfiles[1] end end diff --git a/src/REST/Model.jl b/src/REST/Model.jl index 9effac9..d712235 100644 --- a/src/REST/Model.jl +++ b/src/REST/Model.jl @@ -1,6 +1,6 @@ module Model -import ..Taggers: Taggers, Tagger +import ..Taggers: Taggers, Tagger, loadmetadata, apply_make_dataset import HTTP: HTTP import JSON3 using OrderedCollections: OrderedDict @@ -187,45 +187,15 @@ function pullfile(lt_backend, backend_dir, model_instance, token, filename) return HTTP.Response(200, header, body) end -function loadmodel(dir, hasinstances=false) - metadata = nothing - for filename in ("metadata", "metadata.json") - if isfile(joinpath(dir, filename)) - metadata = JSON3.read(joinpath(dir, filename)) - break - end - end - name = basename(dir) - T = if hasinstances - Union{AbstractString, Vector{OrderedDict{AbstractString, AbstractString}}} - else - AbstractString - end - model = OrderedDict{AbstractString, T}( - "name" => name, - "description" => "", - "homepage" => "", - ) - if !isnothing(metadata) - for key in keys(model) - key′ = Symbol(key) - if haskey(metadata, key′) - model[key] = metadata[key′] - end - end - end - return model -end - function listtaggers(lt_backend) inventory = Vector{OrderedDict{String, Any}}() backends_dir = lt_backend.root[] for tagging_backend_path in readdir(backends_dir; join=true) Taggers.isbackend(tagging_backend_path) || continue models_dir = joinpath(tagging_backend_path, "models") - models = loadmodel.(readdir(models_dir; join=true)) + models = loadmetadata.(readdir(models_dir; join=true)) isempty(models) && continue - tagging_backend = loadmodel(tagging_backend_path, true) + tagging_backend = loadmetadata(tagging_backend_path, false) tagging_backend["models"] = models push!(inventory, tagging_backend) end @@ -234,14 +204,16 @@ end function predict(lt_backend, backend_dir, model_instance, token) tagger = gettagger(lt_backend, backend_dir, model_instance, token) + make_dataset = apply_make_dataset(tagger, "predict") # blocking; should we run async and expose a token-specific status api call? - Taggers.predict(tagger; skip_make_dataset=true) + Taggers.predict(tagger; make_dataset=make_dataset) end function embed(lt_backend, backend_dir, model_instance, token) tagger = gettagger(lt_backend, backend_dir, model_instance, token) + make_dataset = apply_make_dataset(tagger, "embed") # blocking, like predict - Taggers.embed(tagger; skip_make_dataset=true) + Taggers.embed(tagger; make_dataset=make_dataset) end end diff --git a/src/Taggers.jl b/src/Taggers.jl index e7864cc..6dad6a1 100644 --- a/src/Taggers.jl +++ b/src/Taggers.jl @@ -1,8 +1,11 @@ module Taggers import PlanarLarvae.Formats, PlanarLarvae.Dataloaders +using OrderedCollections: OrderedDict +using JSON3 -export Tagger, isbackend, resetmodel, resetdata, train, predict, finetune, embed +export Tagger, isbackend, resetmodel, resetdata, train, predict, finetune, embed, + loadmetadata, apply_make_dataset struct Tagger backend_dir::String @@ -300,7 +303,9 @@ function run(tagger, switch, kwargs) args = Any[] parsekwargs!(args, kwargs) cmd = tagging_backend_command(tagger) - Base.run(Cmd(`$cmd $switch $args`; dir=tagger.backend_dir)) + cmd = Cmd(`$cmd $switch $args`; dir=tagger.backend_dir) + @info "Running command" cmd + Base.run(cmd) end function train(tagger::Tagger; pretrained_instance=None, kwargs...) @@ -323,4 +328,56 @@ end embed(tagger::Tagger; kwargs...) = run(tagger, "embed", kwargs) +function loadmetadata(dir, instance=true) + metadata = nothing + for filename in ("metadata", "metadata.json") + if isfile(joinpath(dir, filename)) + metadata = JSON3.read(joinpath(dir, filename)) + break + end + end + name = basename(dir) + T = if instance + AbstractString + else + Union{AbstractString, Vector{OrderedDict{AbstractString, AbstractString}}} + end + model = OrderedDict{AbstractString, T}( + "name" => name, + "description" => "", + "homepage" => "", + "make_dataset" => "", + ) + if !isnothing(metadata) + for key in keys(model) + key′ = Symbol(key) + if haskey(metadata, key′) + model[key] = metadata[key′] + end + end + end + return model +end + +function loadmetadata(tagger::Tagger, instance=true) + if instance + loadmetadata(Taggers.modeldir(tagger.backend_dir)) + else + loadmetadata(tagger.backend_dir, true) + end +end + +function apply_make_dataset(tagger::Tagger, step) + # see recipes/patch.sh for a note about the make_dataset entry + @assert step in ("train", "finetune", "predict", "embed") + metadata = loadmetadata(tagger, false) + make_dataset = metadata["make_dataset"] + apply_make_dataset = step in ("train", "finetune") + if !isempty(make_dataset) + apply_make_dataset = make_dataset == "always" || occursin(step, make_dataset) + @debug "apply_make_dataset" metadata apply_make_dataset + end + return apply_make_dataset +end + end # module diff --git a/src/backends.jl b/src/backends.jl index d734ad1..a3ad720 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -98,8 +98,8 @@ function Taggers.predict(model::Backends, file::String; metadata=nothing) tagger = Tagger(backend_dir, model_instance) # Taggers.push(model, file; metadata=metadata) - # TODO: make the skip_make_dataset option discoverable in the backend - predict(tagger; skip_make_dataset=true) + make_dataset = apply_make_dataset(tagger, "predict") + predict(tagger; make_dataset=make_dataset) labelfile = Taggers.pull(model, dirname(file)) @assert length(labelfile) == 1 return labelfile[1] diff --git a/test/deploy_and_test.sh b/test/deploy_and_test.sh index 942ffd2..b0c0113 100755 --- a/test/deploy_and_test.sh +++ b/test/deploy_and_test.sh @@ -23,7 +23,7 @@ if ! [ -f scripts/install.sh ]; then fi scripts/install.sh --uninstall -scripts/install.sh --with-backend --experimental +scripts/install.sh --with-default-backend ############# ## Maestro ## @@ -74,7 +74,7 @@ if [ -f LarvaTagger_test_data.tgz ]; then else # Not recommended; reproducibility is not guarantee across hosts or architectures yet (cd "$LTROOT" && \ - wget -O- https://dl.pasteur.fr/fop/ppk8GBQf/241127_LarvaTagger_test_data.tgz | tar zxv) + wget -O- https://dl.pasteur.fr/fop/VVIMasjG/241127_LarvaTagger_test_data.tgz | tar zxv) fi if [ "$LOCAL_SCENARII" = "1" ]; then -- GitLab