diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c65471d9fd04e51593e0f5fc829e2f8cc45f448f..8d36e3be68f696db797915902f9ea258839196b7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,7 +3,7 @@ - | julia --project=@. -e ' using Pkg - Pkg.build() + Pkg.instantiate() Pkg.test(coverage=true)' .coverage: coverage: /Test coverage (\d+\.\d+%)/ @@ -16,8 +16,8 @@ c, t = get_summary(process_folder()) using Printf @printf "Test coverage %.2f%%\n" 100c / t' -Julia 1.7: - image: julia:1.7 +Julia 1.9: + image: julia:1.9 extends: - .script - .coverage diff --git a/Manifest.toml b/Manifest.toml index 6d1d8c959c5e60c463a82ed16638feb85709be47..0810bd952b6a32a0a615582d4febadd8531d394d 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -312,11 +312,11 @@ version = "1.9.0" [[deps.PlanarLarvae]] deps = ["DelimitedFiles", "HDF5", "JSON3", "LinearAlgebra", "MAT", "Meshes", "OrderedCollections", "Random", "SHA", "StaticArrays", "Statistics", "StatsBase", "StructTypes"] -git-tree-sha1 = "0b711a20abae67f235e9f8f26fae41eddd04a766" +git-tree-sha1 = "ef6169e9f8705569925bef897704c7514b4d5f18" repo-rev = "main" repo-url = "https://gitlab.pasteur.fr/nyx/planarlarvae.jl" uuid = "c2615984-ef14-4d40-b148-916c85b43307" -version = "0.11.1" +version = "0.12.0" [[deps.PrecompileTools]] deps = ["Preferences"] diff --git a/Project.toml b/Project.toml index 616a9cf044b042ebc51e17b056564d6346f0c0d4..cc1dc08bbf812dfec729a13be24119c7069939a2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TaggingBackends" uuid = "e551f703-3b82-4335-b341-d497b48d519b" authors = ["François Laurent", "Institut Pasteur"] -version = "0.13.1" +version = "0.14" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/README.md b/README.md index 4f08624aa46f221c4ee637f4398f0e8c3b646c1b..1c23a60cc30e2067e40a01a8ad32afeae6f6078e 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,11 @@ A tagging backend, called *e.g.* `TaggingBackend`, is a Python project with the │ │ can be stored in this directory. │ └── processed <- Predicted labels from predict_model.py are │ expected in this directory. +│ ├── models <- Hyperparameters and weights of trained │ classifiers can be stored here. +├── pretrained_models <- Partially trained models the training procedure +│ starts from; optional. │ ├── pyproject.toml <- Project definition file for Poetry. ├── src @@ -40,6 +43,10 @@ A tagging backend, called *e.g.* `TaggingBackend`, is a Python project with the │ ├── train_model.py <- Trains the behavior tagging algorithm and │ │ stores the trained model in models/; │ │ optional. +│ ├── finetune_model.py <- Further trains the behavior tagging algorithm +│ │ and stores the retrained model as a new model +│ │ instance in models/; optional. +│ │ *Available since version 0.14*. │ └── predict_model.py <- Loads the trained model and features from │ data/interim, and moves the resulting │ labels in data/processed. @@ -51,7 +58,8 @@ A tagging backend, called *e.g.* `TaggingBackend`, is a Python project with the The above structure borrows elements from the [Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/) project template, adapted for use with [Poetry](https://python-poetry.org/). -The `src/<package_name>/{data,features,models}` directories can accommodate Python modules (in subpackages `<package_name>.{data,features,models}` respectively). +The `src/<package_name>/{data,features,models}` directories can accommodate Python modules +(in subpackages `<package_name>.{data,features,models}` respectively). For example, the model can be implemented as a Python class in an additional file in `src/<package_name>/models`, *e.g.* `mymodel.py`. In this case, an empty `__init__.py` file should be created in the same directory. @@ -59,8 +67,29 @@ In this case, an empty `__init__.py` file should be created in the same director As the Python package is installed, this custom module will be loadable from anywhere with `import <package_name>.models.mymodel`. -On the other hand, the `make_dataset.py`, `build_features.py`, `predict_model.py` and `train_model.py` are Python scripts, with a main program. -These scripts will be run using Poetry, from the project root. +On the other hand, the `make_dataset.py`, `build_features.py`, `predict_model.py`, +`train_model.py` and `finetune_model.py` are Python scripts, with a main program. +These scripts are run using Poetry, from the project root. +More exactly, although the Nyx tagging UI does not expect the backend to be a Python +project, the backend should be set a Poetry-managed virtual environment with the +`taggingbackends` package installed as a dependency, so that the backend can be operated +calling `poetry run tagging-backend [train|predict|finetune]`, which in turn +calls the above-mentioned Python scripts. + +*New in version 0.14*, fine-tuning: `finetune_model.py` differs from `train_model.py` as +it takes an existing trained model and further trains it. In contrast, `train_model.py` +trains a model from data only or a so-called *pretrained model*. + +For example, MaggotUBA-adapter trains a classifier on top of a pretrained encoder. +In this particular backend, `train_model.py` picks a pretrained encoder in the +`pretrained_models` directory and saves the resulting model (encoder+classifier) in the +`models` directory. `finetune_model.py` instead picks a model from the `models` directory +and saves the retrained model in `models` as well, under a different name (subdirectory). + +Note that the `pretrained_models` directory is included more for explanatory purposes. +It is not expected or checked for by the TaggingBackends logic, unlike all the other +directories and scripts mentioned above. The `pretrained_models` directory was introduced +by MaggotUBA-adapter. See example scripts in the `examplebackend` directory. @@ -82,8 +111,6 @@ as these subdirectories in `models` are looked for by the Nyx tagger UI. The `data` directory is automatically created by the `BackendExplorer` object, together with its `raw` and `processed` subdirectories, therefore there is no need to include these directories in the backend. -Although the Nyx tagger UI does not expect the project to include a Python package, a Poetry-managed virtual environment should be set up with the `taggingbackends` package installed, so that the command `poetry run tagging-backend` is available at the project root directory. - The `tests` directory is renamed `test` for compatibility with Julia projects. Python/Poetry do not need additional configuration to properly handle the tests. @@ -171,7 +198,7 @@ Note also that, with the above `export` expression, the `JULIA_PROJECT` environm To install a backend, taking MaggotUBA-adapter as an example: ``` -git clone --depth 1 --single-branch -b 20230129 https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter MaggotUBA +git clone --depth 1 --single-branch -b 20230311 https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter MaggotUBA cd MaggotUBA JULIA_PROJECT=$(pwd) poetry install ``` diff --git a/src/LarvaDatasets.jl b/src/LarvaDatasets.jl index 2f6f45c6d3e9aeaf4718d97e8bf3062c187366a9..babf42dfc47e7372f46addd22287ae8566120ebb 100644 --- a/src/LarvaDatasets.jl +++ b/src/LarvaDatasets.jl @@ -617,7 +617,13 @@ function new_write_larva_dataset_hdf5(output_dir, input_data; ratiobasedsampling(selectors, min_max_ratio, prioritylabel(includeall); seed=seed) end loader = DataLoader(repo, window, index) - buildindex(loader; unload=true) + try + buildindex(loader; unload=true) + catch ArgumentError + # most likely error message: "collection must be non-empty" + @error "Most likely cause: no time segments could be isolated" + rethrow() + end total_sample_size = length(loader.index) classcounts, _ = Dataloaders.groupby(selectors, loader.index.targetcounts) # @@ -680,7 +686,7 @@ function new_write_larva_dataset_hdf5(output_dir, input_data; # ensure labels are ordered as provided in input; # see https://gitlab.pasteur.fr/nyx/TaggingBackends/-/issues/24 h5["labels"] = labels - h5["label_counts"] = [classcounts[Symbol(label)] for label in labels] + h5["label_counts"] = [get(classcounts, Symbol(label), 0) for label in labels] end if !isnothing(frameinterval) attributes(g)["frame_interval"] = frameinterval diff --git a/src/taggingbackends/explorer.py b/src/taggingbackends/explorer.py index 955d71b38388a0d106c490da59cc96269810057c..286dae451ff1189f260eb44670bf7b3684c79807 100644 --- a/src/taggingbackends/explorer.py +++ b/src/taggingbackends/explorer.py @@ -62,6 +62,7 @@ class BackendExplorer: self._build_features = None self._train_model = None self._predict_model = None + self._finetune_model = None # self._sandbox = sandbox @@ -133,6 +134,13 @@ Cannot find any Python package in project root directory: if self._predict_model is not False: return self._predict_model + @property + def finetune_model(self): + if self._finetune_model is None: + self._finetune_model = self._locate_script("models", "finetune_model") + if self._finetune_model is not False: + return self._finetune_model + def _locate_script(self, subpkg, basename): basename = basename + ".py" in_root_dir = self.project_dir / basename diff --git a/src/taggingbackends/main.py b/src/taggingbackends/main.py index 931c471091176230421f3fe30acc8d0cba03b7da..166bc9c9d3012cae7ed355cff8c09e20b47e0a60 100644 --- a/src/taggingbackends/main.py +++ b/src/taggingbackends/main.py @@ -5,14 +5,18 @@ from taggingbackends.explorer import BackendExplorer, BackendExplorerDecoder, ge def help(_print=False): msg = """ -Usage: tagging-backend [train|predict] --model-instance <name> +Usage: tagging-backend [train|predict|finetune] --model-instance <name> + tagging-backend [train|finetune] ... --sample-size <N> + tagging-backend [train|finetune] ... --balancing-strategy <S> + tagging-backend [train|finetune] ... --include-all <secondary-label> + tagging-backend [train|finetune] ... --skip-make-dataset + tagging-backend [train|finetune] ... --skip-build-features + tagging-backend [train|finetune] ... --iterations <N> + tagging-backend [train|finetune] ... --seed <seed> tagging-backend train ... --labels <labels> --class-weights <weights> - tagging-backend train ... --sample-size <N> --balancing-strategy <S> tagging-backend train ... --frame-interval <I> --window-length <T> tagging-backend train ... --pretrained-model-instance <name> - tagging-backend train ... --include-all <secondary-label> - tagging-backend train ... --skip-make-dataset --skip-build-features - tagging-backend train ... --seed <seed> + tagging-backend finetune ... --original-model-instance <name> tagging-backend predict ... --make-dataset --build-features tagging-backend predict ... --sandbox <token> tagging-backend --help @@ -75,6 +79,17 @@ truly skip this step; the corresponding module is not loaded. Since version 0.10, `predict` makes `--skip-make-dataset` and `--skip-build-features` the default behavior. As a counterpart, it admits arguments `--make-dataset` and `--build-features`. + +New in version 0.14: the `finetune` switch loads a trained model and further +train it on a similar dataset. The class labels and weights are inherited from +the trained model. The backend is responsible for storing the information and, +for example, MaggotUBA does not store the class weights. + +Fine-tuning is typically resorted to when the (re-)training dataset is small +(and similar enough to the original training data). As a consequence, some +classes may be underrepresented. While totally missing classes are properly +ignored, the data points of the underrepresented classes should be explicitly +unlabelled to be similarly excluded from the (re-)training dataset. """ if _print: print(msg) @@ -89,7 +104,7 @@ def main(fun=None): help(True) #sys.exit("too few input arguments; subcommand expected: 'train' or 'predict'") return - train_or_predict = sys.argv[1] + task = sys.argv[1] project_dir = model_instance = None input_files, labels = [], [] sample_size = window_length = frame_interval = None @@ -140,6 +155,9 @@ def main(fun=None): elif sys.argv[k] == "--pretrained-model-instance": k = k + 1 pretrained_model_instance = sys.argv[k] + elif sys.argv[k] == "--original-model-instance": + k = k + 1 + original_model_instance = sys.argv[k] elif sys.argv[k] == "--sandbox": k = k + 1 sandbox = sys.argv[k] @@ -167,12 +185,12 @@ def main(fun=None): if input_files: for file in input_files: backend.move_to_raw(file) - if make_dataset is None and train_or_predict == 'train': + if make_dataset is None and task in ('train', 'finetune'): make_dataset = True - if build_features is None and train_or_predict == 'train': + if build_features is None and task in ('train', 'finetune'): build_features = True if make_dataset: - make_dataset_kwargs = dict(labels_expected=train_or_predict == "train", + make_dataset_kwargs = dict(labels_expected=task in ('train', 'finetune'), balancing_strategy=balancing_strategy) if labels: make_dataset_kwargs["labels"] = labels @@ -192,6 +210,8 @@ def main(fun=None): logging.info("option --reuse-h5files is ignored in the absence of --trxmat-only") if pretrained_model_instance is not None: make_dataset_kwargs["pretrained_model_instance"] = pretrained_model_instance + if original_model_instance is not None: + make_dataset_kwargs["original_model_instance"] = original_model_instance if include_all: make_dataset_kwargs["include_all"] = include_all if seed is not None: @@ -199,9 +219,9 @@ def main(fun=None): backend._run_script(backend.make_dataset, **make_dataset_kwargs) if build_features: backend._run_script(backend.build_features) - if train_or_predict == "predict": + if task == "predict": backend._run_script(backend.predict_model, trailing=unknown_args) - else: + elif task == 'train': train_kwargs = dict(balancing_strategy=balancing_strategy) if pretrained_model_instance: train_kwargs["pretrained_model_instance"] = pretrained_model_instance @@ -210,6 +230,13 @@ def main(fun=None): if seed is not None: train_kwargs['seed'] = seed backend._run_script(backend.train_model, trailing=unknown_args, **train_kwargs) + elif task == 'finetune': + finetune_kwargs = dict(balancing_strategy=balancing_strategy) + if original_model_instance: + finetune_kwargs['original_model_instance'] = original_model_instance + if seed is not None: + finetune_kwargs['seed'] = seed + backend._run_script(backend.finetune_model, trailing=unknown_args, **finetune_kwargs) else: # called by make_dataset, build_features, train_model and predict_model backend = BackendExplorerDecoder().decode(sys.argv[1]) @@ -234,6 +261,8 @@ def main(fun=None): val_ = val.split(',') if val_[1:]: val = val_ + elif key == 'original_model_instance': + pass # do not try to convert to number elif isinstance(val, str): try: val = int(val)