diff --git a/README.md b/README.md index f44928fe9c3a84e7f49eb76bbb12d285fee0f354..2e52430e3aec78f0e3632fd3147f1a9b01b15868 100644 --- a/README.md +++ b/README.md @@ -9,16 +9,24 @@ This project heavily depends on the [`TaggingBackends`](https://gitlab.pasteur.f MaggotUBA is an autoencoder trained on randomly sampled 20-time-step time segments drawn from the t5 and t15 databases, with a computational budget of 1000 training epochs. In its original "unsupervised" or self-supervised form, it reconstructs series of spines from a compressed latent representation. -For the automatic tagging, the encoder is extracted and a classifier is stacked atop the encoder. -On the same dataset, the combined encoder and classifier are (re-)trained to predict discrete behaviors. +For the automatic tagging, the encoder is combined with a classifier. +On the same dataset, the combined encoder-classifier are (re-)trained to predict discrete behaviors. -## Prototypes and validated taggers +## Taggers + +### 6-behavior classification task + +*6-behavior* refers to *run*, *cast*, *back*, *hunch*, *roll* and *stop_large*. + +#### `20220418` As a first prototype, the [`20220418`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/tree/20220418) trained model is based on a simple random forest classifier, and only the classifier was trained; the encoder was not retrained. See module [`maggotuba.models.randomforest`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/20220418/src/maggotuba/models/randomforest.py). It was trained on the entire t5+t15 database. No interpolation was performed and the prototype does not properly handle data with different frame rates. +#### `20221005` and `20221005-1` + A second tagger called [`20221005`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/tree/20221005) involves a classifier with dense layers, and the encoder was fine-tuned while training the combined encoder+classifier. See modules [`maggotuba.models.trainers`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/20221005/src/maggotuba/models/trainers.py) and [`maggotuba.models.modules`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/20221005/src/maggotuba/models/modules.py). @@ -26,6 +34,28 @@ This second tagger was dimensioned following a [parametric exploration for the 6 It was trained on a subset of 5000 files from the t5 and t15 databases. Spines were/are linearly interpolated at 10 Hz in each time segment individually. +The `20221005-1` tagger is identical. It was first thought to implement a post-prediction correction step referred to as *ABA -> AAA*, but actually did not. + +#### `20221228` + +This tagger combines 25 taggers based on a 1.5-s time window and 25 taggers based on a 3-s time window, and uses a voting approach (or bagging). +All simple taggers featured 25 latent dimensions and 1 dense layer (as classifier) only, following another [parametric exploration for the 6-behavior classification task](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/design/notebooks/parametric_exploration_6-behavior_classification_2.ipynb). + +This complex tagger was not distributed in any `latest` Docker image. See experimental image [0.7.2-20221228](https://hub.docker.com/layers/flaur/larvatagger/0.7.2-20221228/images/sha256-bac83d7ec499da9e868af893b58f8dd7b75317e0b16d99b1dcc1d456aac3c8a0?context=explore). + +It runs 50 times slower and does not solve any of the issues of `20221005`. + +### 7-behavior classification task + +*7-behavior* refers to *run_large*, *cast_large*, *back_large*, *hunch_large*, *roll_large*, *stop_large* and *small_motion*. + +#### `20230111` + +As a stronger default tagger, the `small_motion` was reintroduced to lower the detection rate of hunches and rolls. + +The `20230111` tagger uses a 2-s time window, features 25 latent dimensions and a single dense layer as classifier. +It applies a post-prediction rule referred to as *ABC -> AAC* that consists in correcting all single-step actions with the previous action. + ## Usage For installation, see [TaggingBackends' README](https://gitlab.pasteur.fr/nyx/TaggingBackends/-/tree/dev#recommended-installation). @@ -36,25 +66,30 @@ All the [command arguments supported by `TaggingBackends`](https://gitlab.pasteu ### Automatic tagging -Using the [`20221005`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/tree/20221005) branch, the `20221005` tagger can be called on a supported tracking data file with: +Using the [`20230111`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/tree/20230111) branch, the `20230111` tagger can be called on a supported tracking data file with: ``` -poetry run tagging-backend predict --model-instance 20221005 --skip-make-dataset +poetry run tagging-backend predict --model-instance 20230111 --skip-make-dataset ``` The `--skip-make-dataset` option is optional. It only makes *tagging-backend* slightly faster. -For the above command to work, the track data file must be placed (*e.g.* copied) in the `data/raw/20221005` directory, to be first created or cleared. +For the above command to work, the track data file must be placed (*e.g.* copied) in the `data/raw/20230111` directory, to be first created or cleared. -The resulting label file can be found as *data/processed/20221005/predicted.label*. +The resulting label file can be found as *data/processed/20230111/predicted.label*. Like all *.label* files, this file should be stored as a sibling of the corresponding track data file (in the same directory). -Similarly, with an arbitrary tagger named, say *mytagger*, in the above explanation all occurences of `20221005` or *20221005* must be replaced by the tagger's name. +Similarly, with an arbitrary tagger named, say *mytagger*, in the above explanation all occurences of `20230111` or *20230111* must be replaced by the tagger's name. For example, the input data file would go into *data/raw/mytagger*. +#### On HPC clusters + +Simultaneously tagging multiple tracking data files result in file conflicts because the same data directories are used internally. +Use the *larvatagger.jl* script of the [LarvaTagger.jl](https://gitlab.pasteur.fr/nyx/larvatagger.jl) project instead, with argument `--data-isolation`. + ### Retraining a tagger -A new model instance can be trained on a data repository, using the `main` or `dev` branch of `MaggotUBA-adapter` (the `20221005` branch is also suitable) with: +A new model instance can be trained on a data repository with: ``` poetry run tagging-backend train --model-instance <tagger-name> @@ -64,7 +99,11 @@ Similarly to the *predict* command, for this one to work, the data repository mu The above command will first load a pretrained model (`pretrained_models/default` in `MaggotUBA-adapter`) to determine additional parameters, such as whether to interpolate the spines or not and at which frequency, or the window length. -The current default pretrained model involves linearly interpolating the spines at 10 Hz, and relies on a 20-time-step window (2 seconds). The dimensionality of the latent space is 100. +Beware that the default pretrained model may depend on the branch you are on. + +The default pretrained model in the *20221005* branch involves linearly interpolating the spines at 10 Hz, and relies on a 20-time-step window (2 seconds). The dimensionality of the latent space is 100. + +The default pretrained model in the *20230111* branch similarly interpolates spines at 10 Hz and relies on a 20-time-step window (2 seconds), but features 25 latent dimensions only. Alternative pretrained models can be specified using the `--pretrained-model-instance` option. @@ -72,9 +111,9 @@ The data files are discovered in the repository (more exactly in *data/raw/\<tag A subset of tags can be specified using the `--labels` option followed by a list of comma-separated tags. A two-level balancing rule is followed to randomly select time segments and thus form a training dataset in the shape of a *larva_dataset* hdf5 file. -See also the [`make_dataset.py`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/20221005/src/maggotuba/data/make_dataset.py) script. +See also the [`make_dataset.py`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/dev/src/maggotuba/data/make_dataset.py) script. Training operates in two steps, first pretraining the dense-layer classifier, second simultaneously fine-tuning the encoder and classifier. -See also the [`train_model.py`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/20221005/src/maggotuba/models/train_model.py) script. +See also the [`train_model.py`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/dev/src/maggotuba/models/train_model.py) script. This generates a new sub-directory in the `models` directory of the `MaggotUBA-adapter` project, which makes the trained model discoverable for automatic tagging (*predict* command). diff --git a/pretrained_models/default/autoencoder_config.json b/pretrained_models/default/autoencoder_config.json index 2b55fb459c9b2f87c33c78442f86d6cd311efe5a..c91c44c77c558c21237a770fc95890387b944cf9 100644 --- a/pretrained_models/default/autoencoder_config.json +++ b/pretrained_models/default/autoencoder_config.json @@ -1,17 +1,17 @@ { - "project_dir": "subset2_interp10_len20", + "project_dir": "", "seed": 100, - "exp_name": "latent-100", - "data_dir": "subset2_interp10_len20/larva_dataset_2022_10_05_20_20_100000.hdf5", - "raw_data_dir": "larva_dataset/t5-t15-subset2", - "log_dir": "subset2_interp10_len20/training_log/latent-100", - "exp_folder": "subset2_interp10_len20/training_log/latent-100", - "config": "subset2_interp10_len20/config-100.json", + "exp_name": "20230111", + "data_dir": "/pasteur/appa/scratch/flaurent/MaggotUBA-adapter/data/20230111/pretrain/20/1/larva_dataset_2023_01_12_20_20_100000.hdf5", + "raw_data_dir": "/pasteur/zeus/projets/p02/hecatonchire/screens", + "log_dir": "", + "exp_folder": "", + "config": "models/20230111/autoencoder_config.json", "num_workers": 4, "n_features": 10, "len_traj": 20, "len_pred": 20, - "dim_latent": 100, + "dim_latent": 25, "activation": "relu", "enc_filters": [ 128, diff --git a/pretrained_models/default/best_validated_encoder.pt b/pretrained_models/default/best_validated_encoder.pt index bd684ae54a3e7dfc504cfc709609d6bec80fe9ac..2eeea93eb9f5c2329726db457e6182260d9bd66a 100644 Binary files a/pretrained_models/default/best_validated_encoder.pt and b/pretrained_models/default/best_validated_encoder.pt differ diff --git a/pyproject.toml b/pyproject.toml index 63190e6f33740abb6cc0ca42c9669bf16aece0e6..53777e20537530cd2cc236d0077940209e2fd3a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "MaggotUBA-adapter" -version = "0.8" +version = "0.9" description = "Interface between MaggotUBA and the Nyx tagging UI" authors = ["François Laurent"] license = "MIT" diff --git a/src/maggotuba/models/predict_model.py b/src/maggotuba/models/predict_model.py index c1abd548e048cd4b73d07b6d9c9f2a1f45a15da0..12c5507ea7326a2906bf5de3b0666b9c392487ab 100644 --- a/src/maggotuba/models/predict_model.py +++ b/src/maggotuba/models/predict_model.py @@ -175,8 +175,13 @@ def apply_filters(labels, post_filters): # modify sequentially for k in range(1, len(labels)-1): label = labels[k-1] - if labels[k-1] != label and label == labels[k+1]: + if labels[k] != label and label == labels[k+1]: labels[k] = label + elif post_filter == 'ABC->AAC': + # modify sequentially + for k in range(1, len(labels)-1): + if labels[k-1] != labels[k] and labels[k] != labels[k+1]: + labels[k] = labels[k-1] else: raise ValueError(f"filter not supported: {post_filter}") return labels