diff --git a/README.md b/README.md index 2e52430e3aec78f0e3632fd3147f1a9b01b15868..991dcab918725606d955eafff7e2ec0266a99667 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ This project heavily depends on the [`TaggingBackends`](https://gitlab.pasteur.f ## Principle -MaggotUBA is an autoencoder trained on randomly sampled 20-time-step time segments drawn from the t5 and t15 databases, with a computational budget of 1000 training epochs. +MaggotUBA is an autoencoder trained on randomly sampled 20-time-step time segments drawn from the t5 and t15 databases (from t15 only for current default), with a computational budget of 1,000 training epochs (10,000 for current default). In its original "unsupervised" or self-supervised form, it reconstructs series of spines from a compressed latent representation. For the automatic tagging, the encoder is combined with a classifier. @@ -56,6 +56,19 @@ As a stronger default tagger, the `small_motion` was reintroduced to lower the d The `20230111` tagger uses a 2-s time window, features 25 latent dimensions and a single dense layer as classifier. It applies a post-prediction rule referred to as *ABC -> AAC* that consists in correcting all single-step actions with the previous action. +#### `20230129` + +Previous tagger `20230111` revealed a [temporal leakage issue](https://gitlab.pasteur.fr/nyx/larvatagger.jl/-/issues/88) that might have affected all previous taggers. + +A similar tagger called `20230129` has been proposed to moderate this issue. +This tagger shares the same characteristics as `20230111` and differs in three important aspects: + +* the number of training epochs was brought from 1,000 to 10,000 to let the original features be largely forgotten, +* the training stage involved more data: 1,200,235 time segments were used instead of 100,000; these data were unbalanced and training was performed with class weighting as per the newly introduced balancing strategy `auto` (see https://gitlab.pasteur.fr/nyx/larvatagger.jl/-/issues/92), +* pretraining and training data were drawn from t15 only (as opposed to previous taggers that were pretrained and trained with data from t15 and t5). + +Note the last difference was not meant to improve performance. The `20230129` was trained this way to study its performance on t5, and was kept as is after it showed better properties (less temporal leakage, fewer hunches and rolls except on stimulus onset) on t5 data. + ## Usage For installation, see [TaggingBackends' README](https://gitlab.pasteur.fr/nyx/TaggingBackends/-/tree/dev#recommended-installation). @@ -66,20 +79,20 @@ All the [command arguments supported by `TaggingBackends`](https://gitlab.pasteu ### Automatic tagging -Using the [`20230111`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/tree/20230111) branch, the `20230111` tagger can be called on a supported tracking data file with: +Using the [`20230129`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/tree/20230129) branch, the `20230129` tagger can be called on a supported tracking data file with: ``` -poetry run tagging-backend predict --model-instance 20230111 --skip-make-dataset +poetry run tagging-backend predict --model-instance 20230129 ``` -The `--skip-make-dataset` option is optional. It only makes *tagging-backend* slightly faster. +Note: since `TaggingBackends==0.10`, the `--skip-make-dataset` argument is default behavior. Pass `--make-dataset` instead to enforce the former default. -For the above command to work, the track data file must be placed (*e.g.* copied) in the `data/raw/20230111` directory, to be first created or cleared. +For the above command to work, the track data file must be placed (*e.g.* copied) in the `data/raw/20230129` directory, to be first created or cleared. -The resulting label file can be found as *data/processed/20230111/predicted.label*. +The resulting label file can be found as *data/processed/20230129/predicted.label*. Like all *.label* files, this file should be stored as a sibling of the corresponding track data file (in the same directory). -Similarly, with an arbitrary tagger named, say *mytagger*, in the above explanation all occurences of `20230111` or *20230111* must be replaced by the tagger's name. +Similarly, with an arbitrary tagger named, say *mytagger*, in the above explanation all occurences of `20230129` or *20230129* must be replaced by the tagger's name. For example, the input data file would go into *data/raw/mytagger*. #### On HPC clusters @@ -103,7 +116,7 @@ Beware that the default pretrained model may depend on the branch you are on. The default pretrained model in the *20221005* branch involves linearly interpolating the spines at 10 Hz, and relies on a 20-time-step window (2 seconds). The dimensionality of the latent space is 100. -The default pretrained model in the *20230111* branch similarly interpolates spines at 10 Hz and relies on a 20-time-step window (2 seconds), but features 25 latent dimensions only. +The default pretrained models in the *20230111* and *20230129* branches similarly interpolate spines at 10 Hz and rely on a 20-time-step window (2 seconds), but feature 25 latent dimensions only. Alternative pretrained models can be specified using the `--pretrained-model-instance` option. diff --git a/pretrained_models/default/autoencoder_config.json b/pretrained_models/default/autoencoder_config.json index c91c44c77c558c21237a770fc95890387b944cf9..bfdf0ffc77df41f08a6d97f28f0550bef2a63435 100644 --- a/pretrained_models/default/autoencoder_config.json +++ b/pretrained_models/default/autoencoder_config.json @@ -1,12 +1,12 @@ { "project_dir": "", "seed": 100, - "exp_name": "20230111", - "data_dir": "/pasteur/appa/scratch/flaurent/MaggotUBA-adapter/data/20230111/pretrain/20/1/larva_dataset_2023_01_12_20_20_100000.hdf5", - "raw_data_dir": "/pasteur/zeus/projets/p02/hecatonchire/screens", + "exp_name": "20230129 -- see https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/blob/design/scripts/maestro/make_20230129_datasets.py and related scripts", + "data_dir": "/pasteur/appa/scratch/flaurent/MaggotUBA-adapter/data/20230129/pretrain/20/1/larva_dataset_2023_01_29_20_20_108440.hdf5", + "raw_data_dir": "/pasteur/zeus/projets/p02/hecatonchire/screens/t15", "log_dir": "", "exp_folder": "", - "config": "models/20230111/autoencoder_config.json", + "config": "models/20230129/autoencoder_config.json", "num_workers": 4, "n_features": 10, "len_traj": 20, @@ -87,7 +87,7 @@ "init": "kaiming", "n_clusters": 2, "dim_reduc": "UMAP", - "optim_iter": 1000, + "optim_iter": 10000, "pseudo_epoch": 100, "batch_size": 128, "lr": 0.005, diff --git a/pretrained_models/default/best_validated_encoder.pt b/pretrained_models/default/best_validated_encoder.pt index 2eeea93eb9f5c2329726db457e6182260d9bd66a..7041ba99f0859d1fc85826541580f5432acf0ad4 100644 Binary files a/pretrained_models/default/best_validated_encoder.pt and b/pretrained_models/default/best_validated_encoder.pt differ diff --git a/pyproject.toml b/pyproject.toml index 7043db41154cf4c0114f944e4556a55454242454..d37052234b84699c0c6a6f92c1b1e200799b602f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "MaggotUBA-adapter" -version = "0.9.1" +version = "0.11.0" description = "Interface between MaggotUBA and the Nyx tagging UI" authors = ["François Laurent"] license = "MIT" diff --git a/src/maggotuba/models/modules.py b/src/maggotuba/models/modules.py index 9d197a3885a6719b3003ce793b84e5d0e4fa4a2f..78ec42e4bf03ba2b5d7859f1b41b8f3a5d7d2ae5 100644 --- a/src/maggotuba/models/modules.py +++ b/src/maggotuba/models/modules.py @@ -313,7 +313,8 @@ class DeepLinear(nn.Module): class MaggotClassifier(MaggotModule): def __init__(self, path, behavior_labels=[], n_latent_features=None, - n_layers=1, cfgfile=None, ptfile="trained_classifier.pt"): + n_layers=1, n_iterations=None, cfgfile=None, + ptfile="trained_classifier.pt"): super().__init__(path, cfgfile, ptfile) try: # try load config file, if any self.config @@ -329,6 +330,16 @@ class MaggotClassifier(MaggotModule): weight_init="xavier", loss="cross-entropy", optimizer="adam") + if n_iterations is not None: + if isinstance(n_iterations, str): + n_iterations = map(int, n_iterations.split(',')) + if isinstance(n_iterations, int): + n_pretraining_iter = n_iterations // 2 + n_finetuning_iter = n_iterations // 2 + else: + n_pretraining_iter, n_finetuning_iter = n_iterations + self.config['pretraining_iter'] = n_pretraining_iter + self.config['finetuning_iter'] = n_finetuning_iter @classmethod def load_model(cls, config, path): @@ -365,13 +376,22 @@ class MaggotClassifier(MaggotModule): def n_layers(self): return self.config["clf_depth"] + 1 + @property + def n_pretraining_iter(self): + return self.config.get('pretraining_iter', None) + + @property + def n_finetuning_iter(self): + return self.config.get('finetuning_iter', None) + class SupervisedMaggot(nn.Module): - def __init__(self, cfgfilepath, behaviors=[], n_layers=1): + def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_epochs=None): super().__init__() if behaviors: # the model is only pre-trained self.encoder = PretrainedMaggotEncoder(cfgfilepath) self.clf = MaggotClassifier(self.encoder.path / "clf_config.json", - behaviors, self.encoder.config["dim_latent"], n_layers) + behaviors, self.encoder.config["dim_latent"], n_layers, + n_epochs) else: # the model has been retrained self.clf = MaggotClassifier(cfgfilepath) self.encoder = MaggotEncoder(self.clf.config["autoencoder_config"], @@ -398,15 +418,35 @@ class SupervisedMaggot(nn.Module): self.encoder.to(device) self.clf.to(device) + @property + def n_pretraining_iter(self): + n = self.clf.n_pretraining_iter + if n is None: + enc = self.encoder + n = enc.config['optim_iter'] + if enc.was_pretrained(): + n = n // 2 + return n + + @property + def n_finetuning_iter(self): + n = self.clf.n_finetuning_iter + if n is None: + enc = self.encoder + n = enc.config['optim_iter'] + if enc.was_pretrained(): + n = n // 2 + return n + class MultiscaleSupervisedMaggot(nn.Module): - def __init__(self, cfgfilepath, behaviors=[], n_layers=1): + def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_iterations=None): super().__init__() if behaviors: # the model is only pre-trained self.encoders = MaggotEncoders(cfgfilepath, cls=PretrainedMaggotEncoder) path = next(iter(self.encoders)).path.parent n_latent_features = sum(enc.config["dim_latent"] for enc in self.encoders) self.clf = MaggotClassifier(path / "clf_config.json", - behaviors, n_latent_features, n_layers) + behaviors, n_latent_features, n_layers, n_iterations) else: # the model has been retrained self.clf = MaggotClassifier(cfgfilepath) self.encoders = MaggotEncoders(self.clf.config["autoencoder_config"], @@ -426,6 +466,26 @@ class MultiscaleSupervisedMaggot(nn.Module): self.clf.model # force parameter loading or initialization return super().parameters(self) + @property + def n_pretraining_iter(self): + n = self.clf.n_pretraining_iter + if n is None: + any_enc = self.encoders[0] + n = any_enc.config['optim_iter'] + if any_enc.was_pretrained(): + n = n // 2 + return n + + @property + def n_finetuning_iter(self): + n = self.clf.n_finetuning_iter + if n is None: + any_enc = self.encoders[0] + n = any_enc.config['optim_iter'] + if any_enc.was_pretrained(): + n = n // 2 + return n + """ Bagging for `SupervisedMaggot`. @@ -436,9 +496,10 @@ Bags of taggers are stored so that the models directory only contains subdirectories, each subdirectory specifying an individual tagger. """ class MaggotBag(nn.Module): - def __init__(self, paths, behaviors=[], n_layers=1, cls=SupervisedMaggot): + def __init__(self, paths, behaviors=[], n_layers=1, n_iterations=None, + cls=SupervisedMaggot): super().__init__() - self.maggots = [cls(path, behaviors, n_layers) for path in paths] + self.maggots = [cls(path, behaviors, n_layers, n_iterations) for path in paths] self._lead_maggot = None def forward(self, x): diff --git a/src/maggotuba/models/predict_model.py b/src/maggotuba/models/predict_model.py index 12c5507ea7326a2906bf5de3b0666b9c392487ab..d050e975de9a5351293d44efda3085b0291fe227 100644 --- a/src/maggotuba/models/predict_model.py +++ b/src/maggotuba/models/predict_model.py @@ -22,9 +22,11 @@ def predict_model(backend, **kwargs): if not input_files: input_files = backend.list_input_files(group_by_directories=True) assert 0 < len(input_files) + # initialize output labels input_files_and_labels = backend.prepare_labels(input_files) assert 0 < len(input_files_and_labels) + # load the model model_files = backend.list_model_files() config_files = [file @@ -46,14 +48,14 @@ def predict_model(backend, **kwargs): model = MultiscaleMaggotTrainer(config_file) else: model = MaggotBagging(config_files) - # + + # call the `predict` logic on the input data files if len(input_files) == 1: input_files = next(iter(input_files.values())) if len(input_files) == 1: file = input_files[0] if file.name.startswith("larva_dataset_") and file.name.endswith(".hdf5"): return predict_larva_dataset(backend, model, file, **kwargs) - # predict_individual_data_files(backend, model, input_files_and_labels) def predict_individual_data_files(backend, model, input_files_and_labels): diff --git a/src/maggotuba/models/train_model.py b/src/maggotuba/models/train_model.py index 28d80c23a96327d664a12b526cb12c9140d31936..a27f77b35a7a15374056119c8342b9f197980fd1 100644 --- a/src/maggotuba/models/train_model.py +++ b/src/maggotuba/models/train_model.py @@ -6,33 +6,55 @@ import json import glob def train_model(backend, layers=1, pretrained_model_instance="default", - subsets=(1, 0, 0), rng_seed=None, balancing_strategy='maggotuba', **kwargs): - # make_dataset generated or moved the larva_dataset file into data/interim/{instance}/ - #larva_dataset_file = backend.list_interim_files("larva_dataset_*.hdf5") # recursive - larva_dataset_file = glob.glob(str(backend.interim_data_dir() / "larva_dataset_*.hdf5")) # not recursive (faster) + subsets=(1, 0, 0), rng_seed=None, iterations=1000, **kwargs): + # list training data files; + # we actually expect a single larva_dataset file that make_dataset generated + # or moved into data/interim/{instance}/ + #larva_dataset_file = backend.list_interim_files("larva_dataset_*.hdf5") # this one is recursive + larva_dataset_file = glob.glob(str(backend.interim_data_dir() / "larva_dataset_*.hdf5")) # this other one is not recursive assert len(larva_dataset_file) == 1 - # subsets=(1, 0, 0) => all data are training data; no validation or test subsets + + # instanciate a LarvaDataset object, that is similar to a PyTorch DataLoader + # add can initialize a Labels object + # note: subsets=(1, 0, 0) => all data are training data; no validation or test subsets dataset = LarvaDataset(larva_dataset_file[0], new_generator(rng_seed), subsets=subsets, **kwargs) - dataset.weight_classes = isinstance(balancing_strategy, str) and (balancing_strategy.lower() == 'auto') + + # initialize a Labels object labels = dataset.labels assert 0 < len(labels) + + # the labels may be bytes objects; convert to str labels = labels if isinstance(labels[0], str) else [s.decode() for s in labels] + # copy and load the pretrained model into the model instance directory if isinstance(pretrained_model_instance, str): config_file = import_pretrained_model(backend, pretrained_model_instance) - model = make_trainer(config_file, labels, layers) + model = make_trainer(config_file, labels, layers, iterations) else: pretrained_model_instances = pretrained_model_instance config_files = import_pretrained_models(backend, pretrained_model_instances) - model = make_trainer(config_files, labels, layers) - # fine-tune and save the model + model = make_trainer(config_files, labels, layers, iterations) + + # fine-tune the pretrained model on the loaded dataset model.train(dataset) + + # add post-prediction rule ABC -> AAC; + # see https://gitlab.pasteur.fr/nyx/larvatagger.jl/-/issues/62 + model.clf_config['post_filters'] = ['ABC->AAC'] + + # save the model print(f"saving model \"{backend.model_instance}\"") model.save() # TODO: merge the below two functions +""" + The files of the pretrained model are located in the `pretrained_models` + directory. Importing a pretrained model consists in creating a directory in + the `models` directory, named by the instance, and copying the model files. + The train step will make more files in the model instance directory. +""" def import_pretrained_model(backend, pretrained_model_instance): pretrained_autoencoder_dir = backend.project_dir / "pretrained_models" / pretrained_model_instance config_file = None diff --git a/src/maggotuba/models/trainers.py b/src/maggotuba/models/trainers.py index 154d6f139282c0a52d452c9f9937d991afa32cae..06010e0d7100a87c8aa8947c97fa2e4e77b1438d 100644 --- a/src/maggotuba/models/trainers.py +++ b/src/maggotuba/models/trainers.py @@ -20,9 +20,9 @@ Training the model instead relies on the readily-preprocessed data of a *larva_dataset hdf5* file. """ class MaggotTrainer: - def __init__(self, cfgfilepath, behaviors=[], n_layers=1, + def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_iterations=None, average_body_length=1.0, device=device): - self.model = SupervisedMaggot(cfgfilepath, behaviors, n_layers) + self.model = SupervisedMaggot(cfgfilepath, behaviors, n_layers, n_iterations) self.average_body_length = average_body_length # usually set later self.device = device @@ -71,7 +71,7 @@ class MaggotTrainer: def pad(self, target_t, defined_t, data): if data.shape[0] == 1: - return data + return np.repeat(data, len(target_t), axis=0) else: head = searchsortedfirst(target_t, defined_t[0]) tail = len(target_t) - (searchsortedlast(target_t, defined_t[-1]) + 1) @@ -166,14 +166,12 @@ class MaggotTrainer: model.train() # this only sets the model in training mode (enables gradients) model.to(self.device) criterion = nn.CrossEntropyLoss(**kwargs) - nsteps = self.config['optim_iter'] grad_clip = self.config['grad_clip'] # pre-train the classifier with static encoder weights if model.encoder.was_pretrained(): - nsteps = nsteps // 2 optimizer = torch.optim.Adam(model.clf.parameters()) print("pre-training the classifier...") - for step in range(nsteps): + for step in range(self.model.n_pretraining_iter): optimizer.zero_grad() # TODO: add an option for renormalizing the input data, expected = self.draw(dataset) @@ -186,7 +184,7 @@ class MaggotTrainer: optimizer = torch.optim.Adam(model.parameters()) print(("fine-tuning" if model.encoder.was_pretrained() else "training") + \ " the encoder and classifier...") - for step in range(nsteps): + for step in range(self.model.n_finetuning_iter): optimizer.zero_grad() data, expected = self.draw(dataset) predicted = self.forward(data, train=True) @@ -246,20 +244,19 @@ def new_generator(seed=None): class MultiscaleMaggotTrainer(MaggotTrainer): - def __init__(self, cfgfilepath, behaviors=[], n_layers=1, + def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_iterations=None, average_body_length=1.0, device=device): - self.model = MultiscaleSupervisedMaggot(cfgfilepath, behaviors, n_layers) + self.model = MultiscaleSupervisedMaggot(cfgfilepath, behaviors, + n_layers, n_iterations) self.average_body_length = average_body_length # usually set later self.device = device self._default_encoder_config = None # check consistency - ref_config = self.config - for attr in ["batch_size", "optim_iter"]: - # TODO: add pretraining_iter and finetuning_iter parameters in - # clf_config to have a lever other than optim_iter, that - # could consequently be ignored - for enc in self.model.encoders: - assert enc.config[attr] == ref_config[attr] + if n_iterations is None: + ref_config = self.config + for attr in ["batch_size", "optim_iter"]: + for enc in self.model.encoders: + assert enc.config[attr] == ref_config[attr] @property def config(self):