diff --git a/pyproject.toml b/pyproject.toml index 6277e00da4d8a931ec390674f533d2d06df5da62..d65a1537b4a9787e4a071781f4977d7c73cce3bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "MaggotUBA-adapter" -version = "0.11.0" +version = "0.12.0" description = "Interface between MaggotUBA and the Nyx tagging UI" authors = ["François Laurent"] license = "MIT" diff --git a/src/maggotuba/models/modules.py b/src/maggotuba/models/modules.py index 78ec42e4bf03ba2b5d7859f1b41b8f3a5d7d2ae5..24bcad119c8ce6313b766c1469d3bbe2b5aa99c8 100644 --- a/src/maggotuba/models/modules.py +++ b/src/maggotuba/models/modules.py @@ -343,9 +343,12 @@ class MaggotClassifier(MaggotModule): @classmethod def load_model(cls, config, path): + labels = config["behavior_labels"] + if isinstance(labels, dict): + labels = labels['names'] model = DeepLinear( n_input=config["dim_latent"], - n_output=len(config["behavior_labels"]), + n_output=len(labels), n_hidden=config["clf_depth"]*[None], batch_norm=config["batch_norm"], weight_init=config["weight_init"], @@ -358,7 +361,10 @@ class MaggotClassifier(MaggotModule): @property def behavior_labels(self): - return self.config["behavior_labels"] + labels = self.config["behavior_labels"] + if isinstance(labels, dict): + labels = labels['names'] + return labels @behavior_labels.setter def behavior_labels(self, labels): diff --git a/src/maggotuba/models/predict_model.py b/src/maggotuba/models/predict_model.py index 12c5507ea7326a2906bf5de3b0666b9c392487ab..086e4b8d2a4be887c5aa6791fa0325268fe649ae 100644 --- a/src/maggotuba/models/predict_model.py +++ b/src/maggotuba/models/predict_model.py @@ -4,6 +4,7 @@ from maggotuba.models.trainers import MaggotTrainer, MultiscaleMaggotTrainer, Ma import numpy as np import logging + def predict_model(backend, **kwargs): """ This function generates predicted labels for all the input data. @@ -22,9 +23,11 @@ def predict_model(backend, **kwargs): if not input_files: input_files = backend.list_input_files(group_by_directories=True) assert 0 < len(input_files) + # initialize output labels input_files_and_labels = backend.prepare_labels(input_files) assert 0 < len(input_files_and_labels) + # load the model model_files = backend.list_model_files() config_files = [file @@ -46,25 +49,27 @@ def predict_model(backend, **kwargs): model = MultiscaleMaggotTrainer(config_file) else: model = MaggotBagging(config_files) - # + + # call the `predict` logic on the input data files if len(input_files) == 1: input_files = next(iter(input_files.values())) if len(input_files) == 1: file = input_files[0] if file.name.startswith("larva_dataset_") and file.name.endswith(".hdf5"): return predict_larva_dataset(backend, model, file, **kwargs) - # predict_individual_data_files(backend, model, input_files_and_labels) + def predict_individual_data_files(backend, model, input_files_and_labels): from taggingbackends.data.trxmat import TrxMat from taggingbackends.data.chore import load_spine import taggingbackends.data.fimtrack as fimtrack - # + for input_files, labels in input_files_and_labels.values(): - labels.labelspec = model.clf_config["behavior_labels"] + labels.load_model_config(model.clf_config) done = False for file in input_files: + # load the input data (or features) if done: logging.info(f"ignoring file: {file.name}") @@ -99,15 +104,17 @@ def predict_individual_data_files(backend, model, input_files_and_labels): # label files not processed; only their data dependencies are logging.info(f"ignoring file: {file.name}") continue + # downsample the skeleton if isinstance(data, dict): for larva in data: data[larva] = get_5point_spines(data[larva]) else: data = get_5point_spines(data) - # + post_filters = model.clf_config.get('post_filters', None) - # assign labels + + # assign labels and apply post-prediction filters if isinstance(data, dict): ref_length = np.median(np.concatenate([ model.body_length(spines) for spines in data.values() @@ -133,11 +140,13 @@ def predict_individual_data_files(backend, model, input_files_and_labels): else: predictions = apply_filters(predictions, post_filters) labels[run, larva] = dict(_zip(t[mask], predictions)) + # save the predicted labels to file labels.dump(get_output_filepath(backend, file)) - # + done = True + def predict_larva_dataset(backend, model, file, subset="validation", subsets=(.8, .2, 0)): from taggingbackends.data.dataset import LarvaDataset dataset = LarvaDataset(file, new_generator(), subsets) diff --git a/src/maggotuba/models/train_model.py b/src/maggotuba/models/train_model.py index 18d2fdf0b64efc8f3dc4f2cf69801da0978a990c..a27f77b35a7a15374056119c8342b9f197980fd1 100644 --- a/src/maggotuba/models/train_model.py +++ b/src/maggotuba/models/train_model.py @@ -7,16 +7,26 @@ import glob def train_model(backend, layers=1, pretrained_model_instance="default", subsets=(1, 0, 0), rng_seed=None, iterations=1000, **kwargs): - # make_dataset generated or moved the larva_dataset file into data/interim/{instance}/ - #larva_dataset_file = backend.list_interim_files("larva_dataset_*.hdf5") # recursive - larva_dataset_file = glob.glob(str(backend.interim_data_dir() / "larva_dataset_*.hdf5")) # not recursive (faster) + # list training data files; + # we actually expect a single larva_dataset file that make_dataset generated + # or moved into data/interim/{instance}/ + #larva_dataset_file = backend.list_interim_files("larva_dataset_*.hdf5") # this one is recursive + larva_dataset_file = glob.glob(str(backend.interim_data_dir() / "larva_dataset_*.hdf5")) # this other one is not recursive assert len(larva_dataset_file) == 1 - # subsets=(1, 0, 0) => all data are training data; no validation or test subsets + + # instanciate a LarvaDataset object, that is similar to a PyTorch DataLoader + # add can initialize a Labels object + # note: subsets=(1, 0, 0) => all data are training data; no validation or test subsets dataset = LarvaDataset(larva_dataset_file[0], new_generator(rng_seed), subsets=subsets, **kwargs) + + # initialize a Labels object labels = dataset.labels assert 0 < len(labels) + + # the labels may be bytes objects; convert to str labels = labels if isinstance(labels[0], str) else [s.decode() for s in labels] + # copy and load the pretrained model into the model instance directory if isinstance(pretrained_model_instance, str): config_file = import_pretrained_model(backend, pretrained_model_instance) @@ -25,16 +35,26 @@ def train_model(backend, layers=1, pretrained_model_instance="default", pretrained_model_instances = pretrained_model_instance config_files = import_pretrained_models(backend, pretrained_model_instances) model = make_trainer(config_files, labels, layers, iterations) - # fine-tune the model + + # fine-tune the pretrained model on the loaded dataset model.train(dataset) - # add post-prediction rule ABC -> AAC + + # add post-prediction rule ABC -> AAC; + # see https://gitlab.pasteur.fr/nyx/larvatagger.jl/-/issues/62 model.clf_config['post_filters'] = ['ABC->AAC'] + # save the model print(f"saving model \"{backend.model_instance}\"") model.save() # TODO: merge the below two functions +""" + The files of the pretrained model are located in the `pretrained_models` + directory. Importing a pretrained model consists in creating a directory in + the `models` directory, named by the instance, and copying the model files. + The train step will make more files in the model instance directory. +""" def import_pretrained_model(backend, pretrained_model_instance): pretrained_autoencoder_dir = backend.project_dir / "pretrained_models" / pretrained_model_instance config_file = None