Merge branch 'dev' into 20230129

e1c2da4e · François LAURENT · f7a203e5 · 5c9c32f7 · e1c2da4e · e1c2da4e
Commit e1c2da4e authored 2 years ago by François LAURENT
--- a/README.md
+++ b/README.md
@@ -64,10 +64,10 @@ A similar tagger called `20230129` has been proposed to moderate this issue.
 This tagger shares the same characteristics as `20230111` and differs in three important aspects:

 * the number of training epochs was brought from 1,000 to 10,000 to let the original features be largely forgotten,
-* the training stage involved more data: 1,200,235 time segments were used instead of 100,000; these data were unbalanced and training was performed with the newly introduced balancing strategy `auto` (see https://gitlab.pasteur.fr/nyx/larvatagger.jl/-/issues/92),
+* the training stage involved more data: 1,200,235 time segments were used instead of 100,000; these data were unbalanced and training was performed with class weighting as per the newly introduced balancing strategy `auto` (see https://gitlab.pasteur.fr/nyx/larvatagger.jl/-/issues/92),
 * pretraining and training data were drawn from t15 only (as opposed to previous taggers that were pretrained and trained with data from t15 and t5).

-Note the last difference was not meant to improve performance, at all. The `20230129` was trained this way to study its performance on t5, and was kept as is after it showed better performance on t5 data, compared to previous taggers trained with t5 data in addition to t15 data.
+Note the last difference was not meant to improve performance. The `20230129` was trained this way to study its performance on t5, and was kept as is after it showed better properties (less temporal leakage, fewer hunches and rolls except on stimulus onset) on t5 data.

 ## Usage


--- a/pyproject.toml
+++ b/pyproject.toml
 [tool.poetry]
 name = "MaggotUBA-adapter"
-version = "0.10"
+version = "0.11.0"
 description = "Interface between MaggotUBA and the Nyx tagging UI"
 authors = ["François Laurent"]
 license = "MIT"

--- a/src/maggotuba/models/modules.py
+++ b/src/maggotuba/models/modules.py
@@ -313,7 +313,8 @@ class DeepLinear(nn.Module):

 class MaggotClassifier(MaggotModule):
    def __init__(self, path, behavior_labels=[], n_latent_features=None,
-            n_layers=1, cfgfile=None, ptfile="trained_classifier.pt"):
+            n_layers=1, n_iterations=None, cfgfile=None,
+            ptfile="trained_classifier.pt"):
        super().__init__(path, cfgfile, ptfile)
        try: # try load config file, if any
            self.config
@@ -329,6 +330,16 @@ class MaggotClassifier(MaggotModule):
                weight_init="xavier",
                loss="cross-entropy",
                optimizer="adam")
+            if n_iterations is not None:
+                if isinstance(n_iterations, str):
+                    n_iterations = map(int, n_iterations.split(','))
+                if isinstance(n_iterations, int):
+                    n_pretraining_iter = n_iterations // 2
+                    n_finetuning_iter = n_iterations // 2
+                else:
+                    n_pretraining_iter, n_finetuning_iter = n_iterations
+                self.config['pretraining_iter'] = n_pretraining_iter
+                self.config['finetuning_iter'] = n_finetuning_iter

    @classmethod
    def load_model(cls, config, path):
@@ -365,13 +376,22 @@ class MaggotClassifier(MaggotModule):
    def n_layers(self):
        return self.config["clf_depth"] + 1

+    @property
+    def n_pretraining_iter(self):
+        return self.config.get('pretraining_iter', None)
+
+    @property
+    def n_finetuning_iter(self):
+        return self.config.get('finetuning_iter', None)
+
 class SupervisedMaggot(nn.Module):
-    def __init__(self, cfgfilepath, behaviors=[], n_layers=1):
+    def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_epochs=None):
        super().__init__()
        if behaviors: # the model is only pre-trained
            self.encoder = PretrainedMaggotEncoder(cfgfilepath)
            self.clf = MaggotClassifier(self.encoder.path / "clf_config.json",
-                    behaviors, self.encoder.config["dim_latent"], n_layers)
+                    behaviors, self.encoder.config["dim_latent"], n_layers,
+                    n_epochs)
        else: # the model has been retrained
            self.clf = MaggotClassifier(cfgfilepath)
            self.encoder = MaggotEncoder(self.clf.config["autoencoder_config"],
@@ -398,15 +418,35 @@ class SupervisedMaggot(nn.Module):
        self.encoder.to(device)
        self.clf.to(device)

+    @property
+    def n_pretraining_iter(self):
+        n = self.clf.n_pretraining_iter
+        if n is None:
+            enc = self.encoder
+            n = enc.config['optim_iter']
+            if enc.was_pretrained():
+                n = n // 2
+        return n
+
+    @property
+    def n_finetuning_iter(self):
+        n = self.clf.n_finetuning_iter
+        if n is None:
+            enc = self.encoder
+            n = enc.config['optim_iter']
+            if enc.was_pretrained():
+                n = n // 2
+        return n
+
 class MultiscaleSupervisedMaggot(nn.Module):
-    def __init__(self, cfgfilepath, behaviors=[], n_layers=1):
+    def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_iterations=None):
        super().__init__()
        if behaviors: # the model is only pre-trained
            self.encoders = MaggotEncoders(cfgfilepath, cls=PretrainedMaggotEncoder)
            path = next(iter(self.encoders)).path.parent
            n_latent_features = sum(enc.config["dim_latent"] for enc in self.encoders)
            self.clf = MaggotClassifier(path / "clf_config.json",
-                    behaviors, n_latent_features, n_layers)
+                    behaviors, n_latent_features, n_layers, n_iterations)
        else: # the model has been retrained
            self.clf = MaggotClassifier(cfgfilepath)
            self.encoders = MaggotEncoders(self.clf.config["autoencoder_config"],
@@ -426,6 +466,26 @@ class MultiscaleSupervisedMaggot(nn.Module):
        self.clf.model # force parameter loading or initialization
        return super().parameters(self)

+    @property
+    def n_pretraining_iter(self):
+        n = self.clf.n_pretraining_iter
+        if n is None:
+            any_enc = self.encoders[0]
+            n = any_enc.config['optim_iter']
+            if any_enc.was_pretrained():
+                n = n // 2
+        return n
+
+    @property
+    def n_finetuning_iter(self):
+        n = self.clf.n_finetuning_iter
+        if n is None:
+            any_enc = self.encoders[0]
+            n = any_enc.config['optim_iter']
+            if any_enc.was_pretrained():
+                n = n // 2
+        return n
+
 """
 Bagging for `SupervisedMaggot`.

@@ -436,9 +496,10 @@ Bags of taggers are stored so that the models directory only contains
 subdirectories, each subdirectory specifying an individual tagger.
 """
 class MaggotBag(nn.Module):
-    def __init__(self, paths, behaviors=[], n_layers=1, cls=SupervisedMaggot):
+    def __init__(self, paths, behaviors=[], n_layers=1, n_iterations=None,
+                 cls=SupervisedMaggot):
        super().__init__()
-        self.maggots = [cls(path, behaviors, n_layers) for path in paths]
+        self.maggots = [cls(path, behaviors, n_layers, n_iterations) for path in paths]
        self._lead_maggot = None

    def forward(self, x):

--- a/src/maggotuba/models/train_model.py
+++ b/src/maggotuba/models/train_model.py
@@ -6,7 +6,7 @@ import json
 import glob

 def train_model(backend, layers=1, pretrained_model_instance="default",
-                subsets=(1, 0, 0), rng_seed=None, balancing_strategy='maggotuba', **kwargs):
+                subsets=(1, 0, 0), rng_seed=None, iterations=1000, **kwargs):
    # make_dataset generated or moved the larva_dataset file into data/interim/{instance}/
    #larva_dataset_file = backend.list_interim_files("larva_dataset_*.hdf5") # recursive
    larva_dataset_file = glob.glob(str(backend.interim_data_dir() / "larva_dataset_*.hdf5")) # not recursive (faster)
@@ -14,18 +14,17 @@ def train_model(backend, layers=1, pretrained_model_instance="default",
    # subsets=(1, 0, 0) => all data are training data; no validation or test subsets
    dataset = LarvaDataset(larva_dataset_file[0], new_generator(rng_seed),
                           subsets=subsets, **kwargs)
-    dataset.weight_classes = isinstance(balancing_strategy, str) and (balancing_strategy.lower() == 'auto')
    labels = dataset.labels
    assert 0 < len(labels)
    labels = labels if isinstance(labels[0], str) else [s.decode() for s in labels]
    # copy and load the pretrained model into the model instance directory
    if isinstance(pretrained_model_instance, str):
        config_file = import_pretrained_model(backend, pretrained_model_instance)
-        model = make_trainer(config_file, labels, layers)
+        model = make_trainer(config_file, labels, layers, iterations)
    else:
        pretrained_model_instances = pretrained_model_instance
        config_files = import_pretrained_models(backend, pretrained_model_instances)
-        model = make_trainer(config_files, labels, layers)
+        model = make_trainer(config_files, labels, layers, iterations)
    # fine-tune the model
    model.train(dataset)
    # add post-prediction rule ABC -> AAC

--- a/src/maggotuba/models/trainers.py
+++ b/src/maggotuba/models/trainers.py
@@ -20,9 +20,9 @@ Training the model instead relies on the readily-preprocessed data of a
 *larva_dataset hdf5* file.
 """
 class MaggotTrainer:
-    def __init__(self, cfgfilepath, behaviors=[], n_layers=1,
+    def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_iterations=None,
            average_body_length=1.0, device=device):
-        self.model = SupervisedMaggot(cfgfilepath, behaviors, n_layers)
+        self.model = SupervisedMaggot(cfgfilepath, behaviors, n_layers, n_iterations)
        self.average_body_length = average_body_length # usually set later
        self.device = device

@@ -166,14 +166,12 @@ class MaggotTrainer:
        model.train() # this only sets the model in training mode (enables gradients)
        model.to(self.device)
        criterion = nn.CrossEntropyLoss(**kwargs)
-        nsteps = self.config['optim_iter']
        grad_clip = self.config['grad_clip']
        # pre-train the classifier with static encoder weights
        if model.encoder.was_pretrained():
-            nsteps = nsteps // 2
            optimizer = torch.optim.Adam(model.clf.parameters())
            print("pre-training the classifier...")
-            for step in range(nsteps):
+            for step in range(self.model.n_pretraining_iter):
                optimizer.zero_grad()
                # TODO: add an option for renormalizing the input
                data, expected = self.draw(dataset)
@@ -186,7 +184,7 @@ class MaggotTrainer:
        optimizer = torch.optim.Adam(model.parameters())
        print(("fine-tuning" if model.encoder.was_pretrained() else "training") + \
               " the encoder and classifier...")
-        for step in range(nsteps):
+        for step in range(self.model.n_finetuning_iter):
            optimizer.zero_grad()
            data, expected = self.draw(dataset)
            predicted = self.forward(data, train=True)
@@ -246,18 +244,17 @@ def new_generator(seed=None):


 class MultiscaleMaggotTrainer(MaggotTrainer):
-    def __init__(self, cfgfilepath, behaviors=[], n_layers=1,
+    def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_iterations=None,
            average_body_length=1.0, device=device):
-        self.model = MultiscaleSupervisedMaggot(cfgfilepath, behaviors, n_layers)
+        self.model = MultiscaleSupervisedMaggot(cfgfilepath, behaviors,
+                                                n_layers, n_iterations)
        self.average_body_length = average_body_length # usually set later
        self.device = device
        self._default_encoder_config = None
        # check consistency
+        if n_iterations is None:
            ref_config = self.config
            for attr in ["batch_size", "optim_iter"]:
-            # TODO: add pretraining_iter and finetuning_iter parameters in
-            #       clf_config to have a lever other than optim_iter, that
-            #       could consequently be ignored
                for enc in self.model.encoders:
                    assert enc.config[attr] == ref_config[attr]