diff --git a/Project.toml b/Project.toml
index 5f5beef3ad2f1c6a0912e4c8d93d0527006a0d8f..8ec9cffc854393a3634a1367c7b80dce6656851e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,3 +2,4 @@
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 LazyArtifacts = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
 PlanarLarvae = "c2615984-ef14-4d40-b148-916c85b43307"
+PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
diff --git a/README.md b/README.md
index 418d07e78019e72287a577499f56c1cff151c037..93e91218918c81f2c650acff01cce76cd5d292f5 100644
--- a/README.md
+++ b/README.md
@@ -87,20 +87,20 @@ All the [command arguments supported by `TaggingBackends`](https://gitlab.pasteu
 
 ### Automatic tagging
 
-Using the [`20230129`](https://gitlab.pasteur.fr/nyx/MaggotUBA-adapter/-/tree/20230129) branch, the `20230129` tagger can be called on a supported tracking data file with:
+For example, the `20230311` tagger can be called to generate labels with:
 
 ```
-poetry run tagging-backend predict --model-instance 20230129
+poetry run tagging-backend predict --model-instance 20230311
 ```
 
 Note: since `TaggingBackends==0.10`, the `--skip-make-dataset` argument is default behavior. Pass `--make-dataset` instead to enforce the former default.
 
-For the above command to work, the track data file must be placed (*e.g.* copied) in the `data/raw/20230129` directory, to be first created or cleared.
+For the above command to work, the tracking data file must be placed (*e.g.* copied) in the `data/raw/20230311` directory, to be first created or cleared.
 
-The resulting label file can be found as *data/processed/20230129/predicted.label*.
+The resulting label file can be found as *data/processed/20230311/predicted.label*.
 Like all *.label* files, this file should be stored as a sibling of the corresponding track data file (in the same directory).
 
-Similarly, with an arbitrary tagger named, say *mytagger*, in the above explanation all occurences of `20230129` or *20230129* must be replaced by the tagger's name.
+Similarly, with an arbitrary tagger named, say *mytagger*, in the above explanation all occurrences of `20230311` or *20230311* must be replaced by the tagger's name.
 For example, the input data file would go into *data/raw/mytagger*.
 
 #### On HPC clusters
@@ -126,7 +126,7 @@ Beware that the default pretrained model may depend on the branch you are on.
 
 The default pretrained model in the *20221005* branch involves linearly interpolating the spines at 10 Hz, and relies on a 20-time-step window (2 seconds). The dimensionality of the latent space is 100.
 
-The default pretrained models in the *20230111* and *20230129* branches similarly interpolate spines at 10 Hz and rely on a 20-time-step window (2 seconds), but feature 25 latent dimensions only.
+The default pretrained models in the *20230111*, *20230129*, *20230311*, *main* and *dev* branches similarly interpolate spines at 10 Hz and rely on a 20-time-step window (2 seconds), but feature 25 latent dimensions only.
 
 Alternative pretrained models can be specified using the `--pretrained-model-instance` option.
 
@@ -159,3 +159,40 @@ Indeed, `finetune` loads the pre-mapping label definition from the model files a
 This is relevant in cases where the `20230311` tagger is used to generate a first round of predictions that are manually corrected with the purpose of retraining (fine-tuning) a new tagger based on the corrected labelled data. The labels will not be compatible with the underlying classifier. Only `train` will apply.
 
 The `202230311-0` tagger may be considered instead, if `finetune` may be used with the above-mentioned purpose.
+
+### Embeddings
+
+A trained encoder can be used to generate embeddings, or intermediate vectorized representations of the input track segments. Embeddings are useful for example to compare between different experimental conditions in an annotation-agnostic fashion.
+
+Although in principle both pre-trained and fine-tuned encoders can be used, a model available in the `models` directory is required:
+```
+poetry run tagging-backend embed --model-instance 20230311
+```
+To apply the encoder of a model in the `pretrained_models` directory, copy the corresponding directory in `models`.
+
+Similarly to the other commands, input data files are expected in the data/raw directory corresponding to the designated model instance (in the above example, in data/raw/20230311).
+
+The above command produces an `embeddings.h5` file in the data/processed/20230311 directory.
+
+The `embeddings.h5` file is an HDF5 file containing several arrays that all feature as many elements or rows as
+embedded/projected data points. This file is structured as follows:
+```
+├── run_id         <- 1D array, typically of strings; id of the tracking data file or assay or run.
+├── track_id       <- 1D array of integers; id of the track or larva.
+├── time           <- 1D array of floats; timestamp of the time step or time segment center.
+└── embedding      <- 2D array of floats; coordinates in the latent space.
+```
+This format is not compatible with the `clustering.cache` file used by [MaggotUBA's ToMATo UI](https://github.com/DecBayComp/Detecting_subtle_behavioural_changes/blob/ee73f0dd294a991322a0eec8f6ce69488c7a1f9a/maggotuba/src/maggotuba/cli/cli_model_clustering.py#L129-L164).
+
+Track ids are not unique across runs. Similarly, times do not share a common time origin across runs.
+
+To visualize the embeddings, the `embedding` matrix can be loaded and transformed with methods like UMAP:
+```
+import h5py
+import umap
+
+with h5py.File('embeddings.h5', 'r') as f:
+    embedding = f['embedding'][...]
+
+embedding2d = umap.UMAP().fit_transform(embedding)
+```
diff --git a/pyproject.toml b/pyproject.toml
index 01e9c3d3c407fff7624add08df7e33820c8bc4fd..e03ecd6e24d88dc9d88d605d5f6b326486203130 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "MaggotUBA-adapter"
-version = "0.17"
+version = "0.18"
 description = "Interface between MaggotUBA and the Nyx tagging UI"
 authors = ["François Laurent"]
 license = "MIT"
@@ -14,7 +14,7 @@ maggotuba-core = {git = "https://gitlab.pasteur.fr/nyx/MaggotUBA-core", tag = "v
 torch = "^1.11.0"
 numpy = "^1.19.3"
 protobuf = "3.9.2"
-taggingbackends = {git = "https://gitlab.pasteur.fr/nyx/TaggingBackends", tag = "v0.16"}
+taggingbackends = {git = "https://gitlab.pasteur.fr/nyx/TaggingBackends", rev = "v0.17"}
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/src/maggotuba/features/__init__.py b/src/maggotuba/features/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/maggotuba/features/preprocess.py b/src/maggotuba/features/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ca2cff79b6854a0a2ed2023a26484edd78b84d
--- /dev/null
+++ b/src/maggotuba/features/preprocess.py
@@ -0,0 +1,109 @@
+import numpy as np
+from taggingbackends.features.skeleton import interpolate
+
+
+class Preprocessor:
+    def __init__(self, configured, average_body_length=1.0):
+        self.configured = configured
+        # usually set later
+        self.average_body_length = average_body_length
+
+    @property
+    def config(self):
+        return self.configured.config
+
+    @property
+    def swap_head_tail(self):
+        return self.config.get('swap_head_tail', True)
+
+    @swap_head_tail.setter
+    def swap_head_tail(self, b):
+        self.config['swap_head_tail'] = b
+
+    def window(self, t, data):
+        interpolation_args = {k: self.config[k]
+                              for k in ('spine_interpolation', 'frame_interval')
+                              if k in self.config}
+        winlen = self.config["len_traj"]
+        N = data.shape[0]+1
+        if interpolation_args:
+            for m in range(0, N-1):
+                win = interpolate(t, data, m, winlen, **interpolation_args)
+                if win is not None:
+                    assert win.shape[0] == winlen
+                    yield t[m], win
+        else:
+            for m in range(0, N-winlen):
+                n = m + winlen
+                yield t[(m + n) // 2], data[m:n]
+
+    def pad(self, target_t, defined_t, data):
+        if data.shape[0] == 1:
+            return np.repeat(data, len(target_t), axis=0)
+        else:
+            head = searchsortedfirst(target_t, defined_t[0])
+            tail = len(target_t) - (searchsortedlast(target_t, defined_t[-1]) + 1)
+            ind = np.r_[
+                    np.zeros(head, dtype=int),
+                    np.arange(data.shape[0]),
+                    (data.shape[1]-1) * np.ones(tail, dtype=int),
+                    ]
+            if len(ind) != len(target_t):
+                raise RuntimeError('missing time steps')
+            return data[ind]
+
+    def body_length(self, data):
+        dx = np.diff(data[:,0::2], axis=1)
+        dy = np.diff(data[:,1::2], axis=1)
+        return np.sum(np.sqrt(dx*dx + dy*dy), axis=1)
+
+    def normalize(self, w):
+        # center coordinates
+        wc = np.mean(w[:,4:6], axis=0, keepdims=True)
+        w = w - np.tile(wc, (1, 5))
+        # rotate
+        v = np.mean(w[:,8:10] - w[:,0:2], axis=0)
+        vnorm = np.sqrt(np.dot(v, v))
+        if vnorm == 0:
+            logging.warning('null distance between head and tail')
+        else:
+            v = v / vnorm
+        c, s = v / self.average_body_length # scale using the rotation matrix
+        rot = np.array([[ c, s],
+                        [-s, c]]) # clockwise rotation
+        w = np.einsum("ij,jkl", rot, np.reshape(w.T, (2, 5, -1), order='F'))
+        return w
+
+    """
+    Preprocess a single track.
+
+    This includes running a sliding window, resampling the track in each window,
+    normalizing the spines, etc.
+    """
+    def preprocess(self, t, data):
+        defined_t = []
+        ws = []
+        for t_, w in self.window(t, data):
+            defined_t.append(t_)
+            ws.append(self.normalize(w))
+        if ws:
+            ret = self.pad(t, defined_t, np.stack(ws))
+            if self.swap_head_tail:
+                ret = ret[:,:,::-1,:]
+            return ret
+
+    def __call__(self, *args):
+        return self.preprocess(*args)
+
+
+# Julia functions
+def searchsortedfirst(xs, x):
+    for i, x_ in enumerate(xs):
+        if x <= x_:
+            return i
+
+def searchsortedlast(xs, x):
+    for i in range(len(xs))[::-1]:
+        x_ = xs[i]
+        if x_ <= x:
+            return i
diff --git a/src/maggotuba/models/embed_model.py b/src/maggotuba/models/embed_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc814b75a581c5425572981cfcdf7b4984b4f267
--- /dev/null
+++ b/src/maggotuba/models/embed_model.py
@@ -0,0 +1,228 @@
+#from taggingbackends.data.labels import Labels
+from taggingbackends.features.skeleton import get_5point_spines
+from maggotuba.features.preprocess import Preprocessor
+from maggotuba.models.modules import MaggotEncoder
+from maggotuba.models.trainers import new_generator
+from behavior_model.models.neural_nets import device
+from taggingbackends.explorer import check_permissions
+from collections import defaultdict
+import numpy as np
+import logging
+import torch
+import h5py
+
+
+def embed_model(backend, **kwargs):
+    """
+    This function projects input data into MaggotUBA latent space (or embedding).
+
+    It supports single *larva_dataset* hdf5 files, or (possibly multiple) track
+    data files.
+
+    Input files are expected in `data/interim` or `data/raw`.
+
+    The predicted labels are saved in `data/processed`, in `predicted.label`
+    files, following the same directory structure as in `data/interim` or
+    `data/raw`.
+    """
+    if kwargs.pop('debug', False):
+        logging.root.setLevel(logging.DEBUG)
+
+    # we pick files in `data/interim` if any, otherwise in `data/raw`
+    input_files = backend.list_interim_files(group_by_directories=True)
+    if not input_files:
+        input_files = backend.list_input_files(group_by_directories=True)
+    assert 0 < len(input_files)
+
+    # load the model
+    model_files = backend.list_model_files()
+    config_files = [file
+                    for file in model_files
+                    if file.name.endswith('config.json')]
+    if len(config_files) == 0:
+        raise RuntimeError(f"no config files found for tagger: {backend.model_instance}")
+    config_files = [file
+                    for file in config_files
+                    if file.name != 'clf_config.json']
+    if len(config_files) == 0:
+        raise RuntimeError(f"no encoder config files found")
+    elif len(config_files) == 1:
+        config_file = config_files[0]
+        model = MaggotEncoder(config_file)
+
+    # call the `predict` logic on the input data files
+    embed_individual_data_files(backend, model, input_files)
+
+def embed_individual_data_files(backend, encoder, input_files):
+    from taggingbackends.data.trxmat import TrxMat
+    from taggingbackends.data.chore import load_spine
+    import taggingbackends.data.fimtrack as fimtrack
+
+    encoder.eval()
+    encoder.to(device)
+
+    features = defaultdict(dict)
+    npoints = 0
+
+    for input_files in input_files.values():
+        done = False
+        for file in input_files:
+
+            # load the input data (or features)
+            if done:
+                logging.info(f"ignoring file: {file.name}")
+                continue
+            elif file.name.endswith(".outline"):
+                # skip to spine file
+                logging.info(f"ignoring file: {file.name}")
+                continue
+            elif file.name.endswith(".spine"):
+                spine = load_spine(file)
+                run = spine["date_time"].iloc[0]
+                larvae = spine["larva_id"].values
+                t = spine["time"].values
+                data = spine.iloc[:,3:].values
+            elif file.name.endswith(".mat"):
+                trx = TrxMat(file)
+                t = trx["t"]
+                data = trx["spine"]
+                run, data = next(iter(data.items()))
+                if run == "spine":
+                    run, data = next(iter(data.items()))
+                t = t[run]
+            elif file.name.endswith(".csv"):
+                if labels.camera_framerate:
+                    logging.info(f"camera frame rate: {labels.camera_framerate}fps")
+                else:
+                    logging.info("assuming 30-fps camera frame rate")
+                    labels.camera_framerate = 30
+                t, data = fimtrack.read_spines(file, fps=labels.camera_framerate)
+                run = "NA"
+            else:
+                # label files not processed; only their data dependencies are
+                logging.info(f"ignoring file: {file.name}")
+                continue
+
+            # downsample the skeleton
+            if isinstance(data, dict):
+                for larva in data:
+                    data[larva] = get_5point_spines(data[larva])
+            else:
+                data = get_5point_spines(data)
+
+            # assign labels and apply post-prediction filters
+            preprocessor = Preprocessor(encoder)
+            if isinstance(data, dict):
+                ref_length = np.median(np.concatenate([
+                    preprocessor.body_length(spines) for spines in data.values()
+                    ]))
+                preprocessor.average_body_length = ref_length
+                logging.info(f"average body length: {ref_length}")
+                for larva, spines in data.items():
+                    latentfeatures = _embed(preprocessor, encoder, t[larva], spines)
+                    if latentfeatures is None:
+                        logging.info(f"failure to window track: {larva}")
+                    else:
+                        features[run][larva] = (t[larva], latentfeatures)
+                        npoints += len(latentfeatures)
+            else:
+                ref_length = np.median(preprocessor.body_length(data))
+                preprocessor.average_body_length = ref_length
+                logging.info(f"average body length: {ref_length}")
+                for larva in np.unique(larvae):
+                    mask = larvae == larva
+                    latentfeatures = _embed(preprocessor, encoder, t[mask], data[mask])
+                    if latentfeatures is None:
+                        logging.info(f"failure to window track: {larva}")
+                    else:
+                        features[run][larva] = (t[mask], latentfeatures)
+                        npoints += len(latentfeatures)
+
+            done = True
+
+    # format the latent features and related info as matrices
+    run_id = []
+    run_id_repeats = []
+    sample_run = next(iter(features.values()))
+    sample_track_id, (sample_times, sample_embedding) = next(iter(sample_run.items()))
+    nfeatures = sample_embedding.shape[1]
+    track_id = np.zeros(npoints, dtype=type(sample_track_id))
+    t = np.zeros(npoints, dtype=sample_times.dtype)
+    embedding = np.zeros((npoints, nfeatures), dtype=sample_embedding.dtype)
+    i = 0
+    for run, tracks in features.items():
+        run_id.append(run)
+        repeats = 0
+        for track, (timesteps, ftr) in tracks.items():
+            j = len(timesteps)
+            track_id[i:i+j] = track
+            t[i:i+j] = timesteps
+            embedding[i:i+j] = ftr
+            i += j
+            repeats += j
+        run_id_repeats.append(repeats)
+    run_id = list(repeat(run_id, run_id_repeats))
+
+    # save the vectorized data to file
+    embeddings = get_output_filepath(backend, file)
+    with h5py.File(embeddings, 'w') as f:
+        # f['n_runs'] = len(features)
+        # for i, (run, tracks) in enumerate(features.items()):
+        #     g = f.create_group(f'run_{i}')
+        #     g['run_id'] = run
+        #     g['n_tracks'] = len(tracks)
+        #     for j, (track, (t, latent)) in enumerate(tracks.items()):
+        #         h = g.create_group(f'track_{j}')
+        #         h['track_id'] = track
+        #         h['n_steps'] = len(t)
+        #         for k, (t, latent) in enumerate(zip(t, latent)):
+        #             l = h.create_group(f'step_{k}')
+        #             l['time'] = t
+        #             l['embedding'] = latent
+        f['run_id'] = run_id
+        f['track_id'] = track_id
+        f['time'] = t
+        f['embedding'] = embedding
+    check_permissions(embeddings)
+
+@torch.no_grad()
+def _embed(preprocessor, encoder, t, data):
+    rawfeatures = preprocessor(t, data)
+    if rawfeatures is not None:
+        rawfeatures = torch.from_numpy(rawfeatures.astype(np.float32))
+        latentfeatures = encoder(rawfeatures).detach().numpy()
+        assert len(t) == len(latentfeatures)
+        return latentfeatures
+
+def get_output_filepath(backend, file):
+    #if file.is_relative_to(backend.interim_data_dir()): # Py>=3.9
+    if str(file).startswith(str(backend.interim_data_dir())):
+        subdir = file.parent.relative_to(backend.interim_data_dir())
+    else:
+        #assert file.is_relative_to(backend.raw_data_dir())
+        assert str(file).startswith(str(backend.raw_data_dir()))
+        subdir = file.parent.relative_to(backend.raw_data_dir())
+    parentdir = backend.processed_data_dir() / subdir
+    parentdir.mkdir(parents=True, exist_ok=True)
+    target = parentdir / "embeddings.h5"
+    if target.is_file():
+        logging.info(f"ouput file already exists: {target}")
+        i = 0
+        while True:
+            i += 1
+            target = parentdir / f"embeddings-{i}.h5"
+            if not target.is_file(): break
+    return target
+
+
+def repeat(items, n):
+    for item, n in zip(items, n):
+        for _ in range(n):
+            yield item
+
+
+from taggingbackends.main import main
+
+if __name__ == "__main__":
+    main(embed_model)
+
diff --git a/src/maggotuba/models/predict_model.py b/src/maggotuba/models/predict_model.py
index 086e4b8d2a4be887c5aa6791fa0325268fe649ae..c4cb3ac7c15e12174b7dd4713730999991afe7b2 100644
--- a/src/maggotuba/models/predict_model.py
+++ b/src/maggotuba/models/predict_model.py
@@ -4,7 +4,6 @@ from maggotuba.models.trainers import MaggotTrainer, MultiscaleMaggotTrainer, Ma
 import numpy as np
 import logging
 
-
 def predict_model(backend, **kwargs):
     """
     This function generates predicted labels for all the input data.
@@ -18,6 +17,9 @@ def predict_model(backend, **kwargs):
     files, following the same directory structure as in `data/interim` or
     `data/raw`.
     """
+    if kwargs.pop('debug', False):
+        logging.root.setLevel(logging.DEBUG)
+
     # we pick files in `data/interim` if any, otherwise in `data/raw`
     input_files = backend.list_interim_files(group_by_directories=True)
     if not input_files:
diff --git a/src/maggotuba/models/train_model.py b/src/maggotuba/models/train_model.py
index c9665388f06b0710f3f30e24e791ad4e8cc46654..7f0b294adf5de3b1c72e7db756e94602744a8d6f 100644
--- a/src/maggotuba/models/train_model.py
+++ b/src/maggotuba/models/train_model.py
@@ -2,9 +2,13 @@ from taggingbackends.data.labels import Labels
 from taggingbackends.data.dataset import LarvaDataset
 from maggotuba.models.trainers import make_trainer, new_generator, enforce_reproducibility
 import glob
+import logging
 
 def train_model(backend, layers=1, pretrained_model_instance="default",
                 subsets=(1, 0, 0), rng_seed=None, iterations=1000, **kwargs):
+    if kwargs.pop('debug', False):
+        logging.root.setLevel(logging.DEBUG)
+
     # list training data files;
     # we actually expect a single larva_dataset file that make_dataset generated
     # or moved into data/interim/{instance}/
@@ -48,9 +52,11 @@ def train_model(backend, layers=1, pretrained_model_instance="default",
     model.clf_config['post_filters'] = ['ABC->AAC']
 
     # save the model
-    print(f"saving model \"{backend.model_instance}\"")
+    logging.debug(f"saving model \"{backend.model_instance}\"")
     model.save()
 
+    return dataset, model # for debugging purposes
+
 
 from taggingbackends.main import main
 
diff --git a/src/maggotuba/models/trainers.py b/src/maggotuba/models/trainers.py
index 2603a8490dc3ec052ae15962592a7b85ff5bcab8..2d41bea9aa24d68ea4b455d0f3ad1c9e81a4fe2f 100644
--- a/src/maggotuba/models/trainers.py
+++ b/src/maggotuba/models/trainers.py
@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn
 from behavior_model.models.neural_nets import device
 from maggotuba.models.modules import SupervisedMaggot, MultiscaleSupervisedMaggot, MaggotBag
-from taggingbackends.features.skeleton import interpolate
+from maggotuba.features.preprocess import Preprocessor
 from taggingbackends.explorer import BackendExplorer, check_permissions
 import logging
 import json
@@ -11,7 +11,7 @@ import re
 import os.path
 
 """
-This model borrows the pre-trained MaggotUBA encoder, substitute a dense layer
+This model borrows the pre-trained MaggotUBA encoder, substitutes a dense layer
 for the decoder, and (re-)trains the entire model.
 
 Attribute `config` refers to MaggotUBA autoencoder.
@@ -26,7 +26,7 @@ class MaggotTrainer:
     def __init__(self, cfgfilepath, behaviors=[], n_layers=1, n_iterations=None,
             average_body_length=1.0, device=device):
         self.model = SupervisedMaggot(cfgfilepath, behaviors, n_layers, n_iterations)
-        self.average_body_length = average_body_length # usually set later
+        self.preprocessor = Preprocessor(self, average_body_length)
         self.device = device
 
     @property
@@ -46,87 +46,15 @@ class MaggotTrainer:
         self.model.clf.behavior_labels = labels
 
     @property
-    def swap_head_tail(self):
-        return self.config.get('swap_head_tail', True)
-
-    @swap_head_tail.setter
-    def swap_head_tail(self, b):
-        self.config['swap_head_tail'] = b
-
-    ### TODO: move parts of the below code in a features module
-    # all the code in this section is called by `predict` only
-    def window(self, t, data):
-        interpolation_args = {k: self.config[k]
-                              for k in ('spine_interpolation', 'frame_interval')
-                              if k in self.config}
-        winlen = self.config["len_traj"]
-        N = data.shape[0]+1
-        if interpolation_args:
-            for m in range(0, N-1):
-                win = interpolate(t, data, m, winlen, **interpolation_args)
-                if win is not None:
-                    assert win.shape[0] == winlen
-                    yield t[m], win
-        else:
-            for m in range(0, N-winlen):
-                n = m + winlen
-                yield t[(m + n) // 2], data[m:n]
+    def average_body_length(self):
+        return self.preprocessor.average_body_length
 
-    def pad(self, target_t, defined_t, data):
-        if data.shape[0] == 1:
-            return np.repeat(data, len(target_t), axis=0)
-        else:
-            head = searchsortedfirst(target_t, defined_t[0])
-            tail = len(target_t) - (searchsortedlast(target_t, defined_t[-1]) + 1)
-            ind = np.r_[
-                    np.zeros(head, dtype=int),
-                    np.arange(data.shape[0]),
-                    (data.shape[1]-1) * np.ones(tail, dtype=int),
-                    ]
-            if len(ind) != len(target_t):
-                raise RuntimeError('missing time steps')
-            return data[ind]
+    @average_body_length.setter
+    def average_body_length(self, l):
+        self.preprocessor.average_body_length = l
 
     def body_length(self, data):
-        dx = np.diff(data[:,0::2], axis=1)
-        dy = np.diff(data[:,1::2], axis=1)
-        return np.sum(np.sqrt(dx*dx + dy*dy), axis=1)
-
-    def normalize(self, w):
-        # center coordinates
-        wc = np.mean(w[:,4:6], axis=0, keepdims=True)
-        w = w - np.tile(wc, (1, 5))
-        # rotate
-        v = np.mean(w[:,8:10] - w[:,0:2], axis=0)
-        vnorm = np.sqrt(np.dot(v, v))
-        if vnorm == 0:
-            logging.warning('null distance between head and tail')
-        else:
-            v = v / vnorm
-        c, s = v / self.average_body_length # scale using the rotation matrix
-        rot = np.array([[ c, s],
-                        [-s, c]]) # clockwise rotation
-        w = np.einsum("ij,jkl", rot, np.reshape(w.T, (2, 5, -1), order='F'))
-        return w
-
-    """
-    Preprocess a single track.
-
-    This includes running a sliding window, resampling the track in each window,
-    normalizing the spines, etc.
-    """
-    def preprocess(self, t, data):
-        defined_t = []
-        ws = []
-        for t_, w in self.window(t, data):
-            defined_t.append(t_)
-            ws.append(self.normalize(w))
-        if ws:
-            ret = self.pad(t, defined_t, np.stack(ws))
-            if self.swap_head_tail:
-                ret = ret[:,:,::-1,:]
-            return ret
-    ###
+        return self.preprocessor.body_length(data)
 
     def forward(self, x, train=False):
         if train:
@@ -233,7 +161,7 @@ class MaggotTrainer:
         model.to(self.device)
         if subset is None:
             # data is a (times, spines) couple
-            data = self.preprocess(*data)
+            data = self.preprocessor(*data)
             if data is None:
                 return
             output = self.forward(data)
@@ -323,7 +251,7 @@ class MultiscaleMaggotTrainer(MaggotTrainer):
             average_body_length=1.0, device=device):
         self.model = MultiscaleSupervisedMaggot(cfgfilepath, behaviors,
                                                 n_layers, n_iterations)
-        self.average_body_length = average_body_length # usually set later
+        self.preprocessor = Preprocessor(self, average_body_length)
         self.device = device
         self._default_encoder_config = None
         # check consistency
@@ -349,7 +277,7 @@ class MaggotBagging(MaggotTrainer):
     def __init__(self, cfgfilepaths, behaviors=[], n_layers=1,
             average_body_length=1.0, device=device):
         self.model = MaggotBag(cfgfilepaths, behaviors, n_layers)
-        self.average_body_length = average_body_length # usually set later
+        self.preprocessor = Preprocessor(self, average_body_length)
         self.device = device