diff --git a/README.md b/README.md index 6590557ee61ee198b6c63b91803314c763e67086..93e91218918c81f2c650acff01cce76cd5d292f5 100644 --- a/README.md +++ b/README.md @@ -174,17 +174,25 @@ Similarly to the other commands, input data files are expected in the data/raw d The above command produces an `embeddings.h5` file in the data/processed/20230311 directory. -The `embeddings.h5` file is an HDF5 file structured as follows: +The `embeddings.h5` file is an HDF5 file containing several arrays that all feature as many elements or rows as +embedded/projected data points. This file is structured as follows: ``` -├── n_runs <- integer; number of files or assays or runs. -├── run_0 <- dataset named run_<i> with <i> ranging from 0 to n_runs - 1. -... ├── run_id <- string; run id, typically date and time in the yyyymmdd_HHMMSS format. - ├── n_tracks <- integer; number of tracks or larvae. - ├── track_0 <- dataset named track_<j> with <j> ranging from 0 to n_tracks - 1. - ... ├── track_id <- integer; track id as referred to as in the original tracking data file. - ├── n_steps <- integer; number of projected time steps or segments. - ├── step_0 <- dataset named step_<k> with <k> ranging from 0 to n_steps - 1. - ... ├── time <- float; timestamp of the step or segment center. - └── embedding <- float array. +├── run_id <- 1D array, typically of strings; id of the tracking data file or assay or run. +├── track_id <- 1D array of integers; id of the track or larva. +├── time <- 1D array of floats; timestamp of the time step or time segment center. +└── embedding <- 2D array of floats; coordinates in the latent space. ``` This format is not compatible with the `clustering.cache` file used by [MaggotUBA's ToMATo UI](https://github.com/DecBayComp/Detecting_subtle_behavioural_changes/blob/ee73f0dd294a991322a0eec8f6ce69488c7a1f9a/maggotuba/src/maggotuba/cli/cli_model_clustering.py#L129-L164). + +Track ids are not unique across runs. Similarly, times do not share a common time origin across runs. + +To visualize the embeddings, the `embedding` matrix can be loaded and transformed with methods like UMAP: +``` +import h5py +import umap + +with h5py.File('embeddings.h5', 'r') as f: + embedding = f['embedding'][...] + +embedding2d = umap.UMAP().fit_transform(embedding) +``` diff --git a/src/maggotuba/models/embed_model.py b/src/maggotuba/models/embed_model.py index 58eaa4975f553f2cc1d910d4cd5c55a717746c84..bc814b75a581c5425572981cfcdf7b4984b4f267 100644 --- a/src/maggotuba/models/embed_model.py +++ b/src/maggotuba/models/embed_model.py @@ -4,6 +4,7 @@ from maggotuba.features.preprocess import Preprocessor from maggotuba.models.modules import MaggotEncoder from maggotuba.models.trainers import new_generator from behavior_model.models.neural_nets import device +from taggingbackends.explorer import check_permissions from collections import defaultdict import numpy as np import logging @@ -61,6 +62,7 @@ def embed_individual_data_files(backend, encoder, input_files): encoder.to(device) features = defaultdict(dict) + npoints = 0 for input_files in input_files.values(): done = False @@ -122,6 +124,7 @@ def embed_individual_data_files(backend, encoder, input_files): logging.info(f"failure to window track: {larva}") else: features[run][larva] = (t[larva], latentfeatures) + npoints += len(latentfeatures) else: ref_length = np.median(preprocessor.body_length(data)) preprocessor.average_body_length = ref_length @@ -133,24 +136,54 @@ def embed_individual_data_files(backend, encoder, input_files): logging.info(f"failure to window track: {larva}") else: features[run][larva] = (t[mask], latentfeatures) + npoints += len(latentfeatures) done = True + # format the latent features and related info as matrices + run_id = [] + run_id_repeats = [] + sample_run = next(iter(features.values())) + sample_track_id, (sample_times, sample_embedding) = next(iter(sample_run.items())) + nfeatures = sample_embedding.shape[1] + track_id = np.zeros(npoints, dtype=type(sample_track_id)) + t = np.zeros(npoints, dtype=sample_times.dtype) + embedding = np.zeros((npoints, nfeatures), dtype=sample_embedding.dtype) + i = 0 + for run, tracks in features.items(): + run_id.append(run) + repeats = 0 + for track, (timesteps, ftr) in tracks.items(): + j = len(timesteps) + track_id[i:i+j] = track + t[i:i+j] = timesteps + embedding[i:i+j] = ftr + i += j + repeats += j + run_id_repeats.append(repeats) + run_id = list(repeat(run_id, run_id_repeats)) + # save the vectorized data to file - with h5py.File(get_output_filepath(backend, file), 'w') as f: - f['n_runs'] = len(features) - for i, (run, tracks) in enumerate(features.items()): - g = f.create_group(f'run_{i}') - g['run_id'] = run - g['n_tracks'] = len(tracks) - for j, (track, (t, latent)) in enumerate(tracks.items()): - h = g.create_group(f'track_{j}') - h['track_id'] = track - h['n_steps'] = len(t) - for k, (t, latent) in enumerate(zip(t, latent)): - l = h.create_group(f'step_{k}') - l['time'] = t - l['embedding'] = latent + embeddings = get_output_filepath(backend, file) + with h5py.File(embeddings, 'w') as f: + # f['n_runs'] = len(features) + # for i, (run, tracks) in enumerate(features.items()): + # g = f.create_group(f'run_{i}') + # g['run_id'] = run + # g['n_tracks'] = len(tracks) + # for j, (track, (t, latent)) in enumerate(tracks.items()): + # h = g.create_group(f'track_{j}') + # h['track_id'] = track + # h['n_steps'] = len(t) + # for k, (t, latent) in enumerate(zip(t, latent)): + # l = h.create_group(f'step_{k}') + # l['time'] = t + # l['embedding'] = latent + f['run_id'] = run_id + f['track_id'] = track_id + f['time'] = t + f['embedding'] = embedding + check_permissions(embeddings) @torch.no_grad() def _embed(preprocessor, encoder, t, data): @@ -182,6 +215,12 @@ def get_output_filepath(backend, file): return target +def repeat(items, n): + for item, n in zip(items, n): + for _ in range(n): + yield item + + from taggingbackends.main import main if __name__ == "__main__":