diff --git a/scripts/maestro/make_20230311_datasets.py b/scripts/maestro/make_20230311_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..caa3da89c7acdc718526f76f8ef9fc4f7d2fd485 --- /dev/null +++ b/scripts/maestro/make_20230311_datasets.py @@ -0,0 +1,51 @@ +ntrials = 1 +freq = 10 +labels = ['back_large', 'cast_large', 'hunch_large', 'roll_large', 'run_large', 'stop_large', 'small_motion'] +labels = ['back_strong', 'back_weak', 'cast_strong', 'cast_weak', 'hunch_strong', 'hunch_weak', 'roll_strong', 'roll_weak', 'run_strong', 'run_weak', 'stop_strong', 'stop_weak'] + + +import os +pwd = os.path.abspath('..') +print('JULIA_PROJECT=', os.environ['JULIA_PROJECT']) +data_path = os.environ['DATA_REPOSITORY'] + +import sys +taskid = int(sys.argv[1]) + +i = 0 +_break = False +for window in (2,): + for subset in ('pretrain', 'train', 'test'): + for trial in range(1, ntrials + 1): + i += 1 + if i == taskid: + _break = True + break + if _break: break + if _break: break +print('window:', window, 'subset:', subset, 'trial:', trial, flush=True) + +from math import ceil +from julia import TaggingBackends +write_larva_dataset_hdf5 = TaggingBackends.LarvaDatasets.write_larva_dataset_hdf5 + +def dataset_files(subset): + if subset == 'pretrain': + subset = 'train' + with open(os.path.join(pwd, f"TaggingBackends/data/t15-subset1/{subset}/filelist.txt"), 'r') as f: + return [os.path.join(data_path, line.rstrip()) for line in f.readlines()] + +larva_dataset_files = {} + +window_length = int(window * freq) +third_window_length = ceil(window_length / 3) + +winlen = window_length if subset == 'pretrain' else third_window_length +balance = subset == 'pretrain' +_dir = os.path.join(pwd, f"MaggotUBA-adapter/data/20230311/{subset}/{window_length}/{trial}") +os.makedirs(_dir, exist_ok=True) +_file = write_larva_dataset_hdf5(_dir, dataset_files(subset), winlen, labels=labels, balance=balance, frameinterval=0.1) +larva_dataset_files[(window, subset, trial)] = os.path.join(_dir, _file) + +print(larva_dataset_files) + diff --git a/scripts/maestro/make_20230311_datasets.sh b/scripts/maestro/make_20230311_datasets.sh new file mode 100644 index 0000000000000000000000000000000000000000..0444211ece1284d1da6a351f2a5f4f8985a1a76b --- /dev/null +++ b/scripts/maestro/make_20230311_datasets.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --job-name=Dataset20230311 +#SBATCH --partition=dbc_pmo +#SBATCH --qos=dbc +#SBATCH --array=1-3 +#SBATCH --mem-per-cpu=60G + +export DATA_REPOSITORY=/pasteur/zeus/projets/p02/hecatonchire/screens + +export JULIA_NUM_THREADS=1 + +echo "running JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/make_20230311_datasets.py $SLURM_ARRAY_TASK_ID" +JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/make_20230311_datasets.py $SLURM_ARRAY_TASK_ID diff --git a/scripts/maestro/train_on_20230311_datasets.py b/scripts/maestro/train_on_20230311_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..fffe70143bae1f6a11c6ea02a4a86971ba578cb2 --- /dev/null +++ b/scripts/maestro/train_on_20230311_datasets.py @@ -0,0 +1,89 @@ +from taggingbackends.explorer import BackendExplorer +from maggotuba.data.make_dataset import make_dataset +from maggotuba.models.train_model import train_model +from maggotuba.models.predict_model import predict_model +from glob import glob +import subprocess +import logging +import sys +import os +taskid = int(sys.argv[1]) + +ntrials = 1 +window_lengths = (20,)#(15, 20, 25, 30) +latent_dims = (25,)#(25, 50, 100, 200) +layer_counts = (1,)#(1, 2, 3) +iter_counts = (1000, 10000, 100000) + +ntasks = len(window_lengths) * len(latent_dims) * len(layer_counts) * len(iter_counts) * ntrials * ntrials +print(f"task {taskid}/{ntasks}", flush=True) + +i = 0 +for window_length in window_lengths: + for training_trial in range(1, ntrials+1): + for latent_dim in latent_dims: + for pretraining_trial in range(1, ntrials+1): + + for nlayers in layer_counts: + for niter in iter_counts: + i += 1 + if i < taskid: + continue + elif i > taskid: + print('exiting', flush=True) + sys.exit() + + pretrained_model = f"20230129_winlength_{window_length}_dimlatent_{latent_dim}_trial_{pretraining_trial}" + model = f"20230311_winlength_{window_length}_trials_{pretraining_trial}-{training_trial}_dimlatent_{latent_dim}_layers_{nlayers}_iter_{niter}" + print(f"model: {model}", flush=True) + + training_dataset = glob(f"data/20230311/train/{window_length}/{training_trial}/larva_dataset_*.hdf5")[0] + print(f"training dataset: {training_dataset}", flush=True) + + # the `BackendExplorer` will find out we are running MaggotUBA-adapter, and set up the *models* and *data* directories + backend = BackendExplorer(model_instance=model) + backend.reset() + + # a model instance is allocated 3 data directories: *raw*, *interim* and *processed*; + # input data always go in *raw* + backend.move_to_raw(training_dataset, copy=False) + # the following step simply makes the larva_dataset file available in *interim*, as expected by `train_model`; + # `labels_expected=True` informs `make_dataset` that the data are shipped with the ground truth + make_dataset(backend, labels_expected=True) + + # this populates the *models* directory with a pretrained model, and then runs the training/fine-tuning + train_model(backend, nlayers, pretrained_model_instance=pretrained_model, subsets=(1,0,0), iterations=niter) + + for test_trial in range(1, ntrials+1): + break # temporarily + + test_dataset = glob(f"data/20230311/test/{window_length}/{test_trial}/larva_dataset_*.hdf5")[0] + print(f"test dataset: {test_dataset}", flush=True) + + # score using a fresh (trained) model instance + backend.reset_data() + backend.move_to_raw(test_dataset, copy=False) + # again, we inform `make_dataset` about the availability of the ground truth; this makes `predict_model` return both the predicted and expected labels, so that we can compare + make_dataset(backend, labels_expected=True) + + predictions = predict_model(backend, subset='test', subsets=(0,0,1)) + + with open(f"data/20230311-results/winlength_{window_length}_iter_{niter}_trials_{pretraining_trial}-{training_trial}-{test_trial}.txt", 'w') as f: + for a, b in zip(*predictions): + f.write(f"{a},{b}\n") + + if niter == 100000: + # copy files into the directory for upcoming 20230311 tagger + dir_ = 'models/20230311' + for cmd in ( + ['rm', '-rf', dir_], + ['cp', '-ra', f"models/{model}", dir_], + ['rm', '-f', os.path.join(dir_, 'best_validated_encoder.pt')], + ['sed', '-i', os.path.join(dir_, 'autoencoder_config.json'), '-e', f"s|\"log_dir\": \"models/{model}\"|\"log_dir\": \"\"|", '-e', f"s|\"config\": \".*\"|\"config\": \"{dir_}/autoencoder_config.json\"|", '-e', f"s|\"exp_name\": \"latent-{latent_dim}\"|\"exp_name\": \"20230311\"|"], + ['sed', '-i', os.path.join(dir_, 'clf_config.json'), '-e', f"s|{backend.model_dir()}|{dir_}|", '-e', "s|\"clf_path\": \".*\"|\"clf_path\": \"trained_classifier.pt\"|", '-e', "s|\"enc_path\": \".*\"|\"enc_path\": \"retrained_encoder.pt\"|"], + ): + ret = subprocess.run(cmd, capture_output=True) + for out in (ret.stdout, ret.stderr): + if out: + print(out.decode('utf8')) + diff --git a/scripts/maestro/train_on_20230311_datasets.sh b/scripts/maestro/train_on_20230311_datasets.sh new file mode 100644 index 0000000000000000000000000000000000000000..6cecc94b275f427bffac2e4ad29710731b61eb78 --- /dev/null +++ b/scripts/maestro/train_on_20230311_datasets.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=Train20230311 +#SBATCH --partition=dbc_pmo +#SBATCH --qos=dbc +#SBATCH --array=1-3 +#SBATCH --cpus-per-task=4 + +host=$(hostname) +case ${host:0:8} in + maestro-) +# the shebangs at the top of the present file should include: +#SBATCH --partition=dbc_pmo +#SBATCH --qos=dbc +echo "running on maestro" +if ! [ "$(pwd)" = "$MYSCRATCH/MaggotUBA-adapter" ]; then +echo "Please run from MaggotUBA-adapter directory:" +echo " sbatch scripts/maestro/train_on_20230311_datasets.sh" +exit 1 +fi +export DATA_REPOSITORY=/pasteur/zeus/projets/p02/hecatonchire/screens +;; + *) +echo "unknown host: $host" +exit 1 +esac + +echo "running JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/train_on_20230311_datasets.py $SLURM_ARRAY_TASK_ID" +JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/train_on_20230311_datasets.py $SLURM_ARRAY_TASK_ID