diff --git a/scripts/maestro/make_20230129_datasets.py b/scripts/maestro/make_20230129_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..df2246cda82d7976a4f71aa327a3cf600c9ef6be --- /dev/null +++ b/scripts/maestro/make_20230129_datasets.py @@ -0,0 +1,50 @@ +ntrials = 1 +freq = 10 +labels = ['back_large', 'cast_large', 'hunch_large', 'roll_large', 'run_large', 'stop_large', 'small_motion'] + + +import os +pwd = os.path.abspath('..') +print('JULIA_PROJECT=', os.environ['JULIA_PROJECT']) +data_path = os.environ['DATA_REPOSITORY'] + +import sys +taskid = int(sys.argv[1]) + +i = 0 +_break = False +for window in (2,): + for subset in ('pretrain', 'train', 'test'): + for trial in range(1, ntrials + 1): + i += 1 + if i == taskid: + _break = True + break + if _break: break + if _break: break +print('window:', window, 'subset:', subset, 'trial:', trial, flush=True) + +from math import ceil +from julia import TaggingBackends +write_larva_dataset_hdf5 = TaggingBackends.LarvaDatasets.write_larva_dataset_hdf5 + +def dataset_files(subset): + if subset == 'pretrain': + subset = 'train' + with open(os.path.join(pwd, f"TaggingBackends/data/t15-subset1/{subset}/filelist.txt"), 'r') as f: + return [os.path.join(data_path, line.rstrip()) for line in f.readlines()] + +larva_dataset_files = {} + +window_length = int(window * freq) +third_window_length = ceil(window_length / 3) + +winlen = window_length if subset == 'pretrain' else third_window_length +balance = subset == 'pretrain' +_dir = os.path.join(pwd, f"MaggotUBA-adapter/data/20230129/{subset}/{window_length}/{trial}") +os.makedirs(_dir, exist_ok=True) +_file = write_larva_dataset_hdf5(_dir, dataset_files(subset), winlen, labels=labels, balance=balance, frameinterval=0.1) +larva_dataset_files[(window, subset, trial)] = os.path.join(_dir, _file) + +print(larva_dataset_files) + diff --git a/scripts/maestro/make_20230129_datasets.sh b/scripts/maestro/make_20230129_datasets.sh new file mode 100644 index 0000000000000000000000000000000000000000..6dd136e4548f7b5654cf4172474767951219073e --- /dev/null +++ b/scripts/maestro/make_20230129_datasets.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --job-name=Dataset20230129 +#SBATCH --partition=dbc_pmo +#SBATCH --qos=dbc +#SBATCH --array=1-3 +#SBATCH --mem-per-cpu=60G + +host=$(hostname) +case ${host:0:8} in + maestro-) +echo "running on maestro" +if ! [ "$(pwd)" = "$MYSCRATCH/MaggotUBA-adapter" ]; then +echo "Please run from MaggotUBA-adapter directory:" +echo " sbatch scripts/maestro/make_20230129_datasets.sh" +exit 1 +fi +export DATA_REPOSITORY=/pasteur/zeus/projets/p02/hecatonchire/screens +;; + *) +echo "unknown host: $host" +exit 1 +esac + +export JULIA_NUM_THREADS=1 + +echo "running JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/make_20230129_datasets.py $SLURM_ARRAY_TASK_ID" +JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/make_20230129_datasets.py $SLURM_ARRAY_TASK_ID diff --git a/scripts/maestro/pretrain_on_20230129_datasets.py b/scripts/maestro/pretrain_on_20230129_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..39cd9da3c0d705e66fb39c2200c4069695863c71 --- /dev/null +++ b/scripts/maestro/pretrain_on_20230129_datasets.py @@ -0,0 +1,86 @@ +import os, sys +from math import ceil +import subprocess +import tempfile +from glob import glob + +taskid = int(sys.argv[1]) + +ntrials = 1 +freq = 10 +windows = (2, ) + +larva_dataset_files = {} + +todo = [] + +for window in windows: + window_length = int(window * freq) + third_window_length = ceil(window_length / 3) + for subset, winlen in (('pretrain', window_length), ):#('train', third_window_length), ('test', third_window_length)): + for trial in range(1, ntrials+1): + _dir = f"data/20230129/{subset}/{window_length}/{trial}" + _files = glob(os.path.join(_dir, '*.hdf5')) + if bool(_files): + larva_dataset_files[(window, subset, trial)] = _files[0] + +print(len(larva_dataset_files)) + +i = 0 +for window in windows: + window_length = int(window * freq) + for trial in range(1, ntrials+1): + i += 1 + if i != taskid: + continue + maggotuba_project_name = f"20230129-{window_length}-{trial}" + print(i, maggotuba_project_name) + if todo and maggotuba_project_name not in todo: + continue + maggotuba_dir = os.path.abspath('../MaggotUBA-core') + try: + larva_dataset = os.path.abspath(larva_dataset_files[(window, 'pretrain', trial)]) + except KeyError: + print('KeyError') + continue + pretrained_model_dir = os.path.abspath(f"../MaggotUBA-adapter/pretrained_models/20230129_winlength_{window_length}_dimlatent_${{dim}}_trial_{trial}") + pretrain = f""" +#!/bin/bash +mkdir -p 20230129 +cd 20230138 + +rm -rf {maggotuba_project_name} +echo "poetry run maggotuba setup 20230129 {maggotuba_project_name} --len_traj {window_length}" +poetry run maggotuba setup 20230129 {maggotuba_project_name} --len_traj {window_length} +cd {maggotuba_project_name} || exit 1 + +ln -s {larva_dataset} . +sed -e 's|"data_dir": ""|"data_dir": "{larva_dataset}"|' -e 's|"optim_iter": 1000|"optim_iter": 10000|' -i config.json +sed -e 's|^ ]$| ],|' -e 's|\\}}| "spine_interpolation": "linear",\\n "frame_interval": 0.1,\\n "swap_head_tail": false\\n\\}}|' -i config.json + +for dim in 25; do + +sed -e "s/config.json/config-$dim.json/" -e "s/\\"dim_latent\\": 10/\\"dim_latent\\": $dim/" config.json > "config-$dim.json" + +rm -rf "training_log/latent-$dim" +echo "poetry run maggotuba model train --name \\"latent-$dim\\" --config \\"config-$dim.json\\"" +poetry run maggotuba model train --name "latent-$dim" --config "config-$dim.json" +[ -f "training_log/latent-$dim/best_validated_encoder.pt" ] || exit 1 + +eval loc="{pretrained_model_dir}" +mkdir -p "$loc" +# TODO: make project_dir and exp_folder empty strings +sed -e "s|{maggotuba_dir}/||" -e "s|20230129/20230129|20230129|" "training_log/latent-$dim/config.json" > "$loc/autoencoder_config.json" +cp "training_log/latent-$dim/best_validated_encoder.pt" "$loc" + +done +""" + with tempfile.NamedTemporaryFile('w+') as fp: + fp.write(pretrain) + fp.flush() + ret = subprocess.run(['/bin/bash', fp.name], cwd=maggotuba_dir, capture_output=True) + if ret.stdout: + print(ret.stdout.decode('utf8'), flush=True) + if ret.stderr: + print(ret.stderr.decode('utf8')) + diff --git a/scripts/maestro/pretrain_on_20230129_datasets.sh b/scripts/maestro/pretrain_on_20230129_datasets.sh new file mode 100644 index 0000000000000000000000000000000000000000..2e1a410112be8d8003c5f262878da69cd10a50d2 --- /dev/null +++ b/scripts/maestro/pretrain_on_20230129_datasets.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --job-name=Pretrain20230129 +#SBATCH --partition=dbc_pmo +#SBATCH --qos=dbc +#SBATCH --array=1-1 +#SBATCH --mem-per-cpu=20G + +module load Python/3.8.3 + +echo "running python3 scripts/maestro/pretrain_on_20230129_datasets.py $SLURM_ARRAY_TASK_ID" +python3 scripts/maestro/pretrain_on_20230129_datasets.py $SLURM_ARRAY_TASK_ID diff --git a/scripts/maestro/train_on_20230129_datasets.py b/scripts/maestro/train_on_20230129_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..43589773cf7464b73f6a937f28cefbda88ccfdcb --- /dev/null +++ b/scripts/maestro/train_on_20230129_datasets.py @@ -0,0 +1,86 @@ +from taggingbackends.explorer import BackendExplorer +from maggotuba.data.make_dataset import make_dataset +from maggotuba.models.train_model import train_model +from maggotuba.models.predict_model import predict_model +from glob import glob +import subprocess +import logging +import sys +import os +taskid = int(sys.argv[1]) + +ntrials = 1 +window_lengths = (20,)#(15, 20, 25, 30) +latent_dims = (25,)#(25, 50, 100, 200) +layer_counts = (1,)#(1, 2, 3) + +ntasks = len(window_lengths) * len(latent_dims) * len(layer_counts) * ntrials * ntrials +print(f"task {taskid}/{ntasks}", flush=True) + +i = 0 +for window_length in window_lengths: + for training_trial in range(1, ntrials+1): + for latent_dim in latent_dims: + for pretraining_trial in range(1, ntrials+1): + + for nlayers in layer_counts: + i += 1 + if i < taskid: + continue + elif i > taskid: + print('exiting', flush=True) + sys.exit() + + pretrained_model = f"20230129_winlength_{window_length}_dimlatent_{latent_dim}_trial_{pretraining_trial}" + model = f"20230129_winlength_{window_length}_trials_{pretraining_trial}-{training_trial}_dimlatent_{latent_dim}_layers_{nlayers}" + print(f"model: {model}", flush=True) + + training_dataset = glob(f"data/20230129/train/{window_length}/{training_trial}/larva_dataset_*.hdf5")[0] + print(f"training dataset: {training_dataset}", flush=True) + + # the `BackendExplorer` will find out we are running MaggotUBA-adapter, and set up the *models* and *data* directories + backend = BackendExplorer(model_instance=model) + backend.reset() + + # a model instance is allocated 3 data directories: *raw*, *interim* and *processed*; + # input data always go in *raw* + backend.move_to_raw(training_dataset, copy=False) + # the following step simply makes the larva_dataset file available in *interim*, as expected by `train_model`; + # `labels_expected=True` informs `make_dataset` that the data are shipped with the ground truth + make_dataset(backend, labels_expected=True) + + # this populates the *models* directory with a pretrained model, and then runs the training/fine-tuning + train_model(backend, nlayers, pretrained_model_instance=pretrained_model, subsets=(1,0,0)) + + for test_trial in range(1, ntrials+1): + break # temporarily + + test_dataset = glob(f"data/20230129/test/{window_length}/{test_trial}/larva_dataset_*.hdf5")[0] + print(f"test dataset: {test_dataset}", flush=True) + + # score using a fresh (trained) model instance + backend.reset_data() + backend.move_to_raw(test_dataset, copy=False) + # again, we inform `make_dataset` about the availability of the ground truth; this makes `predict_model` return both the predicted and expected labels, so that we can compare + make_dataset(backend, labels_expected=True) + + predictions = predict_model(backend, subset='test', subsets=(0,0,1)) + + with open(f"data/20230129-results/winlength_{window_length}_trials_{pretraining_trial}-{training_trial}-{test_trial}.txt", 'w') as f: + for a, b in zip(*predictions): + f.write(f"{a},{b}\n") + + # copy files into the directory for upcoming 20230129 tagger + dir_ = 'models/20230129' + for cmd in ( + ['rm', '-rf', dir_], + ['cp', '-ra', f"models/{model}", dir_], + ['rm', '-f', os.path.join(dir_, 'best_validated_encoder.pt')], + ['sed', '-i', os.path.join(dir_, 'autoencoder_config.json'), '-e', f"s|\"log_dir\": \"models/{model}\"|\"log_dir\": \"\"|", '-e', f"s|\"config\": \".*\"|\"config\": \"{dir_}/autoencoder_config.json\"|", '-e', f"s|\"exp_name\": \"latent-{latent_dim}\"|\"exp_name\": \"20230129\"|"], + ['sed', '-i', os.path.join(dir_, 'clf_config.json'), '-e', f"s|{backend.model_dir()}|{dir_}|", '-e', "s|\"clf_path\": \".*\"|\"clf_path\": \"trained_classifier.pt\"|", '-e', "s|\"enc_path\": \".*\"|\"enc_path\": \"retrained_encoder.pt\"|", '-e', r's|"enc_path": "retrained_encoder.pt"|"enc_path": "retrained_encoder.pt",\n "post_filters": [\n "ABC->AAC"\n ]|'], + ): + ret = subprocess.run(cmd, capture_output=True) + for out in (ret.stdout, ret.stderr): + if out: + print(out.decode('utf8')) + diff --git a/scripts/maestro/train_on_20230129_datasets.sh b/scripts/maestro/train_on_20230129_datasets.sh new file mode 100644 index 0000000000000000000000000000000000000000..5132435f0409deacfa83fb69a9e52bb70f4bda23 --- /dev/null +++ b/scripts/maestro/train_on_20230129_datasets.sh @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH --job-name=Train20230129 +#SBATCH --partition=dbc_pmo +#SBATCH --qos=dbc +#SBATCH --array=1-1 +#SBATCH --cpus-per-task=4 + +host=$(hostname) +case ${host:0:8} in + maestro-) +# the shebangs at the top of the present file should include: +#SBATCH --partition=dbc_pmo +#SBATCH --qos=dbc +echo "running on maestro" +if ! [ "$(pwd)" = "$MYSCRATCH/MaggotUBA-adapter" ]; then +echo "Please run from MaggotUBA-adapter directory:" +echo " sbatch scripts/maestro/train_on_20230129_datasets.sh" +exit 1 +fi +export DATA_REPOSITORY=/pasteur/zeus/projets/p02/hecatonchire/screens +;; + *) +echo "unknown host: $host" +exit 1 +esac + +echo "running JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/train_on_20230129_datasets.py $SLURM_ARRAY_TASK_ID" +JULIA_PROJECT=$(realpath ../TaggingBackends) poetry run python scripts/maestro/train_on_20230129_datasets.py $SLURM_ARRAY_TASK_ID