building larva_dataset files

60a308f2 · François LAURENT · cab06b59 · 60a308f2 · 60a308f2 · 60a308f2
Commit 60a308f2 authored 3 years ago by François LAURENT
--- a/scripts/build_t5_t15.sh
+++ b/scripts/build_t5_t15.sh
+#!/bin/bash
+# prerequisite in parent dir:
+# git clone https://gitlab.pasteur.fr/nyx/TaggingBackends --branch dev
+# cd TaggingBackends; julia --project=. -e 'using Pkg; Pkg.instantiate()'
+backbone=../TaggingBackends
+if ! [ -d "$backbone" ]; then
+	echo "local repository not found: $backbone"
+	exit -1
+fi
+if [ -z $1 ]; then
+	model=t5_t15_full
+else
+	model="$1"
+fi
+# choose
+convention="back,cast,hunch,roll,run,stop"
+#convention="back_large,cast_large,hunch_large,roll_large,run_large,small_motion,stop_large"
+echo "convention: $convention"
+samplesize=100000
+echo "sample size: $samplesize"
+# the target total length is 100 < 3*34
+winlen=34
+echo "window length: $winlen (*3)"
+# prerequisite in data/raw: have (parts of) t5 and/or t15 mounted/copied (links do not work)
+mkdir -p "data/raw/$model"
+prevloc=$(pwd)
+cd "data/raw/$model"
+[ -d t5 ] || echo "cannot find t5 in directory data/raw/$model"
+[ -d t15 ] || echo "cannot find t15 in directory data/raw/$model"
+cd "$prevloc"
+# note: on first run, remove option --reuse-h5files
+JULIA_PROJECT="$backbone" poetry run tagging-backend train --model-instance "$model" --labels "$convention" --trxmat-only --reuse-h5files --sample-size $samplesize --window-length $winlen
+# note: for the purpose of generating a larva_dataset file,
+#       the Julia part of the TaggingBackends package is enough:
+#
+#       $ julia --project=. -e 'using TaggingBackends.Trxmat2HDF5; convert_trxmat_to_spineh5(; files="trxmat.list")
+#       with the trxmat.list file generated from sftpcampus and directory screens with:
+#       $ find t{5,15} -mindepth 4 -name trx.mat > ~/trxmat.list
+#       and retrieved locally.
+#
+#       $ julia --project=. -e 'using TaggingBackends.LarvaDatasets, TaggingBackends.Trxmat2HDF5; write_larva_dataset_hdf5("data/processed", larvah5files("data/interim"); labels=["back","cast","hunch","roll","run","stop"], sample_size=100000)
+# TODO: relocate or remove the _larva.h5 files currently found in data/interim/$model,
+#       prior to calling `predict`
+# TODO: suffix the name of the generated larva_dataset file with info about the chosen convention
+echo "relocate the following file for future reuse:"
+ls "data/interim/$model/larva_dataset_*.hdf5"
--- a/scripts/reinstall.sh
+++ b/scripts/reinstall.sh
+#!/bin/bash
+# this simple series of commands forces Poetry to update its local dependencies;
+# useful with a local TaggingBackends for example (pyproject.toml must be updated first).
+rm -rf $(poetry env info -p); rm -f poetry.lock; poetry install -vvv
--- a/src/maggotuba/data/make_dataset.py
+++ b/src/maggotuba/data/make_dataset.py
 import glob
 import pathlib
-def make_dataset(backend, labels_expected=False, trxmat_only=False, labels=None, **kwargs):
+def make_dataset(backend, labels_expected=False, trxmat_only=False, **kwargs):
    if labels_expected:
        larva_dataset_file = glob.glob(str(backend.raw_data_dir() / "larva_dataset_*.hdf5"))
        if larva_dataset_file:
@@ -12,10 +12,6 @@ def make_dataset(backend, labels_expected=False, trxmat_only=False, labels=None,
            print(f"moving file to interim: {larva_dataset_file}")
            backend.move_to_interim(larva_dataset_file, copy=False)
        else:
-            if labels:
-                if isinstance(labels, str):
-                    labels = labels.split(',')
-                kwargs["labels"] = labels
            print("generating a larva_dataset file...")
            # generate a larva_dataset_*.hdf5 file in data/interim/{instance}/
            if trxmat_only: