diff --git a/src/taggingbackends/main.py b/src/taggingbackends/main.py index a6c6d9c19f3f25267b24b1cde85b8e83f035a8ec..31655f9231bbc94a2dc2c966c627a35eccfba72f 100644 --- a/src/taggingbackends/main.py +++ b/src/taggingbackends/main.py @@ -10,7 +10,8 @@ Usage: tagging-backend [train|predict] --model-instance <name> tagging-backend train ... --sample-size <N> --balancing-strategy <strategy> tagging-backend train ... --frame-interval <I> --window-length <T> tagging-backend train ... --pretrained-model-instance <name> - tagging-backend predict ... --skip-make-dataset --sandbox <token> + tagging-backend train ... --skip-make-dataset --skip-build-features + tagging-backend predict ... --make-dataset --build-features --sandbox <token> `tagging-backend` typically is run using `poetry run`. A name must be provided to identify the trained model and its location within @@ -36,6 +37,10 @@ the `make_dataset` module is loaded and this may take quite some time due to dependencies (e.g. Julia FFI). The `--skip-make-dataset` option makes `train` truly skip this step; the corresponding module is not loaded. +Since version 0.8, `predict` makes `--skip-make-dataset` and +`--skip-build-features` the default behavior. As a counterpart, it admits +arguments `--make-dataset` and `--build-features`. + `--sandbox <token>` makes `tagging-backend` use a token instead of <name> as directory name in data/raw, data/interim and data/processed. This is intended to prevent conflicts on running `predict` in parallel on @@ -59,7 +64,7 @@ def main(fun=None): input_files, labels = [], [] sample_size = window_length = frame_interval = None trxmat_only = reuse_h5files = False - skip_make_dataset = skip_build_features = False + make_dataset = build_features = None pretrained_model_instance = None sandbox = False balancing_strategy = 'auto' @@ -92,9 +97,13 @@ def main(fun=None): elif sys.argv[k] == "--reuse-h5files": reuse_h5files = True elif sys.argv[k] == "--skip-make-dataset": - skip_make_dataset = True + make_dataset = False elif sys.argv[k] == "--skip-build-features": - skip_build_features = True + build_features = False + elif sys.argv[k] == '--make-dataset': + make_dataset = True + elif sys.argv[k] == '--build-features': + build_features = True elif sys.argv[k] == "--pretrained-model-instance": k = k + 1 pretrained_model_instance = sys.argv[k] @@ -116,7 +125,11 @@ def main(fun=None): if input_files: for file in input_files: backend.move_to_raw(file) - if not skip_make_dataset: + if make_dataset is None and train_or_predict == 'train': + make_dataset = True + if build_features is None and train_or_predict == 'train': + build_features = True + if make_dataset: make_dataset_kwargs = dict(labels_expected=train_or_predict == "train", balancing_strategy=balancing_strategy) if labels: @@ -134,7 +147,7 @@ def main(fun=None): elif reuse_h5files: logging.info("option --reuse-h5files is ignored in the absence of --trxmat-only") backend._run_script(backend.make_dataset, **make_dataset_kwargs) - if not skip_build_features: + if build_features: backend._run_script(backend.build_features) if train_or_predict == "predict": backend._run_script(backend.predict_model, trailing=unknown_args)