diff --git a/src/maggotuba/models/predict_model.py b/src/maggotuba/models/predict_model.py index cabe96e2cdafba50fc119964339832f5d57a9353..b7919e9c41f08aa22b1731b1de7535db14a34167 100644 --- a/src/maggotuba/models/predict_model.py +++ b/src/maggotuba/models/predict_model.py @@ -2,34 +2,29 @@ from taggingbackends.data.labels import Labels from taggingbackends.features.skeleton import get_5point_spines from maggotuba.models.trainers import MaggotTrainer, MultiscaleMaggotTrainer, new_generator import numpy as np +import logging def predict_model(backend, **kwargs): """ This function generates predicted labels for all the input data. - It currently supports single files at a time only, either *larva_dataset* - hdf5 files or track data files of any time. + It supports single *larva_dataset* hdf5 files, or (possibly multiple) track + data files. Input files are expected in `data/interim` or `data/raw`. - The predicted labels are saved in `data/processed`, as file - `predicted.label`. + The predicted labels are saved in `data/processed`, in `predicted.label` + files, following the same directory structure as in `data/interim` or + `data/raw`. """ # we pick files in `data/interim` if any, otherwise in `data/raw` - input_files = backend.list_interim_files() + input_files = backend.list_interim_files(group_by_directories=True) if not input_files: - input_files = backend.list_input_files() + input_files = backend.list_input_files(group_by_directories=True) assert 0 < len(input_files) # initialize output labels - input_files, labels = backend.prepare_labels(input_files) - assert 0 < len(input_files) - # 20221005 branch only: - if backend.model_instance == '20221005': - labels.metadata['software']['tagger']['changes'] = [ - 'MaggotUBA-adapter#2', - 'LarvaTagger.jl#60', - 'LarvaTagger.jl#65', - ] + input_files_and_labels = backend.prepare_labels(input_files) + assert 0 < len(input_files_and_labels) # load the model model_files = backend.list_model_files() config_file = [file for file in model_files if file.name.endswith("config.json")] @@ -47,95 +42,107 @@ def predict_model(backend, **kwargs): else: model = MaggotTrainer(config_file) # - labels.labelspec = model.clf_config["behavior_labels"] - # if len(input_files) == 1: - file = input_files[0] - if file.name.startswith("larva_dataset_") and file.name.endswith(".hdf5"): - ret = predict_larva_dataset(backend, model, file, labels, **kwargs) - return labels if ret is None else ret + input_files = next(iter(input_files.values())) + if len(input_files) == 1: + file = input_files[0] + if file.name.startswith("larva_dataset_") and file.name.endswith(".hdf5"): + return predict_larva_dataset(backend, model, file, **kwargs) # - ret = predict_individual_data_files(backend, model, input_files, labels) - return labels if ret is None else ret + predict_individual_data_files(backend, model, input_files_and_labels) -def predict_individual_data_files(backend, model, input_files, labels): +def predict_individual_data_files(backend, model, input_files_and_labels): from taggingbackends.data.trxmat import TrxMat from taggingbackends.data.chore import load_spine import taggingbackends.data.fimtrack as fimtrack # - _break = False # for now, a single file can be labelled at a time - for file in input_files: - # load the input data (or features) - if _break: - print(f"ignoring file: {file.name}") - continue - elif file.name.endswith(".spine"): - spine = load_spine(file) - run = spine["date_time"].iloc[0] - larvae = spine["larva_id"].values - t = spine["time"].values - data = spine.iloc[:,3:].values - elif file.name.endswith(".mat"): - trx = TrxMat(file) - t = trx["t"] - data = trx["spine"] - run, data = next(iter(data.items())) - if run == "spine": - run, data = next(iter(data.items())) - t = t[run] - elif file.name.endswith(".csv"): - if labels.camera_framerate: - print(f"camera frame rate: {labels.camera_framerate}fps") - else: - print("assuming 30-fps camera frame rate") - labels.camera_framerate = 30 - t, data = fimtrack.read_spines(file, fps=labels.camera_framerate) - run = "NA" - else: - print(f"ignoring file: {file.name}") - continue - # downsample the skeleton - if isinstance(data, dict): - for larva in data: - data[larva] = get_5point_spines(data[larva]) - else: - data = get_5point_spines(data) + for input_files, labels in input_files_and_labels.values(): + labels.labelspec = model.clf_config["behavior_labels"] + # 20221005 branch only: + if backend.model_instance == '20221005': + labels.metadata['software']['tagger']['changes'] = [ + 'MaggotUBA-adapter#2', + 'LarvaTagger.jl#60', + 'LarvaTagger.jl#65', + ] # - post_filters = model.clf_config.get('post_filters', None) - # assign labels - if isinstance(data, dict): - ref_length = np.median(np.concatenate([ - model.body_length(spines) for spines in data.values() - ])) - model.average_body_length = ref_length - print(f"average body length: {ref_length}") - for larva, spines in data.items(): - predictions = model.predict((t[larva], spines)) - if predictions is None: - print(f"failure to window track: {larva}") - else: - predictions = apply_filters(predictions, post_filters) - labels[run, larva] = dict(_zip(t[larva], predictions)) - else: - ref_length = np.median(model.body_length(data)) - model.average_body_length = ref_length - print(f"average body length: {ref_length}") - for larva in np.unique(larvae): - mask = larvae == larva - predictions = model.predict((t[mask], data[mask])) - if predictions is None: - print(f"failure to window track: {larva}") + done = False + for file in input_files: + # load the input data (or features) + if done: + logging.info(f"ignoring file: {file.name}") + continue + elif file.name.endswith(".outline"): + # skip to spine file + logging.info(f"ignoring file: {file.name}") + continue + elif file.name.endswith(".spine"): + spine = load_spine(file) + run = spine["date_time"].iloc[0] + larvae = spine["larva_id"].values + t = spine["time"].values + data = spine.iloc[:,3:].values + elif file.name.endswith(".mat"): + trx = TrxMat(file) + t = trx["t"] + data = trx["spine"] + run, data = next(iter(data.items())) + if run == "spine": + run, data = next(iter(data.items())) + t = t[run] + elif file.name.endswith(".csv"): + if labels.camera_framerate: + logging.info(f"camera frame rate: {labels.camera_framerate}fps") else: - predictions = apply_filters(predictions, post_filters) - labels[run, larva] = dict(_zip(t[mask], predictions)) - # save the predicted labels to file - labels.dump(backend.processed_data_dir() / "predicted.label") - # - _break = True + logging.info("assuming 30-fps camera frame rate") + labels.camera_framerate = 30 + t, data = fimtrack.read_spines(file, fps=labels.camera_framerate) + run = "NA" + else: + # label files not processed; only their data dependencies are + logging.info(f"ignoring file: {file.name}") + continue + # downsample the skeleton + if isinstance(data, dict): + for larva in data: + data[larva] = get_5point_spines(data[larva]) + else: + data = get_5point_spines(data) + # + post_filters = model.clf_config.get('post_filters', None) + # assign labels + if isinstance(data, dict): + ref_length = np.median(np.concatenate([ + model.body_length(spines) for spines in data.values() + ])) + model.average_body_length = ref_length + logging.info(f"average body length: {ref_length}") + for larva, spines in data.items(): + predictions = model.predict((t[larva], spines)) + if predictions is None: + logging.info(f"failure to window track: {larva}") + else: + predictions = apply_filters(predictions, post_filters) + labels[run, larva] = dict(_zip(t[larva], predictions)) + else: + ref_length = np.median(model.body_length(data)) + model.average_body_length = ref_length + logging.info(f"average body length: {ref_length}") + for larva in np.unique(larvae): + mask = larvae == larva + predictions = model.predict((t[mask], data[mask])) + if predictions is None: + logging.info(f"failure to window track: {larva}") + else: + predictions = apply_filters(predictions, post_filters) + labels[run, larva] = dict(_zip(t[mask], predictions)) + # save the predicted labels to file + labels.dump(get_output_filepath(backend, file)) + # + done = True -def predict_larva_dataset(backend, model, file, labels, subset="validation"): +def predict_larva_dataset(backend, model, file, subset="validation"): from taggingbackends.data.dataset import LarvaDataset - # dataset = LarvaDataset(file, new_generator()) return model.predict(dataset, subset) @@ -144,6 +151,26 @@ def _zip(xs, ys): assert len(xs) == len(ys) return zip(xs, ys) +def get_output_filepath(backend, file): + #if file.is_relative_to(backend.interim_data_dir()): # Py>=3.9 + if str(file).startswith(str(backend.interim_data_dir())): + subdir = file.parent.relative_to(backend.interim_data_dir()) + else: + #assert file.is_relative_to(backend.raw_data_dir()) + assert str(file).startswith(str(backend.raw_data_dir())) + subdir = file.parent.relative_to(backend.raw_data_dir()) + parentdir = backend.processed_data_dir() / subdir + parentdir.mkdir(parents=True, exist_ok=True) + target = parentdir / "predicted.label" + if target.is_file(): + logging.info(f"ouput file already exists: {target}") + i = 0 + while True: + i += 1 + target = parentdir / f"predicted-{i}.label" + if not target.is_file(): break + return target + def apply_filters(labels, post_filters): if post_filters: for post_filter in post_filters: