Skip to content
Snippets Groups Projects
Commit 93340eab authored by François  LAURENT's avatar François LAURENT
Browse files

implements most of larvatagger.jl#94

parent c0ce7942
No related branches found
No related tags found
No related merge requests found
Pipeline #97533 failed
......@@ -17,6 +17,7 @@ structure of *trx.mat* files, an alternative implementation is provided by modul
"""
using PlanarLarvae, PlanarLarvae.Formats, PlanarLarvae.Features, PlanarLarvae.MWT
using PlanarLarvae.Datasets: coerce
using Random
using HDF5
using Dates
......@@ -264,14 +265,55 @@ function thresholdedcounts(counts; majorityweight=20)
end
function write_larva_dataset_hdf5(path, counts, files, refs, nsteps_before, nsteps_after;
fixmwt=false, frameinterval=nothing,
fixmwt=false, frameinterval=nothing, includeall=nothing,
)
fixmwt && @warn "`fixmwt=true` is no longer supported"
# this method mutates argument `refs`
refs′= Tuple{Int, Int, Int, eltype(keys(counts))}[]
for (label, count) in pairs(counts)
for (i, j, k) in shuffle(refs[label])[1:count]
push!(refs′, (i, j, k, label))
T = eltype(keys(counts))
refs′= Tuple{Int, Int, Int, T}[]
if !isnothing(includeall)
includeall = coerce(T, includeall)
if haskey(counts, includeall)
count = counts[includeall]
T′= Vector{Tuple{Int, Int, Int, T}}
specialrefs = Dict{T, T′}()
for (i, j, k) in refs[includeall]
for l in keys(refs)
if l != includeall
m = findfirst(==((i, j, k)), refs[l])
if !isnothing(m)
push!(get!(specialrefs, l, T′()), (i, j, k, l))
deleteat!(refs[l], m)
end
end
end
end
if !isempty(specialrefs)
@info "Explicit inclusions based on label \"$(includeall)\":" [Symbol(label) => length(refs″) for (label, refs″) in pairs(specialrefs)]...
for (label, count) in pairs(counts)
label == includeall && continue
if label in keys(specialrefs)
refs″= specialrefs[label]
if count < length(refs″)
refs″ = shuffle(refs″)[1:count]
end
refs′= vcat(refs′, refs″)
count = count - length(refs″)
end
if 0 < count
for (i, j, k) in shuffle(refs[label])[1:count]
push!(refs′, (i, j, k, label))
end
end
end
end
end
end
if isempty(refs′)
for (label, count) in pairs(counts)
for (i, j, k) in shuffle(refs[label])[1:count]
push!(refs′, (i, j, k, label))
end
end
end
empty!(refs) # free memory
......@@ -474,6 +516,12 @@ Note that, if `input_data` lists all files, `labelledfiles` is not called and ar
Similarly, if `labelpointers` is defined, `labelcounts` is not called and argument
`timestep_filter` is not used.
*New in version 0.10*: `includeall` specifies a secondary label for systematic inclusion in
the dataset. The time segments with this secondary label are accounted for under the
associated primary label, prior to applying the balancing rule. If the label specified by
`includeall` is found in `labels`, it is considered as primary and `includeall` is ignored.
Generally speaking, `labels` should not include any secondary label.
Known issue: ASCII-compatible string attributes are ASCII encoded and deserialized as `bytes`
by the *h5py* Python library.
"""
......@@ -489,7 +537,8 @@ function write_larva_dataset_hdf5(output_dir::String,
shallow=false,
balance=true,
fixmwt=false,
frameinterval=nothing)
frameinterval=nothing,
includeall="edited")
files = if input_data isa String
repository = input_data
labelledfiles(repository, chunks; selection_rule=file_filter, shallow=shallow)
......@@ -515,14 +564,23 @@ function write_larva_dataset_hdf5(output_dir::String,
refs = labelpointers
counts = Dict{String, Int}(label=>length(pointers) for (label, pointers) in pairs(labelpointers))
end
if !isnothing(labels)
labels′ = p -> string(p[1]) in labels
missing_labels = [label for label in labels if label keys(counts)]
if !isempty(missing_labels)
@warn "No occurences found for labels: \"$(join(missing_labels, "\", \""))\""
if isnothing(labels)
labels = collect(keys(counts))
if !isnothing(includeall) && includeall labels
labels = [label for label in labels if label != includeall]
end
else
if !isnothing(includeall) && includeall labels
includeall = nothing
end
labels′= if isnothing(includeall)
p -> string(p[1]) in labels
else
p -> string(p[1]) in labels || string(p[1]) == includeall
end
filter!(labels′, counts)
filter!(labels′, refs)
isempty(counts) && throw("None of specified labels were found")
end
if balance
sample_sizes, total_sample_size = balancedcounts(counts, sample_size)
......@@ -530,12 +588,13 @@ function write_larva_dataset_hdf5(output_dir::String,
isnothing(sample_size) || @error "Argument sample_size not supported for the specified balancing strategy"
sample_sizes, total_sample_size = thresholdedcounts(counts)
end
@info "Sample sizes (observed, selected):" [Symbol(label) => (count, get(sample_sizes, label, 0)) for (label, count) in pairs(counts)]...
@info "Sample sizes (observed, selected):" [Symbol(label) => (get(counts, label, 0), get(sample_sizes, label, 0)) for label in labels]...
date = Dates.format(Dates.now(), "yyyy_mm_dd")
output_file = joinpath(output_dir, "larva_dataset_$(date)_$(window_length)_$(window_length)_$(total_sample_size).hdf5")
write_larva_dataset_hdf5(output_file,
sample_sizes, files, refs, nsteps_before, nsteps_after;
fixmwt=fixmwt, frameinterval=frameinterval)
fixmwt=fixmwt, frameinterval=frameinterval,
includeall=includeall)
h5open(output_file, "cw") do h5
attributes(h5["samples"])["len_traj"] = window_length
......
......@@ -474,7 +474,7 @@ run `poetry add {pkg}` from directory: \n
def generate_dataset(self, input_files,
labels=None, window_length=20, sample_size=None, balance=True,
frame_interval=None):
include_all=None, frame_interval=None):
"""
Generate a *larva_dataset hdf5* file in data/interim/{instance}/
"""
......@@ -485,6 +485,7 @@ run `poetry add {pkg}` from directory: \n
labels=labels,
sample_size=sample_size,
balance=balance,
includeall=include_all,
frameinterval=frame_interval)
def compile_trxmat_database(self, input_dir,
......
......@@ -10,6 +10,7 @@ Usage: tagging-backend [train|predict] --model-instance <name>
tagging-backend train ... --sample-size <N> --balancing-strategy <strategy>
tagging-backend train ... --frame-interval <I> --window-length <T>
tagging-backend train ... --pretrained-model-instance <name>
tagging-backend train ... --include-all <secondary-label>
tagging-backend train ... --skip-make-dataset --skip-build-features
tagging-backend predict ... --make-dataset --build-features --sandbox <token>
......@@ -37,7 +38,7 @@ the `make_dataset` module is loaded and this may take quite some time due to
dependencies (e.g. Julia FFI). The `--skip-make-dataset` option makes `train`
truly skip this step; the corresponding module is not loaded.
Since version 0.8, `predict` makes `--skip-make-dataset` and
Since version 0.10, `predict` makes `--skip-make-dataset` and
`--skip-build-features` the default behavior. As a counterpart, it admits
arguments `--make-dataset` and `--build-features`.
......@@ -68,6 +69,7 @@ def main(fun=None):
pretrained_model_instance = None
sandbox = False
balancing_strategy = 'auto'
include_all = None
unknown_args = {}
k = 2
while k < len(sys.argv):
......@@ -113,6 +115,9 @@ def main(fun=None):
elif sys.argv[k] == "--balancing-strategy":
k = k + 1
balancing_strategy = sys.argv[k]
elif sys.argv[k] == "--include-all":
k = k + 1
include_all = sys.argv[k]
else:
unknown_args[sys.argv[k].lstrip('-').replace('-', '_')] = sys.argv[k+1]
k = k + 1
......@@ -146,6 +151,8 @@ def main(fun=None):
make_dataset_kwargs["reuse_h5files"] = True
elif reuse_h5files:
logging.info("option --reuse-h5files is ignored in the absence of --trxmat-only")
if include_all:
make_dataset_kwargs["include_all"] = include_all
backend._run_script(backend.make_dataset, **make_dataset_kwargs)
if build_features:
backend._run_script(backend.build_features)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment