implements most of larvatagger.jl#94

93340eab · François LAURENT · c0ce7942 · 93340eab · 93340eab · 93340eab
Commit 93340eab authored 2 years ago by François LAURENT
--- a/src/LarvaDatasets.jl
+++ b/src/LarvaDatasets.jl
@@ -17,6 +17,7 @@ structure of *trx.mat* files, an alternative implementation is provided by modul
 """

 using PlanarLarvae, PlanarLarvae.Formats, PlanarLarvae.Features, PlanarLarvae.MWT
+using PlanarLarvae.Datasets: coerce
 using Random
 using HDF5
 using Dates
@@ -264,14 +265,55 @@ function thresholdedcounts(counts; majorityweight=20)
 end

 function write_larva_dataset_hdf5(path, counts, files, refs, nsteps_before, nsteps_after;
-        fixmwt=false, frameinterval=nothing,
+        fixmwt=false, frameinterval=nothing, includeall=nothing,
    )
    fixmwt && @warn "`fixmwt=true` is no longer supported"
    # this method mutates argument `refs`
-    refs′= Tuple{Int, Int, Int, eltype(keys(counts))}[]
-    for (label, count) in pairs(counts)
-        for (i, j, k) in shuffle(refs[label])[1:count]
-            push!(refs′, (i, j, k, label))
+    T = eltype(keys(counts))
+    refs′= Tuple{Int, Int, Int, T}[]
+    if !isnothing(includeall)
+        includeall = coerce(T, includeall)
+        if haskey(counts, includeall)
+            count = counts[includeall]
+            T′= Vector{Tuple{Int, Int, Int, T}}
+            specialrefs = Dict{T, T′}()
+            for (i, j, k) in refs[includeall]
+                for l in keys(refs)
+                    if l != includeall
+                        m = findfirst(==((i, j, k)), refs[l])
+                        if !isnothing(m)
+                            push!(get!(specialrefs, l, T′()), (i, j, k, l))
+                            deleteat!(refs[l], m)
+                        end
+                    end
+                end
+            end
+            if !isempty(specialrefs)
+                @info "Explicit inclusions based on label \"$(includeall)\":" [Symbol(label) => length(refs″) for (label, refs″) in pairs(specialrefs)]...
+                for (label, count) in pairs(counts)
+                    label == includeall && continue
+                    if label in keys(specialrefs)
+                        refs″= specialrefs[label]
+                        if count < length(refs″)
+                            refs″ = shuffle(refs″)[1:count]
+                        end
+                        refs′= vcat(refs′, refs″)
+                        count = count - length(refs″)
+                    end
+                    if 0 < count
+                        for (i, j, k) in shuffle(refs[label])[1:count]
+                            push!(refs′, (i, j, k, label))
+                        end
+                    end
+                end
+            end
+        end
+    end
+    if isempty(refs′)
+        for (label, count) in pairs(counts)
+            for (i, j, k) in shuffle(refs[label])[1:count]
+                push!(refs′, (i, j, k, label))
+            end
        end
    end
    empty!(refs) # free memory
@@ -474,6 +516,12 @@ Note that, if `input_data` lists all files, `labelledfiles` is not called and ar
 Similarly, if `labelpointers` is defined, `labelcounts` is not called and argument
 `timestep_filter` is not used.

+*New in version 0.10*: `includeall` specifies a secondary label for systematic inclusion in
+the dataset. The time segments with this secondary label are accounted for under the
+associated primary label, prior to applying the balancing rule. If the label specified by
+`includeall` is found in `labels`, it is considered as primary and `includeall` is ignored.
+Generally speaking, `labels` should not include any secondary label.
+
 Known issue: ASCII-compatible string attributes are ASCII encoded and deserialized as `bytes`
 by the *h5py* Python library.
 """
@@ -489,7 +537,8 @@ function write_larva_dataset_hdf5(output_dir::String,
        shallow=false,
        balance=true,
        fixmwt=false,
-        frameinterval=nothing)
+        frameinterval=nothing,
+        includeall="edited")
    files = if input_data isa String
        repository = input_data
        labelledfiles(repository, chunks; selection_rule=file_filter, shallow=shallow)
@@ -515,14 +564,23 @@ function write_larva_dataset_hdf5(output_dir::String,
        refs = labelpointers
        counts = Dict{String, Int}(label=>length(pointers) for (label, pointers) in pairs(labelpointers))
    end
-    if !isnothing(labels)
-        labels′ = p -> string(p[1]) in labels
-        missing_labels = [label for label in labels if label ∉ keys(counts)]
-        if !isempty(missing_labels)
-            @warn "No occurences found for labels: \"$(join(missing_labels, "\", \""))\""
+    if isnothing(labels)
+        labels = collect(keys(counts))
+        if !isnothing(includeall) && includeall ∈ labels
+            labels = [label for label in labels if label != includeall]
+        end
+    else
+        if !isnothing(includeall) && includeall ∈ labels
+            includeall = nothing
+        end
+        labels′= if isnothing(includeall)
+            p -> string(p[1]) in labels
+        else
+            p -> string(p[1]) in labels || string(p[1]) == includeall
        end
        filter!(labels′, counts)
        filter!(labels′, refs)
+        isempty(counts) && throw("None of specified labels were found")
    end
    if balance
        sample_sizes, total_sample_size = balancedcounts(counts, sample_size)
@@ -530,12 +588,13 @@ function write_larva_dataset_hdf5(output_dir::String,
        isnothing(sample_size) || @error "Argument sample_size not supported for the specified balancing strategy"
        sample_sizes, total_sample_size = thresholdedcounts(counts)
    end
-    @info "Sample sizes (observed, selected):" [Symbol(label) => (count, get(sample_sizes, label, 0)) for (label, count) in pairs(counts)]...
+    @info "Sample sizes (observed, selected):" [Symbol(label) => (get(counts, label, 0), get(sample_sizes, label, 0)) for label in labels]...
    date = Dates.format(Dates.now(), "yyyy_mm_dd")
    output_file = joinpath(output_dir, "larva_dataset_$(date)_$(window_length)_$(window_length)_$(total_sample_size).hdf5")
    write_larva_dataset_hdf5(output_file,
                             sample_sizes, files, refs, nsteps_before, nsteps_after;
-                             fixmwt=fixmwt, frameinterval=frameinterval)
+                             fixmwt=fixmwt, frameinterval=frameinterval,
+                             includeall=includeall)

    h5open(output_file, "cw") do h5
        attributes(h5["samples"])["len_traj"] = window_length

--- a/src/taggingbackends/explorer.py
+++ b/src/taggingbackends/explorer.py
@@ -474,7 +474,7 @@ run `poetry add {pkg}` from directory: \n

    def generate_dataset(self, input_files,
            labels=None, window_length=20, sample_size=None, balance=True,
-            frame_interval=None):
+            include_all=None, frame_interval=None):
        """
        Generate a *larva_dataset hdf5* file in data/interim/{instance}/
        """
@@ -485,6 +485,7 @@ run `poetry add {pkg}` from directory: \n
                labels=labels,
                sample_size=sample_size,
                balance=balance,
+                includeall=include_all,
                frameinterval=frame_interval)

    def compile_trxmat_database(self, input_dir,

--- a/src/taggingbackends/main.py
+++ b/src/taggingbackends/main.py
@@ -10,6 +10,7 @@ Usage:  tagging-backend [train|predict] --model-instance <name>
        tagging-backend train ... --sample-size <N> --balancing-strategy <strategy>
        tagging-backend train ... --frame-interval <I> --window-length <T>
        tagging-backend train ... --pretrained-model-instance <name>
+        tagging-backend train ... --include-all <secondary-label>
        tagging-backend train ... --skip-make-dataset --skip-build-features
        tagging-backend predict ... --make-dataset --build-features --sandbox <token>

@@ -37,7 +38,7 @@ the `make_dataset` module is loaded and this may take quite some time due to
 dependencies (e.g. Julia FFI). The `--skip-make-dataset` option makes `train`
 truly skip this step; the corresponding module is not loaded.

-Since version 0.8, `predict` makes `--skip-make-dataset` and
+Since version 0.10, `predict` makes `--skip-make-dataset` and
 `--skip-build-features` the default behavior. As a counterpart, it admits
 arguments `--make-dataset` and `--build-features`.

@@ -68,6 +69,7 @@ def main(fun=None):
        pretrained_model_instance = None
        sandbox = False
        balancing_strategy = 'auto'
+        include_all = None
        unknown_args = {}
        k = 2
        while k < len(sys.argv):
@@ -113,6 +115,9 @@ def main(fun=None):
            elif sys.argv[k] == "--balancing-strategy":
                k = k + 1
                balancing_strategy = sys.argv[k]
+            elif sys.argv[k] == "--include-all":
+                k = k + 1
+                include_all = sys.argv[k]
            else:
                unknown_args[sys.argv[k].lstrip('-').replace('-', '_')] = sys.argv[k+1]
                k = k + 1
@@ -146,6 +151,8 @@ def main(fun=None):
                    make_dataset_kwargs["reuse_h5files"] = True
            elif reuse_h5files:
                logging.info("option --reuse-h5files is ignored in the absence of --trxmat-only")
+            if include_all:
+                make_dataset_kwargs["include_all"] = include_all
            backend._run_script(backend.make_dataset, **make_dataset_kwargs)
        if build_features:
            backend._run_script(backend.build_features)