Source code for cell_abm_pipeline.flows.process_sample

"""
Workflow for processing image samples.

Working location structure:

.. code-block:: bash

    (name)
    ├── plots
    │   └── plots.SAMPLE
    │       └── (name)_(key).SAMPLE.png
    └── samples
        ├── samples.PROCESSED
        │   └── (name)_(key).PROCESSED.csv
        └── samples.RAW
            └── (name)_(key).RAW.csv

Samples to be processed are loaded from **samples.RAW**. Resulting processed
sample(s) are placed into **samples.PROCESSED** and corresponding contact
sheet(s) are placed into **plots.SAMPLE**. Note that these contact sheet(s) will
overwrite existing contact sheets generated by the sample images task.
"""

from dataclasses import dataclass
from typing import Optional

from abm_initialization_collection.image import plot_contact_sheet
from abm_initialization_collection.sample import (
    exclude_selected_ids,
    include_selected_ids,
    remove_edge_regions,
    remove_unconnected_regions,
)
from io_collection.keys import make_key
from io_collection.load import load_dataframe
from io_collection.save import save_dataframe, save_figure
from prefect import flow

# Default distance for removing unconnected regions.
UNCONNECTED_THRESHOLD: float = 2.0

# Default number of edge positions per axis needed to assign edge region.
EDGE_THRESHOLD: int = 1

# Default distance from axis limits to assign edge positions.
EDGE_PADDING: float = 1.0


[docs]@dataclass
class ParametersConfig:
    """Parameter configuration for process sample flow."""

    key: str
    """Sample key to process."""

    remove_unconnected: bool = True
    """True to remove unconnected regions, False otherwise."""

    unconnected_threshold: float = UNCONNECTED_THRESHOLD
    """Distance for removing unconnected regions."""

    unconnected_filter: str = "connectivity"
    """Filter type for assigning unconnected coordinates."""

    remove_edges: bool = True
    """True to remove cells touching the edge of the bounds, False otherwise."""

    edge_threshold: int = EDGE_THRESHOLD
    """Number of edge positions per axis needed to assign edge region."""

    edge_padding: float = EDGE_PADDING
    """Distance from axis limits to assign edge positions."""

    include_ids: Optional[list[int]] = None
    """List of ids to include."""

    exclude_ids: Optional[list[int]] = None
    """List of ids to exclude."""

    contact_sheet: bool = True
    """True to save contact sheet of processed samples, False otherwise."""


[docs]@dataclass
class ContextConfig:
    """Context configuration for process sample flow."""

    working_location: str
    """Location for input and output files (local path or S3 bucket)."""


[docs]@dataclass
class SeriesConfig:
    """Series configuration for process sample flow."""

    name: str
    """Name of the simulation series."""


[docs]@flow(name="process-sample")
def run_flow(context: ContextConfig, series: SeriesConfig, parameters: ParametersConfig) -> None:
    """Main process sample flow."""

    item_key = f"{series.name}_{parameters.key}"
    sample_key = make_key(series.name, "samples", "samples.RAW", f"{item_key}.RAW.csv")

    raw_samples = load_dataframe(context.working_location, sample_key)
    processed_samples = raw_samples.copy()

    if parameters.remove_unconnected:
        processed_samples = remove_unconnected_regions(
            processed_samples, parameters.unconnected_threshold, parameters.unconnected_filter
        )

    if parameters.remove_edges:
        processed_samples = remove_edge_regions(
            processed_samples, parameters.edge_threshold, parameters.edge_padding
        )

    if parameters.include_ids is not None:
        processed_samples = include_selected_ids(processed_samples, parameters.include_ids)

    if parameters.exclude_ids is not None:
        processed_samples = exclude_selected_ids(processed_samples, parameters.exclude_ids)

    processed_key = make_key(
        series.name, "samples", "samples.PROCESSED", f"{item_key}.PROCESSED.csv"
    )
    save_dataframe(context.working_location, processed_key, processed_samples, index=False)

    if parameters.contact_sheet:
        contact_sheet = plot_contact_sheet(processed_samples, raw_samples)
        plot_key = make_key(series.name, "plots", "plots.SAMPLE", f"{item_key}.SAMPLE.png")
        save_figure(context.working_location, plot_key, contact_sheet)