Source code for cell_abm_pipeline.flows.parse_physicell_simulations
"""
Workflow for parsing PhysiCell simulations into tidy data.
Working location structure:
.. code-block:: bash
(name)
├── data
│ └── (name)_(key)_(seed).tar.xz
└── results
└── (name)_(key)_(seed).csv
Data from **data** are parsed into **results**.
"""
from dataclasses import dataclass, field
from container_collection.manifest import filter_manifest_files
from io_collection.keys import make_key
from io_collection.load import load_dataframe, load_tar
from io_collection.save import save_dataframe
from prefect import flow
from cell_abm_pipeline.tasks.physicell import parse_mcds_file
[docs]@dataclass
class ParametersConfig:
"""Parameter configuration for parse physicell simulations flow."""
include_filters: list[str] = field(default_factory=lambda: ["*"])
"""List of Unix filename patterns for files to include in parsing."""
exclude_filters: list[str] = field(default_factory=lambda: [])
"""List of Unix filename patterns for files to exclude from parsing."""
[docs]@dataclass
class ContextConfig:
"""Context configuration for parse physicell simulations flow."""
working_location: str
"""Location for input and output files (local path or S3 bucket)."""
manifest_location: str
"""Location of manifest file (local path or S3 bucket)."""
[docs]@dataclass
class SeriesConfig:
"""Series configuration for parse physicell simulations flow."""
name: str
"""Name of the simulation series."""
manifest_key: str
"""Key for manifest file."""
extensions: list[str]
"""List of file extensions in complete run."""
[docs]@flow(name="parse-physicell-simulations")
def run_flow(context: ContextConfig, series: SeriesConfig, parameters: ParametersConfig) -> None:
"""Main parse physicell simulations flow."""
manifest = load_dataframe(context.manifest_location, series.manifest_key)
filtered_files = filter_manifest_files(
manifest, series.extensions, parameters.include_filters, parameters.exclude_filters
)
for key, files in filtered_files.items():
tar_file = load_tar(**files["tar.xz"])
results = parse_mcds_file(tar_file)
results_key = make_key(series.name, "{{timestamp}}", "results", f"{key}.csv")
save_dataframe(context.working_location, results_key, results, index=False)