Coverage for src/cell_abm_pipeline/flows/group_cell_shapes.py: 0%
537 statements
« prev ^ index » next coverage.py v7.1.0, created at 2024-06-05 19:14 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2024-06-05 19:14 +0000
1"""
2Workflow for grouping cell shapes.
4Working location structure:
6.. code-block:: bash
8 (name)
9 ├── analysis
10 │ ├── analysis.CELL_SHAPES_DATA
11 │ │ └── (name)_(key).CELL_SHAPES_DATA.csv
12 │ └── analysis.CELL_SHAPES_MODELS
13 │ └── (name)_(key).CELL_SHAPES_MODELS.pkl
14 ├── data
15 │ └── data.LOCATIONS
16 │ └── (name)_(key)_(seed).LOCATIONS.tar.xz
17 └── groups
18 └── groups.CELL_SHAPES
19 ├── (name).feature_components.csv
20 ├── (name).feature_correlations.(key).(region).csv
21 ├── (name).feature_correlations.(key).(mode).(property).(region).csv
22 ├── (name).feature_distributions.(feature).json
23 ├── (name).mode_correlations.csv
24 ├── (name).population_counts.(time).csv
25 ├── (name).population_stats.json
26 ├── (name).shape_average.(key).(projection).json
27 ├── (name).shape_contours.(key).(seed).(time).(region).(projection).json
28 ├── (name).shape_errors.json
29 ├── (name).shape_modes.(key).(region).(mode).(projection).json
30 ├── (name).shape_samples.json
31 └── (name).variance_explained.csv
33Different groups use inputs from **data.LOCATIONS**,
34**analysis.CELL_SHAPES_DATA**, and **analysis.CELL_SHAPES_MODELS**. Grouped data
35are saved to **groups.CELL_SHAPES**.
37Different groups can be visualized using the corresponding plotting workflow or
38loaded into alternative tools.
39"""
41from dataclasses import dataclass, field
42from datetime import timedelta
43from typing import Optional, Union
45import numpy as np
46import pandas as pd
47from abm_shape_collection import (
48 construct_mesh_from_array,
49 construct_mesh_from_coeffs,
50 extract_mesh_projections,
51 extract_mesh_wireframe,
52 extract_shape_modes,
53 extract_voxel_contours,
54 make_voxels_array,
55)
56from arcade_collection.output import extract_tick_json, get_location_voxels
57from arcade_collection.output.convert_model_units import (
58 estimate_spatial_resolution,
59 estimate_temporal_resolution,
60)
61from io_collection.keys import make_key
62from io_collection.load import load_dataframe, load_pickle, load_tar
63from io_collection.save import save_dataframe, save_json
64from prefect import flow, get_run_logger
65from prefect.tasks import task_input_hash
66from scipy.spatial import ConvexHull, KDTree
67from scipy.stats import pearsonr
68from sklearn.decomposition import PCA
70from cell_abm_pipeline.flows.analyze_cell_shapes import PCA_COMPONENTS
71from cell_abm_pipeline.flows.calculate_coefficients import COEFFICIENT_ORDER
72from cell_abm_pipeline.tasks import bin_to_hex, calculate_data_bins, check_data_bounds
74OPTIONS = {
75 "cache_result_in_memory": False,
76 "cache_key_fn": task_input_hash,
77 "cache_expiration": timedelta(hours=12),
78}
80GROUPS: list[str] = [
81 "feature_components",
82 "feature_correlations",
83 "feature_distributions",
84 "mode_correlations",
85 "population_counts",
86 "population_stats",
87 "shape_average",
88 "shape_contours",
89 "shape_errors",
90 "shape_modes",
91 "shape_samples",
92 "variance_explained",
93]
95COMPONENT_FEATURES: list[str] = [
96 "volume",
97 "height",
98 "area",
99 "axis_major_length",
100 "axis_minor_length",
101 "eccentricity",
102 "orientation",
103 "perimeter",
104 "extent",
105 "solidity",
106]
108CORRELATION_PROPERTIES: list[str] = [
109 "volume",
110 "height",
111 "area",
112 "axis_major_length",
113 "axis_minor_length",
114 "eccentricity",
115 "perimeter",
116]
118DISTRIBUTION_PROPERTIES: list[str] = [
119 "volume",
120 "height",
121]
123PROJECTIONS: list[str] = [
124 "top",
125 "side1",
126 "side2",
127]
129LIMITS: dict[str, list] = {
130 "volume.DEFAULT": [500, 4000],
131 "volume.NUCLEUS": [0, 1500],
132 "height.DEFAULT": [0, 20],
133 "height.NUCLEUS": [0, 20],
134 "area.DEFAULT": [0, 1000],
135 "area.NUCLEUS": [0, 250],
136 "axis_major_length.DEFAULT": [0, 100],
137 "axis_major_length.NUCLEUS": [0, 50],
138 "axis_minor_length.DEFAULT": [0, 50],
139 "axis_minor_length.NUCLEUS": [0, 20],
140 "eccentricity.DEFAULT": [0, 1],
141 "eccentricity.NUCLEUS": [0, 1],
142 "perimeter.DEFAULT": [0, 250],
143 "perimeter.NUCLEUS": [0, 100],
144 "PC1": [-60, 60],
145 "PC2": [-50, 50],
146 "PC3": [-50, 50],
147 "PC4": [-50, 50],
148 "PC5": [-40, 40],
149 "PC6": [-40, 40],
150 "PC7": [-50, 50],
151 "PC8": [-50, 50],
152}
154BOUNDS: dict[str, list] = {
155 "volume.DEFAULT": [0, 6000],
156 "volume.NUCLEUS": [0, 2000],
157 "height.DEFAULT": [0, 21],
158 "height.NUCLEUS": [0, 21],
159 "area.DEFAULT": [0, 2500],
160 "area.NUCLEUS": [0, 1000],
161 "perimeter.DEFAULT": [0, 2000],
162 "perimeter.NUCLEUS": [0, 700],
163 "axis_major_length.DEFAULT": [0, 300],
164 "axis_major_length.NUCLEUS": [0, 150],
165 "axis_minor_length.DEFAULT": [0, 150],
166 "axis_minor_length.NUCLEUS": [0, 100],
167 "eccentricity.DEFAULT": [0, 1],
168 "eccentricity.NUCLEUS": [0, 1],
169 "orientation.DEFAULT": [-2, 2],
170 "orientation.NUCLEUS": [-2, 2],
171 "extent.DEFAULT": [0, 1],
172 "extent.NUCLEUS": [0, 1],
173 "solidity.DEFAULT": [0, 1],
174 "solidity.NUCLEUS": [0, 1],
175 "PC1": [-50, 50],
176 "PC2": [-50, 50],
177 "PC3": [-50, 50],
178 "PC4": [-50, 50],
179 "PC5": [-50, 50],
180 "PC6": [-50, 50],
181 "PC7": [-50, 50],
182 "PC8": [-50, 50],
183}
185BANDWIDTH: dict[str, float] = {
186 "volume.DEFAULT": 100,
187 "volume.NUCLEUS": 50,
188 "height.DEFAULT": 1,
189 "height.NUCLEUS": 1,
190 "area.DEFAULT": 50,
191 "area.NUCLEUS": 10,
192 "perimeter.DEFAULT": 50,
193 "perimeter.NUCLEUS": 10,
194 "axis_major_length.DEFAULT": 10,
195 "axis_major_length.NUCLEUS": 5,
196 "axis_minor_length.DEFAULT": 5,
197 "axis_minor_length.NUCLEUS": 2,
198 "eccentricity.DEFAULT": 0.01,
199 "eccentricity.NUCLEUS": 0.01,
200 "orientation.DEFAULT": 0.05,
201 "orientation.NUCLEUS": 0.05,
202 "extent.DEFAULT": 0.01,
203 "extent.NUCLEUS": 0.01,
204 "solidity.DEFAULT": 0.01,
205 "solidity.NUCLEUS": 0.01,
206 "PC1": 5,
207 "PC2": 5,
208 "PC3": 5,
209 "PC4": 5,
210 "PC5": 5,
211 "PC6": 5,
212 "PC7": 5,
213 "PC8": 5,
214}
217@dataclass
218class ParametersConfigFeatureComponents:
219 """Parameter configuration for group cell shapes subflow - feature components."""
221 features: list[str] = field(default_factory=lambda: COMPONENT_FEATURES)
222 """List of shape features."""
224 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
225 """List of subcellular regions."""
227 components: int = PCA_COMPONENTS
228 """Number of principal components."""
230 reference_metrics: Optional[str] = None
231 """Full key for reference metrics data."""
233 reference_properties: Optional[str] = None
234 """Full key for reference properties data."""
237@dataclass
238class ParametersConfigFeatureCorrelations:
239 """Parameter configuration for group cell shapes subflow - feature correlations."""
241 properties: list[str] = field(default_factory=lambda: CORRELATION_PROPERTIES)
242 """List of shape properties."""
244 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
245 """List of subcellular regions."""
247 components: int = PCA_COMPONENTS
248 """Number of principal components (i.e. shape modes)."""
250 include_bins: bool = False
251 """True if correlations are binned, False otherwise"""
253 limits: dict[str, list] = field(default_factory=lambda: LIMITS)
254 """Limits for scaling feature correlations bins."""
257@dataclass
258class ParametersConfigFeatureDistributions:
259 """Parameter configuration for group cell shapes subflow - feature distributions."""
261 reference_metrics: Optional[str] = None
262 """Full key for reference metrics data."""
264 reference_properties: Optional[str] = None
265 """Full key for reference properties data."""
267 reference_coefficients: Optional[str] = None
268 """Full key for reference coefficients data."""
270 reference_model: Optional[str] = None
271 """Full key for reference PCA model."""
273 properties: list[str] = field(default_factory=lambda: DISTRIBUTION_PROPERTIES)
274 """List of shape properties."""
276 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
277 """List of subcellular regions."""
279 components: int = PCA_COMPONENTS
280 """Number of principal components (i.e. shape modes)."""
282 bounds: dict[str, list] = field(default_factory=lambda: BOUNDS)
283 """Bounds for feature distributions."""
285 bandwidth: dict[str, float] = field(default_factory=lambda: BANDWIDTH)
286 """Bandwidths for feature distributions."""
289@dataclass
290class ParametersConfigModeCorrelations:
291 """Parameter configuration for group cell shapes subflow - mode correlations."""
293 reference_model: Optional[str] = None
294 """Full key for reference PCA model."""
296 reference_data: Optional[str] = None
297 """Full key for reference coefficients data."""
299 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
300 """List of subcellular regions."""
302 components: int = PCA_COMPONENTS
303 """Number of principal components (i.e. shape modes)."""
306@dataclass
307class ParametersConfigPopulationCounts:
308 """Parameter configuration for group cell shapes subflow - population counts."""
310 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
311 """List of subcellular regions."""
313 seeds: list[int] = field(default_factory=lambda: [0])
314 """Simulation seed(s) to use for grouping population counts."""
316 time: int = 0
317 """Simulation time (in hours) to use for grouping population counts."""
320@dataclass
321class ParametersConfigPopulationStats:
322 """Parameter configuration for group cell shapes subflow - population stats."""
324 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
325 """List of subcellular regions."""
328@dataclass
329class ParametersConfigShapeAverage:
330 """Parameter configuration for group cell shapes subflow - shape average."""
332 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
333 """List of subcellular regions."""
335 components: int = PCA_COMPONENTS
336 """Number of principal components (i.e. shape modes)."""
338 order: int = COEFFICIENT_ORDER
339 """Order of the spherical harmonics coefficient parametrization."""
341 scale: float = 1
342 """Scaling for spherical harmonics reconstruction mesh."""
344 projections: list[str] = field(default_factory=lambda: PROJECTIONS)
345 """List of shape projections."""
348@dataclass
349class ParametersConfigShapeContours:
350 """Parameter configuration for group cell shapes subflow - shape contours."""
352 regions: list[Optional[str]] = field(default_factory=lambda: ["DEFAULT"])
353 """List of subcellular regions."""
355 seed: int = 0
356 """Simulation random seed to use for grouping shape contours."""
358 time: int = 0
359 """Simulation time (in hours) to use for grouping shape contours."""
361 ds: Optional[float] = None
362 """Spatial scaling in units/um."""
364 dt: Optional[float] = None
365 """Temporal scaling in hours/tick."""
367 projection: str = "top"
368 """Selected shape projection."""
370 box: tuple[int, int, int] = field(default_factory=lambda: (1, 1, 1))
371 """Size of projection bounding box."""
373 slice_index: Optional[int] = None
374 """Slice index along the shape projection axis."""
377@dataclass
378class ParametersConfigShapeErrors:
379 """Parameter configuration for group cell shapes subflow - shape errors."""
381 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
382 """List of subcellular regions."""
385@dataclass
386class ParametersConfigShapeModes:
387 """Parameter configuration for group cell shapes subflow - shape modes."""
389 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
390 """List of subcellular regions."""
392 components: int = PCA_COMPONENTS
393 """Number of principal components (i.e. shape modes)."""
395 order: int = COEFFICIENT_ORDER
396 """Order of the spherical harmonics coefficient parametrization."""
398 delta: float = 0.5
399 """Increment for shape mode map points."""
401 projections: list[str] = field(default_factory=lambda: PROJECTIONS)
402 """List of shape projections."""
405@dataclass
406class ParametersConfigShapeSamples:
407 """Parameter configuration for group cell shapes subflow - shape samples."""
409 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
410 """List of subcellular regions."""
412 seed: int = 0
413 """Simulation random seed to use for grouping shape samples."""
415 tick: int = 0
416 """Simulation tick to use for grouping shape samples."""
418 indices: list[int] = field(default_factory=lambda: [0])
419 """Cell indices for shape samples."""
422@dataclass
423class ParametersConfigVarianceExplained:
424 """Parameter configuration for group cell shapes subflow - variance explained."""
426 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])
427 """List of subcellular regions."""
429 components: int = PCA_COMPONENTS
430 """Number of principal components (i.e. shape modes)."""
433@dataclass
434class ParametersConfig:
435 """Parameter configuration for group cell shapes flow."""
437 groups: list[str] = field(default_factory=lambda: GROUPS)
438 """List of cell shapes groups."""
440 feature_components: ParametersConfigFeatureComponents = ParametersConfigFeatureComponents()
441 """Parameters for group feature components subflow."""
443 feature_correlations: ParametersConfigFeatureCorrelations = (
444 ParametersConfigFeatureCorrelations()
445 )
446 """Parameters for group feature correlations subflow."""
448 feature_distributions: ParametersConfigFeatureDistributions = (
449 ParametersConfigFeatureDistributions()
450 )
451 """Parameters for group feature distributions subflow."""
453 mode_correlations: ParametersConfigModeCorrelations = ParametersConfigModeCorrelations()
454 """Parameters for group mode correlations subflow."""
456 population_counts: ParametersConfigPopulationCounts = ParametersConfigPopulationCounts()
457 """Parameters for group population counts subflow."""
459 population_stats: ParametersConfigPopulationStats = ParametersConfigPopulationStats()
460 """Parameters for group population stats subflow."""
462 shape_average: ParametersConfigShapeAverage = ParametersConfigShapeAverage()
463 """Parameters for group shape average subflow."""
465 shape_contours: ParametersConfigShapeContours = ParametersConfigShapeContours()
466 """Parameters for group shape contours subflow."""
468 shape_errors: ParametersConfigShapeErrors = ParametersConfigShapeErrors()
469 """Parameters for group shape errors subflow."""
471 shape_modes: ParametersConfigShapeModes = ParametersConfigShapeModes()
472 """Parameters for group shape modes subflow."""
474 shape_samples: ParametersConfigShapeSamples = ParametersConfigShapeSamples()
475 """Parameters for group shape samples subflow."""
477 variance_explained: ParametersConfigVarianceExplained = ParametersConfigVarianceExplained()
478 """Parameters for group variance explained subflow."""
481@dataclass
482class ContextConfig:
483 """Context configuration for group cell shapes flow."""
485 working_location: str
486 """Location for input and output files (local path or S3 bucket)."""
489@dataclass
490class SeriesConfig:
491 """Series configuration for group cell shapes flow."""
493 name: str
494 """Name of the simulation series."""
496 conditions: list[dict]
497 """List of series condition dictionaries (must include unique condition "key")."""
500@flow(name="group-cell-shapes")
501def run_flow(context: ContextConfig, series: SeriesConfig, parameters: ParametersConfig) -> None:
502 """
503 Main group cell shapes flow.
505 Calls the following subflows, if the group is specified:
507 - :py:func:`run_flow_group_feature_components`
508 - :py:func:`run_flow_group_feature_correlations`
509 - :py:func:`run_flow_group_feature_distributions`
510 - :py:func:`run_flow_group_mode_correlations`
511 - :py:func:`run_flow_group_population_counts`
512 - :py:func:`run_flow_group_population_stats`
513 - :py:func:`run_flow_group_shape_average`
514 - :py:func:`run_flow_group_shape_contours`
515 - :py:func:`run_flow_group_shape_errors`
516 - :py:func:`run_flow_group_shape_modes`
517 - :py:func:`run_flow_group_shape_samples`
518 - :py:func:`run_flow_group_variance_explained`
519 """
521 if "feature_components" in parameters.groups:
522 run_flow_group_feature_components(context, series, parameters.feature_components)
524 if "feature_correlations" in parameters.groups:
525 run_flow_group_feature_correlations(context, series, parameters.feature_correlations)
527 if "feature_distributions" in parameters.groups:
528 run_flow_group_feature_distributions(context, series, parameters.feature_distributions)
530 if "mode_correlations" in parameters.groups:
531 run_flow_group_mode_correlations(context, series, parameters.mode_correlations)
533 if "population_counts" in parameters.groups:
534 run_flow_group_population_counts(context, series, parameters.population_counts)
536 if "population_stats" in parameters.groups:
537 run_flow_group_population_stats(context, series, parameters.population_stats)
539 if "shape_average" in parameters.groups:
540 run_flow_group_shape_average(context, series, parameters.shape_average)
542 if "shape_contours" in parameters.groups:
543 run_flow_group_shape_contours(context, series, parameters.shape_contours)
545 if "shape_errors" in parameters.groups:
546 run_flow_group_shape_errors(context, series, parameters.shape_errors)
548 if "shape_modes" in parameters.groups:
549 run_flow_group_shape_modes(context, series, parameters.shape_modes)
551 if "shape_samples" in parameters.groups:
552 run_flow_group_shape_samples(context, series, parameters.shape_samples)
554 if "variance_explained" in parameters.groups:
555 run_flow_group_variance_explained(context, series, parameters.variance_explained)
558@flow(name="group-cell-shapes_group-feature-components")
559def run_flow_group_feature_components(
560 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigFeatureComponents
561) -> None:
562 """Group cell shapes subflow for feature components."""
564 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")
565 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")
567 # Get feature columns
568 columns = [
569 f"{feature}.{region}" if region != "DEFAULT" else feature
570 for region in parameters.regions
571 for feature in parameters.features
572 ]
574 # Load data.
575 data_key = make_key(analysis_key, f"{series.name}.CELL_SHAPES_DATA.csv")
576 data = load_dataframe.with_options(**OPTIONS)(context.working_location, data_key)
578 # Fit model.
579 pca_data = data[columns]
580 pca_data_mean = pca_data.mean(axis=0)
581 pca_data_std = pca_data.std(axis=0)
582 pca_data_zscore = (pca_data - pca_data_mean) / pca_data_std
583 pca = PCA(n_components=parameters.components)
584 pca = pca.fit(pca_data_zscore)
585 transform = pca.transform(pca_data_zscore)
587 # Create output data.
588 feature_components = data[["KEY"]].copy()
589 feature_components.rename(columns={"KEY": "key"}, inplace=True)
590 for comp in range(parameters.components):
591 feature_components[f"component_{comp + 1}"] = transform[:, comp]
593 # Save dataframe.
594 save_dataframe(
595 context.working_location,
596 make_key(group_key, f"{series.name}.feature_components.csv"),
597 feature_components,
598 index=False,
599 )
601 # Get reference data convex hull.
602 if parameters.reference_metrics is not None and parameters.reference_properties is not None:
603 index_columns = ["KEY", "ID", "SEED", "TICK"]
604 reference_metrics = load_dataframe.with_options(**OPTIONS)(
605 context.working_location, parameters.reference_metrics
606 )
607 reference_properties = load_dataframe.with_options(**OPTIONS)(
608 context.working_location, parameters.reference_properties
609 )
611 reference_metrics.set_index(index_columns, inplace=True)
612 reference_properties.set_index(index_columns, inplace=True)
614 reference = reference_metrics.join(reference_properties, on=index_columns).reset_index()
615 reference_zscore = (reference[columns] - pca_data_mean) / pca_data_std
616 reference_transform = pca.transform(reference_zscore)
618 hull = ConvexHull(reference_transform)
619 points = pd.DataFrame(reference_transform[hull.vertices, :], columns=["x", "y"])
621 save_dataframe(
622 context.working_location,
623 make_key(group_key, f"{series.name}.feature_components.REFERENCE.csv"),
624 pd.DataFrame(points),
625 index=False,
626 )
629@flow(name="group-cell-shapes_group-feature-correlations")
630def run_flow_group_feature_correlations(
631 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigFeatureCorrelations
632) -> None:
633 """Group cell shapes subflow for feature correlations."""
635 analysis_shapes_key = make_key(series.name, "analysis", "analysis.SHAPES")
636 analysis_pca_key = make_key(series.name, "analysis", "analysis.PCA")
637 group_key = make_key(series.name, "groups", "groups.SHAPES")
638 region_key = "_".join(sorted(parameters.regions))
639 keys = [condition["key"] for condition in series.conditions]
641 for key in keys:
642 feature_key = f"{series.name}.feature_correlations.{key}"
643 series_key = f"{series.name}_{key}_{region_key}"
645 # Load model.
646 model_key = make_key(analysis_pca_key, f"{series_key}.PCA.pkl")
647 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)
649 # Load dataframe.
650 dataframe_key = make_key(analysis_shapes_key, f"{series_key}.SHAPES.csv")
651 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)
653 # Transform data into shape mode space.
654 columns = data.filter(like="shcoeffs").columns
655 transform = model.transform(data[columns].values)
657 for region in parameters.regions:
658 correlations: list[dict[str, Union[str, float]]] = []
660 for component in range(parameters.components):
661 mode_key = f"PC{component + 1}"
662 component_data = transform[:, component]
664 for prop in parameters.properties:
665 prop_key = prop.upper()
666 prop_data = data[f"{prop}.{region}".replace(".DEFAULT", "")]
668 slope, intercept = np.polyfit(component_data, prop_data, 1)
670 correlations.append(
671 {
672 "mode": mode_key,
673 "property": prop.upper(),
674 "correlation": pearsonr(prop_data, component_data).statistic,
675 "correlation_symmetric": pearsonr(
676 prop_data, abs(component_data)
677 ).statistic,
678 "slope": slope,
679 "intercept": intercept,
680 }
681 )
683 if not parameters.include_bins:
684 continue
686 prop_limits = parameters.limits[f"{prop}.{region}"]
687 mode_limits = parameters.limits[mode_key]
689 bins = bin_to_hex(
690 component_data,
691 prop_data,
692 np.ones(len(prop_data)),
693 scale=0.025,
694 limits=(mode_limits[0], mode_limits[1], prop_limits[0], prop_limits[1]),
695 )
696 bins_df = pd.DataFrame(
697 [[x, y, np.sum(v)] for (x, y), v in bins.items()], columns=["x", "y", "v"]
698 )
700 save_dataframe(
701 context.working_location,
702 make_key(group_key, f"{feature_key}.{mode_key}.{prop_key}.{region}.csv"),
703 bins_df,
704 index=False,
705 )
707 save_dataframe(
708 context.working_location,
709 make_key(group_key, f"{feature_key}.{region}.csv"),
710 pd.DataFrame(correlations),
711 index=False,
712 )
715@flow(name="group-cell-shapes_group-feature-distributions")
716def run_flow_group_feature_distributions(
717 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigFeatureDistributions
718) -> None:
719 """Group cell shapes subflow for feature distributions."""
721 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")
722 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")
724 keys = [condition["key"] for condition in series.conditions]
725 superkeys = {key_group for key in keys for key_group in key.split("_")}
727 features = [
728 (f"{prop}.{region}", False)
729 for prop in parameters.properties
730 for region in parameters.regions
731 ]
733 if parameters.reference_metrics is not None:
734 ref_metrics = load_dataframe.with_options(**OPTIONS)(
735 context.working_location, parameters.reference_metrics
736 )
737 features.extend(
738 [
739 (feature, True)
740 for feature, _ in features
741 if feature.replace(".DEFAULT", "") in ref_metrics.columns
742 ]
743 )
745 if parameters.reference_properties is not None:
746 ref_props = load_dataframe.with_options(**OPTIONS)(
747 context.working_location, parameters.reference_properties
748 )
749 features.extend(
750 [
751 (feature, True)
752 for feature, _ in features
753 if feature.replace(".DEFAULT", "") in ref_props.columns
754 ]
755 )
757 if parameters.reference_model is not None and parameters.reference_coefficients is not None:
758 ref_coeffs = load_dataframe.with_options(**OPTIONS)(
759 context.working_location, parameters.reference_coefficients, nrows=1
760 )
761 ref_model = load_pickle.with_options(**OPTIONS)(
762 context.working_location, parameters.reference_model
763 )
764 features.extend(
765 [(f"PC{component + 1}", False) for component in range(parameters.components)]
766 )
768 distribution_bins: dict[tuple[str, bool], dict] = {feature: {} for feature in features}
769 distribution_means: dict[tuple[str, bool], dict] = {feature: {} for feature in features}
770 distribution_stdevs: dict[tuple[str, bool], dict] = {feature: {} for feature in features}
771 distribution_mins: dict[tuple[str, bool], dict] = {feature: {} for feature in features}
772 distribution_maxs: dict[tuple[str, bool], dict] = {feature: {} for feature in features}
774 for key in superkeys:
775 dataframe_key = make_key(analysis_key, f"{series.name}_{key}.CELL_SHAPES_DATA.csv")
776 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)
778 if parameters.reference_model is not None:
779 transform = ref_model.transform(data[ref_coeffs.filter(like="shcoeffs").columns].values)
780 for component in range(parameters.components):
781 data[f"PC{component + 1}"] = transform[:, component]
783 for feature, filtered in features:
784 feature_column = feature.replace(".DEFAULT", "")
785 values = data[feature_column].values
787 if filtered:
788 if ref_metrics is not None and feature_column in ref_metrics.columns:
789 ref_max = ref_metrics[feature_column].max()
790 ref_min = ref_metrics[feature_column].min()
791 values = values[(values >= ref_min) & (values <= ref_max)]
793 if ref_props is not None and feature_column in ref_props.columns:
794 ref_max = ref_props[feature_column].max()
795 ref_min = ref_props[feature_column].min()
796 values = values[(values >= ref_min) & (values <= ref_max)]
798 bounds = (parameters.bounds[feature][0], parameters.bounds[feature][1])
799 bandwidth = parameters.bandwidth[feature]
801 valid = check_data_bounds(values, bounds, f"[ {key} ] feature [ {feature} ]")
803 if not valid:
804 continue
806 distribution_means[(feature, filtered)][key] = np.mean(values)
807 distribution_stdevs[(feature, filtered)][key] = np.std(values, ddof=1)
808 distribution_bins[(feature, filtered)][key] = calculate_data_bins(
809 values, bounds, bandwidth
810 )
811 distribution_mins[(feature, filtered)][key] = np.min(values)
812 distribution_maxs[(feature, filtered)][key] = np.max(values)
814 for (feature, filtered), distribution in distribution_bins.items():
815 distribution["*"] = {
816 "bandwidth": parameters.bandwidth[feature],
817 "means": distribution_means[(feature, filtered)],
818 "stdevs": distribution_stdevs[(feature, filtered)],
819 "mins": distribution_mins[(feature, filtered)],
820 "maxs": distribution_maxs[(feature, filtered)],
821 }
823 feature_key = f"{feature.upper()}{'.FILTERED' if filtered else ''}"
824 save_json(
825 context.working_location,
826 make_key(group_key, f"{series.name}.feature_distributions.{feature_key}.json"),
827 distribution,
828 )
831@flow(name="group-cell-shapes_group-mode-correlations")
832def run_flow_group_mode_correlations(
833 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigModeCorrelations
834) -> None:
835 """Group cell shapes subflow for mode correlations."""
837 analysis_shapes_key = make_key(series.name, "analysis", "analysis.SHAPES")
838 analysis_pca_key = make_key(series.name, "analysis", "analysis.PCA")
839 group_key = make_key(series.name, "groups", "groups.SHAPES")
840 region_key = "_".join(sorted(parameters.regions))
841 keys = [condition["key"] for condition in series.conditions]
843 all_models = {}
844 all_data = {}
846 for key in keys:
847 series_key = f"{series.name}_{key}_{region_key}"
849 # Load model.
850 model_key = make_key(analysis_pca_key, f"{series_key}.PCA.pkl")
851 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)
852 all_models[key] = model
854 # Load dataframe.
855 dataframe_key = make_key(analysis_shapes_key, f"{series_key}.SHAPES.csv")
856 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)
857 all_data[key] = data
859 if parameters.reference_model is not None and parameters.reference_data is not None:
860 keys.append("reference")
861 all_models["reference"] = load_pickle.with_options(**OPTIONS)(
862 context.working_location, parameters.reference_model
863 )
864 all_data["reference"] = load_dataframe.with_options(**OPTIONS)(
865 context.working_location, parameters.reference_data
866 )
868 correlations: list[dict[str, Union[str, int, float]]] = []
870 for source_key in keys:
871 for target_key in keys:
872 if source_key == target_key:
873 continue
875 # Select data sets.
876 data_source = all_data[source_key]
877 data_target = all_data[target_key]
879 # Select models.
880 model_source = all_models[source_key]
881 model_target = all_models[target_key]
883 # Get column order for model.
884 columns_source = data_source.filter(like="shcoeffs").columns
885 columns_target = data_target.filter(like="shcoeffs").columns
887 # Transform the data.
888 transform_source = model_source.transform(
889 np.append(
890 data_source[columns_source].values,
891 data_target[columns_source].values,
892 axis=0,
893 )
894 )
895 transform_target = model_target.transform(
896 np.append(
897 data_source[columns_target].values,
898 data_target[columns_target].values,
899 axis=0,
900 )
901 )
903 # Calculate correlations.
904 correlations = correlations + [
905 {
906 "source_key": source_key,
907 "target_key": target_key,
908 "source_mode": f"PC{si + 1}",
909 "target_mode": f"PC{ti + 1}",
910 "correlation": pearsonr(
911 transform_source[:, si], transform_target[:, ti]
912 ).statistic,
913 }
914 for si in range(parameters.components)
915 for ti in range(parameters.components)
916 ]
918 save_dataframe(
919 context.working_location,
920 make_key(group_key, f"{series.name}.mode_correlations.csv"),
921 pd.DataFrame(correlations),
922 index=False,
923 )
926@flow(name="group-cell-shapes_group-population-counts")
927def run_flow_group_population_counts(
928 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigPopulationCounts
929) -> None:
930 """Group cell shapes subflow for population counts."""
932 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")
933 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")
935 keys = [condition["key"] for condition in series.conditions]
936 superkeys = {key_group for key in keys for key_group in key.split("_")}
938 counts: list[dict] = []
940 for key in superkeys:
941 dataframe_key = make_key(analysis_key, f"{series.name}_{key}.CELL_SHAPES_DATA.csv")
942 data = load_dataframe.with_options(**OPTIONS)(
943 context.working_location, dataframe_key, usecols=["KEY", "SEED", "time"]
944 )
945 data = data[data["SEED"].isin(parameters.seeds) & (data["time"] == parameters.time)]
947 counts.extend(
948 [
949 {
950 "key": record["KEY"],
951 "seed": record["SEED"],
952 "count": record[0],
953 }
954 for record in data.groupby(["KEY", "SEED"]).size().reset_index().to_dict("records")
955 ]
956 )
958 save_dataframe(
959 context.working_location,
960 make_key(group_key, f"{series.name}.population_counts.{parameters.time:03d}.csv"),
961 pd.DataFrame(counts).drop_duplicates(),
962 index=False,
963 )
966@flow(name="group-cell-shapes_group-population-stats")
967def run_flow_group_population_stats(
968 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigPopulationStats
969) -> None:
970 """Group cell shapes subflow for population stats."""
972 analysis_key = make_key(series.name, "analysis", "analysis.STATISTICS")
973 group_key = make_key(series.name, "groups", "groups.SHAPES")
974 region_key = "_".join(sorted(parameters.regions))
975 keys = [condition["key"] for condition in series.conditions]
977 stats: dict[str, dict] = {key: {} for key in keys}
979 for key in keys:
980 dataframe_key = make_key(analysis_key, f"{series.name}_{key}_{region_key}.STATISTICS.csv")
981 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)
983 for feature, group in data.groupby("FEATURE"):
984 feature_name = f"{feature}.DEFAULT" if feature in ["VOLUME", "HEIGHT"] else feature
986 stats[key][feature_name.upper()] = {
987 "size": int(group["SIZE"].sum()),
988 "replicates": len(group),
989 "mean": group["KS_STATISTIC"].mean(),
990 "std": group["KS_STATISTIC"].std(ddof=1),
991 }
993 save_json(
994 context.working_location,
995 make_key(group_key, f"{series.name}.population_stats.json"),
996 stats,
997 )
1000@flow(name="group-cell-shapes_group-shape-average")
1001def run_flow_group_shape_average(
1002 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeAverage
1003) -> None:
1004 """
1005 Group cell shapes subflow for shape average.
1007 Find the cell closest to the average shape. Extract original mesh slice and
1008 extent projections. Created reconstructed mesh and extract mesh slice and
1009 extent projections.
1010 """
1012 logger = get_run_logger()
1014 analysis_shapes_key = make_key(series.name, "analysis", "analysis.SHAPES")
1015 analysis_pca_key = make_key(series.name, "analysis", "analysis.PCA")
1016 data_key = make_key(series.name, "data", "data.LOCATIONS")
1017 group_key = make_key(series.name, "groups", "groups.SHAPES")
1018 region_key = "_".join(sorted(parameters.regions))
1019 keys = [condition["key"] for condition in series.conditions]
1021 for key in keys:
1022 series_key = f"{series.name}_{key}_{region_key}"
1024 # Load model.
1025 model_key = make_key(analysis_pca_key, f"{series_key}.PCA.pkl")
1026 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)
1028 # Load dataframe.
1029 dataframe_key = make_key(analysis_shapes_key, f"{series_key}.SHAPES.csv")
1030 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)
1032 # Transform data into shape mode space.
1033 columns = data.filter(like="shcoeffs").columns
1034 transform = model.transform(data[columns].values)
1036 # Select the cell closest to average.
1037 distance, index = KDTree(transform).query([0] * parameters.components)
1038 selected = data.iloc[index, :]
1039 logger.info(
1040 "[ %s ] seed [ %d ] tick [ %d ] cell [ %d ] with distance [ %.2f ]",
1041 key,
1042 selected["SEED"],
1043 selected["TICK"],
1044 selected["ID"],
1045 distance,
1046 )
1048 # Get the matching location for the selected cell.
1049 series_key = f"{series.name}_{key}_{selected['SEED']:04d}"
1050 tar_key = make_key(data_key, f"{series_key}.LOCATIONS.tar.xz")
1051 tar = load_tar(context.working_location, tar_key)
1053 # Load matching location voxels.
1054 locations = extract_tick_json(tar, series_key, selected["TICK"], "LOCATIONS")
1055 location = next(location for location in locations if location["id"] == selected["ID"])
1056 voxels = get_location_voxels(location)
1057 array = make_voxels_array(voxels)
1059 # Create original mesh and get projections.
1060 original_mesh = construct_mesh_from_array(array, array)
1061 original_mesh_projections = extract_mesh_projections(original_mesh)
1063 # Create reconstructed mesh and get projections.
1064 reconstructed_mesh = construct_mesh_from_coeffs(
1065 selected, parameters.order, scale=parameters.scale
1066 )
1067 reconstructed_mesh_projections = extract_mesh_projections(reconstructed_mesh)
1069 # Save json for each projection.
1070 for projection in parameters.projections:
1071 shape_average: dict[str, dict] = {
1072 "original_slice": original_mesh_projections[f"{projection}_slice"],
1073 "original_extent": original_mesh_projections[f"{projection}_extent"],
1074 "reconstructed_slice": reconstructed_mesh_projections[f"{projection}_slice"],
1075 }
1077 save_json(
1078 context.working_location,
1079 make_key(group_key, f"{series.name}.shape_average.{key}.{projection.upper()}.json"),
1080 shape_average,
1081 )
1084@flow(name="group-cell-shapes_group-shape-contours")
1085def run_flow_group_shape_contours(
1086 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeContours
1087) -> None:
1088 """Group cell shapes subflow for shape contours."""
1090 data_key = make_key(series.name, "data", "data.LOCATIONS")
1091 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")
1092 keys = [condition["key"] for condition in series.conditions]
1094 projection = parameters.projection
1095 projection_index = list(reversed(PROJECTIONS)).index(projection)
1097 for key in keys:
1098 series_key = f"{series.name}_{key}_{parameters.seed:04d}"
1099 tar_key = make_key(data_key, f"{series_key}.LOCATIONS.tar.xz")
1100 tar = load_tar(context.working_location, tar_key)
1102 ds = parameters.ds if parameters.ds is not None else estimate_spatial_resolution(key)
1103 dt = parameters.dt if parameters.dt is not None else estimate_temporal_resolution(key)
1105 tick = int(parameters.time / dt)
1106 length, width, height = parameters.box
1107 box = (int((length - 2) / ds) + 2, int((width - 2) / ds) + 2, int((height - 2) / ds) + 2)
1109 locations = extract_tick_json(tar, series_key, tick, "LOCATIONS")
1111 for region in parameters.regions:
1112 all_contours = []
1114 for location in locations:
1115 voxels = get_location_voxels(location, None if region == "DEFAULT" else region)
1117 if parameters.slice_index is not None:
1118 voxels = [
1119 voxel
1120 for voxel in voxels
1121 if voxel[projection_index] == parameters.slice_index
1122 ]
1124 if len(voxels) == 0:
1125 continue
1127 contours = [
1128 (np.array(contour) * ds).astype("int").tolist()
1129 for contour in extract_voxel_contours(voxels, projection, box)
1130 ]
1132 all_contours.append({"id": location["id"], "contours": contours})
1134 contour_key = f"{key}.{parameters.seed:04d}.{parameters.time:03d}.{region}"
1135 save_json(
1136 context.working_location,
1137 make_key(
1138 group_key,
1139 f"{series.name}.shape_contours.{contour_key}.{projection.upper()}.json",
1140 ),
1141 all_contours,
1142 )
1145@flow(name="group-cell-shapes_group-shape-errors")
1146def run_flow_group_shape_errors(
1147 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeErrors
1148) -> None:
1149 """Group cell shapes subflow for shape errors."""
1151 analysis_key = make_key(series.name, "analysis", "analysis.SHAPES")
1152 group_key = make_key(series.name, "groups", "groups.SHAPES")
1153 region_key = "_".join(sorted(parameters.regions))
1154 keys = [condition["key"] for condition in series.conditions]
1156 errors: dict[str, dict] = {key: {} for key in keys}
1158 for key in keys:
1159 dataframe_key = make_key(analysis_key, f"{series.name}_{key}_{region_key}.SHAPES.csv")
1160 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)
1162 for region in parameters.regions:
1163 errors[key][region] = {
1164 "mean": data[f"mse.{region}".replace(".DEFAULT", "")].mean(),
1165 "std": data[f"mse.{region}".replace(".DEFAULT", "")].std(ddof=1),
1166 }
1168 save_json(
1169 context.working_location,
1170 make_key(group_key, f"{series.name}.shape_errors.json"),
1171 errors,
1172 )
1175@flow(name="group-cell-shapes_group-shape-modes")
1176def run_flow_group_shape_modes(
1177 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeModes
1178) -> None:
1179 """
1180 Group cell shapes subflow for shape modes.
1182 Extract shape modes from PCAs as dictionaries of svg paths for each map
1183 point and projection. Consolidate shape modes from keys into single json.
1184 """
1186 analysis_data_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")
1187 analysis_model_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_MODELS")
1188 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")
1190 keys = [condition["key"] for condition in series.conditions]
1191 superkeys = {key_group for key in keys for key_group in key.split("_")}
1193 projections = ["top", "side1", "side2"]
1195 for superkey in superkeys:
1196 series_key = f"{series.name}_{superkey}"
1198 # Load model.
1199 model_key = make_key(analysis_model_key, f"{series_key}.CELL_SHAPES_MODELS.pkl")
1200 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)
1202 # Load dataframe.
1203 dataframe_key = make_key(analysis_data_key, f"{series_key}.CELL_SHAPES_DATA.csv")
1204 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)
1206 # Extract shape modes.
1207 shape_modes = extract_shape_modes(
1208 model,
1209 data,
1210 parameters.components,
1211 parameters.regions,
1212 parameters.order,
1213 parameters.delta,
1214 )
1216 for region in parameters.regions:
1217 shape_mode_projections: dict[str, list] = {
1218 f"PC{component + 1}.{projection}": []
1219 for component in range(parameters.components)
1220 for projection in parameters.projections
1221 }
1223 for shape_mode in shape_modes[region]:
1224 for projection in parameters.projections:
1225 shape_mode_projections[f"PC{shape_mode['mode']}.{projection}"].append(
1226 {
1227 "point": shape_mode["point"],
1228 "projection": shape_mode["projections"][f"{projection}_slice"],
1229 }
1230 )
1232 for proj_key, projections in shape_mode_projections.items():
1233 save_json(
1234 context.working_location,
1235 make_key(
1236 group_key,
1237 f"{series.name}.shape_modes.{superkey}.{region}.{proj_key.upper()}.json",
1238 ).replace("..", "."),
1239 projections,
1240 )
1243@flow(name="group-cell-shapes_group-shape-samples")
1244def run_flow_group_shape_samples(
1245 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeSamples
1246) -> None:
1247 """
1248 Group cell shapes subflow for shape samples.
1250 Extract sample cell shapes from specified simulations. Construct wireframes
1251 from the cell shape mesh.
1252 """
1254 data_key = make_key(series.name, "data", "data.LOCATIONS")
1255 group_key = make_key(series.name, "groups", "groups.SHAPES")
1256 keys = [condition["key"] for condition in series.conditions]
1258 shape_samples: dict[str, dict] = {}
1260 for key in keys:
1261 shape_samples[key] = {region: [] for region in parameters.regions}
1263 # Load location data.
1264 series_key = f"{series.name}_{key}_{parameters.seed:04d}"
1265 tar_key = make_key(data_key, f"{series_key}.LOCATIONS.tar.xz")
1266 tar = load_tar(context.working_location, tar_key)
1267 locations = extract_tick_json(tar, series_key, parameters.tick, "LOCATIONS")
1269 for index in parameters.indices:
1270 location = locations[index]
1272 for region in parameters.regions:
1273 voxels = get_location_voxels(location)
1274 array = make_voxels_array(voxels)
1276 if region != "DEFAULT":
1277 region_voxels = get_location_voxels(locations[index], region)
1278 region_array = make_voxels_array(region_voxels, reference=voxels)
1279 mesh = construct_mesh_from_array(region_array, array)
1280 else:
1281 mesh = construct_mesh_from_array(array, array)
1283 shape_samples[key][region].append(extract_mesh_wireframe(mesh))
1285 save_json(
1286 context.working_location,
1287 make_key(group_key, f"{series.name}.shape_samples.json"),
1288 shape_samples,
1289 )
1292@flow(name="group-cell-shapes_group-variance-explained")
1293def run_flow_group_variance_explained(
1294 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigVarianceExplained
1295) -> None:
1296 """Group cell shapes subflow for variance explained."""
1298 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_MODELS")
1299 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")
1301 keys = [condition["key"] for condition in series.conditions]
1302 superkeys = {key_group for key in keys for key_group in key.split("_")}
1304 variance = []
1306 for superkey in superkeys:
1307 model_key = make_key(analysis_key, f"{series.name}_{superkey}.CELL_SHAPES_MODELS.pkl")
1308 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)
1310 variance.append(
1311 pd.DataFrame(
1312 {
1313 "key": [superkey] * parameters.components,
1314 "mode": [f"PC{i}" for i in range(1, parameters.components + 1)],
1315 "variance": model.explained_variance_ratio_,
1316 }
1317 )
1318 )
1320 save_dataframe(
1321 context.working_location,
1322 make_key(group_key, f"{series.name}.variance_explained.csv"),
1323 pd.concat(variance),
1324 index=False,
1325 )