Coverage for src/cell_abm_pipeline/flows/group_cell

1"""

2Workflow for grouping cell shapes.

4Working location structure:

6.. code-block:: bash

8 (name)

9 ├── analysis

10 │ ├── analysis.CELL_SHAPES_DATA

11 │ │ └── (name)_(key).CELL_SHAPES_DATA.csv

12 │ └── analysis.CELL_SHAPES_MODELS

13 │ └── (name)_(key).CELL_SHAPES_MODELS.pkl

14 ├── data

15 │ └── data.LOCATIONS

16 │ └── (name)_(key)_(seed).LOCATIONS.tar.xz

17 └── groups

18 └── groups.CELL_SHAPES

19 ├── (name).feature_components.csv

20 ├── (name).feature_correlations.(key).(region).csv

21 ├── (name).feature_correlations.(key).(mode).(property).(region).csv

22 ├── (name).feature_distributions.(feature).json

23 ├── (name).mode_correlations.csv

24 ├── (name).population_counts.(time).csv

25 ├── (name).population_stats.json

26 ├── (name).shape_average.(key).(projection).json

27 ├── (name).shape_contours.(key).(seed).(time).(region).(projection).json

28 ├── (name).shape_errors.json

29 ├── (name).shape_modes.(key).(region).(mode).(projection).json

30 ├── (name).shape_samples.json

31 └── (name).variance_explained.csv

33Different groups use inputs from **data.LOCATIONS**,

34**analysis.CELL_SHAPES_DATA**, and **analysis.CELL_SHAPES_MODELS**. Grouped data

35are saved to **groups.CELL_SHAPES**.

37Different groups can be visualized using the corresponding plotting workflow or

38loaded into alternative tools.

39"""

41from dataclasses import dataclass, field

42from datetime import timedelta

43from typing import Optional, Union

45import numpy as np

46import pandas as pd

47from abm_shape_collection import (

48 construct_mesh_from_array,

49 construct_mesh_from_coeffs,

50 extract_mesh_projections,

51 extract_mesh_wireframe,

52 extract_shape_modes,

53 extract_voxel_contours,

54 make_voxels_array,

55)

56from arcade_collection.output import extract_tick_json, get_location_voxels

57from arcade_collection.output.convert_model_units import (

58 estimate_spatial_resolution,

59 estimate_temporal_resolution,

60)

61from io_collection.keys import make_key

62from io_collection.load import load_dataframe, load_pickle, load_tar

63from io_collection.save import save_dataframe, save_json

64from prefect import flow, get_run_logger

65from prefect.tasks import task_input_hash

66from scipy.spatial import ConvexHull, KDTree

67from scipy.stats import pearsonr

68from sklearn.decomposition import PCA

70from cell_abm_pipeline.flows.analyze_cell_shapes import PCA_COMPONENTS

71from cell_abm_pipeline.flows.calculate_coefficients import COEFFICIENT_ORDER

72from cell_abm_pipeline.tasks import bin_to_hex, calculate_data_bins, check_data_bounds

74OPTIONS = {

75 "cache_result_in_memory": False,

76 "cache_key_fn": task_input_hash,

77 "cache_expiration": timedelta(hours=12),

78}

80GROUPS: list[str] = [

81 "feature_components",

82 "feature_correlations",

83 "feature_distributions",

84 "mode_correlations",

85 "population_counts",

86 "population_stats",

87 "shape_average",

88 "shape_contours",

89 "shape_errors",

90 "shape_modes",

91 "shape_samples",

92 "variance_explained",

93]

95COMPONENT_FEATURES: list[str] = [

96 "volume",

97 "height",

98 "area",

99 "axis_major_length",

100 "axis_minor_length",

101 "eccentricity",

102 "orientation",

103 "perimeter",

104 "extent",

105 "solidity",

106]

107

108CORRELATION_PROPERTIES: list[str] = [

109 "volume",

110 "height",

111 "area",

112 "axis_major_length",

113 "axis_minor_length",

114 "eccentricity",

115 "perimeter",

116]

117

118DISTRIBUTION_PROPERTIES: list[str] = [

119 "volume",

120 "height",

121]

122

123PROJECTIONS: list[str] = [

124 "top",

125 "side1",

126 "side2",

127]

128

129LIMITS: dict[str, list] = {

130 "volume.DEFAULT": [500, 4000],

131 "volume.NUCLEUS": [0, 1500],

132 "height.DEFAULT": [0, 20],

133 "height.NUCLEUS": [0, 20],

134 "area.DEFAULT": [0, 1000],

135 "area.NUCLEUS": [0, 250],

136 "axis_major_length.DEFAULT": [0, 100],

137 "axis_major_length.NUCLEUS": [0, 50],

138 "axis_minor_length.DEFAULT": [0, 50],

139 "axis_minor_length.NUCLEUS": [0, 20],

140 "eccentricity.DEFAULT": [0, 1],

141 "eccentricity.NUCLEUS": [0, 1],

142 "perimeter.DEFAULT": [0, 250],

143 "perimeter.NUCLEUS": [0, 100],

144 "PC1": [-60, 60],

145 "PC2": [-50, 50],

146 "PC3": [-50, 50],

147 "PC4": [-50, 50],

148 "PC5": [-40, 40],

149 "PC6": [-40, 40],

150 "PC7": [-50, 50],

151 "PC8": [-50, 50],

152}

153

154BOUNDS: dict[str, list] = {

155 "volume.DEFAULT": [0, 6000],

156 "volume.NUCLEUS": [0, 2000],

157 "height.DEFAULT": [0, 21],

158 "height.NUCLEUS": [0, 21],

159 "area.DEFAULT": [0, 2500],

160 "area.NUCLEUS": [0, 1000],

161 "perimeter.DEFAULT": [0, 2000],

162 "perimeter.NUCLEUS": [0, 700],

163 "axis_major_length.DEFAULT": [0, 300],

164 "axis_major_length.NUCLEUS": [0, 150],

165 "axis_minor_length.DEFAULT": [0, 150],

166 "axis_minor_length.NUCLEUS": [0, 100],

167 "eccentricity.DEFAULT": [0, 1],

168 "eccentricity.NUCLEUS": [0, 1],

169 "orientation.DEFAULT": [-2, 2],

170 "orientation.NUCLEUS": [-2, 2],

171 "extent.DEFAULT": [0, 1],

172 "extent.NUCLEUS": [0, 1],

173 "solidity.DEFAULT": [0, 1],

174 "solidity.NUCLEUS": [0, 1],

175 "PC1": [-50, 50],

176 "PC2": [-50, 50],

177 "PC3": [-50, 50],

178 "PC4": [-50, 50],

179 "PC5": [-50, 50],

180 "PC6": [-50, 50],

181 "PC7": [-50, 50],

182 "PC8": [-50, 50],

183}

184

185BANDWIDTH: dict[str, float] = {

186 "volume.DEFAULT": 100,

187 "volume.NUCLEUS": 50,

188 "height.DEFAULT": 1,

189 "height.NUCLEUS": 1,

190 "area.DEFAULT": 50,

191 "area.NUCLEUS": 10,

192 "perimeter.DEFAULT": 50,

193 "perimeter.NUCLEUS": 10,

194 "axis_major_length.DEFAULT": 10,

195 "axis_major_length.NUCLEUS": 5,

196 "axis_minor_length.DEFAULT": 5,

197 "axis_minor_length.NUCLEUS": 2,

198 "eccentricity.DEFAULT": 0.01,

199 "eccentricity.NUCLEUS": 0.01,

200 "orientation.DEFAULT": 0.05,

201 "orientation.NUCLEUS": 0.05,

202 "extent.DEFAULT": 0.01,

203 "extent.NUCLEUS": 0.01,

204 "solidity.DEFAULT": 0.01,

205 "solidity.NUCLEUS": 0.01,

206 "PC1": 5,

207 "PC2": 5,

208 "PC3": 5,

209 "PC4": 5,

210 "PC5": 5,

211 "PC6": 5,

212 "PC7": 5,

213 "PC8": 5,

214}

215

216

217@dataclass

218class ParametersConfigFeatureComponents:

219 """Parameter configuration for group cell shapes subflow - feature components."""

220

221 features: list[str] = field(default_factory=lambda: COMPONENT_FEATURES)

222 """List of shape features."""

223

224 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

225 """List of subcellular regions."""

226

227 components: int = PCA_COMPONENTS

228 """Number of principal components."""

229

230 reference_metrics: Optional[str] = None

231 """Full key for reference metrics data."""

232

233 reference_properties: Optional[str] = None

234 """Full key for reference properties data."""

235

236

237@dataclass

238class ParametersConfigFeatureCorrelations:

239 """Parameter configuration for group cell shapes subflow - feature correlations."""

240

241 properties: list[str] = field(default_factory=lambda: CORRELATION_PROPERTIES)

242 """List of shape properties."""

243

244 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

245 """List of subcellular regions."""

246

247 components: int = PCA_COMPONENTS

248 """Number of principal components (i.e. shape modes)."""

249

250 include_bins: bool = False

251 """True if correlations are binned, False otherwise"""

252

253 limits: dict[str, list] = field(default_factory=lambda: LIMITS)

254 """Limits for scaling feature correlations bins."""

255

256

257@dataclass

258class ParametersConfigFeatureDistributions:

259 """Parameter configuration for group cell shapes subflow - feature distributions."""

260

261 reference_metrics: Optional[str] = None

262 """Full key for reference metrics data."""

263

264 reference_properties: Optional[str] = None

265 """Full key for reference properties data."""

266

267 reference_coefficients: Optional[str] = None

268 """Full key for reference coefficients data."""

269

270 reference_model: Optional[str] = None

271 """Full key for reference PCA model."""

272

273 properties: list[str] = field(default_factory=lambda: DISTRIBUTION_PROPERTIES)

274 """List of shape properties."""

275

276 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

277 """List of subcellular regions."""

278

279 components: int = PCA_COMPONENTS

280 """Number of principal components (i.e. shape modes)."""

281

282 bounds: dict[str, list] = field(default_factory=lambda: BOUNDS)

283 """Bounds for feature distributions."""

284

285 bandwidth: dict[str, float] = field(default_factory=lambda: BANDWIDTH)

286 """Bandwidths for feature distributions."""

287

288

289@dataclass

290class ParametersConfigModeCorrelations:

291 """Parameter configuration for group cell shapes subflow - mode correlations."""

292

293 reference_model: Optional[str] = None

294 """Full key for reference PCA model."""

295

296 reference_data: Optional[str] = None

297 """Full key for reference coefficients data."""

298

299 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

300 """List of subcellular regions."""

301

302 components: int = PCA_COMPONENTS

303 """Number of principal components (i.e. shape modes)."""

304

305

306@dataclass

307class ParametersConfigPopulationCounts:

308 """Parameter configuration for group cell shapes subflow - population counts."""

309

310 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

311 """List of subcellular regions."""

312

313 seeds: list[int] = field(default_factory=lambda: [0])

314 """Simulation seed(s) to use for grouping population counts."""

315

316 time: int = 0

317 """Simulation time (in hours) to use for grouping population counts."""

318

319

320@dataclass

321class ParametersConfigPopulationStats:

322 """Parameter configuration for group cell shapes subflow - population stats."""

323

324 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

325 """List of subcellular regions."""

326

327

328@dataclass

329class ParametersConfigShapeAverage:

330 """Parameter configuration for group cell shapes subflow - shape average."""

331

332 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

333 """List of subcellular regions."""

334

335 components: int = PCA_COMPONENTS

336 """Number of principal components (i.e. shape modes)."""

337

338 order: int = COEFFICIENT_ORDER

339 """Order of the spherical harmonics coefficient parametrization."""

340

341 scale: float = 1

342 """Scaling for spherical harmonics reconstruction mesh."""

343

344 projections: list[str] = field(default_factory=lambda: PROJECTIONS)

345 """List of shape projections."""

346

347

348@dataclass

349class ParametersConfigShapeContours:

350 """Parameter configuration for group cell shapes subflow - shape contours."""

351

352 regions: list[Optional[str]] = field(default_factory=lambda: ["DEFAULT"])

353 """List of subcellular regions."""

354

355 seed: int = 0

356 """Simulation random seed to use for grouping shape contours."""

357

358 time: int = 0

359 """Simulation time (in hours) to use for grouping shape contours."""

360

361 ds: Optional[float] = None

362 """Spatial scaling in units/um."""

363

364 dt: Optional[float] = None

365 """Temporal scaling in hours/tick."""

366

367 projection: str = "top"

368 """Selected shape projection."""

369

370 box: tuple[int, int, int] = field(default_factory=lambda: (1, 1, 1))

371 """Size of projection bounding box."""

372

373 slice_index: Optional[int] = None

374 """Slice index along the shape projection axis."""

375

376

377@dataclass

378class ParametersConfigShapeErrors:

379 """Parameter configuration for group cell shapes subflow - shape errors."""

380

381 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

382 """List of subcellular regions."""

383

384

385@dataclass

386class ParametersConfigShapeModes:

387 """Parameter configuration for group cell shapes subflow - shape modes."""

388

389 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

390 """List of subcellular regions."""

391

392 components: int = PCA_COMPONENTS

393 """Number of principal components (i.e. shape modes)."""

394

395 order: int = COEFFICIENT_ORDER

396 """Order of the spherical harmonics coefficient parametrization."""

397

398 delta: float = 0.5

399 """Increment for shape mode map points."""

400

401 projections: list[str] = field(default_factory=lambda: PROJECTIONS)

402 """List of shape projections."""

403

404

405@dataclass

406class ParametersConfigShapeSamples:

407 """Parameter configuration for group cell shapes subflow - shape samples."""

408

409 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

410 """List of subcellular regions."""

411

412 seed: int = 0

413 """Simulation random seed to use for grouping shape samples."""

414

415 tick: int = 0

416 """Simulation tick to use for grouping shape samples."""

417

418 indices: list[int] = field(default_factory=lambda: [0])

419 """Cell indices for shape samples."""

420

421

422@dataclass

423class ParametersConfigVarianceExplained:

424 """Parameter configuration for group cell shapes subflow - variance explained."""

425

426 regions: list[str] = field(default_factory=lambda: ["DEFAULT"])

427 """List of subcellular regions."""

428

429 components: int = PCA_COMPONENTS

430 """Number of principal components (i.e. shape modes)."""

431

432

433@dataclass

434class ParametersConfig:

435 """Parameter configuration for group cell shapes flow."""

436

437 groups: list[str] = field(default_factory=lambda: GROUPS)

438 """List of cell shapes groups."""

439

440 feature_components: ParametersConfigFeatureComponents = ParametersConfigFeatureComponents()

441 """Parameters for group feature components subflow."""

442

443 feature_correlations: ParametersConfigFeatureCorrelations = (

444 ParametersConfigFeatureCorrelations()

445 )

446 """Parameters for group feature correlations subflow."""

447

448 feature_distributions: ParametersConfigFeatureDistributions = (

449 ParametersConfigFeatureDistributions()

450 )

451 """Parameters for group feature distributions subflow."""

452

453 mode_correlations: ParametersConfigModeCorrelations = ParametersConfigModeCorrelations()

454 """Parameters for group mode correlations subflow."""

455

456 population_counts: ParametersConfigPopulationCounts = ParametersConfigPopulationCounts()

457 """Parameters for group population counts subflow."""

458

459 population_stats: ParametersConfigPopulationStats = ParametersConfigPopulationStats()

460 """Parameters for group population stats subflow."""

461

462 shape_average: ParametersConfigShapeAverage = ParametersConfigShapeAverage()

463 """Parameters for group shape average subflow."""

464

465 shape_contours: ParametersConfigShapeContours = ParametersConfigShapeContours()

466 """Parameters for group shape contours subflow."""

467

468 shape_errors: ParametersConfigShapeErrors = ParametersConfigShapeErrors()

469 """Parameters for group shape errors subflow."""

470

471 shape_modes: ParametersConfigShapeModes = ParametersConfigShapeModes()

472 """Parameters for group shape modes subflow."""

473

474 shape_samples: ParametersConfigShapeSamples = ParametersConfigShapeSamples()

475 """Parameters for group shape samples subflow."""

476

477 variance_explained: ParametersConfigVarianceExplained = ParametersConfigVarianceExplained()

478 """Parameters for group variance explained subflow."""

479

480

481@dataclass

482class ContextConfig:

483 """Context configuration for group cell shapes flow."""

484

485 working_location: str

486 """Location for input and output files (local path or S3 bucket)."""

487

488

489@dataclass

490class SeriesConfig:

491 """Series configuration for group cell shapes flow."""

492

493 name: str

494 """Name of the simulation series."""

495

496 conditions: list[dict]

497 """List of series condition dictionaries (must include unique condition "key")."""

498

499

500@flow(name="group-cell-shapes")

501def run_flow(context: ContextConfig, series: SeriesConfig, parameters: ParametersConfig) -> None:

502 """

503 Main group cell shapes flow.

504

505 Calls the following subflows, if the group is specified:

506

507 - :py:func:`run_flow_group_feature_components`

508 - :py:func:`run_flow_group_feature_correlations`

509 - :py:func:`run_flow_group_feature_distributions`

510 - :py:func:`run_flow_group_mode_correlations`

511 - :py:func:`run_flow_group_population_counts`

512 - :py:func:`run_flow_group_population_stats`

513 - :py:func:`run_flow_group_shape_average`

514 - :py:func:`run_flow_group_shape_contours`

515 - :py:func:`run_flow_group_shape_errors`

516 - :py:func:`run_flow_group_shape_modes`

517 - :py:func:`run_flow_group_shape_samples`

518 - :py:func:`run_flow_group_variance_explained`

519 """

520

521 if "feature_components" in parameters.groups:

522 run_flow_group_feature_components(context, series, parameters.feature_components)

523

524 if "feature_correlations" in parameters.groups:

525 run_flow_group_feature_correlations(context, series, parameters.feature_correlations)

526

527 if "feature_distributions" in parameters.groups:

528 run_flow_group_feature_distributions(context, series, parameters.feature_distributions)

529

530 if "mode_correlations" in parameters.groups:

531 run_flow_group_mode_correlations(context, series, parameters.mode_correlations)

532

533 if "population_counts" in parameters.groups:

534 run_flow_group_population_counts(context, series, parameters.population_counts)

535

536 if "population_stats" in parameters.groups:

537 run_flow_group_population_stats(context, series, parameters.population_stats)

538

539 if "shape_average" in parameters.groups:

540 run_flow_group_shape_average(context, series, parameters.shape_average)

541

542 if "shape_contours" in parameters.groups:

543 run_flow_group_shape_contours(context, series, parameters.shape_contours)

544

545 if "shape_errors" in parameters.groups:

546 run_flow_group_shape_errors(context, series, parameters.shape_errors)

547

548 if "shape_modes" in parameters.groups:

549 run_flow_group_shape_modes(context, series, parameters.shape_modes)

550

551 if "shape_samples" in parameters.groups:

552 run_flow_group_shape_samples(context, series, parameters.shape_samples)

553

554 if "variance_explained" in parameters.groups:

555 run_flow_group_variance_explained(context, series, parameters.variance_explained)

556

557

558@flow(name="group-cell-shapes_group-feature-components")

559def run_flow_group_feature_components(

560 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigFeatureComponents

561) -> None:

562 """Group cell shapes subflow for feature components."""

563

564 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")

565 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")

566

567 # Get feature columns

568 columns = [

569 f"{feature}.{region}" if region != "DEFAULT" else feature

570 for region in parameters.regions

571 for feature in parameters.features

572 ]

573

574 # Load data.

575 data_key = make_key(analysis_key, f"{series.name}.CELL_SHAPES_DATA.csv")

576 data = load_dataframe.with_options(**OPTIONS)(context.working_location, data_key)

577

578 # Fit model.

579 pca_data = data[columns]

580 pca_data_mean = pca_data.mean(axis=0)

581 pca_data_std = pca_data.std(axis=0)

582 pca_data_zscore = (pca_data - pca_data_mean) / pca_data_std

583 pca = PCA(n_components=parameters.components)

584 pca = pca.fit(pca_data_zscore)

585 transform = pca.transform(pca_data_zscore)

586

587 # Create output data.

588 feature_components = data[["KEY"]].copy()

589 feature_components.rename(columns={"KEY": "key"}, inplace=True)

590 for comp in range(parameters.components):

591 feature_components[f"component_{comp + 1}"] = transform[:, comp]

592

593 # Save dataframe.

594 save_dataframe(

595 context.working_location,

596 make_key(group_key, f"{series.name}.feature_components.csv"),

597 feature_components,

598 index=False,

599 )

600

601 # Get reference data convex hull.

602 if parameters.reference_metrics is not None and parameters.reference_properties is not None:

603 index_columns = ["KEY", "ID", "SEED", "TICK"]

604 reference_metrics = load_dataframe.with_options(**OPTIONS)(

605 context.working_location, parameters.reference_metrics

606 )

607 reference_properties = load_dataframe.with_options(**OPTIONS)(

608 context.working_location, parameters.reference_properties

609 )

610

611 reference_metrics.set_index(index_columns, inplace=True)

612 reference_properties.set_index(index_columns, inplace=True)

613

614 reference = reference_metrics.join(reference_properties, on=index_columns).reset_index()

615 reference_zscore = (reference[columns] - pca_data_mean) / pca_data_std

616 reference_transform = pca.transform(reference_zscore)

617

618 hull = ConvexHull(reference_transform)

619 points = pd.DataFrame(reference_transform[hull.vertices, :], columns=["x", "y"])

620

621 save_dataframe(

622 context.working_location,

623 make_key(group_key, f"{series.name}.feature_components.REFERENCE.csv"),

624 pd.DataFrame(points),

625 index=False,

626 )

627

628

629@flow(name="group-cell-shapes_group-feature-correlations")

630def run_flow_group_feature_correlations(

631 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigFeatureCorrelations

632) -> None:

633 """Group cell shapes subflow for feature correlations."""

634

635 analysis_shapes_key = make_key(series.name, "analysis", "analysis.SHAPES")

636 analysis_pca_key = make_key(series.name, "analysis", "analysis.PCA")

637 group_key = make_key(series.name, "groups", "groups.SHAPES")

638 region_key = "_".join(sorted(parameters.regions))

639 keys = [condition["key"] for condition in series.conditions]

640

641 for key in keys:

642 feature_key = f"{series.name}.feature_correlations.{key}"

643 series_key = f"{series.name}_{key}_{region_key}"

644

645 # Load model.

646 model_key = make_key(analysis_pca_key, f"{series_key}.PCA.pkl")

647 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)

648

649 # Load dataframe.

650 dataframe_key = make_key(analysis_shapes_key, f"{series_key}.SHAPES.csv")

651 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)

652

653 # Transform data into shape mode space.

654 columns = data.filter(like="shcoeffs").columns

655 transform = model.transform(data[columns].values)

656

657 for region in parameters.regions:

658 correlations: list[dict[str, Union[str, float]]] = []

659

660 for component in range(parameters.components):

661 mode_key = f"PC{component + 1}"

662 component_data = transform[:, component]

663

664 for prop in parameters.properties:

665 prop_key = prop.upper()

666 prop_data = data[f"{prop}.{region}".replace(".DEFAULT", "")]

667

668 slope, intercept = np.polyfit(component_data, prop_data, 1)

669

670 correlations.append(

671 {

672 "mode": mode_key,

673 "property": prop.upper(),

674 "correlation": pearsonr(prop_data, component_data).statistic,

675 "correlation_symmetric": pearsonr(

676 prop_data, abs(component_data)

677 ).statistic,

678 "slope": slope,

679 "intercept": intercept,

680 }

681 )

682

683 if not parameters.include_bins:

684 continue

685

686 prop_limits = parameters.limits[f"{prop}.{region}"]

687 mode_limits = parameters.limits[mode_key]

688

689 bins = bin_to_hex(

690 component_data,

691 prop_data,

692 np.ones(len(prop_data)),

693 scale=0.025,

694 limits=(mode_limits[0], mode_limits[1], prop_limits[0], prop_limits[1]),

695 )

696 bins_df = pd.DataFrame(

697 [[x, y, np.sum(v)] for (x, y), v in bins.items()], columns=["x", "y", "v"]

698 )

699

700 save_dataframe(

701 context.working_location,

702 make_key(group_key, f"{feature_key}.{mode_key}.{prop_key}.{region}.csv"),

703 bins_df,

704 index=False,

705 )

706

707 save_dataframe(

708 context.working_location,

709 make_key(group_key, f"{feature_key}.{region}.csv"),

710 pd.DataFrame(correlations),

711 index=False,

712 )

713

714

715@flow(name="group-cell-shapes_group-feature-distributions")

716def run_flow_group_feature_distributions(

717 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigFeatureDistributions

718) -> None:

719 """Group cell shapes subflow for feature distributions."""

720

721 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")

722 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")

723

724 keys = [condition["key"] for condition in series.conditions]

725 superkeys = {key_group for key in keys for key_group in key.split("_")}

726

727 features = [

728 (f"{prop}.{region}", False)

729 for prop in parameters.properties

730 for region in parameters.regions

731 ]

732

733 if parameters.reference_metrics is not None:

734 ref_metrics = load_dataframe.with_options(**OPTIONS)(

735 context.working_location, parameters.reference_metrics

736 )

737 features.extend(

738 [

739 (feature, True)

740 for feature, _ in features

741 if feature.replace(".DEFAULT", "") in ref_metrics.columns

742 ]

743 )

744

745 if parameters.reference_properties is not None:

746 ref_props = load_dataframe.with_options(**OPTIONS)(

747 context.working_location, parameters.reference_properties

748 )

749 features.extend(

750 [

751 (feature, True)

752 for feature, _ in features

753 if feature.replace(".DEFAULT", "") in ref_props.columns

754 ]

755 )

756

757 if parameters.reference_model is not None and parameters.reference_coefficients is not None:

758 ref_coeffs = load_dataframe.with_options(**OPTIONS)(

759 context.working_location, parameters.reference_coefficients, nrows=1

760 )

761 ref_model = load_pickle.with_options(**OPTIONS)(

762 context.working_location, parameters.reference_model

763 )

764 features.extend(

765 [(f"PC{component + 1}", False) for component in range(parameters.components)]

766 )

767

768 distribution_bins: dict[tuple[str, bool], dict] = {feature: {} for feature in features}

769 distribution_means: dict[tuple[str, bool], dict] = {feature: {} for feature in features}

770 distribution_stdevs: dict[tuple[str, bool], dict] = {feature: {} for feature in features}

771 distribution_mins: dict[tuple[str, bool], dict] = {feature: {} for feature in features}

772 distribution_maxs: dict[tuple[str, bool], dict] = {feature: {} for feature in features}

773

774 for key in superkeys:

775 dataframe_key = make_key(analysis_key, f"{series.name}_{key}.CELL_SHAPES_DATA.csv")

776 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)

777

778 if parameters.reference_model is not None:

779 transform = ref_model.transform(data[ref_coeffs.filter(like="shcoeffs").columns].values)

780 for component in range(parameters.components):

781 data[f"PC{component + 1}"] = transform[:, component]

782

783 for feature, filtered in features:

784 feature_column = feature.replace(".DEFAULT", "")

785 values = data[feature_column].values

786

787 if filtered:

788 if ref_metrics is not None and feature_column in ref_metrics.columns:

789 ref_max = ref_metrics[feature_column].max()

790 ref_min = ref_metrics[feature_column].min()

791 values = values[(values >= ref_min) & (values <= ref_max)]

792

793 if ref_props is not None and feature_column in ref_props.columns:

794 ref_max = ref_props[feature_column].max()

795 ref_min = ref_props[feature_column].min()

796 values = values[(values >= ref_min) & (values <= ref_max)]

797

798 bounds = (parameters.bounds[feature][0], parameters.bounds[feature][1])

799 bandwidth = parameters.bandwidth[feature]

800

801 valid = check_data_bounds(values, bounds, f"[ {key} ] feature [ {feature} ]")

802

803 if not valid:

804 continue

805

806 distribution_means[(feature, filtered)][key] = np.mean(values)

807 distribution_stdevs[(feature, filtered)][key] = np.std(values, ddof=1)

808 distribution_bins[(feature, filtered)][key] = calculate_data_bins(

809 values, bounds, bandwidth

810 )

811 distribution_mins[(feature, filtered)][key] = np.min(values)

812 distribution_maxs[(feature, filtered)][key] = np.max(values)

813

814 for (feature, filtered), distribution in distribution_bins.items():

815 distribution["*"] = {

816 "bandwidth": parameters.bandwidth[feature],

817 "means": distribution_means[(feature, filtered)],

818 "stdevs": distribution_stdevs[(feature, filtered)],

819 "mins": distribution_mins[(feature, filtered)],

820 "maxs": distribution_maxs[(feature, filtered)],

821 }

822

823 feature_key = f"{feature.upper()}{'.FILTERED' if filtered else ''}"

824 save_json(

825 context.working_location,

826 make_key(group_key, f"{series.name}.feature_distributions.{feature_key}.json"),

827 distribution,

828 )

829

830

831@flow(name="group-cell-shapes_group-mode-correlations")

832def run_flow_group_mode_correlations(

833 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigModeCorrelations

834) -> None:

835 """Group cell shapes subflow for mode correlations."""

836

837 analysis_shapes_key = make_key(series.name, "analysis", "analysis.SHAPES")

838 analysis_pca_key = make_key(series.name, "analysis", "analysis.PCA")

839 group_key = make_key(series.name, "groups", "groups.SHAPES")

840 region_key = "_".join(sorted(parameters.regions))

841 keys = [condition["key"] for condition in series.conditions]

842

843 all_models = {}

844 all_data = {}

845

846 for key in keys:

847 series_key = f"{series.name}_{key}_{region_key}"

848

849 # Load model.

850 model_key = make_key(analysis_pca_key, f"{series_key}.PCA.pkl")

851 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)

852 all_models[key] = model

853

854 # Load dataframe.

855 dataframe_key = make_key(analysis_shapes_key, f"{series_key}.SHAPES.csv")

856 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)

857 all_data[key] = data

858

859 if parameters.reference_model is not None and parameters.reference_data is not None:

860 keys.append("reference")

861 all_models["reference"] = load_pickle.with_options(**OPTIONS)(

862 context.working_location, parameters.reference_model

863 )

864 all_data["reference"] = load_dataframe.with_options(**OPTIONS)(

865 context.working_location, parameters.reference_data

866 )

867

868 correlations: list[dict[str, Union[str, int, float]]] = []

869

870 for source_key in keys:

871 for target_key in keys:

872 if source_key == target_key:

873 continue

874

875 # Select data sets.

876 data_source = all_data[source_key]

877 data_target = all_data[target_key]

878

879 # Select models.

880 model_source = all_models[source_key]

881 model_target = all_models[target_key]

882

883 # Get column order for model.

884 columns_source = data_source.filter(like="shcoeffs").columns

885 columns_target = data_target.filter(like="shcoeffs").columns

886

887 # Transform the data.

888 transform_source = model_source.transform(

889 np.append(

890 data_source[columns_source].values,

891 data_target[columns_source].values,

892 axis=0,

893 )

894 )

895 transform_target = model_target.transform(

896 np.append(

897 data_source[columns_target].values,

898 data_target[columns_target].values,

899 axis=0,

900 )

901 )

902

903 # Calculate correlations.

904 correlations = correlations + [

905 {

906 "source_key": source_key,

907 "target_key": target_key,

908 "source_mode": f"PC{si + 1}",

909 "target_mode": f"PC{ti + 1}",

910 "correlation": pearsonr(

911 transform_source[:, si], transform_target[:, ti]

912 ).statistic,

913 }

914 for si in range(parameters.components)

915 for ti in range(parameters.components)

916 ]

917

918 save_dataframe(

919 context.working_location,

920 make_key(group_key, f"{series.name}.mode_correlations.csv"),

921 pd.DataFrame(correlations),

922 index=False,

923 )

924

925

926@flow(name="group-cell-shapes_group-population-counts")

927def run_flow_group_population_counts(

928 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigPopulationCounts

929) -> None:

930 """Group cell shapes subflow for population counts."""

931

932 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")

933 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")

934

935 keys = [condition["key"] for condition in series.conditions]

936 superkeys = {key_group for key in keys for key_group in key.split("_")}

937

938 counts: list[dict] = []

939

940 for key in superkeys:

941 dataframe_key = make_key(analysis_key, f"{series.name}_{key}.CELL_SHAPES_DATA.csv")

942 data = load_dataframe.with_options(**OPTIONS)(

943 context.working_location, dataframe_key, usecols=["KEY", "SEED", "time"]

944 )

945 data = data[data["SEED"].isin(parameters.seeds) & (data["time"] == parameters.time)]

946

947 counts.extend(

948 [

949 {

950 "key": record["KEY"],

951 "seed": record["SEED"],

952 "count": record[0],

953 }

954 for record in data.groupby(["KEY", "SEED"]).size().reset_index().to_dict("records")

955 ]

956 )

957

958 save_dataframe(

959 context.working_location,

960 make_key(group_key, f"{series.name}.population_counts.{parameters.time:03d}.csv"),

961 pd.DataFrame(counts).drop_duplicates(),

962 index=False,

963 )

964

965

966@flow(name="group-cell-shapes_group-population-stats")

967def run_flow_group_population_stats(

968 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigPopulationStats

969) -> None:

970 """Group cell shapes subflow for population stats."""

971

972 analysis_key = make_key(series.name, "analysis", "analysis.STATISTICS")

973 group_key = make_key(series.name, "groups", "groups.SHAPES")

974 region_key = "_".join(sorted(parameters.regions))

975 keys = [condition["key"] for condition in series.conditions]

976

977 stats: dict[str, dict] = {key: {} for key in keys}

978

979 for key in keys:

980 dataframe_key = make_key(analysis_key, f"{series.name}_{key}_{region_key}.STATISTICS.csv")

981 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)

982

983 for feature, group in data.groupby("FEATURE"):

984 feature_name = f"{feature}.DEFAULT" if feature in ["VOLUME", "HEIGHT"] else feature

985

986 stats[key][feature_name.upper()] = {

987 "size": int(group["SIZE"].sum()),

988 "replicates": len(group),

989 "mean": group["KS_STATISTIC"].mean(),

990 "std": group["KS_STATISTIC"].std(ddof=1),

991 }

992

993 save_json(

994 context.working_location,

995 make_key(group_key, f"{series.name}.population_stats.json"),

996 stats,

997 )

998

999

1000@flow(name="group-cell-shapes_group-shape-average")

1001def run_flow_group_shape_average(

1002 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeAverage

1003) -> None:

1004 """

1005 Group cell shapes subflow for shape average.

1006

1007 Find the cell closest to the average shape. Extract original mesh slice and

1008 extent projections. Created reconstructed mesh and extract mesh slice and

1009 extent projections.

1010 """

1011

1012 logger = get_run_logger()

1013

1014 analysis_shapes_key = make_key(series.name, "analysis", "analysis.SHAPES")

1015 analysis_pca_key = make_key(series.name, "analysis", "analysis.PCA")

1016 data_key = make_key(series.name, "data", "data.LOCATIONS")

1017 group_key = make_key(series.name, "groups", "groups.SHAPES")

1018 region_key = "_".join(sorted(parameters.regions))

1019 keys = [condition["key"] for condition in series.conditions]

1020

1021 for key in keys:

1022 series_key = f"{series.name}_{key}_{region_key}"

1023

1024 # Load model.

1025 model_key = make_key(analysis_pca_key, f"{series_key}.PCA.pkl")

1026 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)

1027

1028 # Load dataframe.

1029 dataframe_key = make_key(analysis_shapes_key, f"{series_key}.SHAPES.csv")

1030 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)

1031

1032 # Transform data into shape mode space.

1033 columns = data.filter(like="shcoeffs").columns

1034 transform = model.transform(data[columns].values)

1035

1036 # Select the cell closest to average.

1037 distance, index = KDTree(transform).query([0] * parameters.components)

1038 selected = data.iloc[index, :]

1039 logger.info(

1040 "[ %s ] seed [ %d ] tick [ %d ] cell [ %d ] with distance [ %.2f ]",

1041 key,

1042 selected["SEED"],

1043 selected["TICK"],

1044 selected["ID"],

1045 distance,

1046 )

1047

1048 # Get the matching location for the selected cell.

1049 series_key = f"{series.name}_{key}_{selected['SEED']:04d}"

1050 tar_key = make_key(data_key, f"{series_key}.LOCATIONS.tar.xz")

1051 tar = load_tar(context.working_location, tar_key)

1052

1053 # Load matching location voxels.

1054 locations = extract_tick_json(tar, series_key, selected["TICK"], "LOCATIONS")

1055 location = next(location for location in locations if location["id"] == selected["ID"])

1056 voxels = get_location_voxels(location)

1057 array = make_voxels_array(voxels)

1058

1059 # Create original mesh and get projections.

1060 original_mesh = construct_mesh_from_array(array, array)

1061 original_mesh_projections = extract_mesh_projections(original_mesh)

1062

1063 # Create reconstructed mesh and get projections.

1064 reconstructed_mesh = construct_mesh_from_coeffs(

1065 selected, parameters.order, scale=parameters.scale

1066 )

1067 reconstructed_mesh_projections = extract_mesh_projections(reconstructed_mesh)

1068

1069 # Save json for each projection.

1070 for projection in parameters.projections:

1071 shape_average: dict[str, dict] = {

1072 "original_slice": original_mesh_projections[f"{projection}_slice"],

1073 "original_extent": original_mesh_projections[f"{projection}_extent"],

1074 "reconstructed_slice": reconstructed_mesh_projections[f"{projection}_slice"],

1075 }

1076

1077 save_json(

1078 context.working_location,

1079 make_key(group_key, f"{series.name}.shape_average.{key}.{projection.upper()}.json"),

1080 shape_average,

1081 )

1082

1083

1084@flow(name="group-cell-shapes_group-shape-contours")

1085def run_flow_group_shape_contours(

1086 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeContours

1087) -> None:

1088 """Group cell shapes subflow for shape contours."""

1089

1090 data_key = make_key(series.name, "data", "data.LOCATIONS")

1091 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")

1092 keys = [condition["key"] for condition in series.conditions]

1093

1094 projection = parameters.projection

1095 projection_index = list(reversed(PROJECTIONS)).index(projection)

1096

1097 for key in keys:

1098 series_key = f"{series.name}_{key}_{parameters.seed:04d}"

1099 tar_key = make_key(data_key, f"{series_key}.LOCATIONS.tar.xz")

1100 tar = load_tar(context.working_location, tar_key)

1101

1102 ds = parameters.ds if parameters.ds is not None else estimate_spatial_resolution(key)

1103 dt = parameters.dt if parameters.dt is not None else estimate_temporal_resolution(key)

1104

1105 tick = int(parameters.time / dt)

1106 length, width, height = parameters.box

1107 box = (int((length - 2) / ds) + 2, int((width - 2) / ds) + 2, int((height - 2) / ds) + 2)

1108

1109 locations = extract_tick_json(tar, series_key, tick, "LOCATIONS")

1110

1111 for region in parameters.regions:

1112 all_contours = []

1113

1114 for location in locations:

1115 voxels = get_location_voxels(location, None if region == "DEFAULT" else region)

1116

1117 if parameters.slice_index is not None:

1118 voxels = [

1119 voxel

1120 for voxel in voxels

1121 if voxel[projection_index] == parameters.slice_index

1122 ]

1123

1124 if len(voxels) == 0:

1125 continue

1126

1127 contours = [

1128 (np.array(contour) * ds).astype("int").tolist()

1129 for contour in extract_voxel_contours(voxels, projection, box)

1130 ]

1131

1132 all_contours.append({"id": location["id"], "contours": contours})

1133

1134 contour_key = f"{key}.{parameters.seed:04d}.{parameters.time:03d}.{region}"

1135 save_json(

1136 context.working_location,

1137 make_key(

1138 group_key,

1139 f"{series.name}.shape_contours.{contour_key}.{projection.upper()}.json",

1140 ),

1141 all_contours,

1142 )

1143

1144

1145@flow(name="group-cell-shapes_group-shape-errors")

1146def run_flow_group_shape_errors(

1147 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeErrors

1148) -> None:

1149 """Group cell shapes subflow for shape errors."""

1150

1151 analysis_key = make_key(series.name, "analysis", "analysis.SHAPES")

1152 group_key = make_key(series.name, "groups", "groups.SHAPES")

1153 region_key = "_".join(sorted(parameters.regions))

1154 keys = [condition["key"] for condition in series.conditions]

1155

1156 errors: dict[str, dict] = {key: {} for key in keys}

1157

1158 for key in keys:

1159 dataframe_key = make_key(analysis_key, f"{series.name}_{key}_{region_key}.SHAPES.csv")

1160 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)

1161

1162 for region in parameters.regions:

1163 errors[key][region] = {

1164 "mean": data[f"mse.{region}".replace(".DEFAULT", "")].mean(),

1165 "std": data[f"mse.{region}".replace(".DEFAULT", "")].std(ddof=1),

1166 }

1167

1168 save_json(

1169 context.working_location,

1170 make_key(group_key, f"{series.name}.shape_errors.json"),

1171 errors,

1172 )

1173

1174

1175@flow(name="group-cell-shapes_group-shape-modes")

1176def run_flow_group_shape_modes(

1177 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeModes

1178) -> None:

1179 """

1180 Group cell shapes subflow for shape modes.

1181

1182 Extract shape modes from PCAs as dictionaries of svg paths for each map

1183 point and projection. Consolidate shape modes from keys into single json.

1184 """

1185

1186 analysis_data_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_DATA")

1187 analysis_model_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_MODELS")

1188 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")

1189

1190 keys = [condition["key"] for condition in series.conditions]

1191 superkeys = {key_group for key in keys for key_group in key.split("_")}

1192

1193 projections = ["top", "side1", "side2"]

1194

1195 for superkey in superkeys:

1196 series_key = f"{series.name}_{superkey}"

1197

1198 # Load model.

1199 model_key = make_key(analysis_model_key, f"{series_key}.CELL_SHAPES_MODELS.pkl")

1200 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)

1201

1202 # Load dataframe.

1203 dataframe_key = make_key(analysis_data_key, f"{series_key}.CELL_SHAPES_DATA.csv")

1204 data = load_dataframe.with_options(**OPTIONS)(context.working_location, dataframe_key)

1205

1206 # Extract shape modes.

1207 shape_modes = extract_shape_modes(

1208 model,

1209 data,

1210 parameters.components,

1211 parameters.regions,

1212 parameters.order,

1213 parameters.delta,

1214 )

1215

1216 for region in parameters.regions:

1217 shape_mode_projections: dict[str, list] = {

1218 f"PC{component + 1}.{projection}": []

1219 for component in range(parameters.components)

1220 for projection in parameters.projections

1221 }

1222

1223 for shape_mode in shape_modes[region]:

1224 for projection in parameters.projections:

1225 shape_mode_projections[f"PC{shape_mode['mode']}.{projection}"].append(

1226 {

1227 "point": shape_mode["point"],

1228 "projection": shape_mode["projections"][f"{projection}_slice"],

1229 }

1230 )

1231

1232 for proj_key, projections in shape_mode_projections.items():

1233 save_json(

1234 context.working_location,

1235 make_key(

1236 group_key,

1237 f"{series.name}.shape_modes.{superkey}.{region}.{proj_key.upper()}.json",

1238 ).replace("..", "."),

1239 projections,

1240 )

1241

1242

1243@flow(name="group-cell-shapes_group-shape-samples")

1244def run_flow_group_shape_samples(

1245 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigShapeSamples

1246) -> None:

1247 """

1248 Group cell shapes subflow for shape samples.

1249

1250 Extract sample cell shapes from specified simulations. Construct wireframes

1251 from the cell shape mesh.

1252 """

1253

1254 data_key = make_key(series.name, "data", "data.LOCATIONS")

1255 group_key = make_key(series.name, "groups", "groups.SHAPES")

1256 keys = [condition["key"] for condition in series.conditions]

1257

1258 shape_samples: dict[str, dict] = {}

1259

1260 for key in keys:

1261 shape_samples[key] = {region: [] for region in parameters.regions}

1262

1263 # Load location data.

1264 series_key = f"{series.name}_{key}_{parameters.seed:04d}"

1265 tar_key = make_key(data_key, f"{series_key}.LOCATIONS.tar.xz")

1266 tar = load_tar(context.working_location, tar_key)

1267 locations = extract_tick_json(tar, series_key, parameters.tick, "LOCATIONS")

1268

1269 for index in parameters.indices:

1270 location = locations[index]

1271

1272 for region in parameters.regions:

1273 voxels = get_location_voxels(location)

1274 array = make_voxels_array(voxels)

1275

1276 if region != "DEFAULT":

1277 region_voxels = get_location_voxels(locations[index], region)

1278 region_array = make_voxels_array(region_voxels, reference=voxels)

1279 mesh = construct_mesh_from_array(region_array, array)

1280 else:

1281 mesh = construct_mesh_from_array(array, array)

1282

1283 shape_samples[key][region].append(extract_mesh_wireframe(mesh))

1284

1285 save_json(

1286 context.working_location,

1287 make_key(group_key, f"{series.name}.shape_samples.json"),

1288 shape_samples,

1289 )

1290

1291

1292@flow(name="group-cell-shapes_group-variance-explained")

1293def run_flow_group_variance_explained(

1294 context: ContextConfig, series: SeriesConfig, parameters: ParametersConfigVarianceExplained

1295) -> None:

1296 """Group cell shapes subflow for variance explained."""

1297

1298 analysis_key = make_key(series.name, "analysis", "analysis.CELL_SHAPES_MODELS")

1299 group_key = make_key(series.name, "groups", "groups.CELL_SHAPES")

1300

1301 keys = [condition["key"] for condition in series.conditions]

1302 superkeys = {key_group for key in keys for key_group in key.split("_")}

1303

1304 variance = []

1305

1306 for superkey in superkeys:

1307 model_key = make_key(analysis_key, f"{series.name}_{superkey}.CELL_SHAPES_MODELS.pkl")

1308 model = load_pickle.with_options(**OPTIONS)(context.working_location, model_key)

1309

1310 variance.append(

1311 pd.DataFrame(

1312 {

1313 "key": [superkey] * parameters.components,

1314 "mode": [f"PC{i}" for i in range(1, parameters.components + 1)],

1315 "variance": model.explained_variance_ratio_,

1316 }

1317 )

1318 )

1319

1320 save_dataframe(

1321 context.working_location,

1322 make_key(group_key, f"{series.name}.variance_explained.csv"),

1323 pd.concat(variance),

1324 index=False,

1325 )

Coverage for src/cell_abm_pipeline/flows/group_cell_shapes.py: 0%

537 statements