From 5540f05fb47db58223eacc1870a633975770b2b9 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 2 Jun 2026 18:35:32 +0000 Subject: [PATCH 01/11] Refactor ingestion pipeline and dataset templates * Introduce a new `dataset_factory` module for schema-driven dataset creation. * Implement `run_segy_ingestion` orchestrator to streamline SEG-Y ingestion. * Enhance dataset templates with dimension specification methods to ensure consistency in data types. * Update `__init__.py` files to include new ingestion components and improve module organization. * Remove deprecated `schema_resolver.py` to clean up the codebase. * Add unit tests to validate new functionalities and ensure robustness. --- refactor_plan_bak.md | 220 +++++++++ src/mdio/__init__.py | 4 + src/mdio/builder/dataset_builder.py | 6 +- src/mdio/builder/templates/base.py | 56 ++- src/mdio/builder/templates/seismic_2d_cdp.py | 6 +- .../templates/seismic_2d_streamer_shot.py | 2 +- src/mdio/builder/templates/seismic_3d_cdp.py | 8 +- src/mdio/builder/templates/seismic_3d_coca.py | 20 +- src/mdio/builder/templates/seismic_3d_obn.py | 21 +- .../templates/seismic_3d_offset_tiles.py | 20 +- .../templates/seismic_3d_receiver_gathers.py | 14 +- .../seismic_3d_shot_receiver_line.py | 20 +- .../templates/seismic_3d_streamer_field.py | 20 +- .../templates/seismic_3d_streamer_shot.py | 2 +- src/mdio/converters/segy.py | 439 +----------------- src/mdio/ingestion/__init__.py | 20 +- src/mdio/ingestion/dataset_factory.py | 262 +++++++++++ src/mdio/ingestion/metadata.py | 2 +- src/mdio/ingestion/schema/__init__.py | 20 + src/mdio/ingestion/schema/models.py | 227 +++++++++ src/mdio/ingestion/schema/resolver.py | 88 ++++ src/mdio/ingestion/schema_resolver.py | 211 --------- src/mdio/ingestion/segy/coordinates.py | 74 ++- src/mdio/ingestion/segy/file_headers.py | 2 +- .../ingestion/{ => segy}/index_strategies.py | 33 +- src/mdio/ingestion/segy/pipeline.py | 194 ++++++++ src/mdio/ingestion/segy/raw_headers.py | 64 +++ src/mdio/ingestion/segy/reader.py | 83 ++++ src/mdio/ingestion/segy/serializer.py | 101 ++++ src/mdio/ingestion/segy/validation.py | 2 +- src/mdio/segy/geometry.py | 114 +---- src/mdio/segy/utilities.py | 171 ------- tests/unit/ingestion/test_dataset_factory.py | 128 +++++ tests/unit/ingestion/test_metadata.py | 12 +- .../ingestion/test_no_template_mutation.py | 36 ++ tests/unit/ingestion/test_schema_effects.py | 112 +++++ .../test_schema_resolver.py} | 49 +- tests/unit/ingestion/test_segy_coordinates.py | 82 +--- .../unit/ingestion/test_segy_file_headers.py | 14 +- .../test_segy_grid_overrides.py | 72 ++- .../test_segy_index_strategies.py} | 51 +- tests/unit/ingestion/test_segy_pipeline.py | 90 ++++ tests/unit/ingestion/test_segy_reader.py | 136 ++++++ .../test_segy_spec_validation.py | 10 +- tests/unit/ingestion/test_segy_validation.py | 14 +- tests/unit/test_segy_file_header_modes.py | 20 +- 46 files changed, 2213 insertions(+), 1139 deletions(-) create mode 100644 refactor_plan_bak.md create mode 100644 src/mdio/ingestion/dataset_factory.py create mode 100644 src/mdio/ingestion/schema/__init__.py create mode 100644 src/mdio/ingestion/schema/models.py create mode 100644 src/mdio/ingestion/schema/resolver.py delete mode 100644 src/mdio/ingestion/schema_resolver.py rename src/mdio/ingestion/{ => segy}/index_strategies.py (91%) create mode 100644 src/mdio/ingestion/segy/pipeline.py create mode 100644 src/mdio/ingestion/segy/raw_headers.py create mode 100644 src/mdio/ingestion/segy/reader.py create mode 100644 src/mdio/ingestion/segy/serializer.py create mode 100644 tests/unit/ingestion/test_dataset_factory.py create mode 100644 tests/unit/ingestion/test_no_template_mutation.py create mode 100644 tests/unit/ingestion/test_schema_effects.py rename tests/unit/{test_ingestion_schema_resolver.py => ingestion/test_schema_resolver.py} (73%) rename tests/unit/{ => ingestion}/test_segy_grid_overrides.py (74%) rename tests/unit/{test_ingestion_index_strategies.py => ingestion/test_segy_index_strategies.py} (93%) create mode 100644 tests/unit/ingestion/test_segy_pipeline.py create mode 100644 tests/unit/ingestion/test_segy_reader.py rename tests/unit/{ => ingestion}/test_segy_spec_validation.py (91%) diff --git a/refactor_plan_bak.md b/refactor_plan_bak.md new file mode 100644 index 00000000..ce33bde9 --- /dev/null +++ b/refactor_plan_bak.md @@ -0,0 +1,220 @@ +--- +name: split-grid-overrides-v3 +overview: Split the current `grid_overrides_v3` branch into 8 dependency-ordered, atomically reviewable PRs against `main`, each with its own changelog story. Each PR is green on its own; PR 6 is the one that flips runtime over to the new orchestrator and is the natural minor-version-bump cut. +todos: + - id: pr1 + content: "PR 1: extract pure helpers (header_analysis, coordinate_utils, validation, metadata) into mdio/ingestion/ with no behavior change" + status: pending + - id: pr2 + content: "PR 2: add Pydantic GridOverrides class + dict deprecation shim, additive only" + status: pending + - id: pr3 + content: "PR 3: replace GridOverrider internals with IndexStrategy pattern, fix the broken test_segy_grid_overrides.py file" + status: pending + - id: pr4 + content: "PR 4: introduce SchemaResolver + ResolvedSchema and template declare_coordinate_specs/apply_resolved_dimensions hooks (dead code, wired in PR 6)" + status: pending + - id: pr5 + content: "PR 5: add HeaderAnalyzer and DatasetFactory services with unit tests (dead code, wired in PR 6)" + status: pending + - id: pr6 + content: "PR 6: introduce run_segy_ingestion orchestrator, slim segy_to_mdio to a shim, expose new public API, delete legacy GridOverrider, decide CLI fate" + status: pending + - id: pr7 + content: "PR 7: extract MDIO__IMPORT__RAW_HEADERS gating into _raw_headers_experimental.py for one-file removal" + status: pending + - id: pr8 + content: "PR 8: dependency bump (uv.lock) and template internal polish" + status: pending +isProject: false +--- + +# Split `grid_overrides_v3` Into 8 Atomic PRs + +## Strategy + +The current branch interleaves five themes: typed config, strategy refactor, schema resolution, services, pipeline rewire, and an experimental flag move. Treat the existing branch as the **source of truth for the final state** and replay it as a stack of branches off `main`. Each PR below cherry-picks a tight subset of files; nothing earlier than PR 6 changes the runtime path of `segy_to_mdio`, so each one merges without a behavior change visible to users. + +Suggested workflow per PR: + +1. `git checkout -b origin/main` +2. `git checkout grid_overrides_v3 -- ` +3. Trim back anything that pulls in scope from later PRs (notes per PR below). +4. Run `nox -s pre-commit` and `nox -s tests-3.13 -- ` per the workspace rule. +5. Open the PR with the suggested title/summary. + +```mermaid +flowchart LR + PR1[PR1 extract pure modules] --> PR2[PR2 Pydantic GridOverrides] + PR1 --> PR3[PR3 strategy pattern] + PR2 --> PR3 + PR3 --> PR4[PR4 schema resolver and template hooks] + PR4 --> PR5[PR5 HeaderAnalyzer and DatasetFactory] + PR5 --> PR6[PR6 pipeline orchestrator and public API] + PR6 --> PR7[PR7 raw-headers gating] + PR6 --> PR8[PR8 dep bump and template polish] +``` + +--- + +## PR 1 - Extract pure modules into `mdio/ingestion/` + +**Goal:** mechanical code move, zero behavior change. Gives the next PRs a home without dragging architecture along. + +**Files (copy from branch verbatim):** +- [src/mdio/ingestion/__init__.py](src/mdio/ingestion/__init__.py) - trimmed to only re-export what this PR introduces +- [src/mdio/ingestion/header_analysis.py](src/mdio/ingestion/header_analysis.py) (the pure analysis primitives + `StreamerShotGeometryType`, `ShotGunGeometryType`, `analyze_streamer_headers`, `analyze_lines_for_guns`, `analyze_non_indexed_headers`) +- [src/mdio/ingestion/coordinate_utils.py](src/mdio/ingestion/coordinate_utils.py) +- [src/mdio/ingestion/validation.py](src/mdio/ingestion/validation.py) (`grid_density_qc`, `validate_spec_in_template`) +- [src/mdio/ingestion/metadata.py](src/mdio/ingestion/metadata.py) MINUS the `attach_raw_binary_header` import (defer to PR 7) + +**Adjustments before commit:** +- In [src/mdio/ingestion/metadata.py](src/mdio/ingestion/metadata.py), drop `from mdio.ingestion._raw_headers_experimental import attach_raw_binary_header` and its call inside `add_segy_file_headers`. Inline the existing on-`main` raw-binary handling instead, or stub it as a TODO (PR 7 reintroduces the gating module). +- Trim `__init__.py` exports to only the symbols actually defined in this PR. +- Update import sites in `mdio/converters/segy.py` and any other on-`main` callers to point at the new module locations. + +**Tests:** existing tests must pass unchanged (`nox -s tests-3.13`). + +**PR title:** `refactor(ingestion): extract analysis, coordinate, validation, and metadata helpers` + +--- + +## PR 2 - Add Pydantic `GridOverrides` (additive) + +**Goal:** the typed model lands as a public class, dict path stays alive internally. + +**Files:** +- [src/mdio/segy/geometry.py](src/mdio/segy/geometry.py) - add the `GridOverrides` class **alongside** the existing `GridOverrider` (do not delete `GridOverrider` yet). +- [src/mdio/converters/segy.py](src/mdio/converters/segy.py) - add the `isinstance(grid_overrides, dict)` shim with `DeprecationWarning` and `GridOverrides.from_legacy_dict(...)` coercion. After coercion, immediately re-dump to dict via `model_dump(by_alias=True, exclude_defaults=True)` so the still-on-`main` `GridOverrider` keeps receiving the dict it expects. +- [src/mdio/__init__.py](src/mdio/__init__.py) - add `GridOverrides` to `__all__` and imports. +- [tests/unit/test_grid_overrides_pydantic.py](tests/unit/test_grid_overrides_pydantic.py) - new file, copy as-is. + +**Acceptance:** existing `tests/unit/test_segy_grid_overrides.py` (which imports `GridOverrider`) still passes; `test_grid_overrides_pydantic.py` passes. + +**PR title:** `feat(api): typed Pydantic GridOverrides with legacy dict deprecation` + +**Changelog line:** "`mdio.GridOverrides` is now the supported way to configure grid overrides. Passing a `dict` still works but emits a `DeprecationWarning`." + +--- + +## PR 3 - Replace `GridOverrider` with strategy pattern + +**Goal:** the "modernization to grid override code" PR you called out. Behavior preserved; the monolithic `GridOverrider.run(...)` becomes `IndexStrategyRegistry().create_strategy(...)` returning composable `IndexStrategy` instances. + +**Files:** +- [src/mdio/ingestion/index_strategies.py](src/mdio/ingestion/index_strategies.py) - new (`IndexStrategy` ABC + `RegularGridStrategy`, `NonBinnedStrategy`, `DuplicateHandlingStrategy`, `ChannelWrappingStrategy`, `ShotWrappingStrategy`, `ComponentSynthesisStrategy`, `CompositeStrategy`, `IndexStrategyRegistry`). +- [src/mdio/segy/geometry.py](src/mdio/segy/geometry.py) - rewrite `GridOverrider.run(...)` as a thin shim: build a `GridOverrides` from the input dict, call `IndexStrategyRegistry`, return the `(headers, names, chunks)` tuple shape that callers expect. Keep the public signature unchanged. +- [tests/unit/test_ingestion_index_strategies.py](tests/unit/test_ingestion_index_strategies.py) - new. +- [tests/unit/test_segy_grid_overrides.py](tests/unit/test_segy_grid_overrides.py) - **fix**: the file currently imports `GridOverrider` and `TemplateRegistry`; either keep the shim alive (above) so this file passes unchanged, or rewrite each test to drive `IndexStrategyRegistry` directly. The branch leaves it in a broken-import state, which is the kind of footgun this PR should clean up. + +**Acceptance:** integration tests `tests/integration/test_import_obn_grid_overrides.py` and `tests/integration/test_import_streamer_grid_overrides.py` pass; new strategy unit tests pass; old `test_segy_grid_overrides.py` either passes or is replaced. + +**PR title:** `refactor(ingestion): split GridOverrider into composable IndexStrategy classes` + +--- + +## PR 4 - Schema resolution + template-aware dimension layout + +**Goal:** declarative final-schema description; templates stop having ingestion poke private attributes. + +**Files:** +- [src/mdio/ingestion/schema_resolver.py](src/mdio/ingestion/schema_resolver.py) - new (`DimensionSpec`, `CoordinateSpec`, `ResolvedSchema`, `SchemaResolver`). +- [src/mdio/builder/templates/base.py](src/mdio/builder/templates/base.py) - add `declare_coordinate_specs()` (default impl over `physical_coordinate_names` + `logical_coordinate_names`) and `apply_resolved_dimensions(dim_names, chunk_shape)`. Add the `synthesize_missing_dims` and `_calculated_dims` attributes. +- Per-template overrides only where the default isn't right: + - [src/mdio/builder/templates/seismic_3d_obn.py](src/mdio/builder/templates/seismic_3d_obn.py) - per-shot-dim coord specs, `_calculated_dims = ("shot_index",)`, `synthesize_missing_dims = ("component",)`. + - [src/mdio/builder/templates/seismic_3d_streamer_field.py](src/mdio/builder/templates/seismic_3d_streamer_field.py) + - [src/mdio/builder/templates/seismic_3d_coca.py](src/mdio/builder/templates/seismic_3d_coca.py) + - [src/mdio/builder/templates/seismic_3d_cdp.py](src/mdio/builder/templates/seismic_3d_cdp.py), [src/mdio/builder/templates/seismic_2d_cdp.py](src/mdio/builder/templates/seismic_2d_cdp.py) - inline/crossline-indexed `cdp_x`/`cdp_y` specs. +- [tests/unit/test_ingestion_schema_resolver.py](tests/unit/test_ingestion_schema_resolver.py) - new. + +**Adjustments:** nothing in this PR consumes `ResolvedSchema` at runtime - the resolver is dead code until PR 5/6 wire it. That is intentional. + +**Acceptance:** all template tests under `tests/unit/v1/templates/` still pass; new resolver tests pass. + +**PR title:** `feat(ingestion): declarative ResolvedSchema and template-driven coordinate specs` + +--- + +## PR 5 - `HeaderAnalyzer` + `DatasetFactory` + +**Goal:** add the two small services that PR 6 will compose. They have a real changelog bullet on their own ("ingestion now reads only the headers required by the resolved schema"). + +**Files:** +- [src/mdio/ingestion/header_analyzer.py](src/mdio/ingestion/header_analyzer.py) +- [src/mdio/ingestion/dataset_factory.py](src/mdio/ingestion/dataset_factory.py) +- [tests/unit/test_ingestion_header_analyzer.py](tests/unit/test_ingestion_header_analyzer.py) +- [tests/unit/test_ingestion_dataset_factory.py](tests/unit/test_ingestion_dataset_factory.py) + +**Adjustments:** still no runtime use; pipeline switch is PR 6. + +**Acceptance:** new unit tests pass; everything else unchanged. + +**PR title:** `feat(ingestion): HeaderAnalyzer and DatasetFactory services` + +--- + +## PR 6 - Pipeline orchestrator + public API surface (the minor-version-bump PR) + +**Goal:** wire PRs 2-5 into a single `run_segy_ingestion`, replace the body of `segy_to_mdio` with a thin shim, surface the new public types. + +**Files:** +- [src/mdio/ingestion/pipeline.py](src/mdio/ingestion/pipeline.py) - new (`run_segy_ingestion`). +- [src/mdio/converters/segy.py](src/mdio/converters/segy.py) - body becomes the dict-deprecation shim + delegate to `run_segy_ingestion`. The `noqa: PLR0913` and old-style internals go away. +- [src/mdio/ingestion/__init__.py](src/mdio/ingestion/__init__.py) - expand to the full `__all__` from the branch. +- [src/mdio/__init__.py](src/mdio/__init__.py) - add `run_segy_ingestion`, `IndexStrategy`, `IndexStrategyRegistry`, `ResolvedSchema`, `CoordinateSpec`, `DimensionSpec`. +- [src/mdio/segy/geometry.py](src/mdio/segy/geometry.py) - **delete** the legacy `GridOverrider` shim that PR 3 kept alive. All callers now route through `run_segy_ingestion`. +- [src/mdio/commands/segy.py](src/mdio/commands/segy.py) - **fix the broken CLI**: it currently calls `segy_to_mdio(segy_path=..., index_bytes=..., index_names=..., chunksize=...)`, none of which match the v1 signature in `converters/segy.py`. Either update the CLI to construct a `SegySpec` + `mdio_template` + `GridOverrides` and call `segy_to_mdio` correctly, or wrap it in a `# TODO(v1.3): rewrite CLI for v1 API` and skip it from CI. Pick one explicitly; do not leave it silently broken. + +**Acceptance:** full `nox -s tests-3.13` passes; integration tests under `tests/integration/` pass; example notebooks (if any) still run. + +**PR title:** `feat(ingestion): run_segy_ingestion orchestrator and v1.2 public API` + +**Changelog highlights:** +- New public functions/classes: `run_segy_ingestion`, `IndexStrategy`, `IndexStrategyRegistry`, `ResolvedSchema`, `CoordinateSpec`, `DimensionSpec`. +- `segy_to_mdio` retained as a v1.x compatibility entry point. +- Memory: ingestion now only parses headers required by the resolved schema (`HeaderAnalyzer`). + +--- + +## PR 7 - Gate experimental raw-headers behind `_raw_headers_experimental.py` + +**Goal:** isolate the `MDIO__IMPORT__RAW_HEADERS` feature so removing it later is a one-file delete. + +**Files:** +- [src/mdio/ingestion/_raw_headers_experimental.py](src/mdio/ingestion/_raw_headers_experimental.py) - new. The module's own docstring is the PR description; copy it into the PR body. +- [src/mdio/ingestion/pipeline.py](src/mdio/ingestion/pipeline.py) - call `maybe_add_raw_headers(mdio_template, mdio_ds)` after `DatasetFactory().build(...)`. +- [src/mdio/ingestion/metadata.py](src/mdio/ingestion/metadata.py) - re-add the `attach_raw_binary_header` import + call inside `add_segy_file_headers` (the inverse of the trim in PR 1). +- [src/mdio/builder/templates/base.py](src/mdio/builder/templates/base.py) - keep the `_add_raw_headers` method (defined in PR 4) in place; no change here unless the prior PR omitted it. + +**Acceptance:** with `MDIO__IMPORT__RAW_HEADERS=1` and Zarr v3, raw_headers variable appears in output; without the env var, no behavior change. + +**PR title:** `refactor: gate experimental raw-headers feature behind a single module` + +--- + +## PR 8 - Dep bump + template polish + +**Goal:** the leftover small stuff. Optional; can ride along with PR 6 if reviewers prefer. + +**Files:** +- [uv.lock](uv.lock) +- Per-template polish in [src/mdio/builder/templates/](src/mdio/builder/templates/) (`_dim_names`/`_calculated_dims` cleanup, dead-method removal). + +**Acceptance:** full test suite green; `nox -s pre-commit` clean. + +**PR title:** `chore(deps): bump dependencies and polish template internals` + +--- + +## Risks & coupling notes + +- **PR 4-6 are the tightest cluster.** PR 4's `ResolvedSchema` is dead code at merge time, PR 5's services are dead code at merge time, PR 6 flips runtime. This staging is intentional - it keeps each diff small and reviewable, at the cost of two PRs that "do nothing yet". If reviewers push back on dead code, fold PRs 4 and 5 together but **never** fold them into PR 6. +- **PR 3 must keep `GridOverrider` callable** until PR 6 deletes it, otherwise existing callers in `mdio/converters/segy.py` (still on the v1.1 path) break mid-stack. +- **`tests/unit/test_segy_grid_overrides.py`** is currently broken on the branch (imports a `GridOverrider` symbol that no longer exists in `mdio.segy.geometry`). PR 3 is the place to fix it. Do not let it land broken in a separate PR. +- **CLI in `mdio/commands/segy.py`** has the same problem (calls v0-style `segy_to_mdio(segy_path=..., index_bytes=..., ...)`). PR 6 must either fix or explicitly defer with a tracked issue - this would be very embarrassing in a release notes diff. +- **Autodocs.** The split gives ~8 changelog bullets instead of one "big refactor" line. PR 2 (typed config), PR 3 (strategy pattern), PR 5 (memory win), and PR 6 (new public API) are all release-note-worthy on their own. + +## Suggested merge order for the next two minor releases + +- v1.2: PRs 1, 2, 7 (no runtime change, deprecation only, easy to revert). +- v1.3: PRs 3, 4, 5, 6, 8 (the actual architectural turn, with the orchestrator landing in PR 6 as the headline). diff --git a/src/mdio/__init__.py b/src/mdio/__init__.py index 7e1851c0..bfae0f30 100644 --- a/src/mdio/__init__.py +++ b/src/mdio/__init__.py @@ -8,6 +8,8 @@ from mdio.api.io import to_mdio from mdio.converters import mdio_to_segy from mdio.converters import segy_to_mdio +from mdio.ingestion import ResolvedSchema +from mdio.ingestion import run_segy_ingestion from mdio.optimize.access_pattern import OptimizedAccessPatternConfig from mdio.optimize.access_pattern import optimize_access_patterns from mdio.segy.geometry import GridOverrides @@ -27,4 +29,6 @@ "segy_to_mdio", "OptimizedAccessPatternConfig", "optimize_access_patterns", + "run_segy_ingestion", + "ResolvedSchema", ] diff --git a/src/mdio/builder/dataset_builder.py b/src/mdio/builder/dataset_builder.py index b2560e1d..81ab5b65 100644 --- a/src/mdio/builder/dataset_builder.py +++ b/src/mdio/builder/dataset_builder.py @@ -4,9 +4,13 @@ from datetime import datetime from enum import Enum from enum import auto +from importlib import metadata from typing import Any -from mdio import __version__ +try: + __version__ = metadata.version("multidimio") +except metadata.PackageNotFoundError: + __version__ = "unknown" from mdio.builder.formatting_html import dataset_builder_repr_html from mdio.builder.schemas.compressors import ZFP from mdio.builder.schemas.compressors import Blosc diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index 644ba91c..928c5926 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -114,6 +114,46 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: ) return tuple(specs) + def declare_dimension_specs(self) -> dict[str, ScalarType]: + """Declare the data types for each dimension in this template. + + This is the single source of truth for dimension-coordinate data types: both the + ingestion ``SchemaResolver`` and :meth:`_add_dimension_coordinate` read from it, so + the resolved schema and the built dataset cannot disagree. + + Returns: + A dictionary mapping dimension name to ScalarType. + """ + return dict.fromkeys(self.dimension_names, ScalarType.INT32) + + def _dim_dtype(self, name: str) -> ScalarType: + """Return the declared dtype for a dimension coordinate. + + Sourcing dimension-coordinate dtypes here (and in the ingestion ``SchemaResolver``) + from :meth:`declare_dimension_specs` keeps the built dataset and the resolved schema + from disagreeing without a separate post-build assertion. + + Args: + name: The dimension name. + + Returns: + The declared :class:`ScalarType`, defaulting to ``INT32`` when undeclared. + """ + return self.declare_dimension_specs().get(name, ScalarType.INT32) + + def _add_dimension_coordinate(self, name: str) -> None: + """Add a single dimension coordinate, sourcing its dtype from :meth:`declare_dimension_specs`. + + Args: + name: The dimension name; also the coordinate name and its sole dimension. + """ + self._builder.add_coordinate( + name, + dimensions=(name,), + data_type=self._dim_dtype(name), + metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), + ) + def build_dataset( self, name: str, @@ -354,6 +394,15 @@ def _load_dataset_attributes(self) -> dict[str, Any]: The dataset attributes as a dictionary """ + @property + def units(self) -> dict[str, AllUnitModel]: + """Return a copy of the template's configured units. + + Read-only view for collaborators (e.g. ingestion unit resolution) so they do not + reach into the private ``_units`` mapping. + """ + return dict(self._units) + def get_unit_by_key(self, key: str) -> AllUnitModel | None: """Get units by variable/dimension/coordinate name. Returns None if not found.""" return self._units.get(key, None) @@ -375,12 +424,7 @@ def _add_coordinates(self) -> None: """ # Add dimension coordinates for name in self._dim_names: - self._builder.add_coordinate( - name, - dimensions=(name,), - data_type=ScalarType.INT32, - metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), - ) + self._add_dimension_coordinate(name) # Add non-dimension coordinates # Note: coordinate_names may be modified at runtime by grid overrides, diff --git a/src/mdio/builder/templates/seismic_2d_cdp.py b/src/mdio/builder/templates/seismic_2d_cdp.py index fd2ca808..91185d09 100644 --- a/src/mdio/builder/templates/seismic_2d_cdp.py +++ b/src/mdio/builder/templates/seismic_2d_cdp.py @@ -48,18 +48,18 @@ def _add_coordinates(self) -> None: self._builder.add_coordinate( "cdp", dimensions=("cdp",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("cdp"), ) self._builder.add_coordinate( self._gather_domain, dimensions=(self._gather_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self._gather_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)), ) self._builder.add_coordinate( self.trace_domain, dimensions=(self.trace_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self.trace_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), ) diff --git a/src/mdio/builder/templates/seismic_2d_streamer_shot.py b/src/mdio/builder/templates/seismic_2d_streamer_shot.py index 4999fba8..7be80d03 100644 --- a/src/mdio/builder/templates/seismic_2d_streamer_shot.py +++ b/src/mdio/builder/templates/seismic_2d_streamer_shot.py @@ -44,7 +44,7 @@ def _add_coordinates(self) -> None: self._builder.add_coordinate( name, dimensions=(name,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(name), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), ) diff --git a/src/mdio/builder/templates/seismic_3d_cdp.py b/src/mdio/builder/templates/seismic_3d_cdp.py index 6490a7de..6cbb6bc6 100644 --- a/src/mdio/builder/templates/seismic_3d_cdp.py +++ b/src/mdio/builder/templates/seismic_3d_cdp.py @@ -48,23 +48,23 @@ def _add_coordinates(self) -> None: self._builder.add_coordinate( "inline", dimensions=("inline",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("inline"), ) self._builder.add_coordinate( "crossline", dimensions=("crossline",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("crossline"), ) self._builder.add_coordinate( self._gather_domain, dimensions=(self._gather_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self._gather_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)), ) self._builder.add_coordinate( self.trace_domain, dimensions=(self.trace_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self.trace_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), ) diff --git a/src/mdio/builder/templates/seismic_3d_coca.py b/src/mdio/builder/templates/seismic_3d_coca.py index fc296a33..d83d4fd6 100644 --- a/src/mdio/builder/templates/seismic_3d_coca.py +++ b/src/mdio/builder/templates/seismic_3d_coca.py @@ -34,34 +34,44 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="cdp_y", dimensions=("inline", "crossline"), dtype=ScalarType.FLOAT64), ) + def declare_dimension_specs(self) -> dict[str, ScalarType]: + """Declare the data types for each dimension in this template.""" + return { + "inline": ScalarType.INT32, + "crossline": ScalarType.INT32, + "offset": ScalarType.INT32, + "azimuth": ScalarType.FLOAT32, + self._data_domain: ScalarType.INT32, + } + def _add_coordinates(self) -> None: # Add dimension coordinates self._builder.add_coordinate( "inline", dimensions=("inline",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("inline"), ) self._builder.add_coordinate( "crossline", dimensions=("crossline",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("crossline"), ) self._builder.add_coordinate( "offset", dimensions=("offset",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("offset"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("offset")), # same unit as X/Y ) self._builder.add_coordinate( "azimuth", dimensions=("azimuth",), - data_type=ScalarType.FLOAT32, + data_type=self._dim_dtype("azimuth"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("azimuth")), ) self._builder.add_coordinate( self.trace_domain, dimensions=(self.trace_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self.trace_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), ) diff --git a/src/mdio/builder/templates/seismic_3d_obn.py b/src/mdio/builder/templates/seismic_3d_obn.py index e5b81659..503f6022 100644 --- a/src/mdio/builder/templates/seismic_3d_obn.py +++ b/src/mdio/builder/templates/seismic_3d_obn.py @@ -33,6 +33,7 @@ def __init__(self, data_domain: SeismicDataDomain = "time"): self._spatial_dim_names = ("component", "receiver", "shot_line", "gun", "shot_index") self._calculated_dims = ("shot_index",) + self.synthesize_missing_dims = ("component",) self._dim_names = (*self._spatial_dim_names, self._data_domain) self._physical_coord_names = ( "group_coord_x", @@ -63,33 +64,43 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="source_coord_y", dimensions=shot_dims, dtype=ScalarType.FLOAT64), ) + def declare_dimension_specs(self) -> dict[str, ScalarType]: + """Declare the data types for each dimension in this template.""" + return { + "component": ScalarType.UINT8, + "receiver": ScalarType.UINT32, + "shot_line": ScalarType.UINT32, + "gun": ScalarType.UINT8, + self._data_domain: ScalarType.INT32, + } + def _add_coordinates(self) -> None: # Add dimension coordinates # EXCLUDE: `shot_index` since it's 0-N (calculated dimension) self._builder.add_coordinate( "component", dimensions=("component",), - data_type=ScalarType.UINT8, + data_type=self._dim_dtype("component"), ) self._builder.add_coordinate( "receiver", dimensions=("receiver",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("receiver"), ) self._builder.add_coordinate( "shot_line", dimensions=("shot_line",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("shot_line"), ) self._builder.add_coordinate( "gun", dimensions=("gun",), - data_type=ScalarType.UINT8, + data_type=self._dim_dtype("gun"), ) self._builder.add_coordinate( self._data_domain, dimensions=(self._data_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self._data_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._data_domain)), ) diff --git a/src/mdio/builder/templates/seismic_3d_offset_tiles.py b/src/mdio/builder/templates/seismic_3d_offset_tiles.py index 3ec802c7..5eb5df54 100644 --- a/src/mdio/builder/templates/seismic_3d_offset_tiles.py +++ b/src/mdio/builder/templates/seismic_3d_offset_tiles.py @@ -41,36 +41,46 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="cdp_y", dimensions=("inline", "crossline"), dtype=ScalarType.FLOAT64), ) + def declare_dimension_specs(self) -> dict[str, ScalarType]: + """Declare the data types for each dimension in this template.""" + return { + "inline": ScalarType.INT32, + "crossline": ScalarType.INT32, + "inline_offset_tile": ScalarType.INT16, + "crossline_offset_tile": ScalarType.INT16, + self._data_domain: ScalarType.INT32, + } + def _add_coordinates(self) -> None: # Add dimension coordinates self._builder.add_coordinate( "inline", dimensions=("inline",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("inline"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("inline")), ) self._builder.add_coordinate( "crossline", dimensions=("crossline",), - data_type=ScalarType.INT32, + data_type=self._dim_dtype("crossline"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("crossline")), ) self._builder.add_coordinate( "inline_offset_tile", dimensions=("inline_offset_tile",), - data_type=ScalarType.INT16, + data_type=self._dim_dtype("inline_offset_tile"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("inline_offset_tile")), ) self._builder.add_coordinate( "crossline_offset_tile", dimensions=("crossline_offset_tile",), - data_type=ScalarType.INT16, + data_type=self._dim_dtype("crossline_offset_tile"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("crossline_offset_tile")), ) self._builder.add_coordinate( self.trace_domain, dimensions=(self.trace_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self.trace_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), ) diff --git a/src/mdio/builder/templates/seismic_3d_receiver_gathers.py b/src/mdio/builder/templates/seismic_3d_receiver_gathers.py index 232f7a22..911ecf87 100644 --- a/src/mdio/builder/templates/seismic_3d_receiver_gathers.py +++ b/src/mdio/builder/templates/seismic_3d_receiver_gathers.py @@ -45,25 +45,33 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="source_coord_y", dimensions=shot_dims, dtype=ScalarType.FLOAT64), ) + def declare_dimension_specs(self) -> dict[str, ScalarType]: + """Declare the data types for each dimension in this template.""" + return { + "receiver": ScalarType.UINT32, + "shot_line": ScalarType.UINT32, + self._data_domain: ScalarType.INT32, + } + def _add_coordinates(self) -> None: # Add dimension coordinates # Note: shot_index is calculated (0-N), so we don't add a coordinate for it self._builder.add_coordinate( "receiver", dimensions=("receiver",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("receiver"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("receiver")), ) self._builder.add_coordinate( "shot_line", dimensions=("shot_line",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("shot_line"), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("shot_line")), ) self._builder.add_coordinate( self.trace_domain, dimensions=(self.trace_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self.trace_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)), ) diff --git a/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py b/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py index e5e2dbad..311ca046 100644 --- a/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py +++ b/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py @@ -45,32 +45,42 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="orig_field_record_num", dimensions=source_dims, dtype=ScalarType.UINT32), ) + def declare_dimension_specs(self) -> dict[str, ScalarType]: + """Declare the data types for each dimension in this template.""" + return { + "shot_line": ScalarType.UINT32, + "shot_point": ScalarType.UINT32, + "receiver_line": ScalarType.UINT32, + "receiver": ScalarType.UINT32, + self._data_domain: ScalarType.INT32, + } + def _add_coordinates(self) -> None: # Add dimension coordinates self._builder.add_coordinate( "shot_line", dimensions=("shot_line",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("shot_line"), ) self._builder.add_coordinate( "shot_point", dimensions=("shot_point",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("shot_point"), ) self._builder.add_coordinate( "receiver_line", dimensions=("receiver_line",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("receiver_line"), ) self._builder.add_coordinate( "receiver", dimensions=("receiver",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("receiver"), ) self._builder.add_coordinate( self._data_domain, dimensions=(self._data_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self._data_domain), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._data_domain)), ) diff --git a/src/mdio/builder/templates/seismic_3d_streamer_field.py b/src/mdio/builder/templates/seismic_3d_streamer_field.py index c2613e6a..d8e23368 100644 --- a/src/mdio/builder/templates/seismic_3d_streamer_field.py +++ b/src/mdio/builder/templates/seismic_3d_streamer_field.py @@ -52,33 +52,43 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="group_coord_y", dimensions=receiver_dims, dtype=ScalarType.FLOAT64), ) + def declare_dimension_specs(self) -> dict[str, ScalarType]: + """Declare the data types for each dimension in this template.""" + return { + "sail_line": ScalarType.UINT32, + "gun": ScalarType.UINT8, + "cable": ScalarType.UINT8, + "channel": ScalarType.UINT16, + self._data_domain: ScalarType.INT32, + } + def _add_coordinates(self) -> None: # Add dimension coordinates # EXCLUDE: `shot_index` since its 0-N self._builder.add_coordinate( "sail_line", dimensions=("sail_line",), - data_type=ScalarType.UINT32, + data_type=self._dim_dtype("sail_line"), ) self._builder.add_coordinate( "gun", dimensions=("gun",), - data_type=ScalarType.UINT8, + data_type=self._dim_dtype("gun"), ) self._builder.add_coordinate( "cable", dimensions=("cable",), - data_type=ScalarType.UINT8, + data_type=self._dim_dtype("cable"), ) self._builder.add_coordinate( "channel", dimensions=("channel",), - data_type=ScalarType.UINT16, + data_type=self._dim_dtype("channel"), ) self._builder.add_coordinate( self._data_domain, dimensions=(self._data_domain,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(self._data_domain), ) # Add non-dimension coordinates diff --git a/src/mdio/builder/templates/seismic_3d_streamer_shot.py b/src/mdio/builder/templates/seismic_3d_streamer_shot.py index 4709dba3..744c5fe6 100644 --- a/src/mdio/builder/templates/seismic_3d_streamer_shot.py +++ b/src/mdio/builder/templates/seismic_3d_streamer_shot.py @@ -46,7 +46,7 @@ def _add_coordinates(self) -> None: self._builder.add_coordinate( name, dimensions=(name,), - data_type=ScalarType.INT32, + data_type=self._dim_dtype(name), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), ) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 1dd9d20a..8692b40e 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -5,347 +5,20 @@ import logging from typing import TYPE_CHECKING -import numpy as np -import zarr -from segy.config import SegyFileSettings -from segy.config import SegyHeaderOverrides - -from mdio.api.io import _normalize_path -from mdio.api.io import to_mdio -from mdio.builder.schemas.chunk_grid import RegularChunkGrid -from mdio.builder.schemas.chunk_grid import RegularChunkShape -from mdio.builder.schemas.compressors import Blosc -from mdio.builder.schemas.compressors import BloscCname -from mdio.builder.schemas.dtype import ScalarType -from mdio.builder.schemas.v1.variable import VariableMetadata -from mdio.builder.xarray_builder import to_xarray_dataset -from mdio.constants import ZarrFormat -from mdio.converters.exceptions import GridTraceCountError -from mdio.converters.type_converter import to_structured_type -from mdio.core.config import MDIOSettings -from mdio.core.grid import Grid -from mdio.core.utils_write import MAX_COORDINATES_BYTES -from mdio.core.utils_write import MAX_SIZE_LIVE_MASK -from mdio.core.utils_write import get_constrained_chunksize -from mdio.ingestion.grid_qc import grid_density_qc -from mdio.ingestion.metadata import _add_grid_override_to_metadata -from mdio.ingestion.segy.coordinates import _get_coordinates -from mdio.ingestion.segy.coordinates import _get_spatial_coordinate_unit -from mdio.ingestion.segy.coordinates import _populate_coordinates -from mdio.ingestion.segy.coordinates import _update_template_units -from mdio.ingestion.segy.file_headers import _add_segy_file_headers -from mdio.ingestion.segy.validation import _validate_spec_in_template -from mdio.segy import blocked_io -from mdio.segy.file import get_segy_file_info +from mdio.ingestion.segy.pipeline import run_segy_ingestion from mdio.segy.geometry import GridOverrides -from mdio.segy.utilities import get_grid_plan + +logger = logging.getLogger(__name__) if TYPE_CHECKING: from pathlib import Path from typing import Any - from segy.arrays import HeaderArray as SegyHeaderArray + from segy.config import SegyHeaderOverrides from segy.schema import SegySpec from upath import UPath - from xarray import Dataset as xr_Dataset - from mdio.builder.schemas import Dataset from mdio.builder.templates.base import AbstractDatasetTemplate - from mdio.core.dimension import Dimension - from mdio.segy.file import SegyFileArguments - from mdio.segy.file import SegyFileInfo - -logger = logging.getLogger(__name__) - - -def _patch_add_coordinates_for_non_binned( - template: AbstractDatasetTemplate, - non_binned_dims: set[str], -) -> None: - """Patch template's _add_coordinates to skip adding non-binned dims as dimension coordinates. - - When NonBinned override is used, dimensions like 'offset' or 'azimuth' become coordinates - instead of dimensions. However, template subclasses may still try to add them as 1D - dimension coordinates (e.g., with dimensions=("offset",)). Since 'offset' is no longer - a dimension, the builder substitutes 'trace', resulting in wrong coordinate dimensions. - - This function patches the template's _add_coordinates method to intercept calls to - builder.add_coordinate and skip adding coordinates that are non-binned dims with - single-element dimension tuples. These coordinates will be added later by build_dataset - with the correct spatial_dimension_names (e.g., (inline, crossline, trace)). - - Args: - template: The template to patch - non_binned_dims: Set of dimension names that became coordinates due to NonBinned override - """ - # Check if already patched to avoid duplicate patching - if hasattr(template, "_mdio_non_binned_patched"): - return - - # Store the original _add_coordinates method - original_add_coordinates = template._add_coordinates - - def patched_add_coordinates() -> None: - """Wrapper that intercepts builder.add_coordinate calls for non-binned dims.""" - # Store the original add_coordinate method from the builder - original_builder_add_coordinate = template._builder.add_coordinate - - def filtered_add_coordinate( # noqa: ANN202 - name: str, - *, - dimensions: tuple[str, ...], - **kwargs, # noqa: ANN003 - ): - """Skip adding non-binned dims as 1D dimension coordinates.""" - # Check if this is a non-binned dim being added as a 1D dimension coordinate - # (i.e., the coordinate name matches a non-binned dim and has only 1 dimension) - if name in non_binned_dims and len(dimensions) == 1: - logger.debug( - "Skipping 1D coordinate '%s' with dims %s - will be added with full spatial dims", - name, - dimensions, - ) - return template._builder # Return builder for chaining, but don't add - - # Otherwise, call the original method - return original_builder_add_coordinate(name, dimensions=dimensions, **kwargs) - - # Temporarily replace builder's add_coordinate - template._builder.add_coordinate = filtered_add_coordinate - - try: - # Call the original _add_coordinates - original_add_coordinates() - finally: - # Restore the original add_coordinate method - template._builder.add_coordinate = original_builder_add_coordinate - - # Replace the template's _add_coordinates method - template._add_coordinates = patched_add_coordinates - - # Mark as patched to prevent duplicate patching - template._mdio_non_binned_patched = True - - -def _update_template_from_grid_overrides( - template: AbstractDatasetTemplate, - grid_overrides: GridOverrides | None, - segy_dimensions: list[Dimension], - full_chunk_shape: tuple[int, ...], - chunk_size: tuple[int, ...], -) -> None: - """Update template attributes to match grid plan results after grid overrides. - - This function modifies the template in-place to reflect changes from grid overrides: - - Updates chunk shape if it changed due to overrides - - Updates dimension names if they changed due to overrides - - Adds non-binned dimensions as coordinates for NonBinned override - - Patches _add_coordinates to skip adding non-binned dims as dimension coordinates - - Args: - template: The template to update - grid_overrides: Grid override configuration - segy_dimensions: Dimensions returned from grid planning - full_chunk_shape: Original template chunk shape - chunk_size: Chunk size returned from grid planning - """ - # Update template to match grid_plan results after grid overrides - # Extract actual spatial dimensions from segy_dimensions (excluding vertical dimension) - actual_spatial_dims = tuple(dim.name for dim in segy_dimensions[:-1]) - - # Align chunk_size with actual dimensions - truncate if dimensions were filtered out - num_actual_spatial = len(actual_spatial_dims) - num_chunk_spatial = len(chunk_size) - 1 # Exclude vertical dimension chunk - if num_actual_spatial != num_chunk_spatial: - # Truncate chunk_size to match actual dimensions - chunk_size = chunk_size[:num_actual_spatial] + (chunk_size[-1],) - - if full_chunk_shape != chunk_size: - logger.debug( - "Adjusting template chunk shape from %s to %s to match grid after overrides", - full_chunk_shape, - chunk_size, - ) - template._var_chunk_shape = chunk_size - - # Update dimensions if they don't match grid_plan results - if template.spatial_dimension_names != actual_spatial_dims: - logger.debug( - "Adjusting template dimensions from %s to %s to match grid after overrides", - template.spatial_dimension_names, - actual_spatial_dims, - ) - template._dim_names = actual_spatial_dims + (template.trace_domain,) - template._grid_overrides_applied = True - - # If using NonBinned override, expose non-binned dims as logical coordinates on the template instance - # and patch _add_coordinates to skip adding them as 1D dimension coordinates - if grid_overrides is not None and grid_overrides.non_binned and grid_overrides.non_binned_dims: - non_binned_dims = tuple(grid_overrides.non_binned_dims) - logger.debug( - "NonBinned grid override: exposing non-binned dims as coordinates: %s", - non_binned_dims, - ) - # Append any missing names; keep existing order and avoid duplicates - existing = set(template.coordinate_names) - to_add = tuple(n for n in non_binned_dims if n not in existing) - if to_add: - template._logical_coord_names = template._logical_coord_names + to_add - template._grid_overrides_applied = True - - # Patch _add_coordinates to skip adding non-binned dims as 1D dimension coordinates - # This prevents them from being added with wrong dimensions (e.g., just "trace") - # They will be added later by build_dataset with full spatial_dimension_names - _patch_add_coordinates_for_non_binned(template, set(non_binned_dims)) - - -def _scan_for_headers( - segy_file_kwargs: SegyFileArguments, - segy_file_info: SegyFileInfo, - template: AbstractDatasetTemplate, - grid_overrides: GridOverrides | None = None, -) -> tuple[list[Dimension], SegyHeaderArray]: - """Extract trace dimensions and index headers from the SEG-Y file. - - This is an expensive operation. - It scans the SEG-Y file in chunks by using ProcessPoolExecutor. - - Note: - If grid_overrides are applied to the template before calling this function, - the chunk_size returned from get_grid_plan should match the template's chunk shape. - """ - full_chunk_shape = template.full_chunk_shape - segy_dimensions, chunk_size, segy_headers = get_grid_plan( - segy_file_kwargs=segy_file_kwargs, - segy_file_info=segy_file_info, - return_headers=True, - template=template, - chunksize=full_chunk_shape, - grid_overrides=grid_overrides, - ) - - _update_template_from_grid_overrides( - template=template, - grid_overrides=grid_overrides, - segy_dimensions=segy_dimensions, - full_chunk_shape=full_chunk_shape, - chunk_size=chunk_size, - ) - - return segy_dimensions, segy_headers - - -def _build_and_check_grid( - segy_dimensions: list[Dimension], - segy_file_info: SegyFileInfo, - segy_headers: SegyHeaderArray, -) -> Grid: - """Build and check the grid from the SEG-Y headers and dimensions. - - Args: - segy_dimensions: List of of all SEG-Y dimensions to build grid from. - segy_file_info: SegyFileInfo instance containing the SEG-Y file information. - segy_headers: Headers read in from SEG-Y file for building the trace map. - - Returns: - A grid instance populated with the dimensions and trace index map. - - Raises: - GridTraceCountError: If number of traces in SEG-Y file does not match the parsed grid - """ - grid = Grid(dims=segy_dimensions) - num_traces = segy_file_info.num_traces - grid_density_qc(grid, num_traces) - grid.build_map(segy_headers) - # Check grid validity by comparing trace numbers - if np.sum(grid.live_mask) != num_traces: - for dim_name in grid.dim_names: - dim_min, dim_max = grid.get_min(dim_name), grid.get_max(dim_name) - logger.warning("%s min: %s max: %s", dim_name, dim_min, dim_max) - logger.warning("Ingestion grid shape: %s.", grid.shape) - raise GridTraceCountError(np.sum(grid.live_mask), num_traces) - return grid - - -def _add_raw_headers_to_template(mdio_template: AbstractDatasetTemplate) -> AbstractDatasetTemplate: - """Add raw headers capability to the MDIO template by monkey-patching its _add_variables method. - - This function modifies the template's _add_variables method to also add a raw headers variable - with the following characteristics: - - Same rank as the Headers variable (all dimensions except vertical) - - Name: "RawHeaders" - - Type: ScalarType.HEADERS - - No coordinates - - zstd compressor - - No additional metadata - - Chunked the same as the Headers variable - - Args: - mdio_template: The MDIO template to mutate - Returns: - The mutated MDIO template - """ - # Check if raw headers enhancement has already been applied to avoid duplicate additions - if hasattr(mdio_template, "_mdio_raw_headers_enhanced"): - return mdio_template - - # Store the original _add_variables method - original_add_variables = mdio_template._add_variables - - def enhanced_add_variables() -> None: - # Call the original method first - original_add_variables() - - # Now add the raw headers variable - chunk_shape = mdio_template.full_chunk_shape[:-1] - - # Create chunk grid metadata - chunk_metadata = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape)) - - # Add the raw headers variable using the builder's add_variable method - mdio_template._builder.add_variable( - name="raw_headers", - long_name="Raw Headers", - dimensions=mdio_template.spatial_dimension_names, - data_type=ScalarType.BYTES240, - compressor=Blosc(cname=BloscCname.zstd), - coordinates=None, # No coordinates as specified - metadata=VariableMetadata(chunk_grid=chunk_metadata), - ) - - # Replace the template's _add_variables method - mdio_template._add_variables = enhanced_add_variables - - # Mark the template as enhanced to prevent duplicate monkey-patching - mdio_template._mdio_raw_headers_enhanced = True - - return mdio_template - - -def _chunk_variable(ds: Dataset, target_variable_name: str) -> None: - """Determines and sets the chunking for a specific Variable in the Dataset.""" - # Find variable index by name - index = next((i for i, obj in enumerate(ds.variables) if obj.name == target_variable_name), None) - - def determine_target_size(var_type: str) -> int: - """Determines the target size (in bytes) for a Variable based on its type.""" - if var_type == "bool": - return MAX_SIZE_LIVE_MASK - return MAX_COORDINATES_BYTES - - # Create the chunk grid metadata - var_type = ds.variables[index].data_type - full_shape = tuple(dim.size for dim in ds.variables[index].dimensions) - target_size = determine_target_size(var_type) - - chunk_shape = get_constrained_chunksize(full_shape, var_type, target_size) - chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape)) - - # Create variable metadata if it doesn't exist - if ds.variables[index].metadata is None: - ds.variables[index].metadata = VariableMetadata() - - ds.variables[index].metadata.chunk_grid = chunk_grid def _coerce_grid_overrides( @@ -354,7 +27,7 @@ def _coerce_grid_overrides( """Normalize public ``grid_overrides`` input into a :class:`GridOverrides` model. The internal ingestion pipeline only accepts the typed model. A legacy ``dict`` is - converted via :meth:`GridOverrides.from_legacy_dict` and a deprecation message is logged. + converted and a deprecation message is logged. """ if grid_overrides is None: return None @@ -369,7 +42,7 @@ def _coerce_grid_overrides( return GridOverrides.model_validate(grid_overrides) -def segy_to_mdio( # noqa PLR0913 +def segy_to_mdio( # noqa: PLR0913 segy_spec: SegySpec, mdio_template: AbstractDatasetTemplate, input_path: UPath | Path | str, @@ -391,101 +64,15 @@ def segy_to_mdio( # noqa PLR0913 grid_overrides: Option to add grid overrides. Prefer a :class:`mdio.GridOverrides` instance; ``dict`` is still accepted but emits a :class:`DeprecationWarning`. segy_header_overrides: Option to override specific SEG-Y headers during ingestion. - - Raises: - FileExistsError: If the output location already exists and overwrite is False. """ typed_grid_overrides = _coerce_grid_overrides(grid_overrides) - settings = MDIOSettings() - - _validate_spec_in_template(segy_spec, mdio_template) - - input_path = _normalize_path(input_path) - output_path = _normalize_path(output_path) - - if not overwrite and output_path.exists(): - err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." - raise FileExistsError(err) - - segy_settings = SegyFileSettings(storage_options=input_path.storage_options) - segy_file_kwargs: SegyFileArguments = { - "url": input_path.as_posix(), - "spec": segy_spec, - "settings": segy_settings, - "header_overrides": segy_header_overrides, - } - segy_file_info = get_segy_file_info(segy_file_kwargs) - - segy_dimensions, segy_headers = _scan_for_headers( - segy_file_kwargs, - segy_file_info, - template=mdio_template, - grid_overrides=typed_grid_overrides, - ) - grid = _build_and_check_grid(segy_dimensions, segy_file_info, segy_headers) - - _, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template) - - # Explicitly delete segy_headers to free memory - coordinate values have been copied - del segy_headers - - header_dtype = to_structured_type(segy_spec.trace.header.dtype) - - if settings.raw_headers: - if zarr.config.get("default_zarr_format") == ZarrFormat.V2: - logger.warning("Raw headers are only supported for Zarr v3. Skipping raw headers.") - else: - logger.warning("MDIO__IMPORT__RAW_HEADERS is experimental and expected to change or be removed.") - mdio_template = _add_raw_headers_to_template(mdio_template) - - spatial_unit = _get_spatial_coordinate_unit(segy_file_info) - mdio_template = _update_template_units(mdio_template, spatial_unit) - mdio_ds: Dataset = mdio_template.build_dataset(name=mdio_template.name, sizes=grid.shape, header_dtype=header_dtype) - - _add_grid_override_to_metadata(dataset=mdio_ds, grid_overrides=typed_grid_overrides) - - # Dynamically chunk the variables based on their type - _chunk_variable(ds=mdio_ds, target_variable_name="trace_mask") # trace_mask is a Variable and not a Coordinate - for coord in mdio_template.coordinate_names: - _chunk_variable(ds=mdio_ds, target_variable_name=coord) - - xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds) - - xr_dataset, drop_vars_delayed = _populate_coordinates( - dataset=xr_dataset, - grid=grid, - coords=non_dim_coords, - spatial_coordinate_scalar=segy_file_info.coordinate_scalar, - ) - - xr_dataset = _add_segy_file_headers(xr_dataset, segy_file_info) - - xr_dataset.trace_mask.data[:] = grid.live_mask - # IMPORTANT: Do not drop the "trace_mask" here, as it will be used later in - # blocked_io.to_zarr() -> _workers.trace_worker() - - # This will create the Zarr store with the correct structure but with empty arrays - to_mdio(xr_dataset, output_path=output_path, mode="w", compute=False) - - # This will write the non-dimension coordinates and trace mask - # We also remove dimensions that don't have associated coordinates - unindexed_dims = [d for d in xr_dataset.dims if d not in xr_dataset.coords] - [drop_vars_delayed.remove(d) for d in unindexed_dims] - meta_ds = xr_dataset[drop_vars_delayed + ["trace_mask"]] - to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) - - # Now we can drop them to simplify chunked write of the data variable - xr_dataset = xr_dataset.drop_vars(drop_vars_delayed) - - # Write the headers and traces in chunks using grid_map to indicate dead traces - default_variable_name = mdio_template.default_variable_name - # This is an memory-expensive and time-consuming read-write operation - # performed in chunks to save the memory - blocked_io.to_zarr( - segy_file_kwargs=segy_file_kwargs, + return run_segy_ingestion( + segy_spec=segy_spec, + mdio_template=mdio_template, + input_path=input_path, output_path=output_path, - grid_map=grid.map, - dataset=xr_dataset, - data_variable_name=default_variable_name, + overwrite=overwrite, + grid_overrides=typed_grid_overrides, + segy_header_overrides=segy_header_overrides, ) diff --git a/src/mdio/ingestion/__init__.py b/src/mdio/ingestion/__init__.py index 00baad73..6624f7e6 100644 --- a/src/mdio/ingestion/__init__.py +++ b/src/mdio/ingestion/__init__.py @@ -1 +1,19 @@ -"""MDIO ingestion helpers.""" +"""MDIO ingestion pipeline components. + +This is the advanced ingestion namespace. ``CoordinateSpec`` is intentionally not +re-exported here; its canonical home is :mod:`mdio.builder.templates.types`. +""" + +from mdio.ingestion.schema import DimensionSpec +from mdio.ingestion.schema import ResolvedSchema +from mdio.ingestion.segy.index_strategies import IndexStrategy +from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry +from mdio.ingestion.segy.pipeline import run_segy_ingestion + +__all__ = [ + "DimensionSpec", + "IndexStrategy", + "IndexStrategyRegistry", + "ResolvedSchema", + "run_segy_ingestion", +] diff --git a/src/mdio/ingestion/dataset_factory.py b/src/mdio/ingestion/dataset_factory.py new file mode 100644 index 00000000..84d9b496 --- /dev/null +++ b/src/mdio/ingestion/dataset_factory.py @@ -0,0 +1,262 @@ +"""Schema-driven factory for MDIO datasets.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING +from typing import Any + +from mdio.builder.dataset_builder import MDIODatasetBuilder +from mdio.builder.schemas import compressors +from mdio.builder.schemas.chunk_grid import RegularChunkGrid +from mdio.builder.schemas.chunk_grid import RegularChunkShape +from mdio.builder.schemas.dtype import ScalarType +from mdio.builder.schemas.v1.variable import CoordinateMetadata +from mdio.builder.schemas.v1.variable import VariableMetadata +from mdio.core.utils_write import MAX_COORDINATES_BYTES +from mdio.core.utils_write import MAX_SIZE_LIVE_MASK +from mdio.core.utils_write import get_constrained_chunksize + +if TYPE_CHECKING: + from mdio.builder.schemas import Dataset + from mdio.builder.schemas.dtype import StructuredType + from mdio.builder.schemas.v1.units import AllUnitModel + from mdio.ingestion.schema import ResolvedSchema + +logger = logging.getLogger(__name__) + + +def _chunk_variable(ds: Dataset, target_variable_name: str) -> None: + """Determines and sets the chunking for a specific Variable in the Dataset.""" + index = next((i for i, obj in enumerate(ds.variables) if obj.name == target_variable_name), None) + if index is None: + return + + def determine_target_size(var_type: str) -> int: + """Determines the target size (in bytes) for a Variable based on its type.""" + if var_type == "bool": + return MAX_SIZE_LIVE_MASK + return MAX_COORDINATES_BYTES + + var_type = ds.variables[index].data_type + full_shape = tuple(dim.size for dim in ds.variables[index].dimensions) + target_size = determine_target_size(var_type) + + chunk_shape = get_constrained_chunksize(full_shape, var_type, target_size) + chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=chunk_shape)) + + if ds.variables[index].metadata is None: + ds.variables[index].metadata = VariableMetadata() + + ds.variables[index].metadata.chunk_grid = chunk_grid + + +def _resolve_chunks(chunk_shape: tuple[int, ...], sizes: tuple[int, ...]) -> tuple[int, ...]: + """Resolve chunk shapes by substituting -1 with actual sizes. + + Args: + chunk_shape: Configured chunk shape. + sizes: Actual sizes of each dimension. + + Returns: + tuple[int, ...]: Resolved chunk shape. + """ + return tuple(size if chunk_size == -1 else chunk_size for chunk_size, size in zip(chunk_shape, sizes, strict=True)) + + +def _create_dataset_builder(schema: ResolvedSchema) -> MDIODatasetBuilder: + """Create and initialize the MDIODatasetBuilder with attributes. + + Args: + schema: Resolved schema. + + Returns: + MDIODatasetBuilder: The initialized dataset builder. + """ + attributes = dict(schema.metadata) if schema.metadata else {} + attributes["defaultVariableName"] = schema.default_variable_name + + return MDIODatasetBuilder(name=schema.name, attributes=attributes) + + +def _add_dimensions_and_coordinates( + builder: MDIODatasetBuilder, + schema: ResolvedSchema, + sizes: tuple[int, ...], + units: dict[str, AllUnitModel], +) -> None: + """Add dimensions and coordinates to the builder. + + Args: + builder: MDIO dataset builder. + schema: Resolved schema. + sizes: Actual sizes of each dimension. + units: Dictionary mapping coordinate/dimension names to AllUnitModel. + + Raises: + ValueError: If a coordinate with the same name already exists but has different attributes. + """ + for dim_spec, size in zip(schema.dimensions, sizes, strict=True): + builder.add_dimension(dim_spec.name, size) + + for dim_spec in schema.dimensions: + if dim_spec.is_calculated and dim_spec.name != "trace": + continue + builder.add_coordinate( + name=dim_spec.name, + dimensions=(dim_spec.name,), + data_type=dim_spec.dtype, + metadata=CoordinateMetadata(units_v1=units.get(dim_spec.name)), + ) + + compressor = compressors.Blosc(cname=compressors.BloscCname.zstd) + for coord_spec in schema.coordinates: + try: + builder.add_coordinate( + name=coord_spec.name, + dimensions=coord_spec.dimensions, + data_type=coord_spec.dtype, + compressor=compressor, + metadata=CoordinateMetadata(units_v1=units.get(coord_spec.name)), + ) + except ValueError as exc: + if "same name twice" not in str(exc): + raise + + +def _add_trace_mask_and_headers( + builder: MDIODatasetBuilder, + schema: ResolvedSchema, + resolved_chunks: tuple[int, ...], + header_dtype: StructuredType | None, +) -> None: + """Add trace mask and headers variables to the builder. + + Args: + builder: MDIO dataset builder. + schema: Resolved schema. + resolved_chunks: Resolved chunk shapes. + header_dtype: Structured dtype for trace headers. + """ + compressor = compressors.Blosc(cname=compressors.BloscCname.zstd) + spatial_dim_names = tuple(dim.name for dim in schema.dimensions if dim.is_spatial) + coordinate_names = [coord.name for coord in schema.coordinates] + + builder.add_variable( + name="trace_mask", + dimensions=spatial_dim_names, + data_type=ScalarType.BOOL, + compressor=compressor, + coordinates=coordinate_names, + ) + + if header_dtype is not None: + chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=resolved_chunks[:-1])) + builder.add_variable( + name="headers", + dimensions=spatial_dim_names, + data_type=header_dtype, + compressor=compressor, + coordinates=coordinate_names, + metadata=VariableMetadata(chunk_grid=chunk_grid), + ) + + +def _add_main_and_extra_variables( + builder: MDIODatasetBuilder, + schema: ResolvedSchema, + resolved_chunks: tuple[int, ...], + units: dict[str, AllUnitModel], + extra_variables: list[dict[str, Any]], +) -> None: + """Add main data and extra variables to the builder. + + Args: + builder: MDIO dataset builder. + schema: Resolved schema. + resolved_chunks: Resolved chunk shapes. + units: Dictionary mapping coordinate/dimension names to AllUnitModel. + extra_variables: Optional list of additional variables. + """ + compressor = compressors.Blosc(cname=compressors.BloscCname.zstd) + coordinate_names = [coord.name for coord in schema.coordinates] + + chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=resolved_chunks)) + builder.add_variable( + name=schema.default_variable_name, + dimensions=tuple(dim.name for dim in schema.dimensions), + data_type=ScalarType.FLOAT32, + compressor=compressor, + coordinates=coordinate_names, + metadata=VariableMetadata(chunk_grid=chunk_grid, units_v1=units.get(schema.default_variable_name)), + ) + + for var_dict in extra_variables: + builder.add_variable(**var_dict) + + +def _apply_dynamic_chunking(ds: Dataset, coordinate_names: list[str]) -> None: + """Dynamically chunk trace_mask and coordinates based on their data types and sizes. + + Args: + ds: The dataset model. + coordinate_names: List of coordinate names. + """ + _chunk_variable(ds=ds, target_variable_name="trace_mask") + for coord_name in coordinate_names: + _chunk_variable(ds=ds, target_variable_name=coord_name) + + +def build_mdio_dataset( + schema: ResolvedSchema, + sizes: tuple[int, ...], + header_dtype: StructuredType | None = None, + units: dict[str, AllUnitModel] | None = None, + extra_variables: list[dict[str, Any]] | None = None, +) -> Dataset: + """Build an MDIO Dataset model purely from a ResolvedSchema and dimensions sizes. + + Args: + schema: Resolved schema describing dimensions, coordinates, metadata, etc. + sizes: Actual sizes of each dimension. + header_dtype: Structured dtype for trace headers. + units: Dictionary mapping coordinate/dimension names to AllUnitModel. + extra_variables: Optional list of additional variables to add (e.g. raw_headers). + + Returns: + Dataset: The completed Dataset schema model. + """ + units = units or {} + extra_variables = extra_variables or [] + + resolved_chunks = _resolve_chunks(schema.chunk_shape, sizes) + builder = _create_dataset_builder(schema) + + _add_dimensions_and_coordinates( + builder=builder, + schema=schema, + sizes=sizes, + units=units, + ) + + _add_trace_mask_and_headers( + builder=builder, + schema=schema, + resolved_chunks=resolved_chunks, + header_dtype=header_dtype, + ) + + _add_main_and_extra_variables( + builder=builder, + schema=schema, + resolved_chunks=resolved_chunks, + units=units, + extra_variables=extra_variables, + ) + + ds = builder.build() + + coordinate_names = [coord.name for coord in schema.coordinates] + _apply_dynamic_chunking(ds=ds, coordinate_names=coordinate_names) + + return ds diff --git a/src/mdio/ingestion/metadata.py b/src/mdio/ingestion/metadata.py index 4cc617f0..cd86535c 100644 --- a/src/mdio/ingestion/metadata.py +++ b/src/mdio/ingestion/metadata.py @@ -9,7 +9,7 @@ from mdio.segy.geometry import GridOverrides -def _add_grid_override_to_metadata(dataset: Dataset, grid_overrides: GridOverrides | None) -> None: +def add_grid_override_to_metadata(dataset: Dataset, grid_overrides: GridOverrides | None) -> None: """Add grid override to Dataset metadata if needed.""" if dataset.metadata.attributes is None: dataset.metadata.attributes = {} diff --git a/src/mdio/ingestion/schema/__init__.py b/src/mdio/ingestion/schema/__init__.py new file mode 100644 index 00000000..2d82ac07 --- /dev/null +++ b/src/mdio/ingestion/schema/__init__.py @@ -0,0 +1,20 @@ +"""Immutable schema models that describe a dataset before any data is scanned. + +``SchemaResolver`` lives in :mod:`mdio.ingestion.schema.resolver` and is imported from there +directly to avoid a circular import (the resolver depends on the SEG-Y index-strategy +registry, which in turn depends on these models). +""" + +from mdio.ingestion.schema.models import CollapseToTraceEffect +from mdio.ingestion.schema.models import DimensionSpec +from mdio.ingestion.schema.models import InsertTraceDimEffect +from mdio.ingestion.schema.models import ResolvedSchema +from mdio.ingestion.schema.models import SchemaEffect + +__all__ = [ + "CollapseToTraceEffect", + "DimensionSpec", + "InsertTraceDimEffect", + "ResolvedSchema", + "SchemaEffect", +] diff --git a/src/mdio/ingestion/schema/models.py b/src/mdio/ingestion/schema/models.py new file mode 100644 index 00000000..cce198f9 --- /dev/null +++ b/src/mdio/ingestion/schema/models.py @@ -0,0 +1,227 @@ +"""Immutable schema models that fully describe a dataset before any data is scanned.""" + +from __future__ import annotations + +from abc import ABC +from abc import abstractmethod +from typing import TYPE_CHECKING +from typing import Any + +from pydantic import BaseModel +from pydantic import Field + +from mdio.builder.schemas.dtype import ScalarType +from mdio.builder.templates.types import CoordinateSpec # noqa: TC001 (pydantic needs this at runtime) + +if TYPE_CHECKING: + from collections.abc import Iterable + + +class DimensionSpec(BaseModel): + """Specification for a dimension in the final dataset. + + Attributes: + name: Dimension name (e.g. ``"inline"``, ``"shot_point"``, ``"trace"``, ``"time"``). + is_spatial: Whether this is a spatial dimension. ``False`` only for the vertical + (data-domain) dimension. + is_calculated: Whether this dimension's coordinate values are produced by an index + strategy at ingest time (e.g. ``shot_index`` from a template, or ``trace`` added + by a grid override) rather than read directly. The pipeline uses this to give a + clear error if a required strategy was not enabled. + dtype: Data type for the dimension coordinate. + """ + + name: str + is_spatial: bool = True + is_calculated: bool = False + dtype: ScalarType = ScalarType.INT32 + + +class ResolvedSchema(BaseModel): + """Final resolved schema for dataset ingestion. + + This represents the complete, resolved schema after applying template configuration + and grid overrides. It contains everything needed to build the dataset without + any further decision-making. + + Attributes: + name: Name of the dataset/template + dimensions: Ordered list of dimension specifications + coordinates: List of coordinate specifications + chunk_shape: Tuple of chunk sizes for each dimension + metadata: Additional metadata attributes + default_variable_name: Name of the main data variable + """ + + name: str + dimensions: list[DimensionSpec] + coordinates: list[CoordinateSpec] + chunk_shape: tuple[int, ...] + metadata: dict[str, Any] = Field(default_factory=dict) + default_variable_name: str = "amplitude" + + def required_header_fields(self) -> set[str]: + """Names that must be readable from SEG-Y trace headers to materialize this schema.""" + fields = {dim.name for dim in self.dimensions if dim.is_spatial and not dim.is_calculated} + fields.update(coord.name for coord in self.coordinates) + # coordinate_scalar is always needed to scale X/Y coordinates. + fields.add("coordinate_scalar") + return fields + + def spatial_dimensions(self) -> list[DimensionSpec]: + """Get only spatial dimensions (excludes vertical/trace domain).""" + return [dim for dim in self.dimensions if dim.is_spatial] + + def missing_calculated_dimensions(self, produced_names: Iterable[str]) -> list[str]: + """Return calculated spatial dimensions that no index strategy produced. + + Calculated dimensions (e.g. ``shot_index``, or ``trace`` from a grid override) are + not read directly from SEG-Y headers; an index strategy must materialize them. The + pipeline uses this to fail clearly when the required strategy was not enabled. + + Args: + produced_names: Names of dimensions actually produced after header analysis. + + Returns: + Calculated spatial dimension names absent from ``produced_names``. + """ + produced = set(produced_names) + return [ + dim.name for dim in self.dimensions if dim.is_spatial and dim.is_calculated and dim.name not in produced + ] + + +_TRACE_DIM = "trace" + + +class SchemaEffect(ABC): + """How a trace-producing index strategy reshapes a :class:`ResolvedSchema`. + + A grid override changes both the trace headers (handled by an + :class:`~mdio.ingestion.segy.index_strategies.IndexStrategy`) and the resolved schema + layout (dimensions, chunk shape, coordinates). To keep those two views from drifting, + the index-strategy registry is the single place that maps a ``GridOverrides`` to the + matching :class:`SchemaEffect`; the :class:`~mdio.ingestion.schema.resolver.SchemaResolver` + simply applies whatever effect it is handed. + """ + + @abstractmethod + def apply(self, schema: ResolvedSchema) -> ResolvedSchema: + """Return a new schema with this effect applied; ``schema`` is left unchanged.""" + + +class InsertTraceDimEffect(SchemaEffect): + """Insert a calculated ``trace`` dimension just before the vertical axis. + + Mirrors the ``HasDuplicates`` override: no spatial dimension is collapsed, a single + extra ``trace`` axis disambiguates duplicate index tuples. + + Args: + chunksize: Chunk size assigned to the inserted ``trace`` dimension. + """ + + def __init__(self, chunksize: int = 1) -> None: + self.chunksize = chunksize + + def apply(self, schema: ResolvedSchema) -> ResolvedSchema: + """Insert the ``trace`` dimension and its chunk before the vertical dimension.""" + new_dimensions: list[DimensionSpec] = [] + new_chunk_shape: list[int] = [] + for dim, chunk in zip(schema.dimensions, schema.chunk_shape, strict=True): + if dim.is_spatial: + new_dimensions.append(dim) + new_chunk_shape.append(chunk) + continue + new_dimensions.append(DimensionSpec(name=_TRACE_DIM, is_spatial=True, is_calculated=True)) + new_chunk_shape.append(self.chunksize) + new_dimensions.append(dim) + new_chunk_shape.append(chunk) + + return schema.model_copy(update={"dimensions": new_dimensions, "chunk_shape": tuple(new_chunk_shape)}) + + +class CollapseToTraceEffect(SchemaEffect): + """Collapse selected spatial dimensions into a single ``trace`` dimension. + + Mirrors the ``NonBinned`` override. The collapsed dimensions are re-expressed as + non-dimension coordinates over ``trace``, and any coordinate that referenced a + collapsed dimension is rewritten to depend on ``trace`` instead. + + Args: + chunksize: Chunk size assigned to the inserted ``trace`` dimension. + collapse_dims: Names of spatial dimensions to collapse. ``None`` collapses every + spatial dimension except the first; an empty tuple collapses nothing. + """ + + def __init__(self, chunksize: int | None, collapse_dims: tuple[str, ...] | None = None) -> None: + self.chunksize = chunksize + self.collapse_dims = collapse_dims + + def _resolve_collapse_dims(self, schema: ResolvedSchema) -> tuple[str, ...]: + """Resolve the effective set of dimension names to collapse for ``schema``.""" + if self.collapse_dims is not None: + return self.collapse_dims + spatial = schema.spatial_dimensions() + return tuple(dim.name for dim in spatial[1:]) if len(spatial) > 1 else () + + def apply(self, schema: ResolvedSchema) -> ResolvedSchema: + """Collapse the configured dimensions into ``trace`` and rewrite coordinates.""" + collapse = self._resolve_collapse_dims(schema) + collapse_set = set(collapse) + + new_dimensions: list[DimensionSpec] = [] + new_chunk_shape: list[int] = [] + replaced_count = 0 + for dim, chunk in zip(schema.dimensions, schema.chunk_shape, strict=True): + if dim.name in collapse_set: + replaced_count += 1 + continue + if dim.is_spatial: + new_dimensions.append(dim) + new_chunk_shape.append(chunk) + continue + if replaced_count > 0: + new_dimensions.append(DimensionSpec(name=_TRACE_DIM, is_spatial=True, is_calculated=True)) + new_chunk_shape.append(self.chunksize) + new_dimensions.append(dim) + new_chunk_shape.append(chunk) + + new_coordinates = self._rewrite_coordinates(schema, collapse, collapse_set, replaced_count) + + return schema.model_copy( + update={ + "dimensions": new_dimensions, + "coordinates": new_coordinates, + "chunk_shape": tuple(new_chunk_shape), + } + ) + + @staticmethod + def _rewrite_coordinates( + schema: ResolvedSchema, + collapse: tuple[str, ...], + collapse_set: set[str], + replaced_count: int, + ) -> list[CoordinateSpec]: + """Rewrite coordinate dimensions and append collapsed dims as ``trace`` coordinates.""" + rewritten: list[CoordinateSpec] = [] + for coord in schema.coordinates: + had_collapsed = any(name in collapse_set for name in coord.dimensions) + dims = tuple(name for name in coord.dimensions if name not in collapse_set) + if had_collapsed and replaced_count > 0: + dims = (*dims, _TRACE_DIM) + rewritten.append(coord.model_copy(update={"dimensions": dims})) + + existing = {coord.name for coord in rewritten} + dtype_by_dim = {dim.name: dim.dtype for dim in schema.dimensions} + for name in collapse: + if name in existing: + continue + rewritten.append( + CoordinateSpec( + name=name, + dimensions=(_TRACE_DIM,), + dtype=dtype_by_dim.get(name, ScalarType.INT32), + ) + ) + return rewritten diff --git a/src/mdio/ingestion/schema/resolver.py b/src/mdio/ingestion/schema/resolver.py new file mode 100644 index 00000000..8dffd12e --- /dev/null +++ b/src/mdio/ingestion/schema/resolver.py @@ -0,0 +1,88 @@ +"""Schema resolution: turn a template + grid overrides into a final, ingestion-ready schema.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from mdio.builder.schemas.dtype import ScalarType +from mdio.ingestion.schema.models import DimensionSpec +from mdio.ingestion.schema.models import ResolvedSchema +from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry + +if TYPE_CHECKING: + from mdio.builder.templates.base import AbstractDatasetTemplate + from mdio.segy.geometry import GridOverrides + + +class SchemaResolver: + """Resolves template + grid overrides into a final schema. + + This class takes a template and optional grid overrides and produces a + :class:`ResolvedSchema` that completely specifies the dataset structure before any data + is scanned or processed. All override semantics (which dimensions collapse, where a + ``trace`` dimension is inserted) are owned by + :class:`~mdio.ingestion.segy.index_strategies.IndexStrategyRegistry`; this resolver only + applies the resulting :class:`~mdio.ingestion.schema.models.SchemaEffect`. + """ + + def __init__(self) -> None: + self._registry = IndexStrategyRegistry() + + def resolve( + self, + template: AbstractDatasetTemplate, + grid_overrides: GridOverrides | None = None, + ) -> ResolvedSchema: + """Resolve template and overrides into final schema. + + Args: + template: The MDIO dataset template. + grid_overrides: Optional grid override configuration. + + Returns: + ResolvedSchema with all dimensions, coordinates, and metadata resolved. + """ + schema = self._template_to_schema(template) + + if not grid_overrides: + return schema + + # Grid-override provenance is attached to the dataset at assembly time + # (mdio.ingestion.metadata.add_grid_override_to_metadata); the resolver only + # reshapes the schema, keeping it override-mechanics-only. + effect = self._registry.schema_effect(grid_overrides) + if effect is not None: + schema = effect.apply(schema) + + return schema + + def _template_to_schema(self, template: AbstractDatasetTemplate) -> ResolvedSchema: + """Convert a template to a resolved schema without overrides.""" + calculated = set(template.calculated_dimension_names) + dim_dtypes = template.declare_dimension_specs() + dimensions = [ + DimensionSpec( + name=name, + is_spatial=True, + is_calculated=name in calculated, + dtype=dim_dtypes.get(name, ScalarType.INT32), + ) + for name in template.spatial_dimension_names + ] + vertical_name = template.dimension_names[-1] + dimensions.append( + DimensionSpec( + name=vertical_name, + is_spatial=False, + dtype=dim_dtypes.get(vertical_name, ScalarType.INT32), + ) + ) + + return ResolvedSchema( + name=template.name, + dimensions=dimensions, + coordinates=list(template.declare_coordinate_specs()), + chunk_shape=template.full_chunk_shape, + metadata=template._load_dataset_attributes() or {}, + default_variable_name=template.default_variable_name, + ) diff --git a/src/mdio/ingestion/schema_resolver.py b/src/mdio/ingestion/schema_resolver.py deleted file mode 100644 index f28da1df..00000000 --- a/src/mdio/ingestion/schema_resolver.py +++ /dev/null @@ -1,211 +0,0 @@ -"""Schema resolution: turn a template + grid overrides into a final, ingestion-ready schema.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING -from typing import Any - -from pydantic import BaseModel -from pydantic import Field - -from mdio.builder.templates.types import CoordinateSpec # noqa: TC001 (pydantic needs this at runtime) - -if TYPE_CHECKING: - from mdio.builder.templates.base import AbstractDatasetTemplate - from mdio.segy.geometry import GridOverrides - - -class DimensionSpec(BaseModel): - """Specification for a dimension in the final dataset. - - Attributes: - name: Dimension name (e.g. ``"inline"``, ``"shot_point"``, ``"trace"``, ``"time"``). - is_spatial: Whether this is a spatial dimension. ``False`` only for the vertical - (data-domain) dimension. - is_calculated: Whether this dimension's coordinate values are produced by an index - strategy at ingest time (e.g. ``shot_index`` from a template, or ``trace`` added - by a grid override) rather than read directly. The pipeline uses this to give a - clear error if a required strategy was not enabled. - """ - - name: str - is_spatial: bool = True - is_calculated: bool = False - - -class ResolvedSchema(BaseModel): - """Final resolved schema for dataset ingestion. - - This represents the complete, resolved schema after applying template configuration - and grid overrides. It contains everything needed to build the dataset without - any further decision-making. - - Attributes: - name: Name of the dataset/template - dimensions: Ordered list of dimension specifications - coordinates: List of coordinate specifications - chunk_shape: Tuple of chunk sizes for each dimension - metadata: Additional metadata attributes - default_variable_name: Name of the main data variable - """ - - name: str - dimensions: list[DimensionSpec] - coordinates: list[CoordinateSpec] - chunk_shape: tuple[int, ...] - metadata: dict[str, Any] = Field(default_factory=dict) - default_variable_name: str = "amplitude" - - def required_header_fields(self) -> set[str]: - """Names that must be readable from SEG-Y trace headers to materialize this schema.""" - fields = {dim.name for dim in self.dimensions if dim.is_spatial and not dim.is_calculated} - fields.update(coord.name for coord in self.coordinates) - # coordinate_scalar is always needed to scale X/Y coordinates. - fields.add("coordinate_scalar") - return fields - - def spatial_dimensions(self) -> list[DimensionSpec]: - """Get only spatial dimensions (excludes vertical/trace domain).""" - return [dim for dim in self.dimensions if dim.is_spatial] - - -class SchemaResolver: - """Resolves template + grid overrides into a final schema. - - This class takes a template and optional grid overrides and produces - a ResolvedSchema that completely specifies the dataset structure before - any data is scanned or processed. - """ - - def resolve( - self, - template: AbstractDatasetTemplate, - grid_overrides: GridOverrides | None = None, - ) -> ResolvedSchema: - """Resolve template and overrides into final schema. - - Args: - template: The MDIO dataset template - grid_overrides: Optional grid override configuration - - Returns: - ResolvedSchema with all dimensions, coordinates, and metadata resolved - """ - schema = self._template_to_schema(template) - - if grid_overrides: - schema = self._apply_override_transformations(schema, grid_overrides) - - return schema - - def _template_to_schema(self, template: AbstractDatasetTemplate) -> ResolvedSchema: - """Convert a template to a resolved schema without overrides.""" - calculated = set(template.calculated_dimension_names) - dimensions = [ - DimensionSpec(name=name, is_spatial=True, is_calculated=name in calculated) - for name in template.spatial_dimension_names - ] - dimensions.append(DimensionSpec(name=template.dimension_names[-1], is_spatial=False)) - - return ResolvedSchema( - name=template.name, - dimensions=dimensions, - coordinates=list(template.declare_coordinate_specs()), - chunk_shape=template.full_chunk_shape, - metadata=template._load_dataset_attributes() or {}, - default_variable_name=template.default_variable_name, - ) - - def _apply_override_transformations( - self, - schema: ResolvedSchema, - grid_overrides: GridOverrides, - ) -> ResolvedSchema: - """Apply grid override transformations to the schema.""" - schema_dict = schema.model_dump() - - if grid_overrides.non_binned: - schema_dict = self._apply_non_binned_transform(schema_dict, grid_overrides) - elif grid_overrides.has_duplicates: - schema_dict = self._apply_duplicate_transform(schema_dict) - - schema_dict["metadata"]["gridOverrides"] = grid_overrides.to_legacy_dict() - - return ResolvedSchema(**schema_dict) - - def _apply_non_binned_transform( - self, - schema_dict: dict, - grid_overrides: GridOverrides, - ) -> dict: - """Replace selected spatial dimensions with a single ``trace`` dimension.""" - dimensions = schema_dict["dimensions"] - chunk_shape = list(schema_dict["chunk_shape"]) - - replace_dims = grid_overrides.non_binned_dims - if replace_dims is None: - # Default: replace all spatial dims except the first. - spatial_dims = [d for d in dimensions if d["is_spatial"]] - replace_dims = [d["name"] for d in spatial_dims[1:]] if len(spatial_dims) > 1 else [] - - new_dimensions = [] - new_chunk_shape = [] - replaced_count = 0 - - for i, dim in enumerate(dimensions): - if dim["name"] in replace_dims: - replaced_count += 1 - continue - if dim["is_spatial"]: - new_dimensions.append(dim) - new_chunk_shape.append(chunk_shape[i]) - else: - if replaced_count > 0: - new_dimensions.append(DimensionSpec(name="trace", is_spatial=True, is_calculated=True).model_dump()) - new_chunk_shape.append(grid_overrides.chunksize) - new_dimensions.append(dim) - new_chunk_shape.append(chunk_shape[i]) - - schema_dict["dimensions"] = new_dimensions - schema_dict["chunk_shape"] = tuple(new_chunk_shape) - - # Rewrite coordinate dimension references: collapsed dims drop out, replaced by ``trace``. - replaced_dims_set = set(replace_dims) - updated_coordinates = [] - for coord in schema_dict["coordinates"]: - original_dims = coord["dimensions"] - had_collapsed_dims = any(d in replaced_dims_set for d in original_dims) - coord_dims = [d for d in original_dims if d not in replaced_dims_set] - - if had_collapsed_dims and replaced_count > 0: - coord_dims.append("trace") - - updated_coord = dict(coord) - updated_coord["dimensions"] = tuple(coord_dims) - updated_coordinates.append(updated_coord) - schema_dict["coordinates"] = updated_coordinates - - return schema_dict - - def _apply_duplicate_transform(self, schema_dict: dict) -> dict: - """Insert a ``trace`` dimension with chunksize 1 before the vertical dimension.""" - dimensions = schema_dict["dimensions"] - chunk_shape = list(schema_dict["chunk_shape"]) - - new_dimensions = [] - new_chunk_shape = [] - - for i, dim in enumerate(dimensions): - if dim["is_spatial"]: - new_dimensions.append(dim) - new_chunk_shape.append(chunk_shape[i]) - else: - new_dimensions.append(DimensionSpec(name="trace", is_spatial=True, is_calculated=True).model_dump()) - new_chunk_shape.append(1) - new_dimensions.append(dim) - new_chunk_shape.append(chunk_shape[i]) - - schema_dict["dimensions"] = new_dimensions - schema_dict["chunk_shape"] = tuple(new_chunk_shape) - - return schema_dict diff --git a/src/mdio/ingestion/segy/coordinates.py b/src/mdio/ingestion/segy/coordinates.py index 9f49f837..783f7095 100644 --- a/src/mdio/ingestion/segy/coordinates.py +++ b/src/mdio/ingestion/segy/coordinates.py @@ -5,10 +5,10 @@ import logging from typing import TYPE_CHECKING -import numpy as np from segy.standards.codes import MeasurementSystem as SegyMeasurementSystem from segy.standards.fields import binary as binary_header_fields +from mdio.builder.schemas.v1.units import AllUnitModel from mdio.builder.schemas.v1.units import AngleUnitEnum from mdio.builder.schemas.v1.units import AngleUnitModel from mdio.builder.schemas.v1.units import LengthUnitEnum @@ -21,7 +21,6 @@ from xarray import Dataset as xr_Dataset from mdio.builder.templates.base import AbstractDatasetTemplate - from mdio.core.dimension import Dimension from mdio.core.grid import Grid from mdio.segy.file import SegyFileInfo @@ -41,33 +40,7 @@ ] -def _get_coordinates( - grid: Grid, - segy_headers: SegyHeaderArray, - mdio_template: AbstractDatasetTemplate, -) -> tuple[list[Dimension], dict[str, SegyHeaderArray]]: - """Get the data dim and non-dim coordinates from the SEG-Y headers and MDIO template. - - Select a subset of the segy_dimensions that corresponds to the MDIO dimensions - The dimensions are ordered as in the MDIO template. - The last dimension is always the vertical domain dimension - - Args: - grid: Inferred MDIO grid for SEG-Y file. - segy_headers: Headers read in from SEG-Y file. - mdio_template: The MDIO template to use for the conversion. - - Returns: - A tuple containing: - - A list of dimension coordinates (1-D arrays). - - A dict of non-dimension coordinates (str: N-D arrays). - """ - dimensions_coords = [grid.select_dim(name) for name in mdio_template.dimension_names] - non_dim_coords = {name: np.array(segy_headers[name]) for name in mdio_template.coordinate_names} - return dimensions_coords, non_dim_coords - - -def _populate_coordinates( +def populate_coordinates( dataset: xr_Dataset, grid: Grid, coords: dict[str, SegyHeaderArray], @@ -100,7 +73,7 @@ def _populate_coordinates( return dataset, drop_vars_delayed -def _get_spatial_coordinate_unit(segy_file_info: SegyFileInfo) -> LengthUnitModel | None: +def get_spatial_coordinate_unit(segy_file_info: SegyFileInfo) -> LengthUnitModel | None: """Get the coordinate unit from the SEG-Y headers.""" measurement_system_code = int(segy_file_info.binary_header_dict[MEASUREMENT_SYSTEM_KEY]) @@ -121,21 +94,26 @@ def _get_spatial_coordinate_unit(segy_file_info: SegyFileInfo) -> LengthUnitMode return LengthUnitModel(length=unit) -def _update_template_units(template: AbstractDatasetTemplate, unit: LengthUnitModel | None) -> AbstractDatasetTemplate: - """Update the template with dynamic and some pre-defined units.""" - new_units = {key: AngleUnitModel(angle=AngleUnitEnum.DEGREES) for key in ANGLE_UNIT_KEYS} - - if unit is None: - template.add_units(new_units) - return template - - for key in SPATIAL_UNIT_KEYS: - current_value = template.get_unit_by_key(key) - if current_value is not None: - logger.warning("Unit for %s already in template. Will keep the original unit: %s", key, current_value) - continue - - new_units[key] = unit - - template.add_units(new_units) - return template +def resolve_units( + template: AbstractDatasetTemplate, + unit: LengthUnitModel | None, +) -> dict[str, AllUnitModel]: + """Resolve the units for the template and dynamic SEG-Y units without mutating the template.""" + resolved = dict(template.units) + + for key in ANGLE_UNIT_KEYS: + if key not in resolved: + resolved[key] = AngleUnitModel(angle=AngleUnitEnum.DEGREES) + + if unit is not None: + for key in SPATIAL_UNIT_KEYS: + if key not in resolved: + resolved[key] = unit + else: + logger.warning( + "Unit for %s already in template. Will keep the original unit: %s", + key, + resolved[key], + ) + + return resolved diff --git a/src/mdio/ingestion/segy/file_headers.py b/src/mdio/ingestion/segy/file_headers.py index fe69116f..5c056b81 100644 --- a/src/mdio/ingestion/segy/file_headers.py +++ b/src/mdio/ingestion/segy/file_headers.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) -def _add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_info: SegyFileInfo) -> xr_Dataset: +def add_segy_file_headers(xr_dataset: xr_Dataset, segy_file_info: SegyFileInfo) -> xr_Dataset: """Attach the SEG-Y text and binary file headers as attrs on a scalar variable.""" settings = MDIOSettings() mode = settings.save_segy_file_header diff --git a/src/mdio/ingestion/index_strategies.py b/src/mdio/ingestion/segy/index_strategies.py similarity index 91% rename from src/mdio/ingestion/index_strategies.py rename to src/mdio/ingestion/segy/index_strategies.py index ede35e39..6e80b29f 100644 --- a/src/mdio/ingestion/index_strategies.py +++ b/src/mdio/ingestion/segy/index_strategies.py @@ -21,6 +21,8 @@ from numpy.lib import recfunctions as rfn from mdio.core import Dimension +from mdio.ingestion.schema.models import CollapseToTraceEffect +from mdio.ingestion.schema.models import InsertTraceDimEffect from mdio.ingestion.segy.header_analysis import ShotGunGeometryType from mdio.ingestion.segy.header_analysis import StreamerShotGeometryType from mdio.ingestion.segy.header_analysis import analyze_lines_for_guns @@ -35,6 +37,7 @@ from segy.arrays import HeaderArray from mdio.builder.templates.base import AbstractDatasetTemplate + from mdio.ingestion.schema.models import SchemaEffect from mdio.segy.geometry import GridOverrides logger = logging.getLogger(__name__) @@ -351,7 +354,35 @@ def compute_dimensions(self, headers: HeaderArray, dim_names: tuple[str, ...]) - class IndexStrategyRegistry: - """Picks the right :class:`IndexStrategy` from grid overrides + template hints.""" + """Picks the right :class:`IndexStrategy` from grid overrides + template hints. + + The registry is the single source of truth for override semantics: it maps a + :class:`~mdio.segy.geometry.GridOverrides` both to the header-transforming + :class:`IndexStrategy` (:meth:`create_strategy`) and to the schema-reshaping + :class:`~mdio.ingestion.schema.models.SchemaEffect` (:meth:`schema_effect`). Keeping + both derivations here prevents the header view and the schema view from drifting. + """ + + def schema_effect(self, grid_overrides: GridOverrides | None) -> SchemaEffect | None: + """Return the schema reshaping implied by ``grid_overrides``, if any. + + Only ``non_binned`` and ``has_duplicates`` change the dimension/coordinate layout; + every other override transforms headers in place without reshaping the schema. + + Args: + grid_overrides: Typed grid override configuration, or ``None``. + + Returns: + The matching :class:`SchemaEffect`, or ``None`` when no layout change applies. + """ + if not grid_overrides: + return None + if grid_overrides.non_binned: + collapse_dims = None if grid_overrides.non_binned_dims is None else tuple(grid_overrides.non_binned_dims) + return CollapseToTraceEffect(chunksize=grid_overrides.chunksize, collapse_dims=collapse_dims) + if grid_overrides.has_duplicates: + return InsertTraceDimEffect(chunksize=1) + return None def create_strategy( self, diff --git a/src/mdio/ingestion/segy/pipeline.py b/src/mdio/ingestion/segy/pipeline.py new file mode 100644 index 00000000..8dd69706 --- /dev/null +++ b/src/mdio/ingestion/segy/pipeline.py @@ -0,0 +1,194 @@ +"""Ingestion pipeline for SEG-Y to MDIO.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import numpy as np +from segy.config import SegyFileSettings + +from mdio.api.io import _normalize_path +from mdio.converters.exceptions import GridTraceCountError +from mdio.converters.type_converter import to_structured_type +from mdio.core.grid import Grid +from mdio.ingestion.dataset_factory import build_mdio_dataset +from mdio.ingestion.grid_qc import grid_density_qc +from mdio.ingestion.metadata import add_grid_override_to_metadata +from mdio.ingestion.schema.resolver import SchemaResolver +from mdio.ingestion.segy.coordinates import get_spatial_coordinate_unit +from mdio.ingestion.segy.coordinates import resolve_units +from mdio.ingestion.segy.raw_headers import build_raw_header_variables +from mdio.ingestion.segy.reader import read_index_headers +from mdio.ingestion.segy.serializer import serialize_to_mdio +from mdio.ingestion.segy.validation import validate_spec_in_template +from mdio.segy.file import get_segy_file_info + +if TYPE_CHECKING: + from pathlib import Path + + from segy.config import SegyHeaderOverrides + from segy.schema import SegySpec + from upath import UPath + + from mdio.builder.templates.base import AbstractDatasetTemplate + from mdio.core.dimension import Dimension + from mdio.ingestion.schema import ResolvedSchema + from mdio.segy.file import SegyFileArguments + from mdio.segy.geometry import GridOverrides + +logger = logging.getLogger(__name__) + + +def _resolve_output_path(output_path: UPath | Path | str, overwrite: bool) -> UPath: + """Normalize the output path and enforce the overwrite policy. + + Args: + output_path: Requested output location. + overwrite: Whether an existing location may be overwritten. + + Returns: + The normalized output path. + + Raises: + FileExistsError: If the location exists and ``overwrite`` is False. + """ + output_path = _normalize_path(output_path) + if not overwrite and output_path.exists(): + err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended." + raise FileExistsError(err) + return output_path + + +def _verify_calculated_dimensions( + schema: ResolvedSchema, + dimensions: list[Dimension], + template_name: str, +) -> None: + """Ensure every calculated dimension required by the schema was produced. + + Args: + schema: The resolved schema. + dimensions: Dimensions produced by header analysis. + template_name: Template name, used only for the error message. + + Raises: + ValueError: If a required calculated dimension is missing. + """ + missing = schema.missing_calculated_dimensions(dim.name for dim in dimensions) + if missing: + err = ( + f"Required computed fields {sorted(missing)} for template {template_name} not found " + f"after grid overrides. Please ensure the correct grid overrides are applied." + ) + raise ValueError(err) + + +def _build_grid(dimensions: list[Dimension], indexed_headers: np.ndarray, num_traces: int) -> Grid: + """Build the ingestion grid, run density QC, and verify the live trace count. + + Args: + dimensions: Ordered spatial + vertical dimensions. + indexed_headers: Transformed trace headers used to build the grid map. + num_traces: Number of traces reported by the SEG-Y file. + + Returns: + The built and validated grid. + + Raises: + GridTraceCountError: If the live trace count does not match ``num_traces``. + """ + grid = Grid(dims=dimensions) + grid_density_qc(grid, num_traces) + grid.build_map(indexed_headers) + + live_trace_count = int(np.sum(grid.live_mask)) + if live_trace_count != num_traces: + for dim_name in grid.dim_names: + logger.warning("%s min: %s max: %s", dim_name, grid.get_min(dim_name), grid.get_max(dim_name)) + logger.warning("Ingestion grid shape: %s.", grid.shape) + raise GridTraceCountError(live_trace_count, num_traces) + + return grid + + +def run_segy_ingestion( # noqa: PLR0913 + segy_spec: SegySpec, + mdio_template: AbstractDatasetTemplate, + input_path: UPath | Path | str, + output_path: UPath | Path | str, + overwrite: bool = False, + grid_overrides: GridOverrides | None = None, + segy_header_overrides: SegyHeaderOverrides | None = None, +) -> None: + """Convert SEG-Y to MDIO. + + Pipeline phases: schema resolution, header analysis, index strategy, grid build, + dataset build, data write. + + Args: + segy_spec: The SEG-Y specification to use for the conversion. + mdio_template: The MDIO template to use for the conversion. + input_path: The universal path of the input SEG-Y file. + output_path: The universal path for the output MDIO v1 file. + overwrite: Whether to overwrite the output file if it already exists. Defaults to False. + grid_overrides: Grid override configuration for non-standard geometries. + segy_header_overrides: Option to override specific SEG-Y headers during ingestion. + + Raises: + FileExistsError: If the output location already exists and overwrite is False. + ValueError: If required fields are missing from segy_spec or required computed + dimensions are not produced after grid overrides are applied. + GridTraceCountError: If the live trace count in the built grid does not match + the number of traces reported by the SEG-Y file. + """ + validate_spec_in_template(segy_spec, mdio_template) + + input_path = _normalize_path(input_path) + output_path = _resolve_output_path(output_path, overwrite) + + segy_file_kwargs: SegyFileArguments = { + "url": input_path.as_posix(), + "spec": segy_spec, + "settings": SegyFileSettings(storage_options=input_path.storage_options), + "header_overrides": segy_header_overrides, + } + segy_file_info = get_segy_file_info(segy_file_kwargs) + + spatial_unit = get_spatial_coordinate_unit(segy_file_info) + units = resolve_units(mdio_template, spatial_unit) + + schema = SchemaResolver().resolve(mdio_template, grid_overrides) + + indexed_headers, dimensions = read_index_headers( + segy_file_kwargs=segy_file_kwargs, + file_info=segy_file_info, + schema=schema, + grid_overrides=grid_overrides, + synthesize_dims=mdio_template.synthesize_missing_dims, + template=mdio_template, + ) + _verify_calculated_dimensions(schema, dimensions, mdio_template.name) + + grid = _build_grid(dimensions, indexed_headers, segy_file_info.num_traces) + + header_dtype = to_structured_type(segy_spec.trace.header.dtype) + extra_variables = build_raw_header_variables(schema) + mdio_ds = build_mdio_dataset( + schema=schema, + sizes=grid.shape, + header_dtype=header_dtype, + units=units, + extra_variables=extra_variables, + ) + add_grid_override_to_metadata(dataset=mdio_ds, grid_overrides=grid_overrides) + + serialize_to_mdio( + mdio_ds=mdio_ds, + grid=grid, + indexed_headers=indexed_headers, + schema=schema, + file_info=segy_file_info, + segy_file_kwargs=segy_file_kwargs, + output_path=output_path, + ) diff --git a/src/mdio/ingestion/segy/raw_headers.py b/src/mdio/ingestion/segy/raw_headers.py new file mode 100644 index 00000000..fea93562 --- /dev/null +++ b/src/mdio/ingestion/segy/raw_headers.py @@ -0,0 +1,64 @@ +"""Experimental raw trace-header ingestion, isolated for one-file removal. + +The ``MDIO__IMPORT__RAW_HEADERS`` feature is experimental and expected to change or be +removed. All of its gating and variable construction lives here so that retiring it is a +single-file delete plus removing the call site in :mod:`mdio.ingestion.segy.pipeline`. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import zarr + +from mdio.builder.schemas.chunk_grid import RegularChunkGrid +from mdio.builder.schemas.chunk_grid import RegularChunkShape +from mdio.builder.schemas.compressors import Blosc +from mdio.builder.schemas.compressors import BloscCname +from mdio.builder.schemas.dtype import ScalarType +from mdio.builder.schemas.v1.variable import VariableMetadata +from mdio.constants import ZarrFormat +from mdio.core.config import MDIOSettings + +if TYPE_CHECKING: + from typing import Any + + from mdio.ingestion.schema import ResolvedSchema + +logger = logging.getLogger(__name__) + + +def build_raw_header_variables(schema: ResolvedSchema) -> list[dict[str, Any]]: + """Return extra-variable specs for the experimental raw-headers feature. + + Args: + schema: The resolved schema; its spatial dimensions and chunk shape size the + ``raw_headers`` variable. + + Returns: + A single-element list describing the ``raw_headers`` variable when the feature is + enabled and the active Zarr format supports it (v3); otherwise an empty list. + """ + settings = MDIOSettings() + if not settings.raw_headers: + return [] + + if zarr.config.get("default_zarr_format") == ZarrFormat.V2: + logger.warning("Raw headers are only supported for Zarr v3. Skipping raw headers.") + return [] + + logger.warning("MDIO__IMPORT__RAW_HEADERS is experimental and expected to change or be removed.") + spatial_dim_names = tuple(dim.name for dim in schema.dimensions if dim.is_spatial) + chunk_metadata = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=schema.chunk_shape[:-1])) + return [ + { + "name": "raw_headers", + "long_name": "Raw Headers", + "dimensions": spatial_dim_names, + "data_type": ScalarType.BYTES240, + "compressor": Blosc(cname=BloscCname.zstd), + "coordinates": None, + "metadata": VariableMetadata(chunk_grid=chunk_metadata), + } + ] diff --git a/src/mdio/ingestion/segy/reader.py b/src/mdio/ingestion/segy/reader.py new file mode 100644 index 00000000..9acf4b30 --- /dev/null +++ b/src/mdio/ingestion/segy/reader.py @@ -0,0 +1,83 @@ +"""SEG-Y index header reader and grid dimension builder.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from mdio.core.dimension import Dimension +from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry +from mdio.segy.parsers import parse_headers + +if TYPE_CHECKING: + import numpy as np + + from mdio.builder.templates.base import AbstractDatasetTemplate + from mdio.ingestion.schema import ResolvedSchema + from mdio.segy.file import SegyFileArguments + from mdio.segy.file import SegyFileInfo + from mdio.segy.geometry import GridOverrides + +logger = logging.getLogger(__name__) + + +def read_index_headers( # noqa: PLR0913 + segy_file_kwargs: SegyFileArguments, + file_info: SegyFileInfo, + schema: ResolvedSchema, + grid_overrides: GridOverrides | None, + synthesize_dims: tuple[str, ...], + template: AbstractDatasetTemplate | None = None, +) -> tuple[np.ndarray, list[Dimension]]: + """Parse SEG-Y headers, apply index strategy transformations, and build dimensions. + + Args: + segy_file_kwargs: Arguments for opening the SEG-Y file. + file_info: Metadata info of the SEG-Y file. + schema: Final resolved MDIO schema. + grid_overrides: Grid override parameters if any. + synthesize_dims: Dimensions to synthesize if missing from headers. + template: Optional dataset template for specialized strategy resolution. + + Returns: + tuple: + - np.ndarray: The transformed and indexed trace headers. + - list[Dimension]: The completed list of spatial + vertical Dimensions. + """ + # 1. Determine subset of header fields to parse + spec = segy_file_kwargs.get("spec") + spec_fields = {field.name for field in spec.trace.header.fields} if spec else set() + + # Drop any synthesized or missing dimensions/coordinates that aren't in the physical file spec + subset = tuple(f for f in schema.required_header_fields() if f in spec_fields) + + # 2. Parse headers + parsed_headers = parse_headers( + segy_file_kwargs=segy_file_kwargs, + num_traces=file_info.num_traces, + subset=subset, + ) + + # 3. Apply Index Strategy + strategy = IndexStrategyRegistry().create_strategy( + grid_overrides=grid_overrides, + synthesize_dims=synthesize_dims, + template=template, + ) + logger.info("Using index strategy: %s", strategy.name) + + indexed_headers = strategy.transform_headers(parsed_headers) + + # 4. Compute spatial dimensions + dim_names = tuple(d.name for d in schema.dimensions if d.is_spatial) + dimensions = strategy.compute_dimensions(indexed_headers, dim_names) + + # 5. Append vertical dimension + sample_labels = file_info.sample_labels / 1000 # normalize + if all(sample_labels.astype("int64") == sample_labels): + sample_labels = sample_labels.astype("int64") + + vertical_dim_name = schema.dimensions[-1].name + dimensions.append(Dimension(coords=sample_labels, name=vertical_dim_name)) + + return indexed_headers, dimensions diff --git a/src/mdio/ingestion/segy/serializer.py b/src/mdio/ingestion/segy/serializer.py new file mode 100644 index 00000000..037563af --- /dev/null +++ b/src/mdio/ingestion/segy/serializer.py @@ -0,0 +1,101 @@ +"""Serializer that formats the populated xarray dataset and writes it to MDIO / Zarr.""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import numpy as np + +from mdio.api.io import to_mdio +from mdio.builder.xarray_builder import to_xarray_dataset +from mdio.ingestion.segy.coordinates import populate_coordinates +from mdio.ingestion.segy.file_headers import add_segy_file_headers +from mdio.segy import blocked_io + +if TYPE_CHECKING: + from upath import UPath + + from mdio.builder.schemas import Dataset + from mdio.core.grid import Grid + from mdio.ingestion.schema import ResolvedSchema + from mdio.segy.file import SegyFileArguments + from mdio.segy.file import SegyFileInfo + +logger = logging.getLogger(__name__) + + +def serialize_to_mdio( # noqa: PLR0913 + mdio_ds: Dataset, + grid: Grid, + indexed_headers: np.ndarray, + schema: ResolvedSchema, + file_info: SegyFileInfo, + segy_file_kwargs: SegyFileArguments, + output_path: UPath, +) -> None: + """Serialize the constructed MDIO Dataset and populate it with trace data. + + This function coordinates coordinate population, trace mask writing, file headers, + staged to_mdio Zarr writes, and the final blocked-I/O writing. + + Args: + mdio_ds: The constructed Dataset schema model. + grid: The built ingestion Grid. + indexed_headers: Transformed trace headers. + schema: Resolved schema of the dataset. + file_info: Metadata info of the SEG-Y file. + segy_file_kwargs: SEG-Y file arguments. + output_path: Universal output path for writing. + """ + # 1. Extract non-dimension coordinates from transformed headers + non_dim_coords = {} + for coord in schema.coordinates: + if coord.name in indexed_headers.dtype.names: + non_dim_coords[coord.name] = np.array(indexed_headers[coord.name]) + else: + logger.warning( + "Coordinate '%s' expected by schema but not found in transformed headers.", + coord.name, + ) + + # 2. Convert to xarray dataset + xr_dataset = to_xarray_dataset(mdio_ds=mdio_ds) + + # 3. Populate coordinates with grid mappings + xr_dataset, drop_vars_delayed = populate_coordinates( + dataset=xr_dataset, + grid=grid, + coords=non_dim_coords, + spatial_coordinate_scalar=file_info.coordinate_scalar, + ) + + # 4. Attach SEG-Y text and binary file headers + xr_dataset = add_segy_file_headers(xr_dataset, file_info) + + # 5. Populate trace mask + xr_dataset.trace_mask.data[:] = grid.live_mask + + # 6. Initialize Zarr store with metadata (staged empty arrays) + to_mdio(xr_dataset, output_path=output_path, mode="w", compute=False) + + # 7. Write delayed coordinate variables and trace mask + unindexed_dims = [d for d in xr_dataset.dims if d not in xr_dataset.coords] + for d in unindexed_dims: + if d in drop_vars_delayed: + drop_vars_delayed.remove(d) + + meta_ds = xr_dataset[drop_vars_delayed + ["trace_mask"]] + to_mdio(meta_ds, output_path=output_path, mode="r+", compute=True) + + # Drop delayed vars to prepare for blocked-I/O writing + xr_dataset = xr_dataset.drop_vars(drop_vars_delayed) + + # 8. Write bulk trace data and structured headers via blocked-I/O workers + blocked_io.to_zarr( + segy_file_kwargs=segy_file_kwargs, + output_path=output_path, + grid_map=grid.map, + dataset=xr_dataset, + data_variable_name=schema.default_variable_name, + ) diff --git a/src/mdio/ingestion/segy/validation.py b/src/mdio/ingestion/segy/validation.py index 1c74c96f..f9a6d079 100644 --- a/src/mdio/ingestion/segy/validation.py +++ b/src/mdio/ingestion/segy/validation.py @@ -10,7 +10,7 @@ from mdio.builder.templates.base import AbstractDatasetTemplate -def _validate_spec_in_template(segy_spec: SegySpec, mdio_template: AbstractDatasetTemplate) -> None: +def validate_spec_in_template(segy_spec: SegySpec, mdio_template: AbstractDatasetTemplate) -> None: """Validate that the SegySpec has all required fields in the MDIO template.""" # Import here to avoid circular imports at module load time from mdio.builder.templates.seismic_3d_obn import Seismic3DObnReceiverGathersTemplate # noqa: PLC0415 diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index c2b80b80..aa24270a 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -1,9 +1,9 @@ -"""SEG-Y grid override configuration model and legacy executor shim. +"""SEG-Y grid override configuration model and template-compatibility helpers. The Pydantic :class:`GridOverrides` model is the supported public API for configuring -grid overrides. The :class:`GridOverrider` class is retained as a thin shim that -delegates to :class:`mdio.ingestion.index_strategies.IndexStrategyRegistry`; it preserves -the v1.1 ``run(...)`` contract for callers that still pass a legacy ``dict``. +grid overrides. Header transformation and schema reshaping are owned by +:class:`mdio.ingestion.segy.index_strategies.IndexStrategyRegistry`; this module only holds +the typed config plus the template-compatibility guards used to validate override pairings. """ from __future__ import annotations @@ -16,15 +16,7 @@ from pydantic import ConfigDict from pydantic import Field -from mdio.ingestion.index_strategies import IndexStrategyRegistry -from mdio.segy.exceptions import GridOverrideMissingParameterError -from mdio.segy.exceptions import GridOverrideUnknownError - if TYPE_CHECKING: - from collections.abc import Sequence - - from segy.arrays import HeaderArray - from mdio.builder.templates.base import AbstractDatasetTemplate @@ -142,101 +134,3 @@ def _validate_template_for_overrides( actual = type(template).__name__ if template is not None else "None" msg = f"calculate_shot_index only supports Seismic3DObnReceiverGathersTemplate, got {actual}." raise TypeError(msg) - - -class GridOverrider: - """Legacy facade that adapts the dict-based v1.1 API onto :class:`IndexStrategyRegistry`. - - Existing callers (notably :func:`mdio.segy.utilities.get_grid_plan`) still build a - legacy ``dict`` of grid overrides and call :meth:`run`. This class translates the dict - into a typed :class:`GridOverrides`, dispatches to the appropriate - :class:`IndexStrategy`, and returns the ``(headers, names, chunksize)`` tuple shape - those callers depend on. It will be removed once all callers move to the typed API. - """ - - def __init__(self) -> None: - self._registry = IndexStrategyRegistry() - - def run( - self, - index_headers: HeaderArray, - index_names: Sequence[str], - grid_overrides: dict[str, Any] | None, - chunksize: Sequence[int] | None = None, - template: AbstractDatasetTemplate | None = None, - ) -> tuple[HeaderArray, tuple[str, ...], tuple[int, ...] | None]: - """Run the configured grid overrides and return updated headers/names/chunks. - - Args: - index_headers: Parsed SEG-Y trace headers; structured numpy array. - index_names: Names of the index dimensions before any override is applied. - grid_overrides: Legacy dict of overrides (CamelCase keys). - chunksize: Optional chunk shape that may be expanded by overrides that add a - ``trace`` dimension. - template: Optional dataset template; used to identify coordinate fields and - to drive component synthesis for OBN. - - Returns: - Tuple of ``(transformed_headers, new_index_names, new_chunksize)``. The - chunksize tuple is ``None`` when the caller did not pass a chunksize. - - Raises: - GridOverrideUnknownError: When ``grid_overrides`` contains an unknown key. - GridOverrideMissingParameterError: When ``NonBinned`` is enabled without - ``chunksize`` or ``non_binned_dims``. - - Notes: - Header-precondition checks (``GridOverrideKeysError``) are delegated to - :meth:`IndexStrategy.validate_headers`; template-compatibility checks - (``TypeError``) are delegated to :func:`_validate_template_for_overrides`. - """ - grid_overrides = grid_overrides or {} - - field_names = set(GridOverrides.model_fields.keys()) - aliases = {field.alias for field in GridOverrides.model_fields.values() if field.alias} - valid_keys = field_names | aliases - for key in grid_overrides: - if key not in valid_keys: - raise GridOverrideUnknownError(key) - - config = GridOverrides.model_validate(grid_overrides) - - if config.non_binned: - missing: set[str] = set() - if config.chunksize is None: - missing.add("chunksize") - if not config.non_binned_dims: - missing.add("non_binned_dims") - if missing: - command = "NonBinned" - raise GridOverrideMissingParameterError(command, missing) - - _validate_template_for_overrides(config, template) - - synthesize_dims = _resolve_synthesize_dims(template) - strategy = self._registry.create_strategy( - grid_overrides=config, - synthesize_dims=synthesize_dims, - template=template, - ) - logger.debug("Selected grid override strategy: %s", strategy.name) - - strategy.validate_headers(index_headers) - new_headers = strategy.transform_headers(index_headers) - - new_names = list(index_names) - new_chunks = list(chunksize) if chunksize is not None else None - - # Both NonBinned and HasDuplicates add a 'trace' dim at index -1; HasDuplicates - # always uses chunksize 1, NonBinned uses the user-supplied value. - if config.non_binned or config.has_duplicates: - new_names.append("trace") - if new_chunks is not None: - inserted_chunk = config.chunksize if config.non_binned else 1 - new_chunks.insert(-1, inserted_chunk) - - return ( - new_headers, - tuple(new_names), - tuple(new_chunks) if new_chunks is not None else None, - ) diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 64c89584..02be321d 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -9,187 +9,16 @@ import numpy as np from dask.array.core import normalize_chunks -from mdio.core import Dimension -from mdio.segy.geometry import GridOverrider -from mdio.segy.parsers import parse_headers - if TYPE_CHECKING: from dask.array import Array as DaskArray from numpy.typing import DTypeLike from numpy.typing import NDArray - from segy.arrays import HeaderArray from segy.schema import SegySpec - from mdio.builder.templates.base import AbstractDatasetTemplate - from mdio.segy.file import SegyFileArguments - from mdio.segy.file import SegyFileInfo - from mdio.segy.geometry import GridOverrides logger = logging.getLogger(__name__) -def get_grid_plan( # noqa: C901, PLR0912, PLR0913, PLR0915 - segy_file_kwargs: SegyFileArguments, - segy_file_info: SegyFileInfo, - chunksize: tuple[int, ...] | None, - template: AbstractDatasetTemplate, - return_headers: bool = False, - grid_overrides: GridOverrides | None = None, -) -> tuple[list[Dimension], tuple[int, ...]] | tuple[list[Dimension], tuple[int, ...], HeaderArray]: - """Infer dimension ranges, and increments. - - Generates multiple dimensions with the following steps: - 1. Read index headers - 2. Get min, max, and increments - 3. Create `Dimension` with appropriate range, index, and description. - 4. Create `Dimension` for sample axis using binary header. - - Args: - segy_file_kwargs: SEG-Y file arguments. - segy_file_info: SegyFileInfo instance containing the num_traces and sample_labels. - chunksize: Chunk sizes to be used in grid plan. - template: MDIO template where coordinate names and domain will be taken. - return_headers: Option to return parsed headers with `Dimension` objects. Default is False. - grid_overrides: Typed grid override configuration, or ``None`` for no overrides. - - Returns: - All index dimensions and chunksize or dimensions and chunksize together with header values. - - Raises: - ValueError: If computed fields are not found after grid overrides. - """ - # Keep only dimension and non-dimension coordinates excluding the vertical axis - horizontal_dimensions = template.spatial_dimension_names - horizontal_coordinates = horizontal_dimensions + template.coordinate_names - # Exclude calculated dimensions - they don't exist in SEG-Y headers - calculated_dims = set(template.calculated_dimension_names) - - # Remove any to be computed fields - preserve order by using list comprehension instead of set operations - computed_fields = set(template.calculated_dimension_names) - horizontal_coordinates = tuple(c for c in horizontal_coordinates if c not in computed_fields) - - # Ensure non_binned_dims are included in the headers to parse, even if not in template - if grid_overrides is not None and grid_overrides.non_binned_dims: - for dim in grid_overrides.non_binned_dims: - if dim not in horizontal_coordinates: - horizontal_coordinates = horizontal_coordinates + (dim,) - - # For OBN template: skip 'component' if not in SEG-Y spec (will be synthesized later) - # Import here to avoid circular imports at module load time - from mdio.builder.templates.seismic_3d_obn import Seismic3DObnReceiverGathersTemplate # noqa: PLC0415 - - spec = segy_file_kwargs.get("spec") - spec_fields = {field.name for field in spec.trace.header.fields} if spec else set() - fields_to_skip = calculated_dims.copy() - - if isinstance(template, Seismic3DObnReceiverGathersTemplate) and "component" not in spec_fields: - fields_to_skip.add("component") - - headers_subset = parse_headers( - segy_file_kwargs=segy_file_kwargs, - num_traces=segy_file_info.num_traces, - subset=tuple(c for c in horizontal_coordinates if c not in fields_to_skip), - ) - - # The legacy GridOverrider still consumes the dict shape; dump only at this boundary. - # Future PR will replace GridOverrider. - override_handler = GridOverrider() - headers_subset, horizontal_coordinates, chunksize = override_handler.run( - headers_subset, - horizontal_coordinates, - chunksize=chunksize, - grid_overrides=grid_overrides.to_legacy_dict() if grid_overrides is not None else {}, - template=template, - ) - - # After grid overrides, determine final spatial dimensions and their chunk sizes - non_binned_active = grid_overrides is not None and grid_overrides.non_binned - non_binned_dims: set[str] = ( - set(grid_overrides.non_binned_dims) - if non_binned_active and grid_overrides is not None and grid_overrides.non_binned_dims - else set() - ) - - # Create mapping from dimension name to original chunk size for easy lookup - original_spatial_dims = list(template.spatial_dimension_names) - original_chunks = list(template.full_chunk_shape[:-1]) # Exclude vertical (sample/time) dimension - dim_to_chunk = dict(zip(original_spatial_dims, original_chunks, strict=True)) - - # Final spatial dimensions: keep trace and original dims, exclude non-binned dims - final_spatial_dims = [] - final_spatial_chunks = [] - for name in horizontal_coordinates: - if name in non_binned_dims: - continue # Skip dimensions that became coordinates - if name == "trace": - chunk_val = ( - int(grid_overrides.chunksize) - if non_binned_active and grid_overrides is not None and grid_overrides.chunksize is not None - else 1 - ) - final_spatial_dims.append(name) - final_spatial_chunks.append(chunk_val) - elif name in dim_to_chunk: - # Use original chunk size for known dimensions - final_spatial_dims.append(name) - final_spatial_chunks.append(dim_to_chunk[name]) - - if len(computed_fields) > 0 and not computed_fields.issubset(headers_subset.dtype.names): - err = ( - f"Required computed fields {sorted(computed_fields)} for template {template.name} " - f"not found after grid overrides. Please ensure correct overrides are applied." - ) - raise ValueError(err) - - # Create dimensions from final_spatial_dims plus any computed fields that were added by grid overrides - all_dimension_names = list(final_spatial_dims) - added_computed_fields = [] - for computed_field in computed_fields: - if computed_field in headers_subset.dtype.names and computed_field not in all_dimension_names: - # Insert in template order - if computed_field in template.spatial_dimension_names: - insert_idx = template.spatial_dimension_names.index(computed_field) - # Find position in all_dimension_names that corresponds to this template position - actual_idx = min(insert_idx, len(all_dimension_names)) - all_dimension_names.insert(actual_idx, computed_field) - # Track where we inserted and what chunk size it should have - template_chunk_idx = template.spatial_dimension_names.index(computed_field) - chunk_val = template.full_chunk_shape[template_chunk_idx] - added_computed_fields.append((actual_idx, chunk_val)) - else: - all_dimension_names.append(computed_field) - added_computed_fields.append((len(all_dimension_names) - 1, 1)) - - # Build chunksize including chunks for computed fields - if added_computed_fields: - chunk_list = list(final_spatial_chunks) - for insert_idx, chunk_val in sorted(added_computed_fields, reverse=True): - chunk_list.insert(insert_idx, chunk_val) - chunksize = tuple(chunk_list + [template.full_chunk_shape[-1]]) - else: - chunksize = tuple(final_spatial_chunks + [template.full_chunk_shape[-1]]) - - dimensions = [] - for dim_name in all_dimension_names: - if dim_name not in headers_subset.dtype.names: - continue - dim_unique = np.unique(headers_subset[dim_name]) - dimensions.append(Dimension(coords=dim_unique, name=dim_name)) - - sample_labels = segy_file_info.sample_labels / 1000 # normalize - - if all(sample_labels.astype("int64") == sample_labels): - sample_labels = sample_labels.astype("int64") - - vertical_dim = Dimension(coords=sample_labels, name=template.trace_domain) - dimensions.append(vertical_dim) - - if return_headers: - return dimensions, chunksize, headers_subset - - return dimensions, chunksize - - def project_headers_to_segy_spec(headers: DaskArray, segy_spec: SegySpec) -> DaskArray: """Project stored MDIO trace headers onto the SegySpec trace header layout. diff --git a/tests/unit/ingestion/test_dataset_factory.py b/tests/unit/ingestion/test_dataset_factory.py new file mode 100644 index 00000000..cb32d925 --- /dev/null +++ b/tests/unit/ingestion/test_dataset_factory.py @@ -0,0 +1,128 @@ +"""Tests for the schema-driven ``build_mdio_dataset`` factory.""" + +from __future__ import annotations + +import numpy as np +import pytest + +from mdio.builder.schemas.dtype import ScalarType +from mdio.builder.schemas.v1.units import LengthUnitEnum +from mdio.builder.schemas.v1.units import LengthUnitModel +from mdio.builder.templates.types import CoordinateSpec +from mdio.converters.type_converter import to_structured_type +from mdio.ingestion.dataset_factory import build_mdio_dataset +from mdio.ingestion.schema import DimensionSpec +from mdio.ingestion.schema import ResolvedSchema + + +def _vars_by_name(dataset) -> dict: # noqa: ANN001 + """Index a built Dataset's variables (and coordinates) by name.""" + return {var.name: var for var in dataset.variables} + + +def _dim_names(variable) -> tuple[str, ...]: # noqa: ANN001 + """Return the ordered dimension names of a built Variable.""" + return tuple(dim.name for dim in variable.dimensions) + + +def _chunk_shape(variable) -> tuple[int, ...]: # noqa: ANN001 + """Return the configured chunk shape of a built Variable.""" + return tuple(variable.metadata.chunk_grid.configuration.chunk_shape) + + +@pytest.fixture +def basic_schema() -> ResolvedSchema: + """A minimal 2-spatial-dim + vertical schema with one non-dim coordinate.""" + return ResolvedSchema( + name="Toy3D", + dimensions=[ + DimensionSpec(name="inline", is_spatial=True, dtype=ScalarType.INT32), + DimensionSpec(name="crossline", is_spatial=True, dtype=ScalarType.INT32), + DimensionSpec(name="time", is_spatial=False, dtype=ScalarType.INT16), + ], + coordinates=[ + CoordinateSpec(name="cdp_x", dimensions=("inline", "crossline"), dtype=ScalarType.FLOAT64), + ], + chunk_shape=(2, -1, 4), + default_variable_name="amplitude", + ) + + +class TestBuildMdioDataset: + """Tests for the structure produced by ``build_mdio_dataset``.""" + + def test_builds_dimensions_coordinates_and_data_variable(self, basic_schema: ResolvedSchema) -> None: + """All dimensions, the non-dim coordinate, trace_mask and data variable are created.""" + dataset = build_mdio_dataset(schema=basic_schema, sizes=(2, 3, 4)) + variables = _vars_by_name(dataset) + + assert {"inline", "crossline", "time", "cdp_x", "trace_mask", "amplitude"}.issubset(variables) + assert _dim_names(variables["amplitude"]) == ("inline", "crossline", "time") + assert _dim_names(variables["trace_mask"]) == ("inline", "crossline") + assert _dim_names(variables["cdp_x"]) == ("inline", "crossline") + + def test_resolves_negative_one_chunk_against_sizes(self, basic_schema: ResolvedSchema) -> None: + """A ``-1`` chunk entry is replaced by the actual dimension size.""" + dataset = build_mdio_dataset(schema=basic_schema, sizes=(2, 3, 4)) + amplitude = _vars_by_name(dataset)["amplitude"] + # chunk_shape (2, -1, 4) with sizes (2, 3, 4) -> (2, 3, 4) + assert _chunk_shape(amplitude) == (2, 3, 4) + + def test_default_variable_name_recorded_in_attributes(self, basic_schema: ResolvedSchema) -> None: + """The default variable name is written to dataset attributes.""" + dataset = build_mdio_dataset(schema=basic_schema, sizes=(2, 3, 4)) + assert dataset.metadata.attributes["defaultVariableName"] == "amplitude" + + def test_header_dtype_adds_headers_variable(self, basic_schema: ResolvedSchema) -> None: + """Passing a header dtype adds a ``headers`` variable over the spatial dims only.""" + header_dtype = to_structured_type(np.dtype([("cdp_x", "int32"), ("cdp_y", "int32")])) + dataset = build_mdio_dataset(schema=basic_schema, sizes=(2, 3, 4), header_dtype=header_dtype) + + headers = _vars_by_name(dataset)["headers"] + assert _dim_names(headers) == ("inline", "crossline") + # Headers chunk drops the vertical axis. + assert _chunk_shape(headers) == (2, 3) + + def test_no_headers_variable_when_dtype_missing(self, basic_schema: ResolvedSchema) -> None: + """Without a header dtype, no ``headers`` variable is created.""" + dataset = build_mdio_dataset(schema=basic_schema, sizes=(2, 3, 4)) + assert "headers" not in _vars_by_name(dataset) + + def test_calculated_dimension_has_no_dimension_coordinate(self) -> None: + """A calculated spatial dim (e.g. ``shot_index``) gets no dimension coordinate.""" + schema = ResolvedSchema( + name="Calc", + dimensions=[ + DimensionSpec(name="receiver", is_spatial=True, dtype=ScalarType.UINT32), + DimensionSpec(name="shot_index", is_spatial=True, is_calculated=True, dtype=ScalarType.INT32), + DimensionSpec(name="time", is_spatial=False, dtype=ScalarType.INT16), + ], + coordinates=[], + chunk_shape=(2, 2, 4), + ) + dataset = build_mdio_dataset(schema=schema, sizes=(2, 2, 4)) + variables = _vars_by_name(dataset) + + # shot_index is still a data-variable dimension, but not its own 1-D coordinate. + assert _dim_names(variables["amplitude"]) == ("receiver", "shot_index", "time") + assert "shot_index" not in variables + assert "receiver" in variables + + def test_extra_variables_are_appended(self, basic_schema: ResolvedSchema) -> None: + """Extra variable dicts (e.g. raw_headers) are added to the dataset.""" + extra = [ + { + "name": "raw_headers", + "dimensions": ("inline", "crossline"), + "data_type": ScalarType.BYTES240, + } + ] + dataset = build_mdio_dataset(schema=basic_schema, sizes=(2, 3, 4), extra_variables=extra) + assert "raw_headers" in _vars_by_name(dataset) + + def test_units_attached_to_coordinate(self, basic_schema: ResolvedSchema) -> None: + """Units passed by name are attached to the matching coordinate metadata.""" + meter = LengthUnitModel(length=LengthUnitEnum.METER) + dataset = build_mdio_dataset(schema=basic_schema, sizes=(2, 3, 4), units={"cdp_x": meter}) + cdp_x = _vars_by_name(dataset)["cdp_x"] + assert cdp_x.metadata.units_v1 == meter diff --git a/tests/unit/ingestion/test_metadata.py b/tests/unit/ingestion/test_metadata.py index 2ba7d4c8..3ef7a928 100644 --- a/tests/unit/ingestion/test_metadata.py +++ b/tests/unit/ingestion/test_metadata.py @@ -4,7 +4,7 @@ from types import SimpleNamespace -from mdio.ingestion.metadata import _add_grid_override_to_metadata +from mdio.ingestion.metadata import add_grid_override_to_metadata from mdio.segy.geometry import GridOverrides @@ -14,19 +14,19 @@ def _make_dataset(attributes: dict | None) -> SimpleNamespace: class TestAddGridOverrideToMetadata: - """Tests for ``_add_grid_override_to_metadata``.""" + """Tests for ``add_grid_override_to_metadata``.""" def test_initializes_attributes_dict_when_none(self) -> None: """A ``None`` attributes dict gets replaced with an empty dict before insertion.""" dataset = _make_dataset(attributes=None) - _add_grid_override_to_metadata(dataset, grid_overrides=None) + add_grid_override_to_metadata(dataset, grid_overrides=None) assert dataset.metadata.attributes == {} def test_adds_grid_overrides_when_provided(self) -> None: """Active grid overrides should serialize under the ``gridOverrides`` key.""" dataset = _make_dataset(attributes=None) overrides = GridOverrides(has_duplicates=True, chunksize=4) - _add_grid_override_to_metadata(dataset, grid_overrides=overrides) + add_grid_override_to_metadata(dataset, grid_overrides=overrides) assert dataset.metadata.attributes == { "gridOverrides": {"HasDuplicates": True, "chunksize": 4}, } @@ -35,7 +35,7 @@ def test_preserves_existing_attributes(self) -> None: """Existing attribute keys should be preserved when adding overrides.""" dataset = _make_dataset(attributes={"existing": "value"}) overrides = GridOverrides(non_binned=True) - _add_grid_override_to_metadata(dataset, grid_overrides=overrides) + add_grid_override_to_metadata(dataset, grid_overrides=overrides) assert dataset.metadata.attributes == { "existing": "value", "gridOverrides": {"NonBinned": True}, @@ -44,5 +44,5 @@ def test_preserves_existing_attributes(self) -> None: def test_no_overrides_leaves_attributes_untouched(self) -> None: """Passing ``None`` overrides must not introduce a ``gridOverrides`` key.""" dataset = _make_dataset(attributes={"existing": "value"}) - _add_grid_override_to_metadata(dataset, grid_overrides=None) + add_grid_override_to_metadata(dataset, grid_overrides=None) assert dataset.metadata.attributes == {"existing": "value"} diff --git a/tests/unit/ingestion/test_no_template_mutation.py b/tests/unit/ingestion/test_no_template_mutation.py new file mode 100644 index 00000000..ce634764 --- /dev/null +++ b/tests/unit/ingestion/test_no_template_mutation.py @@ -0,0 +1,36 @@ +"""Guardrail: ingestion must never mutate template internals. + +The pre-v1.2 ingestion path mutated templates in place (reassigning ``_dim_names``, +``_var_chunk_shape``, ``_logical_coord_names``) and even monkeypatched template methods at +runtime. The refactor replaced that with an immutable ``ResolvedSchema``. This test fails +loudly if any ingestion module reintroduces template-internals mutation or method rebinding. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import mdio + +# Assignment to a private template attribute, e.g. ``template._dim_names = ...`` or +# rebinding a method/attribute on a template object, e.g. ``template._add_coordinates = ...``. +_MUTATION = re.compile(r"\b(?:template|mdio_template)\.\w+\s*=(?!=)") + +_INGESTION_ROOT = Path(mdio.__file__).parent / "ingestion" + + +def _ingestion_sources() -> list[Path]: + return sorted(p for p in _INGESTION_ROOT.rglob("*.py") if "__pycache__" not in p.parts) + + +def test_ingestion_never_mutates_template() -> None: + """No ingestion module assigns to a template attribute (mutation/monkeypatch guard).""" + offenders: list[str] = [] + for path in _ingestion_sources(): + for lineno, line in enumerate(path.read_text().splitlines(), start=1): + code = line.split("#", 1)[0] + if _MUTATION.search(code): + offenders.append(f"{path.name}:{lineno}: {line.strip()}") + + assert not offenders, "Ingestion must not mutate template internals:\n" + "\n".join(offenders) diff --git a/tests/unit/ingestion/test_schema_effects.py b/tests/unit/ingestion/test_schema_effects.py new file mode 100644 index 00000000..5bd4a224 --- /dev/null +++ b/tests/unit/ingestion/test_schema_effects.py @@ -0,0 +1,112 @@ +"""Unit tests for grid-override schema effects and their registry selection.""" + +from __future__ import annotations + +from mdio.builder.schemas.dtype import ScalarType +from mdio.builder.templates.types import CoordinateSpec +from mdio.ingestion.schema import CollapseToTraceEffect +from mdio.ingestion.schema import DimensionSpec +from mdio.ingestion.schema import InsertTraceDimEffect +from mdio.ingestion.schema import ResolvedSchema +from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry +from mdio.segy.geometry import GridOverrides + + +def _schema() -> ResolvedSchema: + """A 3-spatial-dim + vertical schema with a coordinate spanning two spatial dims.""" + return ResolvedSchema( + name="Toy", + dimensions=[ + DimensionSpec(name="shot_point", is_spatial=True, dtype=ScalarType.UINT32), + DimensionSpec(name="cable", is_spatial=True, dtype=ScalarType.UINT8), + DimensionSpec(name="channel", is_spatial=True, dtype=ScalarType.UINT16), + DimensionSpec(name="time", is_spatial=False, dtype=ScalarType.INT32), + ], + coordinates=[ + CoordinateSpec( + name="group_coord_x", + dimensions=("shot_point", "cable", "channel"), + dtype=ScalarType.FLOAT64, + ), + ], + chunk_shape=(8, 1, 128, 2048), + ) + + +class TestRegistrySchemaEffectSelection: + """The registry maps GridOverrides to the matching SchemaEffect (single source).""" + + def test_no_overrides_returns_none(self) -> None: + """A None or empty override yields no schema effect.""" + registry = IndexStrategyRegistry() + assert registry.schema_effect(None) is None + assert registry.schema_effect(GridOverrides()) is None + + def test_header_only_overrides_return_none(self) -> None: + """Overrides that only transform headers (no layout change) yield no effect.""" + registry = IndexStrategyRegistry() + assert registry.schema_effect(GridOverrides(auto_channel_wrap=True)) is None + + def test_non_binned_default_collapse(self) -> None: + """NonBinned without explicit dims yields a default (None) CollapseToTraceEffect.""" + effect = IndexStrategyRegistry().schema_effect(GridOverrides(non_binned=True, chunksize=64)) + assert isinstance(effect, CollapseToTraceEffect) + assert effect.chunksize == 64 + assert effect.collapse_dims is None + + def test_non_binned_explicit_dims(self) -> None: + """NonBinned with explicit dims preserves them on the effect.""" + overrides = GridOverrides(non_binned=True, chunksize=128, non_binned_dims=["channel"]) + effect = IndexStrategyRegistry().schema_effect(overrides) + assert isinstance(effect, CollapseToTraceEffect) + assert effect.collapse_dims == ("channel",) + + def test_has_duplicates_inserts_trace(self) -> None: + """HasDuplicates yields a 1-wide InsertTraceDimEffect.""" + effect = IndexStrategyRegistry().schema_effect(GridOverrides(has_duplicates=True)) + assert isinstance(effect, InsertTraceDimEffect) + assert effect.chunksize == 1 + + +class TestInsertTraceDimEffect: + """InsertTraceDimEffect adds a calculated trace dim before the vertical axis.""" + + def test_inserts_trace_before_vertical(self) -> None: + """A 1-wide calculated trace dim is inserted; coordinates are untouched.""" + result = InsertTraceDimEffect(chunksize=1).apply(_schema()) + assert [d.name for d in result.dimensions] == ["shot_point", "cable", "channel", "trace", "time"] + assert result.chunk_shape == (8, 1, 128, 1, 2048) + trace = next(d for d in result.dimensions if d.name == "trace") + assert trace.is_calculated is True + # Coordinates are unchanged by duplicate handling. + assert result.coordinates[0].dimensions == ("shot_point", "cable", "channel") + + +class TestCollapseToTraceEffect: + """CollapseToTraceEffect collapses spatial dims into a single trace dim.""" + + def test_default_collapses_all_but_first(self) -> None: + """A None collapse set keeps the first spatial dim and collapses the rest.""" + result = CollapseToTraceEffect(chunksize=64, collapse_dims=None).apply(_schema()) + assert [d.name for d in result.dimensions] == ["shot_point", "trace", "time"] + assert result.chunk_shape == (8, 64, 2048) + + def test_explicit_collapse_dims(self) -> None: + """Only the named dims collapse; the coordinate is rewritten onto trace.""" + result = CollapseToTraceEffect(chunksize=128, collapse_dims=("channel",)).apply(_schema()) + assert [d.name for d in result.dimensions] == ["shot_point", "cable", "trace", "time"] + group_coord_x = next(c for c in result.coordinates if c.name == "group_coord_x") + assert group_coord_x.dimensions == ("shot_point", "cable", "trace") + + def test_collapsed_dim_becomes_trace_coordinate(self) -> None: + """A collapsed dimension is re-expressed as a coordinate over trace with its dtype.""" + result = CollapseToTraceEffect(chunksize=64, collapse_dims=("channel",)).apply(_schema()) + channel = next(c for c in result.coordinates if c.name == "channel") + assert channel.dimensions == ("trace",) + assert channel.dtype == ScalarType.UINT16 + + def test_empty_collapse_is_noop_for_dimensions(self) -> None: + """An empty collapse set inserts no trace dim and keeps the original layout.""" + result = CollapseToTraceEffect(chunksize=64, collapse_dims=()).apply(_schema()) + assert [d.name for d in result.dimensions] == ["shot_point", "cable", "channel", "time"] + assert result.chunk_shape == (8, 1, 128, 2048) diff --git a/tests/unit/test_ingestion_schema_resolver.py b/tests/unit/ingestion/test_schema_resolver.py similarity index 73% rename from tests/unit/test_ingestion_schema_resolver.py rename to tests/unit/ingestion/test_schema_resolver.py index 365b3df2..64fc469a 100644 --- a/tests/unit/test_ingestion_schema_resolver.py +++ b/tests/unit/ingestion/test_schema_resolver.py @@ -5,7 +5,9 @@ from mdio.builder.templates.seismic_3d_cdp import Seismic3DCdpGathersTemplate from mdio.builder.templates.seismic_3d_obn import Seismic3DObnReceiverGathersTemplate from mdio.builder.templates.seismic_3d_streamer_shot import Seismic3DStreamerShotGathersTemplate -from mdio.ingestion.schema_resolver import SchemaResolver +from mdio.ingestion.schema import DimensionSpec +from mdio.ingestion.schema import ResolvedSchema +from mdio.ingestion.schema.resolver import SchemaResolver from mdio.segy.geometry import GridOverrides @@ -77,13 +79,12 @@ def test_coordinate_dimensions_collapsed_when_referenced(self) -> None: group_coord_x = next(c for c in schema.coordinates if c.name == "group_coord_x") assert group_coord_x.dimensions == ("shot_point", "trace") - def test_non_binned_flag_recorded_in_metadata(self) -> None: - """The NonBinned flag is recorded under ``gridOverrides`` metadata.""" + def test_grid_override_provenance_not_in_schema_metadata(self) -> None: + """The resolver is mechanics-only: override provenance is attached at the dataset level.""" template = Seismic3DStreamerShotGathersTemplate(data_domain="time") overrides = GridOverrides(non_binned=True, chunksize=64) schema = SchemaResolver().resolve(template, overrides) - assert "gridOverrides" in schema.metadata - assert schema.metadata["gridOverrides"].get("NonBinned") is True + assert "gridOverrides" not in schema.metadata class TestSchemaResolverHasDuplicates: @@ -100,8 +101,40 @@ def test_inserts_trace_dim_with_chunksize_one(self) -> None: # before the vertical dim. assert schema.chunk_shape == (8, 1, 128, 1, 2048) - def test_has_duplicates_metadata(self) -> None: - """The HasDuplicates flag is recorded under ``gridOverrides`` metadata.""" + def test_has_duplicates_provenance_not_in_schema_metadata(self) -> None: + """The resolver does not record override provenance in schema metadata.""" template = Seismic3DStreamerShotGathersTemplate(data_domain="time") schema = SchemaResolver().resolve(template, GridOverrides(has_duplicates=True)) - assert schema.metadata["gridOverrides"].get("HasDuplicates") is True + assert "gridOverrides" not in schema.metadata + + +class TestMissingCalculatedDimensions: + """Tests for ``ResolvedSchema.missing_calculated_dimensions``.""" + + def _schema(self) -> ResolvedSchema: + return ResolvedSchema( + name="Calc", + dimensions=[ + DimensionSpec(name="receiver", is_spatial=True), + DimensionSpec(name="shot_index", is_spatial=True, is_calculated=True), + DimensionSpec(name="time", is_spatial=False), + ], + coordinates=[], + chunk_shape=(2, 2, 4), + ) + + def test_reports_missing_calculated_dim(self) -> None: + """A calculated dim absent from produced names is reported.""" + schema = self._schema() + assert schema.missing_calculated_dimensions(["receiver", "time"]) == ["shot_index"] + + def test_empty_when_calculated_dim_produced(self) -> None: + """Nothing is missing once the calculated dim is produced.""" + schema = self._schema() + assert schema.missing_calculated_dimensions(["receiver", "shot_index", "time"]) == [] + + def test_non_calculated_dims_are_never_reported(self) -> None: + """A missing non-calculated spatial dim is not flagged by this check.""" + schema = self._schema() + # 'receiver' is read from headers, not calculated, so its absence is not reported here. + assert schema.missing_calculated_dimensions(["time"]) == ["shot_index"] diff --git a/tests/unit/ingestion/test_segy_coordinates.py b/tests/unit/ingestion/test_segy_coordinates.py index 84962986..6a52d3f5 100644 --- a/tests/unit/ingestion/test_segy_coordinates.py +++ b/tests/unit/ingestion/test_segy_coordinates.py @@ -16,45 +16,14 @@ from mdio.builder.schemas.v1.units import LengthUnitEnum from mdio.builder.schemas.v1.units import LengthUnitModel from mdio.builder.templates.base import AbstractDatasetTemplate -from mdio.builder.templates.seismic_3d_poststack import Seismic3DPostStackTemplate -from mdio.ingestion.segy.coordinates import _get_coordinates -from mdio.ingestion.segy.coordinates import _get_spatial_coordinate_unit -from mdio.ingestion.segy.coordinates import _populate_coordinates -from mdio.ingestion.segy.coordinates import _update_template_units -from tests.unit.ingestion.testing_helpers import make_grid +from mdio.ingestion.segy.coordinates import get_spatial_coordinate_unit +from mdio.ingestion.segy.coordinates import populate_coordinates +from mdio.ingestion.segy.coordinates import resolve_units from tests.unit.ingestion.testing_helpers import make_grid_with_map -from tests.unit.ingestion.testing_helpers import make_header_array - - -class TestGetCoordinates: - """Tests for ``_get_coordinates``.""" - - def test_returns_dims_and_coords_in_template_order(self) -> None: - """Dim coords and non-dim coords should follow the template's declared order.""" - inline = np.array([1, 2, 3], dtype=np.int32) - crossline = np.array([10, 20], dtype=np.int32) - sample = np.array([0, 4, 8, 12], dtype=np.int32) - grid = make_grid([("inline", inline), ("crossline", crossline), ("time", sample)]) - - n = inline.size * crossline.size - cdp_x = np.arange(n, dtype=np.float64) - cdp_y = np.arange(n, dtype=np.float64) + 100.0 - headers = make_header_array({"cdp_x": cdp_x, "cdp_y": cdp_y}) - - template = Seismic3DPostStackTemplate(data_domain="time") - - dim_coords, non_dim = _get_coordinates(grid, headers, template) - - assert [d.name for d in dim_coords] == ["inline", "crossline", "time"] - np.testing.assert_array_equal(dim_coords[0].coords, inline) - np.testing.assert_array_equal(dim_coords[1].coords, crossline) - assert list(non_dim.keys()) == ["cdp_x", "cdp_y"] - np.testing.assert_array_equal(non_dim["cdp_x"], cdp_x) - np.testing.assert_array_equal(non_dim["cdp_y"], cdp_y) class TestPopulateCoordinates: - """Tests for the ``_populate_coordinates`` wrapper. + """Tests for the ``populate_coordinates`` wrapper. These pin the contract that wraps ``populate_dim_coordinates`` + ``populate_non_dim_coordinates``: dim names land in ``drop_vars`` before coord @@ -86,7 +55,7 @@ def test_wraps_dim_and_non_dim_population_in_order(self) -> None: } ) - dataset, drop_vars = _populate_coordinates( + dataset, drop_vars = populate_coordinates( dataset, grid, coords={"cdp_x": cdp_x}, @@ -102,7 +71,7 @@ def test_wraps_dim_and_non_dim_population_in_order(self) -> None: class TestGetSpatialCoordinateUnit: - """Tests for ``_get_spatial_coordinate_unit``.""" + """Tests for ``get_spatial_coordinate_unit``.""" @pytest.mark.parametrize( ("code", "expected_unit"), @@ -114,7 +83,7 @@ class TestGetSpatialCoordinateUnit: def test_known_measurement_codes(self, code: int, expected_unit: LengthUnitEnum) -> None: """Codes 1 (m) and 2 (ft) return the corresponding length unit.""" info = SimpleNamespace(binary_header_dict={"measurement_system_code": code}) - result = _get_spatial_coordinate_unit(info) + result = get_spatial_coordinate_unit(info) assert isinstance(result, LengthUnitModel) assert result.length == expected_unit @@ -122,40 +91,36 @@ def test_unknown_code_returns_none_and_warns(self, caplog: pytest.LogCaptureFixt """Unexpected codes should log a warning and return ``None``.""" info = SimpleNamespace(binary_header_dict={"measurement_system_code": 7}) with caplog.at_level(logging.WARNING, logger="mdio.ingestion.segy.coordinates"): - result = _get_spatial_coordinate_unit(info) + result = get_spatial_coordinate_unit(info) assert result is None assert any("Unexpected value in coordinate unit" in r.message for r in caplog.records) -class TestUpdateTemplateUnits: - """Tests for ``_update_template_units``.""" +class TestResolveUnits: + """Tests for ``resolve_units``.""" def _stub_template(self) -> MagicMock: template = MagicMock(spec=AbstractDatasetTemplate) - template.get_unit_by_key.return_value = None + template.units = {} return template def test_adds_angle_units_only_when_spatial_unit_missing(self) -> None: """Without a spatial unit, only angle units should be added.""" template = self._stub_template() - result = _update_template_units(template, unit=None) + result = resolve_units(template, unit=None) - template.add_units.assert_called_once() - added = template.add_units.call_args.args[0] - assert set(added.keys()) == {"angle", "azimuth"} - for unit in added.values(): + assert set(result.keys()) == {"angle", "azimuth"} + for unit in result.values(): assert isinstance(unit, AngleUnitModel) assert unit.angle == AngleUnitEnum.DEGREES - assert result is template def test_adds_spatial_units_when_unit_provided(self) -> None: """A non-None unit should populate all SPATIAL keys plus angle keys.""" template = self._stub_template() unit = LengthUnitModel(length=LengthUnitEnum.METER) - _update_template_units(template, unit=unit) + result = resolve_units(template, unit=unit) - added = template.add_units.call_args.args[0] expected_keys = { "angle", "azimuth", @@ -167,9 +132,9 @@ def test_adds_spatial_units_when_unit_provided(self) -> None: "group_coord_y", "offset", } - assert set(added.keys()) == expected_keys + assert set(result.keys()) == expected_keys for key in ("cdp_x", "cdp_y", "source_coord_x", "source_coord_y", "group_coord_x", "group_coord_y", "offset"): - assert added[key] is unit + assert result[key] is unit def test_preserves_pre_existing_spatial_units(self, caplog: pytest.LogCaptureFixture) -> None: """Keys that already have a template unit must not be overwritten.""" @@ -177,16 +142,11 @@ def test_preserves_pre_existing_spatial_units(self, caplog: pytest.LogCaptureFix new_unit = LengthUnitModel(length=LengthUnitEnum.METER) template = MagicMock(spec=AbstractDatasetTemplate) - - def fake_lookup(key: str) -> LengthUnitModel | None: - return existing if key == "cdp_x" else None - - template.get_unit_by_key.side_effect = fake_lookup + template.units = {"cdp_x": existing} with caplog.at_level(logging.WARNING, logger="mdio.ingestion.segy.coordinates"): - _update_template_units(template, unit=new_unit) + result = resolve_units(template, unit=new_unit) - added = template.add_units.call_args.args[0] - assert "cdp_x" not in added - assert added["cdp_y"] is new_unit + assert result["cdp_x"] is existing + assert result["cdp_y"] is new_unit assert any("already in template" in r.message for r in caplog.records) diff --git a/tests/unit/ingestion/test_segy_file_headers.py b/tests/unit/ingestion/test_segy_file_headers.py index 39db2339..97480060 100644 --- a/tests/unit/ingestion/test_segy_file_headers.py +++ b/tests/unit/ingestion/test_segy_file_headers.py @@ -12,7 +12,7 @@ from xarray import DataArray as xr_DataArray from xarray import Dataset as xr_Dataset -from mdio.ingestion.segy.file_headers import _add_segy_file_headers +from mdio.ingestion.segy.file_headers import add_segy_file_headers def _valid_text_header() -> str: @@ -37,14 +37,14 @@ def _empty_dataset() -> xr_Dataset: class TestAddSegyFileHeaders: - """Tests for ``_add_segy_file_headers``.""" + """Tests for ``add_segy_file_headers``.""" def test_disabled_returns_dataset_unchanged(self) -> None: """When the save flag is off the dataset must not be modified.""" info = _make_segy_info() ds = _empty_dataset() with patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "false"}): - result = _add_segy_file_headers(ds, info) + result = add_segy_file_headers(ds, info) assert "segy_file_header" not in result @@ -53,7 +53,7 @@ def test_attaches_headers_when_enabled(self) -> None: info = _make_segy_info() ds = _empty_dataset() with patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "true"}): - result = _add_segy_file_headers(ds, info) + result = add_segy_file_headers(ds, info) attrs = result["segy_file_header"].attrs assert attrs["textHeader"] == info.text_header @@ -69,7 +69,7 @@ def test_attaches_raw_binary_when_raw_flag_enabled(self) -> None: "MDIO__IMPORT__RAW_HEADERS": "true", } with patch.dict(os.environ, env): - result = _add_segy_file_headers(ds, info) + result = add_segy_file_headers(ds, info) encoded = base64.b64encode(b"abc").decode("ascii") assert result["segy_file_header"].attrs["rawBinaryHeader"] == encoded @@ -83,7 +83,7 @@ def test_invalid_row_count_raises(self) -> None: patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "true"}), pytest.raises(ValueError, match="Invalid text header line count"), ): - _add_segy_file_headers(ds, info) + add_segy_file_headers(ds, info) def test_invalid_column_count_raises(self) -> None: """Text header rows shorter than 80 chars must raise.""" @@ -95,4 +95,4 @@ def test_invalid_column_count_raises(self) -> None: patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "true"}), pytest.raises(ValueError, match="Invalid text header line widths"), ): - _add_segy_file_headers(ds, info) + add_segy_file_headers(ds, info) diff --git a/tests/unit/test_segy_grid_overrides.py b/tests/unit/ingestion/test_segy_grid_overrides.py similarity index 74% rename from tests/unit/test_segy_grid_overrides.py rename to tests/unit/ingestion/test_segy_grid_overrides.py index b7c862c8..34055fcc 100644 --- a/tests/unit/test_segy_grid_overrides.py +++ b/tests/unit/ingestion/test_segy_grid_overrides.py @@ -15,8 +15,12 @@ from numpy.testing import assert_array_equal from mdio.core import Dimension +from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry +from mdio.segy.exceptions import GridOverrideMissingParameterError from mdio.segy.exceptions import GridOverrideUnknownError -from mdio.segy.geometry import GridOverrider +from mdio.segy.geometry import GridOverrides +from mdio.segy.geometry import _resolve_synthesize_dims +from mdio.segy.geometry import _validate_template_for_overrides if TYPE_CHECKING: from mdio.builder.templates.base import AbstractDatasetTemplate @@ -32,10 +36,58 @@ def run_override( headers: npt.NDArray, chunksize: tuple[int, ...] | None = None, template: AbstractDatasetTemplate | None = None, -) -> tuple[dict[str, Any], tuple[str], tuple[int]]: - """Initialize and run overrider.""" - overrider = GridOverrider() - return overrider.run(headers, index_names, grid_overrides, chunksize, template=template) +) -> tuple[npt.NDArray, tuple[str, ...], tuple[int, ...] | None]: + """Initialize and run overrider using IndexStrategyRegistry.""" + grid_overrides = grid_overrides or {} + + field_names = set(GridOverrides.model_fields.keys()) + aliases = {field.alias for field in GridOverrides.model_fields.values() if field.alias} + valid_keys = field_names | aliases + for key in grid_overrides: + if key not in valid_keys: + raise GridOverrideUnknownError(key) + + config = GridOverrides.model_validate(grid_overrides) + + if config.non_binned: + missing: set[str] = set() + if config.chunksize is None: + missing.add("chunksize") + if not config.non_binned_dims: + missing.add("non_binned_dims") + if missing: + command = "NonBinned" + raise GridOverrideMissingParameterError(command, missing) + + _validate_template_for_overrides(config, template) + + synthesize_dims = _resolve_synthesize_dims(template) + registry = IndexStrategyRegistry() + strategy = registry.create_strategy( + grid_overrides=config, + synthesize_dims=synthesize_dims, + template=template, + ) + + strategy.validate_headers(headers) + new_headers = strategy.transform_headers(headers) + + new_names = list(index_names) + new_chunks = list(chunksize) if chunksize is not None else None + + # Both NonBinned and HasDuplicates add a 'trace' dim at index -1; HasDuplicates + # always uses chunksize 1, NonBinned uses the user-supplied value. + if config.non_binned or config.has_duplicates: + new_names.append("trace") + if new_chunks is not None: + inserted_chunk = config.chunksize if config.non_binned else 1 + new_chunks.insert(-1, inserted_chunk) + + return ( + new_headers, + tuple(new_names), + tuple(new_chunks) if new_chunks is not None else None, + ) def get_dims(headers: npt.NDArray) -> list[Dimension]: @@ -143,9 +195,8 @@ def test_unknown_override( """Test exception if user provides a command that's not allowed.""" index_names = ("shot_point", "cable", "channel") chunksize = None - overrider = GridOverrider() with pytest.raises(GridOverrideUnknownError): - overrider.run(mock_streamer_headers, index_names, {"WrongCommand": True}, chunksize) + run_override({"WrongCommand": True}, index_names, mock_streamer_headers, chunksize) # OBN test fixtures and tests @@ -159,9 +210,10 @@ class TestObnGridOverrides: """Check grid overrides for OBN (Ocean Bottom Node) data. Note: The synthetic component behavior (when SEG-Y spec doesn't have a component field) - is handled by the template's internal `_synthetic_defaults` attribute in `utilities.get_grid_plan()`, - not by a grid override. See integration test `test_import_obn_synthetic_component` for - full flow coverage. These unit tests focus on grid override functionality. + is handled by ``ComponentSynthesisStrategy`` (driven by the template's + ``synthesize_missing_dims``) during header reading, not by a grid override. See integration + test `test_import_obn_synthetic_component` for full flow coverage. These unit tests focus on + grid override functionality. """ def test_calculate_shot_index_obn(self) -> None: diff --git a/tests/unit/test_ingestion_index_strategies.py b/tests/unit/ingestion/test_segy_index_strategies.py similarity index 93% rename from tests/unit/test_ingestion_index_strategies.py rename to tests/unit/ingestion/test_segy_index_strategies.py index 0cac710c..70384ed3 100644 --- a/tests/unit/test_ingestion_index_strategies.py +++ b/tests/unit/ingestion/test_segy_index_strategies.py @@ -1,6 +1,6 @@ """Unit tests for the v1.2 ingestion index strategies and the strategy registry. -These tests exercise individual :class:`mdio.ingestion.index_strategies.IndexStrategy` +These tests exercise individual :class:`mdio.ingestion.segy.index_strategies.IndexStrategy` subclasses with synthetic structured numpy arrays (mimicking the shape semantics of :class:`segy.arrays.HeaderArray`) so they remain fast and do not require any real SEG-Y data. @@ -8,21 +8,52 @@ from __future__ import annotations +from typing import TYPE_CHECKING +from typing import Any + import numpy as np import pytest from mdio.builder.template_registry import TemplateRegistry -from mdio.ingestion.index_strategies import ChannelWrappingStrategy -from mdio.ingestion.index_strategies import ComponentSynthesisStrategy -from mdio.ingestion.index_strategies import CompositeStrategy -from mdio.ingestion.index_strategies import DuplicateHandlingStrategy -from mdio.ingestion.index_strategies import IndexStrategyRegistry -from mdio.ingestion.index_strategies import NonBinnedStrategy -from mdio.ingestion.index_strategies import RegularGridStrategy -from mdio.ingestion.index_strategies import ShotWrappingStrategy +from mdio.ingestion.segy.index_strategies import ChannelWrappingStrategy + +if TYPE_CHECKING: + from mdio.builder.templates.base import AbstractDatasetTemplate +from mdio.ingestion.segy.index_strategies import ComponentSynthesisStrategy +from mdio.ingestion.segy.index_strategies import CompositeStrategy +from mdio.ingestion.segy.index_strategies import DuplicateHandlingStrategy +from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry +from mdio.ingestion.segy.index_strategies import NonBinnedStrategy +from mdio.ingestion.segy.index_strategies import RegularGridStrategy +from mdio.ingestion.segy.index_strategies import ShotWrappingStrategy from mdio.segy.exceptions import GridOverrideKeysError -from mdio.segy.geometry import GridOverrider from mdio.segy.geometry import GridOverrides +from mdio.segy.geometry import _resolve_synthesize_dims +from mdio.segy.geometry import _validate_template_for_overrides + + +class GridOverrider: + """Mock GridOverrider shim for template and keys validation tests.""" + + def run( + self, + headers: np.ndarray, + index_names: tuple[str, ...], # noqa: ARG002 + grid_overrides: dict[str, Any], + template: AbstractDatasetTemplate | None, + ) -> np.ndarray: + """Run mock strategy validation and transform.""" + config = GridOverrides.model_validate(grid_overrides) + _validate_template_for_overrides(config, template) + synthesize_dims = _resolve_synthesize_dims(template) + registry = IndexStrategyRegistry() + strategy = registry.create_strategy( + grid_overrides=config, + synthesize_dims=synthesize_dims, + template=template, + ) + strategy.validate_headers(headers) + return strategy.transform_headers(headers) def _make_struct(data: dict[str, np.ndarray]) -> np.ndarray: diff --git a/tests/unit/ingestion/test_segy_pipeline.py b/tests/unit/ingestion/test_segy_pipeline.py new file mode 100644 index 00000000..296887e7 --- /dev/null +++ b/tests/unit/ingestion/test_segy_pipeline.py @@ -0,0 +1,90 @@ +"""Unit tests for the slim ingestion pipeline helpers.""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from mdio.ingestion.schema import DimensionSpec +from mdio.ingestion.schema import ResolvedSchema +from mdio.ingestion.segy import pipeline +from mdio.ingestion.segy.raw_headers import build_raw_header_variables + + +def _schema(dimensions: list[DimensionSpec]) -> ResolvedSchema: + return ResolvedSchema( + name="Toy", + dimensions=dimensions, + coordinates=[], + chunk_shape=tuple(2 for _ in dimensions), + ) + + +class TestResolveOutputPath: + """Tests for ``_resolve_output_path`` overwrite enforcement.""" + + def test_returns_path_when_absent(self, tmp_path) -> None: # noqa: ANN001 + """A non-existent location is returned normalized.""" + target = tmp_path / "out.mdio" + result = pipeline._resolve_output_path(str(target), overwrite=False) + assert result.as_posix().endswith("out.mdio") + + def test_raises_when_exists_without_overwrite(self, tmp_path) -> None: # noqa: ANN001 + """An existing location without overwrite raises FileExistsError.""" + target = tmp_path / "out.mdio" + target.mkdir() + with pytest.raises(FileExistsError, match="overwrite=True"): + pipeline._resolve_output_path(str(target), overwrite=False) + + def test_allows_existing_with_overwrite(self, tmp_path) -> None: # noqa: ANN001 + """An existing location with overwrite is allowed.""" + target = tmp_path / "out.mdio" + target.mkdir() + result = pipeline._resolve_output_path(str(target), overwrite=True) + assert result.as_posix().endswith("out.mdio") + + +class TestVerifyCalculatedDimensions: + """Tests for ``_verify_calculated_dimensions``.""" + + def test_passes_when_calculated_dim_present(self) -> None: + """No error when every calculated dim was produced.""" + schema = _schema( + [ + DimensionSpec(name="receiver", is_spatial=True), + DimensionSpec(name="shot_index", is_spatial=True, is_calculated=True), + DimensionSpec(name="time", is_spatial=False), + ] + ) + produced = [SimpleNamespace(name=n) for n in ("receiver", "shot_index", "time")] + pipeline._verify_calculated_dimensions(schema, produced, "Toy") + + def test_raises_when_calculated_dim_missing(self) -> None: + """A missing calculated dim raises a descriptive ValueError.""" + schema = _schema( + [ + DimensionSpec(name="receiver", is_spatial=True), + DimensionSpec(name="shot_index", is_spatial=True, is_calculated=True), + DimensionSpec(name="time", is_spatial=False), + ] + ) + produced = [SimpleNamespace(name=n) for n in ("receiver", "time")] + with pytest.raises(ValueError, match="shot_index"): + pipeline._verify_calculated_dimensions(schema, produced, "Toy") + + +class TestBuildRawHeaderVariables: + """Tests for the isolated experimental raw-headers feature.""" + + def test_disabled_by_default(self, monkeypatch: pytest.MonkeyPatch) -> None: + """With the feature flag off, no extra variables are produced.""" + monkeypatch.delenv("MDIO__IMPORT__RAW_HEADERS", raising=False) + schema = _schema( + [ + DimensionSpec(name="inline", is_spatial=True), + DimensionSpec(name="crossline", is_spatial=True), + DimensionSpec(name="time", is_spatial=False), + ] + ) + assert build_raw_header_variables(schema) == [] diff --git a/tests/unit/ingestion/test_segy_reader.py b/tests/unit/ingestion/test_segy_reader.py new file mode 100644 index 00000000..5017902e --- /dev/null +++ b/tests/unit/ingestion/test_segy_reader.py @@ -0,0 +1,136 @@ +"""Tests for ``read_index_headers``: header subset selection, strategy, dim building.""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import patch + +import numpy as np + +from mdio.builder.schemas.dtype import ScalarType +from mdio.builder.templates.types import CoordinateSpec +from mdio.ingestion.schema import DimensionSpec +from mdio.ingestion.schema import ResolvedSchema +from mdio.ingestion.segy import reader +from tests.unit.ingestion.testing_helpers import make_header_array + +_READER = "mdio.ingestion.segy.reader" + + +def _spec_with_fields(*names: str) -> SimpleNamespace: + """Build a fake SegySpec exposing ``spec.trace.header.fields`` with the given names.""" + fields = [SimpleNamespace(name=name) for name in names] + return SimpleNamespace(trace=SimpleNamespace(header=SimpleNamespace(fields=fields))) + + +def _file_info(num_traces: int, sample_labels: np.ndarray) -> SimpleNamespace: + """Build a fake SegyFileInfo carrying only the attributes the reader touches.""" + return SimpleNamespace(num_traces=num_traces, sample_labels=sample_labels) + + +def _schema(dimensions: list[DimensionSpec], coordinates: list[CoordinateSpec]) -> ResolvedSchema: + return ResolvedSchema( + name="ReaderToy", + dimensions=dimensions, + coordinates=coordinates, + chunk_shape=tuple(-1 for _ in dimensions), + ) + + +class TestReadIndexHeaders: + """Tests for the decomposed SEG-Y index-header reader.""" + + def test_regular_path_builds_spatial_and_vertical_dims(self) -> None: + """No overrides: spatial dims come from unique header values, vertical from samples.""" + schema = _schema( + dimensions=[ + DimensionSpec(name="inline", is_spatial=True), + DimensionSpec(name="crossline", is_spatial=True), + DimensionSpec(name="time", is_spatial=False), + ], + coordinates=[CoordinateSpec(name="cdp_x", dimensions=("inline", "crossline"), dtype=ScalarType.FLOAT64)], + ) + headers = make_header_array( + { + "inline": np.array([1, 1, 2, 2], dtype=np.int32), + "crossline": np.array([10, 11, 10, 11], dtype=np.int32), + "cdp_x": np.array([0.0, 1.0, 2.0, 3.0]), + } + ) + segy_file_kwargs = {"spec": _spec_with_fields("inline", "crossline", "cdp_x", "coordinate_scalar")} + + with patch(f"{_READER}.parse_headers", return_value=headers) as mock_parse: + indexed, dimensions = reader.read_index_headers( + segy_file_kwargs=segy_file_kwargs, + file_info=_file_info(4, np.array([0, 2000, 4000])), + schema=schema, + grid_overrides=None, + synthesize_dims=(), + ) + + # Only spec-present required fields are parsed. + subset = set(mock_parse.call_args.kwargs["subset"]) + assert subset == {"inline", "crossline", "cdp_x", "coordinate_scalar"} + + names = [d.name for d in dimensions] + assert names == ["inline", "crossline", "time"] + assert dimensions[0].coords.tolist() == [1, 2] + assert dimensions[1].coords.tolist() == [10, 11] + # sample labels normalized by 1000 and cast to int when integral. + assert dimensions[-1].coords.tolist() == [0, 2, 4] + assert indexed is headers + + def test_subset_excludes_fields_absent_from_spec(self) -> None: + """Required fields not in the SEG-Y spec are dropped from the parse subset.""" + schema = _schema( + dimensions=[ + DimensionSpec(name="inline", is_spatial=True), + DimensionSpec(name="component", is_spatial=True), + DimensionSpec(name="time", is_spatial=False), + ], + coordinates=[], + ) + headers = make_header_array({"inline": np.array([1, 2], dtype=np.int32)}) + # spec lacks 'component' on purpose. + segy_file_kwargs = {"spec": _spec_with_fields("inline", "coordinate_scalar")} + + with patch(f"{_READER}.parse_headers", return_value=headers) as mock_parse: + reader.read_index_headers( + segy_file_kwargs=segy_file_kwargs, + file_info=_file_info(2, np.array([0, 1000])), + schema=schema, + grid_overrides=None, + synthesize_dims=(), + ) + + subset = set(mock_parse.call_args.kwargs["subset"]) + assert "component" not in subset + assert subset == {"inline", "coordinate_scalar"} + + def test_synthesize_dims_produces_missing_dimension(self) -> None: + """A synthesized dim absent from headers is created and yields a dimension.""" + schema = _schema( + dimensions=[ + DimensionSpec(name="component", is_spatial=True), + DimensionSpec(name="receiver", is_spatial=True), + DimensionSpec(name="time", is_spatial=False), + ], + coordinates=[], + ) + headers = make_header_array({"receiver": np.array([5, 6, 7], dtype=np.uint32)}) + segy_file_kwargs = {"spec": _spec_with_fields("receiver", "coordinate_scalar")} + + with patch(f"{_READER}.parse_headers", return_value=headers): + indexed, dimensions = reader.read_index_headers( + segy_file_kwargs=segy_file_kwargs, + file_info=_file_info(3, np.array([0, 1000, 2000])), + schema=schema, + grid_overrides=None, + synthesize_dims=("component",), + ) + + names = [d.name for d in dimensions] + assert names == ["component", "receiver", "time"] + # ComponentSynthesisStrategy fills the missing dim with a constant 1. + assert "component" in indexed.dtype.names + assert dimensions[0].coords.tolist() == [1] diff --git a/tests/unit/test_segy_spec_validation.py b/tests/unit/ingestion/test_segy_spec_validation.py similarity index 91% rename from tests/unit/test_segy_spec_validation.py rename to tests/unit/ingestion/test_segy_spec_validation.py index afdfeac6..10115532 100644 --- a/tests/unit/test_segy_spec_validation.py +++ b/tests/unit/ingestion/test_segy_spec_validation.py @@ -9,11 +9,11 @@ from segy.standards import get_segy_standard from mdio.builder.templates.base import AbstractDatasetTemplate -from mdio.converters.segy import _validate_spec_in_template +from mdio.ingestion.segy.validation import validate_spec_in_template class TestValidateSpecInTemplate: - """Test cases for _validate_spec_in_template function.""" + """Test cases for validate_spec_in_template function.""" def test_validation_passes_with_all_required_fields(self) -> None: """Test that validation passes when all required fields are present.""" @@ -25,7 +25,7 @@ def test_validation_passes_with_all_required_fields(self) -> None: segy_spec = get_segy_standard(1.0) # Should not raise any exception - _validate_spec_in_template(segy_spec, template) + validate_spec_in_template(segy_spec, template) def test_validation_fails_with_missing_fields(self) -> None: """Test that validation fails when required fields are missing.""" @@ -44,7 +44,7 @@ def test_validation_fails_with_missing_fields(self) -> None: # Should raise ValueError listing the missing fields with pytest.raises(ValueError, match=r"Required fields.*not found in.*segy_spec") as exc_info: - _validate_spec_in_template(segy_spec, template) + validate_spec_in_template(segy_spec, template) error_message = str(exc_info.value) assert "custom_dim2" in error_message @@ -68,7 +68,7 @@ def test_validation_fails_with_missing_coordinate_scalar(self) -> None: # Should raise ValueError for missing coordinate_scalar with pytest.raises(ValueError, match=r"Required fields.*not found in.*segy_spec") as exc_info: - _validate_spec_in_template(segy_spec, template) + validate_spec_in_template(segy_spec, template) error_message = str(exc_info.value) assert "coordinate_scalar" in error_message diff --git a/tests/unit/ingestion/test_segy_validation.py b/tests/unit/ingestion/test_segy_validation.py index 38f1b75f..8159ec51 100644 --- a/tests/unit/ingestion/test_segy_validation.py +++ b/tests/unit/ingestion/test_segy_validation.py @@ -11,7 +11,7 @@ from mdio.builder.template_registry import TemplateRegistry from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.templates.seismic_3d_obn import Seismic3DObnReceiverGathersTemplate -from mdio.ingestion.segy.validation import _validate_spec_in_template +from mdio.ingestion.segy.validation import validate_spec_in_template class TestValidateSpecInTemplate: @@ -25,7 +25,7 @@ def test_passes_with_all_required_fields(self) -> None: template.calculated_dimension_names = () segy_spec = get_segy_standard(1.0) - _validate_spec_in_template(segy_spec, template) + validate_spec_in_template(segy_spec, template) def test_missing_fields_listed_in_error(self) -> None: """The error message must enumerate all missing required fields.""" @@ -40,7 +40,7 @@ def test_missing_fields_listed_in_error(self) -> None: spec = spec.customize(trace_header_fields=[HeaderField(name="custom_dim1", byte=189, format="int32")]) with pytest.raises(ValueError, match=r"Required fields.*not found in.*segy_spec") as exc: - _validate_spec_in_template(spec, template) + validate_spec_in_template(spec, template) msg = str(exc.value) assert "custom_dim2" in msg @@ -61,7 +61,7 @@ def test_missing_coordinate_scalar_raises(self) -> None: spec = spec.customize(trace_header_fields=kept) with pytest.raises(ValueError, match=r"coordinate_scalar"): - _validate_spec_in_template(spec, template) + validate_spec_in_template(spec, template) def test_calculated_dimensions_are_not_required(self) -> None: """Dimensions in ``calculated_dimension_names`` should not be required from the spec.""" @@ -72,7 +72,7 @@ def test_calculated_dimensions_are_not_required(self) -> None: template.calculated_dimension_names = ("calculated_only",) segy_spec = get_segy_standard(1.0) - _validate_spec_in_template(segy_spec, template) + validate_spec_in_template(segy_spec, template) def test_obn_template_excludes_component_requirement(self) -> None: """OBN templates synthesize ``component`` when absent → not required from spec.""" @@ -95,7 +95,7 @@ def test_obn_template_excludes_component_requirement(self) -> None: ] ) - _validate_spec_in_template(spec, template) + validate_spec_in_template(spec, template) def test_obn_template_missing_other_required_field_still_fails(self) -> None: """Even with the ``component`` carve-out, other missing fields should error.""" @@ -103,4 +103,4 @@ def test_obn_template_missing_other_required_field_still_fails(self) -> None: spec = get_segy_standard(1.0) # missing OBN-specific fields like 'receiver', 'shot_line', etc. with pytest.raises(ValueError, match=r"Required fields.*not found"): - _validate_spec_in_template(spec, template) + validate_spec_in_template(spec, template) diff --git a/tests/unit/test_segy_file_header_modes.py b/tests/unit/test_segy_file_header_modes.py index c363426e..f37eda4c 100644 --- a/tests/unit/test_segy_file_header_modes.py +++ b/tests/unit/test_segy_file_header_modes.py @@ -1,4 +1,4 @@ -"""Tests for ``_add_segy_file_headers`` mode handling. +"""Tests for ``add_segy_file_headers`` mode handling. Covers the three values of ``MDIO__IMPORT__SAVE_SEGY_FILE_HEADER``: 0 skips, 1 raises on a malformed text header, 2 corrects a malformed text header. @@ -13,7 +13,7 @@ import pytest import xarray as xr -from mdio.ingestion.segy.file_headers import _add_segy_file_headers +from mdio.ingestion.segy.file_headers import add_segy_file_headers from mdio.segy.file import SegyFileInfo @@ -57,7 +57,7 @@ def test_mode_zero_skips_header_save(self) -> None: """Mode 0 leaves the dataset without a ``segy_file_header`` variable.""" ds = xr.Dataset() with patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "0"}): - result = _add_segy_file_headers(ds, _segy_info(_malformed_header())) + result = add_segy_file_headers(ds, _segy_info(_malformed_header())) assert "segy_file_header" not in result @@ -66,7 +66,7 @@ def test_mode_one_accepts_well_formed(self) -> None: ds = xr.Dataset() header = _well_formed_header() with patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "1"}): - result = _add_segy_file_headers(ds, _segy_info(header)) + result = add_segy_file_headers(ds, _segy_info(header)) assert result["segy_file_header"].attrs["textHeader"] == header @@ -77,7 +77,7 @@ def test_mode_one_raises_on_malformed(self) -> None: patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "1"}), pytest.raises(ValueError, match="non-ASCII or non-printable"), ): - _add_segy_file_headers(ds, _segy_info(_malformed_header())) + add_segy_file_headers(ds, _segy_info(_malformed_header())) def test_mode_one_raises_on_replacement_char(self) -> None: """Mode 1 raises on U+FFFD.""" @@ -86,7 +86,7 @@ def test_mode_one_raises_on_replacement_char(self) -> None: patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "1"}), pytest.raises(ValueError, match="non-ASCII or non-printable"), ): - _add_segy_file_headers(ds, _segy_info(_replacement_char_header())) + add_segy_file_headers(ds, _segy_info(_replacement_char_header())) def test_mode_two_corrects_malformed(self, caplog: pytest.LogCaptureFixture) -> None: """Mode 2 repairs a NUL byte and stores a 40x80 header.""" @@ -95,7 +95,7 @@ def test_mode_two_corrects_malformed(self, caplog: pytest.LogCaptureFixture) -> patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "2"}), caplog.at_level(logging.WARNING, logger="mdio.ingestion.segy.file_headers"), ): - result = _add_segy_file_headers(ds, _segy_info(_malformed_header())) + result = add_segy_file_headers(ds, _segy_info(_malformed_header())) stored = result["segy_file_header"].attrs["textHeader"] assert "\x00" not in stored @@ -110,7 +110,7 @@ def test_mode_two_corrects_replacement_char(self, caplog: pytest.LogCaptureFixtu patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "2"}), caplog.at_level(logging.WARNING, logger="mdio.ingestion.segy.file_headers"), ): - result = _add_segy_file_headers(ds, _segy_info(_replacement_char_header())) + result = add_segy_file_headers(ds, _segy_info(_replacement_char_header())) stored = result["segy_file_header"].attrs["textHeader"] assert "\ufffd" not in stored @@ -125,7 +125,7 @@ def test_mode_two_passes_through_well_formed(self, caplog: pytest.LogCaptureFixt patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "2"}), caplog.at_level(logging.WARNING, logger="mdio.ingestion.segy.file_headers"), ): - result = _add_segy_file_headers(ds, _segy_info(header)) + result = add_segy_file_headers(ds, _segy_info(header)) assert result["segy_file_header"].attrs["textHeader"] == header assert not any("Correcting" in record.message for record in caplog.records) @@ -140,7 +140,7 @@ def test_mode_two_repairs_double_newline_wrapped(self, caplog: pytest.LogCapture patch.dict(os.environ, {"MDIO__IMPORT__SAVE_SEGY_FILE_HEADER": "2"}), caplog.at_level(logging.WARNING, logger="mdio.ingestion.segy.file_headers"), ): - result = _add_segy_file_headers(ds, _segy_info(wrapped)) + result = add_segy_file_headers(ds, _segy_info(wrapped)) stored = result["segy_file_header"].attrs["textHeader"] stored_rows = stored.split("\n") From 62c7f0a818b23f363a2a82c1701429b6154a4365 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 2 Jun 2026 18:39:16 +0000 Subject: [PATCH 02/11] chore: stop tracking refactor_plan_bak.md Co-authored-by: Cursor --- refactor_plan_bak.md | 220 ------------------------------------------- 1 file changed, 220 deletions(-) delete mode 100644 refactor_plan_bak.md diff --git a/refactor_plan_bak.md b/refactor_plan_bak.md deleted file mode 100644 index ce33bde9..00000000 --- a/refactor_plan_bak.md +++ /dev/null @@ -1,220 +0,0 @@ ---- -name: split-grid-overrides-v3 -overview: Split the current `grid_overrides_v3` branch into 8 dependency-ordered, atomically reviewable PRs against `main`, each with its own changelog story. Each PR is green on its own; PR 6 is the one that flips runtime over to the new orchestrator and is the natural minor-version-bump cut. -todos: - - id: pr1 - content: "PR 1: extract pure helpers (header_analysis, coordinate_utils, validation, metadata) into mdio/ingestion/ with no behavior change" - status: pending - - id: pr2 - content: "PR 2: add Pydantic GridOverrides class + dict deprecation shim, additive only" - status: pending - - id: pr3 - content: "PR 3: replace GridOverrider internals with IndexStrategy pattern, fix the broken test_segy_grid_overrides.py file" - status: pending - - id: pr4 - content: "PR 4: introduce SchemaResolver + ResolvedSchema and template declare_coordinate_specs/apply_resolved_dimensions hooks (dead code, wired in PR 6)" - status: pending - - id: pr5 - content: "PR 5: add HeaderAnalyzer and DatasetFactory services with unit tests (dead code, wired in PR 6)" - status: pending - - id: pr6 - content: "PR 6: introduce run_segy_ingestion orchestrator, slim segy_to_mdio to a shim, expose new public API, delete legacy GridOverrider, decide CLI fate" - status: pending - - id: pr7 - content: "PR 7: extract MDIO__IMPORT__RAW_HEADERS gating into _raw_headers_experimental.py for one-file removal" - status: pending - - id: pr8 - content: "PR 8: dependency bump (uv.lock) and template internal polish" - status: pending -isProject: false ---- - -# Split `grid_overrides_v3` Into 8 Atomic PRs - -## Strategy - -The current branch interleaves five themes: typed config, strategy refactor, schema resolution, services, pipeline rewire, and an experimental flag move. Treat the existing branch as the **source of truth for the final state** and replay it as a stack of branches off `main`. Each PR below cherry-picks a tight subset of files; nothing earlier than PR 6 changes the runtime path of `segy_to_mdio`, so each one merges without a behavior change visible to users. - -Suggested workflow per PR: - -1. `git checkout -b origin/main` -2. `git checkout grid_overrides_v3 -- ` -3. Trim back anything that pulls in scope from later PRs (notes per PR below). -4. Run `nox -s pre-commit` and `nox -s tests-3.13 -- ` per the workspace rule. -5. Open the PR with the suggested title/summary. - -```mermaid -flowchart LR - PR1[PR1 extract pure modules] --> PR2[PR2 Pydantic GridOverrides] - PR1 --> PR3[PR3 strategy pattern] - PR2 --> PR3 - PR3 --> PR4[PR4 schema resolver and template hooks] - PR4 --> PR5[PR5 HeaderAnalyzer and DatasetFactory] - PR5 --> PR6[PR6 pipeline orchestrator and public API] - PR6 --> PR7[PR7 raw-headers gating] - PR6 --> PR8[PR8 dep bump and template polish] -``` - ---- - -## PR 1 - Extract pure modules into `mdio/ingestion/` - -**Goal:** mechanical code move, zero behavior change. Gives the next PRs a home without dragging architecture along. - -**Files (copy from branch verbatim):** -- [src/mdio/ingestion/__init__.py](src/mdio/ingestion/__init__.py) - trimmed to only re-export what this PR introduces -- [src/mdio/ingestion/header_analysis.py](src/mdio/ingestion/header_analysis.py) (the pure analysis primitives + `StreamerShotGeometryType`, `ShotGunGeometryType`, `analyze_streamer_headers`, `analyze_lines_for_guns`, `analyze_non_indexed_headers`) -- [src/mdio/ingestion/coordinate_utils.py](src/mdio/ingestion/coordinate_utils.py) -- [src/mdio/ingestion/validation.py](src/mdio/ingestion/validation.py) (`grid_density_qc`, `validate_spec_in_template`) -- [src/mdio/ingestion/metadata.py](src/mdio/ingestion/metadata.py) MINUS the `attach_raw_binary_header` import (defer to PR 7) - -**Adjustments before commit:** -- In [src/mdio/ingestion/metadata.py](src/mdio/ingestion/metadata.py), drop `from mdio.ingestion._raw_headers_experimental import attach_raw_binary_header` and its call inside `add_segy_file_headers`. Inline the existing on-`main` raw-binary handling instead, or stub it as a TODO (PR 7 reintroduces the gating module). -- Trim `__init__.py` exports to only the symbols actually defined in this PR. -- Update import sites in `mdio/converters/segy.py` and any other on-`main` callers to point at the new module locations. - -**Tests:** existing tests must pass unchanged (`nox -s tests-3.13`). - -**PR title:** `refactor(ingestion): extract analysis, coordinate, validation, and metadata helpers` - ---- - -## PR 2 - Add Pydantic `GridOverrides` (additive) - -**Goal:** the typed model lands as a public class, dict path stays alive internally. - -**Files:** -- [src/mdio/segy/geometry.py](src/mdio/segy/geometry.py) - add the `GridOverrides` class **alongside** the existing `GridOverrider` (do not delete `GridOverrider` yet). -- [src/mdio/converters/segy.py](src/mdio/converters/segy.py) - add the `isinstance(grid_overrides, dict)` shim with `DeprecationWarning` and `GridOverrides.from_legacy_dict(...)` coercion. After coercion, immediately re-dump to dict via `model_dump(by_alias=True, exclude_defaults=True)` so the still-on-`main` `GridOverrider` keeps receiving the dict it expects. -- [src/mdio/__init__.py](src/mdio/__init__.py) - add `GridOverrides` to `__all__` and imports. -- [tests/unit/test_grid_overrides_pydantic.py](tests/unit/test_grid_overrides_pydantic.py) - new file, copy as-is. - -**Acceptance:** existing `tests/unit/test_segy_grid_overrides.py` (which imports `GridOverrider`) still passes; `test_grid_overrides_pydantic.py` passes. - -**PR title:** `feat(api): typed Pydantic GridOverrides with legacy dict deprecation` - -**Changelog line:** "`mdio.GridOverrides` is now the supported way to configure grid overrides. Passing a `dict` still works but emits a `DeprecationWarning`." - ---- - -## PR 3 - Replace `GridOverrider` with strategy pattern - -**Goal:** the "modernization to grid override code" PR you called out. Behavior preserved; the monolithic `GridOverrider.run(...)` becomes `IndexStrategyRegistry().create_strategy(...)` returning composable `IndexStrategy` instances. - -**Files:** -- [src/mdio/ingestion/index_strategies.py](src/mdio/ingestion/index_strategies.py) - new (`IndexStrategy` ABC + `RegularGridStrategy`, `NonBinnedStrategy`, `DuplicateHandlingStrategy`, `ChannelWrappingStrategy`, `ShotWrappingStrategy`, `ComponentSynthesisStrategy`, `CompositeStrategy`, `IndexStrategyRegistry`). -- [src/mdio/segy/geometry.py](src/mdio/segy/geometry.py) - rewrite `GridOverrider.run(...)` as a thin shim: build a `GridOverrides` from the input dict, call `IndexStrategyRegistry`, return the `(headers, names, chunks)` tuple shape that callers expect. Keep the public signature unchanged. -- [tests/unit/test_ingestion_index_strategies.py](tests/unit/test_ingestion_index_strategies.py) - new. -- [tests/unit/test_segy_grid_overrides.py](tests/unit/test_segy_grid_overrides.py) - **fix**: the file currently imports `GridOverrider` and `TemplateRegistry`; either keep the shim alive (above) so this file passes unchanged, or rewrite each test to drive `IndexStrategyRegistry` directly. The branch leaves it in a broken-import state, which is the kind of footgun this PR should clean up. - -**Acceptance:** integration tests `tests/integration/test_import_obn_grid_overrides.py` and `tests/integration/test_import_streamer_grid_overrides.py` pass; new strategy unit tests pass; old `test_segy_grid_overrides.py` either passes or is replaced. - -**PR title:** `refactor(ingestion): split GridOverrider into composable IndexStrategy classes` - ---- - -## PR 4 - Schema resolution + template-aware dimension layout - -**Goal:** declarative final-schema description; templates stop having ingestion poke private attributes. - -**Files:** -- [src/mdio/ingestion/schema_resolver.py](src/mdio/ingestion/schema_resolver.py) - new (`DimensionSpec`, `CoordinateSpec`, `ResolvedSchema`, `SchemaResolver`). -- [src/mdio/builder/templates/base.py](src/mdio/builder/templates/base.py) - add `declare_coordinate_specs()` (default impl over `physical_coordinate_names` + `logical_coordinate_names`) and `apply_resolved_dimensions(dim_names, chunk_shape)`. Add the `synthesize_missing_dims` and `_calculated_dims` attributes. -- Per-template overrides only where the default isn't right: - - [src/mdio/builder/templates/seismic_3d_obn.py](src/mdio/builder/templates/seismic_3d_obn.py) - per-shot-dim coord specs, `_calculated_dims = ("shot_index",)`, `synthesize_missing_dims = ("component",)`. - - [src/mdio/builder/templates/seismic_3d_streamer_field.py](src/mdio/builder/templates/seismic_3d_streamer_field.py) - - [src/mdio/builder/templates/seismic_3d_coca.py](src/mdio/builder/templates/seismic_3d_coca.py) - - [src/mdio/builder/templates/seismic_3d_cdp.py](src/mdio/builder/templates/seismic_3d_cdp.py), [src/mdio/builder/templates/seismic_2d_cdp.py](src/mdio/builder/templates/seismic_2d_cdp.py) - inline/crossline-indexed `cdp_x`/`cdp_y` specs. -- [tests/unit/test_ingestion_schema_resolver.py](tests/unit/test_ingestion_schema_resolver.py) - new. - -**Adjustments:** nothing in this PR consumes `ResolvedSchema` at runtime - the resolver is dead code until PR 5/6 wire it. That is intentional. - -**Acceptance:** all template tests under `tests/unit/v1/templates/` still pass; new resolver tests pass. - -**PR title:** `feat(ingestion): declarative ResolvedSchema and template-driven coordinate specs` - ---- - -## PR 5 - `HeaderAnalyzer` + `DatasetFactory` - -**Goal:** add the two small services that PR 6 will compose. They have a real changelog bullet on their own ("ingestion now reads only the headers required by the resolved schema"). - -**Files:** -- [src/mdio/ingestion/header_analyzer.py](src/mdio/ingestion/header_analyzer.py) -- [src/mdio/ingestion/dataset_factory.py](src/mdio/ingestion/dataset_factory.py) -- [tests/unit/test_ingestion_header_analyzer.py](tests/unit/test_ingestion_header_analyzer.py) -- [tests/unit/test_ingestion_dataset_factory.py](tests/unit/test_ingestion_dataset_factory.py) - -**Adjustments:** still no runtime use; pipeline switch is PR 6. - -**Acceptance:** new unit tests pass; everything else unchanged. - -**PR title:** `feat(ingestion): HeaderAnalyzer and DatasetFactory services` - ---- - -## PR 6 - Pipeline orchestrator + public API surface (the minor-version-bump PR) - -**Goal:** wire PRs 2-5 into a single `run_segy_ingestion`, replace the body of `segy_to_mdio` with a thin shim, surface the new public types. - -**Files:** -- [src/mdio/ingestion/pipeline.py](src/mdio/ingestion/pipeline.py) - new (`run_segy_ingestion`). -- [src/mdio/converters/segy.py](src/mdio/converters/segy.py) - body becomes the dict-deprecation shim + delegate to `run_segy_ingestion`. The `noqa: PLR0913` and old-style internals go away. -- [src/mdio/ingestion/__init__.py](src/mdio/ingestion/__init__.py) - expand to the full `__all__` from the branch. -- [src/mdio/__init__.py](src/mdio/__init__.py) - add `run_segy_ingestion`, `IndexStrategy`, `IndexStrategyRegistry`, `ResolvedSchema`, `CoordinateSpec`, `DimensionSpec`. -- [src/mdio/segy/geometry.py](src/mdio/segy/geometry.py) - **delete** the legacy `GridOverrider` shim that PR 3 kept alive. All callers now route through `run_segy_ingestion`. -- [src/mdio/commands/segy.py](src/mdio/commands/segy.py) - **fix the broken CLI**: it currently calls `segy_to_mdio(segy_path=..., index_bytes=..., index_names=..., chunksize=...)`, none of which match the v1 signature in `converters/segy.py`. Either update the CLI to construct a `SegySpec` + `mdio_template` + `GridOverrides` and call `segy_to_mdio` correctly, or wrap it in a `# TODO(v1.3): rewrite CLI for v1 API` and skip it from CI. Pick one explicitly; do not leave it silently broken. - -**Acceptance:** full `nox -s tests-3.13` passes; integration tests under `tests/integration/` pass; example notebooks (if any) still run. - -**PR title:** `feat(ingestion): run_segy_ingestion orchestrator and v1.2 public API` - -**Changelog highlights:** -- New public functions/classes: `run_segy_ingestion`, `IndexStrategy`, `IndexStrategyRegistry`, `ResolvedSchema`, `CoordinateSpec`, `DimensionSpec`. -- `segy_to_mdio` retained as a v1.x compatibility entry point. -- Memory: ingestion now only parses headers required by the resolved schema (`HeaderAnalyzer`). - ---- - -## PR 7 - Gate experimental raw-headers behind `_raw_headers_experimental.py` - -**Goal:** isolate the `MDIO__IMPORT__RAW_HEADERS` feature so removing it later is a one-file delete. - -**Files:** -- [src/mdio/ingestion/_raw_headers_experimental.py](src/mdio/ingestion/_raw_headers_experimental.py) - new. The module's own docstring is the PR description; copy it into the PR body. -- [src/mdio/ingestion/pipeline.py](src/mdio/ingestion/pipeline.py) - call `maybe_add_raw_headers(mdio_template, mdio_ds)` after `DatasetFactory().build(...)`. -- [src/mdio/ingestion/metadata.py](src/mdio/ingestion/metadata.py) - re-add the `attach_raw_binary_header` import + call inside `add_segy_file_headers` (the inverse of the trim in PR 1). -- [src/mdio/builder/templates/base.py](src/mdio/builder/templates/base.py) - keep the `_add_raw_headers` method (defined in PR 4) in place; no change here unless the prior PR omitted it. - -**Acceptance:** with `MDIO__IMPORT__RAW_HEADERS=1` and Zarr v3, raw_headers variable appears in output; without the env var, no behavior change. - -**PR title:** `refactor: gate experimental raw-headers feature behind a single module` - ---- - -## PR 8 - Dep bump + template polish - -**Goal:** the leftover small stuff. Optional; can ride along with PR 6 if reviewers prefer. - -**Files:** -- [uv.lock](uv.lock) -- Per-template polish in [src/mdio/builder/templates/](src/mdio/builder/templates/) (`_dim_names`/`_calculated_dims` cleanup, dead-method removal). - -**Acceptance:** full test suite green; `nox -s pre-commit` clean. - -**PR title:** `chore(deps): bump dependencies and polish template internals` - ---- - -## Risks & coupling notes - -- **PR 4-6 are the tightest cluster.** PR 4's `ResolvedSchema` is dead code at merge time, PR 5's services are dead code at merge time, PR 6 flips runtime. This staging is intentional - it keeps each diff small and reviewable, at the cost of two PRs that "do nothing yet". If reviewers push back on dead code, fold PRs 4 and 5 together but **never** fold them into PR 6. -- **PR 3 must keep `GridOverrider` callable** until PR 6 deletes it, otherwise existing callers in `mdio/converters/segy.py` (still on the v1.1 path) break mid-stack. -- **`tests/unit/test_segy_grid_overrides.py`** is currently broken on the branch (imports a `GridOverrider` symbol that no longer exists in `mdio.segy.geometry`). PR 3 is the place to fix it. Do not let it land broken in a separate PR. -- **CLI in `mdio/commands/segy.py`** has the same problem (calls v0-style `segy_to_mdio(segy_path=..., index_bytes=..., ...)`). PR 6 must either fix or explicitly defer with a tracked issue - this would be very embarrassing in a release notes diff. -- **Autodocs.** The split gives ~8 changelog bullets instead of one "big refactor" line. PR 2 (typed config), PR 3 (strategy pattern), PR 5 (memory win), and PR 6 (new public API) are all release-note-worthy on their own. - -## Suggested merge order for the next two minor releases - -- v1.2: PRs 1, 2, 7 (no runtime change, deprecation only, easy to revert). -- v1.3: PRs 3, 4, 5, 6, 8 (the actual architectural turn, with the orchestrator landing in PR 6 as the headline). From 3d60cc441ac17df8e2eced73cad981a520045dd8 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 2 Jun 2026 18:51:02 +0000 Subject: [PATCH 03/11] Cleanup docstrings --- src/mdio/builder/templates/base.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index 928c5926..e6214895 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -115,11 +115,7 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: return tuple(specs) def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare the data types for each dimension in this template. - - This is the single source of truth for dimension-coordinate data types: both the - ingestion ``SchemaResolver`` and :meth:`_add_dimension_coordinate` read from it, so - the resolved schema and the built dataset cannot disagree. + """Declare data types for each dimension in this template. Returns: A dictionary mapping dimension name to ScalarType. @@ -129,23 +125,19 @@ def declare_dimension_specs(self) -> dict[str, ScalarType]: def _dim_dtype(self, name: str) -> ScalarType: """Return the declared dtype for a dimension coordinate. - Sourcing dimension-coordinate dtypes here (and in the ingestion ``SchemaResolver``) - from :meth:`declare_dimension_specs` keeps the built dataset and the resolved schema - from disagreeing without a separate post-build assertion. - Args: name: The dimension name. Returns: - The declared :class:`ScalarType`, defaulting to ``INT32`` when undeclared. + The declared ScalarType, defaulting to INT32. """ return self.declare_dimension_specs().get(name, ScalarType.INT32) def _add_dimension_coordinate(self, name: str) -> None: - """Add a single dimension coordinate, sourcing its dtype from :meth:`declare_dimension_specs`. + """Add a single dimension coordinate. Args: - name: The dimension name; also the coordinate name and its sole dimension. + name: The dimension name. """ self._builder.add_coordinate( name, @@ -444,7 +436,7 @@ def _add_coordinates(self) -> None: raise def _add_trace_mask(self) -> None: - """Add trace mask variables.""" + """Add trace mask variable.""" self._builder.add_variable( name="trace_mask", dimensions=self.spatial_dimension_names, @@ -454,7 +446,7 @@ def _add_trace_mask(self) -> None: ) def _add_trace_headers(self, header_dtype: StructuredType) -> None: - """Add trace mask variables.""" + """Add trace headers variable.""" chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=self.full_chunk_shape[:-1])) self._builder.add_variable( name="headers", From abbaaabeb1606b7f473ff28c323ca948a3c7eeba Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 2 Jun 2026 18:53:20 +0000 Subject: [PATCH 04/11] Remove deep copy requirement --- src/mdio/builder/templates/base.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index e6214895..c6e001dd 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -2,7 +2,6 @@ from __future__ import annotations -import copy from abc import ABC from abc import abstractmethod from typing import TYPE_CHECKING @@ -298,32 +297,32 @@ def trace_domain(self) -> str: @property def spatial_dimension_names(self) -> tuple[str, ...]: """Returns the names of the dimensions excluding the last axis.""" - return copy.deepcopy(self._dim_names[:-1]) + return self._dim_names[:-1] @property def dimension_names(self) -> tuple[str, ...]: """Returns the names of the dimensions.""" - return copy.deepcopy(self._dim_names) + return self._dim_names @property def calculated_dimension_names(self) -> tuple[str, ...]: - """Returns the names of the dimensions.""" - return copy.deepcopy(self._calculated_dims) + """Returns the names of the calculated dimensions.""" + return self._calculated_dims @property def physical_coordinate_names(self) -> tuple[str, ...]: """Returns the names of the physical (world) coordinates.""" - return copy.deepcopy(self._physical_coord_names) + return self._physical_coord_names @property def logical_coordinate_names(self) -> tuple[str, ...]: """Returns the names of the logical (grid) coordinates.""" - return copy.deepcopy(self._logical_coord_names) + return self._logical_coord_names @property def coordinate_names(self) -> tuple[str, ...]: """Returns names of all coordinates.""" - return copy.deepcopy(self._physical_coord_names + self._logical_coord_names) + return self._physical_coord_names + self._logical_coord_names @property def full_chunk_shape(self) -> tuple[int, ...]: From cfb3380cc01cdd36a4ea3cd1c930097e9ff12d58 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 2 Jun 2026 19:13:14 +0000 Subject: [PATCH 05/11] Make method name more intentional and clear --- src/mdio/builder/templates/base.py | 7 ++++--- src/mdio/builder/templates/seismic_3d_coca.py | 5 +++-- src/mdio/builder/templates/seismic_3d_obn.py | 5 +++-- src/mdio/builder/templates/seismic_3d_offset_tiles.py | 5 +++-- src/mdio/builder/templates/seismic_3d_receiver_gathers.py | 5 +++-- .../builder/templates/seismic_3d_shot_receiver_line.py | 5 +++-- src/mdio/builder/templates/seismic_3d_streamer_field.py | 5 +++-- src/mdio/builder/templates/types.py | 2 ++ src/mdio/ingestion/schema/resolver.py | 2 +- 9 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index c6e001dd..ed23668b 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -18,6 +18,7 @@ from mdio.builder.schemas.v1.variable import CoordinateMetadata from mdio.builder.schemas.v1.variable import VariableMetadata from mdio.builder.templates.types import CoordinateSpec +from mdio.builder.templates.types import DimCoordinateTypes if TYPE_CHECKING: from mdio.builder.schemas.v1.dataset import Dataset @@ -113,8 +114,8 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: ) return tuple(specs) - def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare data types for each dimension in this template. + def declare_dim_coordinate_types(self) -> DimCoordinateTypes: + """Declare data types for each dimension coordinate in this template. Returns: A dictionary mapping dimension name to ScalarType. @@ -130,7 +131,7 @@ def _dim_dtype(self, name: str) -> ScalarType: Returns: The declared ScalarType, defaulting to INT32. """ - return self.declare_dimension_specs().get(name, ScalarType.INT32) + return self.declare_dim_coordinate_types().get(name, ScalarType.INT32) def _add_dimension_coordinate(self, name: str) -> None: """Add a single dimension coordinate. diff --git a/src/mdio/builder/templates/seismic_3d_coca.py b/src/mdio/builder/templates/seismic_3d_coca.py index d83d4fd6..5974e099 100644 --- a/src/mdio/builder/templates/seismic_3d_coca.py +++ b/src/mdio/builder/templates/seismic_3d_coca.py @@ -7,6 +7,7 @@ from mdio.builder.schemas.v1.variable import CoordinateMetadata from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.templates.types import CoordinateSpec +from mdio.builder.templates.types import DimCoordinateTypes from mdio.builder.templates.types import SeismicDataDomain @@ -34,8 +35,8 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="cdp_y", dimensions=("inline", "crossline"), dtype=ScalarType.FLOAT64), ) - def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare the data types for each dimension in this template.""" + def declare_dim_coordinate_types(self) -> DimCoordinateTypes: + """Declare the data types for each dimension coordinate in this template.""" return { "inline": ScalarType.INT32, "crossline": ScalarType.INT32, diff --git a/src/mdio/builder/templates/seismic_3d_obn.py b/src/mdio/builder/templates/seismic_3d_obn.py index 503f6022..de58ae53 100644 --- a/src/mdio/builder/templates/seismic_3d_obn.py +++ b/src/mdio/builder/templates/seismic_3d_obn.py @@ -6,6 +6,7 @@ from mdio.builder.schemas.v1.variable import CoordinateMetadata from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.templates.types import CoordinateSpec +from mdio.builder.templates.types import DimCoordinateTypes from mdio.builder.templates.types import SeismicDataDomain @@ -64,8 +65,8 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="source_coord_y", dimensions=shot_dims, dtype=ScalarType.FLOAT64), ) - def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare the data types for each dimension in this template.""" + def declare_dim_coordinate_types(self) -> DimCoordinateTypes: + """Declare the data types for each dimension coordinate in this template.""" return { "component": ScalarType.UINT8, "receiver": ScalarType.UINT32, diff --git a/src/mdio/builder/templates/seismic_3d_offset_tiles.py b/src/mdio/builder/templates/seismic_3d_offset_tiles.py index 5eb5df54..8d0ea79d 100644 --- a/src/mdio/builder/templates/seismic_3d_offset_tiles.py +++ b/src/mdio/builder/templates/seismic_3d_offset_tiles.py @@ -7,6 +7,7 @@ from mdio.builder.schemas.v1.variable import CoordinateMetadata from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.templates.types import CoordinateSpec +from mdio.builder.templates.types import DimCoordinateTypes from mdio.builder.templates.types import SeismicDataDomain @@ -41,8 +42,8 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="cdp_y", dimensions=("inline", "crossline"), dtype=ScalarType.FLOAT64), ) - def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare the data types for each dimension in this template.""" + def declare_dim_coordinate_types(self) -> DimCoordinateTypes: + """Declare the data types for each dimension coordinate in this template.""" return { "inline": ScalarType.INT32, "crossline": ScalarType.INT32, diff --git a/src/mdio/builder/templates/seismic_3d_receiver_gathers.py b/src/mdio/builder/templates/seismic_3d_receiver_gathers.py index 911ecf87..a820595d 100644 --- a/src/mdio/builder/templates/seismic_3d_receiver_gathers.py +++ b/src/mdio/builder/templates/seismic_3d_receiver_gathers.py @@ -7,6 +7,7 @@ from mdio.builder.schemas.v1.variable import CoordinateMetadata from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.templates.types import CoordinateSpec +from mdio.builder.templates.types import DimCoordinateTypes class Seismic3DReceiverGathersTemplate(AbstractDatasetTemplate): @@ -45,8 +46,8 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="source_coord_y", dimensions=shot_dims, dtype=ScalarType.FLOAT64), ) - def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare the data types for each dimension in this template.""" + def declare_dim_coordinate_types(self) -> DimCoordinateTypes: + """Declare the data types for each dimension coordinate in this template.""" return { "receiver": ScalarType.UINT32, "shot_line": ScalarType.UINT32, diff --git a/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py b/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py index 311ca046..5f2c1ad7 100644 --- a/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py +++ b/src/mdio/builder/templates/seismic_3d_shot_receiver_line.py @@ -6,6 +6,7 @@ from mdio.builder.schemas.v1.variable import CoordinateMetadata from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.templates.types import CoordinateSpec +from mdio.builder.templates.types import DimCoordinateTypes from mdio.builder.templates.types import SeismicDataDomain @@ -45,8 +46,8 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="orig_field_record_num", dimensions=source_dims, dtype=ScalarType.UINT32), ) - def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare the data types for each dimension in this template.""" + def declare_dim_coordinate_types(self) -> DimCoordinateTypes: + """Declare the data types for each dimension coordinate in this template.""" return { "shot_line": ScalarType.UINT32, "shot_point": ScalarType.UINT32, diff --git a/src/mdio/builder/templates/seismic_3d_streamer_field.py b/src/mdio/builder/templates/seismic_3d_streamer_field.py index d8e23368..b7cf1c62 100644 --- a/src/mdio/builder/templates/seismic_3d_streamer_field.py +++ b/src/mdio/builder/templates/seismic_3d_streamer_field.py @@ -6,6 +6,7 @@ from mdio.builder.schemas.v1.variable import CoordinateMetadata from mdio.builder.templates.base import AbstractDatasetTemplate from mdio.builder.templates.types import CoordinateSpec +from mdio.builder.templates.types import DimCoordinateTypes from mdio.builder.templates.types import SeismicDataDomain @@ -52,8 +53,8 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]: CoordinateSpec(name="group_coord_y", dimensions=receiver_dims, dtype=ScalarType.FLOAT64), ) - def declare_dimension_specs(self) -> dict[str, ScalarType]: - """Declare the data types for each dimension in this template.""" + def declare_dim_coordinate_types(self) -> DimCoordinateTypes: + """Declare the data types for each dimension coordinate in this template.""" return { "sail_line": ScalarType.UINT32, "gun": ScalarType.UINT8, diff --git a/src/mdio/builder/templates/types.py b/src/mdio/builder/templates/types.py index 70d9de57..f333be52 100644 --- a/src/mdio/builder/templates/types.py +++ b/src/mdio/builder/templates/types.py @@ -11,6 +11,8 @@ CdpGatherDomain: TypeAlias = Literal["offset", "angle"] +DimCoordinateTypes: TypeAlias = dict[str, ScalarType] + class CoordinateSpec(BaseModel): """Specification for a non-dimension coordinate declared by a template. diff --git a/src/mdio/ingestion/schema/resolver.py b/src/mdio/ingestion/schema/resolver.py index 8dffd12e..45053c2a 100644 --- a/src/mdio/ingestion/schema/resolver.py +++ b/src/mdio/ingestion/schema/resolver.py @@ -59,7 +59,7 @@ def resolve( def _template_to_schema(self, template: AbstractDatasetTemplate) -> ResolvedSchema: """Convert a template to a resolved schema without overrides.""" calculated = set(template.calculated_dimension_names) - dim_dtypes = template.declare_dimension_specs() + dim_dtypes = template.declare_dim_coordinate_types() dimensions = [ DimensionSpec( name=name, From daad52ee808a348f64e4af3da0a4ff2598d1536e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Jun 2026 13:16:53 +0000 Subject: [PATCH 06/11] Deslop and cleanup comments --- src/mdio/ingestion/segy/index_strategies.py | 151 +++++++++----------- 1 file changed, 69 insertions(+), 82 deletions(-) diff --git a/src/mdio/ingestion/segy/index_strategies.py b/src/mdio/ingestion/segy/index_strategies.py index 6e80b29f..8e533f3e 100644 --- a/src/mdio/ingestion/segy/index_strategies.py +++ b/src/mdio/ingestion/segy/index_strategies.py @@ -1,13 +1,11 @@ """Composable index strategies for transforming SEG-Y headers into indexable dimensions. -This module replaces the monolithic :class:`mdio.segy.geometry.GridOverrider` command -dispatch with a small set of single-responsibility :class:`IndexStrategy` objects that can -be composed via :class:`CompositeStrategy`. - -Strategies are selected by :class:`IndexStrategyRegistry` from the typed -:class:`mdio.segy.geometry.GridOverrides` configuration plus optional template hints. The -public contract preserved by :class:`mdio.segy.geometry.GridOverrider` (a thin shim around -this module) keeps end-to-end ingestion behavior identical to v1.1.x. +This module replaces the monolithic `GridOverrider` command dispatch with a small set of +single-responsibility `IndexStrategy` objects that can be composed via `CompositeStrategy`. + +Strategies are selected by `IndexStrategyRegistry` from the typed `GridOverrides` +configuration plus optional template hints. The public contract preserved by `GridOverrider` +(a thin shim around this module) keeps end-to-end ingestion behavior identical. """ from __future__ import annotations @@ -46,31 +44,31 @@ class IndexStrategy(ABC): """Abstract base for header indexing strategies. - A strategy transforms a raw header array (e.g. add or rebase fields) and computes - the resulting :class:`Dimension` list. Strategies are composable through - :class:`CompositeStrategy`. The default :meth:`compute_dimensions` builds dimensions - from unique header values; subclasses override only when they need different - semantics (currently just :class:`CompositeStrategy`). + A strategy transforms a raw header array (e.g., adding or rebasing fields) and + computes the resulting `Dimension` list. Strategies are composable through + `CompositeStrategy`. The default `compute_dimensions` builds dimensions from unique + header values; subclasses override only when they need different semantics + (currently just `CompositeStrategy`). - Subclasses with header preconditions set :attr:`required_keys` so the shim and - :class:`CompositeStrategy` can raise :class:`GridOverrideKeysError` with a clear - "missing fields X, Y, Z" message before numpy fails on a deeper key lookup. + Subclasses with header preconditions set `required_keys` so the shim and + `CompositeStrategy` can raise `GridOverrideKeysError` with a clear + "missing fields" message before NumPy fails on a deeper key lookup. """ @property def required_keys(self) -> frozenset[str]: - """Header field names that must be present before :meth:`transform_headers` runs. + """Header field names that must be present before `transform_headers` runs. Empty by default. Override on subclasses whose transform indexes specific fields. """ return frozenset() def validate_headers(self, headers: HeaderArray) -> None: - """Raise :class:`GridOverrideKeysError` if any required header field is missing. + """Raise `GridOverrideKeysError` if any required header field is missing. - Callers (the :class:`mdio.segy.geometry.GridOverrider` shim and - :class:`CompositeStrategy`) invoke this before each transform so failure points - at the user-facing override name rather than at a numpy structured-array key error. + Callers (the `GridOverrider` shim and `CompositeStrategy`) invoke this before each + transform so failure points at the user-facing override name rather than at a NumPy + structured-array key error. """ required = self.required_keys if not required: @@ -84,10 +82,9 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: """Return a new header array with this strategy's transformation applied.""" def compute_dimensions(self, headers: HeaderArray, dim_names: tuple[str, ...]) -> list[Dimension]: - """Build one :class:`Dimension` per requested name from unique header values. + """Build one `Dimension` per requested name from unique header values. - Names absent from ``headers.dtype.names`` are silently skipped, matching the v1.1 - ``GridOverrider`` post-processing step. + Names absent from `headers.dtype.names` are silently skipped. """ return [ Dimension(coords=np.unique(headers[name]), name=name) for name in dim_names if name in headers.dtype.names @@ -108,21 +105,20 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: class DuplicateHandlingStrategy(IndexStrategy): - """Disambiguate duplicate index tuples by appending a per-tuple ``trace`` counter. + """Disambiguate duplicate index tuples by appending a per-tuple `trace` counter. - Mirrors the v1.1 ``DuplicateIndex`` command: count occurrences of each unique - combination of dimension fields (excluding coordinate fields and any caller-declared - ``excluded_fields``), then attach the resulting 1-based counter as a new ``trace`` field - on the original headers. + Counts occurrences of each unique combination of dimension fields (excluding + coordinate fields and any caller-declared `excluded_fields`), then attaches the + resulting 1-based counter as a new `trace` field on the original headers. Args: coord_fields: Names of header fields that are template coordinates and must be excluded from the dimension grouping (their values vary independently of the grid index). excluded_fields: Additional fields to exclude from grouping. Used by - :class:`NonBinnedStrategy` to keep the explicit ``non_binned_dims`` from + `NonBinnedStrategy` to keep the explicit `non_binned_dims` from polluting the per-tuple counter. - dtype: NumPy dtype for the appended ``trace`` counter. + dtype: NumPy dtype for the appended `trace` counter. """ def __init__( @@ -144,7 +140,7 @@ def _dim_fields(self, headers: HeaderArray) -> list[str]: ] def transform_headers(self, headers: HeaderArray) -> HeaderArray: - """Append a per-(dim-tuple) ``trace`` counter to ``headers``.""" + """Append a per-dimension-tuple `trace` counter to headers.""" dim_fields = self._dim_fields(headers) dim_headers = headers[dim_fields] if dim_fields else headers with_trace = analyze_non_indexed_headers(dim_headers, dtype=self.dtype) @@ -157,19 +153,18 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: class NonBinnedStrategy(DuplicateHandlingStrategy): - """Collapse selected non-binned dimensions into a single ``trace`` dimension. + """Collapse selected non-binned dimensions into a single `trace` dimension. - Inherits the per-tuple ``trace`` counter from :class:`DuplicateHandlingStrategy` and - captures ``chunksize`` so the :class:`mdio.segy.geometry.GridOverrider` shim can size - the new ``trace`` chunk correctly. + Inherits the per-tuple `trace` counter from `DuplicateHandlingStrategy` and + captures `chunksize` so the `GridOverrider` shim can size the new `trace` chunk correctly. Args: - chunksize: Chunk size to assign to the ``trace`` dimension. The strategy itself + chunksize: Chunk size to assign to the `trace` dimension. The strategy itself does not apply this value; the shim uses it when rewriting the chunksize tuple. - non_binned_dims: Header fields collapsed into ``trace``. They are excluded from + non_binned_dims: Header fields collapsed into `trace`. They are excluded from the duplicate grouping so the counter only varies along the remaining dims. coord_fields: Template coordinate names to exclude from grouping. - dtype: NumPy dtype for the appended ``trace`` counter. + dtype: NumPy dtype for the appended `trace` counter. """ def __init__( @@ -193,8 +188,7 @@ class ChannelWrappingStrategy(IndexStrategy): """Renumber streamer channels per cable when geometry is Type B. Detects whether channel numbering is per-cable (Type A; pass-through) or sequential - across cables (Type B; rebase to 1..N per cable). Mirrors the v1.1 ``AutoChannelWrap`` - command. + across cables (Type B; rebase to 1..N per cable). """ @property @@ -203,7 +197,7 @@ def required_keys(self) -> frozenset[str]: return frozenset({"shot_point", "cable", "channel"}) def transform_headers(self, headers: HeaderArray) -> HeaderArray: - """Rebase ``channel`` per cable for Type B geometry; pass through for Type A.""" + """Rebase `channel` per cable for Type B geometry; pass through for Type A.""" unique_cables, cable_chan_min, cable_chan_max, geom_type = analyze_streamer_headers(headers) logger.info("Ingesting dataset as %s", geom_type.name) @@ -221,20 +215,19 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: class ShotWrappingStrategy(IndexStrategy): - """Derive a dense ``shot_index`` field from sparse or interleaved ``shot_point`` values. + """Derive a dense `shot_index` field from sparse or interleaved `shot_point` values. - Replaces the v1.1 ``AutoShotWrap`` (streamer) and ``CalculateShotIndex`` (OBN) - commands. The two callers differ only in: + The two configurations differ in: - * ``line_field`` -- ``sail_line`` for streamer, ``shot_line`` for OBN. - * ``always_calculate`` -- streamer skips the transform entirely for Type A geometries - (per-gun shot points are already dense), OBN always emits ``shot_index`` because the + * `line_field` -- `sail_line` for streamer, `shot_line` for OBN. + * `always_calculate` -- streamer skips the transform entirely for Type A geometries + (per-gun shot points are already dense), OBN always emits `shot_index` because the template declares it as a calculated dimension. Args: line_field: Header field used to group shots into independent lines. - always_calculate: When ``True``, emit ``shot_index`` for every geometry type. For - Type A this builds a 0-based ``np.searchsorted`` over sorted unique shot + always_calculate: When `True`, emit `shot_index` for every geometry type. For + Type A this builds a 0-based `np.searchsorted` over sorted unique shot points per line. """ @@ -246,18 +239,14 @@ def __init__(self, line_field: str, always_calculate: bool = False) -> None: @property def required_keys(self) -> frozenset[str]: - """Streamer (``sail_line``) needs cable+channel too; OBN (``shot_line``) does not. - - Mirrors the v1.1 split between ``AutoShotWrap.required_keys`` and - ``CalculateShotIndex.required_keys``. - """ + """Streamer (`sail_line`) needs cable and channel too; OBN (`shot_line`) does not.""" base = {self.line_field, "gun", "shot_point"} if self.line_field == self._STREAMER_LINE_FIELD: base |= {"cable", "channel"} return frozenset(base) def transform_headers(self, headers: HeaderArray) -> HeaderArray: - """Append ``shot_index`` derived from ``shot_point`` per line.""" + """Append `shot_index` derived from `shot_point` per line.""" unique_lines, unique_guns_per_line, geom_type = analyze_lines_for_guns(headers, line_field=self.line_field) logger.info("Ingesting dataset as shot type: %s (line_field=%s)", geom_type.name, self.line_field) @@ -294,9 +283,8 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: class ComponentSynthesisStrategy(IndexStrategy): """Synthesize template-required dimension fields that are absent from the headers. - Currently used to fill the ``component`` dimension with a constant value of 1 for - OBN templates whose SEG-Y spec does not include a component header. Mirrors the - v1.1 ``GridOverrider._synthesize_obn_component`` behavior. + Currently used to fill the `component` dimension with a constant value of 1 for + OBN templates whose SEG-Y spec does not include a component header. Args: synthesize_dims: Names of dimension fields to synthesize when missing. @@ -338,8 +326,8 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: """Validate then run each child strategy's transform in sequence. Each step re-validates against the running header array, so a strategy that - produces a field (e.g. :class:`ComponentSynthesisStrategy` adding ``component``) - can satisfy a later strategy's :attr:`required_keys`. + produces a field (e.g. `ComponentSynthesisStrategy` adding `component`) + can satisfy a later strategy's `required_keys`. """ result = headers for strategy in self.strategies: @@ -354,26 +342,25 @@ def compute_dimensions(self, headers: HeaderArray, dim_names: tuple[str, ...]) - class IndexStrategyRegistry: - """Picks the right :class:`IndexStrategy` from grid overrides + template hints. + """Picks the right `IndexStrategy` from grid overrides and template hints. The registry is the single source of truth for override semantics: it maps a - :class:`~mdio.segy.geometry.GridOverrides` both to the header-transforming - :class:`IndexStrategy` (:meth:`create_strategy`) and to the schema-reshaping - :class:`~mdio.ingestion.schema.models.SchemaEffect` (:meth:`schema_effect`). Keeping - both derivations here prevents the header view and the schema view from drifting. + `GridOverrides` both to the header-transforming `IndexStrategy` (`create_strategy`) + and to the schema-reshaping `SchemaEffect` (`schema_effect`). Keeping both derivations + here prevents the header view and the schema view from drifting. """ def schema_effect(self, grid_overrides: GridOverrides | None) -> SchemaEffect | None: - """Return the schema reshaping implied by ``grid_overrides``, if any. + """Return the schema reshaping implied by `grid_overrides`, if any. - Only ``non_binned`` and ``has_duplicates`` change the dimension/coordinate layout; + Only `non_binned` and `has_duplicates` change the dimension/coordinate layout; every other override transforms headers in place without reshaping the schema. Args: - grid_overrides: Typed grid override configuration, or ``None``. + grid_overrides: Typed grid override configuration, or `None`. Returns: - The matching :class:`SchemaEffect`, or ``None`` when no layout change applies. + The matching `SchemaEffect`, or `None` when no layout change applies. """ if not grid_overrides: return None @@ -392,27 +379,27 @@ def create_strategy( ) -> IndexStrategy: """Build a strategy (possibly composite) for the given config. - Strategy ordering, when multiple flags are set, mirrors v1.1 behavior: + Strategy ordering, when multiple flags are set, mirrors previous behavior: - 1. ``ComponentSynthesisStrategy`` (so later strategies can rely on the synthesized + 1. `ComponentSynthesisStrategy` (so later strategies can rely on the synthesized field being present). - 2. ``ChannelWrappingStrategy`` (rebases ``channel`` before any shot calculation). - 3. ``ShotWrappingStrategy`` for ``auto_shot_wrap`` (streamer; ``sail_line``). - 4. ``ShotWrappingStrategy`` for ``calculate_shot_index`` (OBN; ``shot_line``, - ``always_calculate=True``). - 5. ``NonBinnedStrategy`` or ``DuplicateHandlingStrategy`` (mutually exclusive; - ``non_binned`` wins when both are set, matching v1.x semantics). + 2. `ChannelWrappingStrategy` (rebases `channel` before any shot calculation). + 3. `ShotWrappingStrategy` for `auto_shot_wrap` (streamer; `sail_line`). + 4. `ShotWrappingStrategy` for `calculate_shot_index` (OBN; `shot_line`, + `always_calculate=True`). + 5. `NonBinnedStrategy` or `DuplicateHandlingStrategy` (mutually exclusive; + `non_binned` wins when both are set). Args: - grid_overrides: Typed grid override configuration, or ``None`` for no + grid_overrides: Typed grid override configuration, or `None` for no user-driven overrides. - synthesize_dims: Dimensions to synthesize if missing (e.g. ``component``). + synthesize_dims: Dimensions to synthesize if missing (e.g., `component`). template: Optional dataset template; used to look up coordinate names so duplicate-handling counters group on dimension fields only. Returns: - A single :class:`IndexStrategy` instance. Returns - :class:`RegularGridStrategy` when no overrides and no synthesis are required. + A single `IndexStrategy` instance. Returns `RegularGridStrategy` when no + overrides and no synthesis are required. """ strategies: list[IndexStrategy] = [] From bf39d7f216e7d1f71fedb99edf29cb72d5601694 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Jun 2026 13:21:03 +0000 Subject: [PATCH 07/11] Make base `ingestion` module more generic and move segy specific vocabulary and requirements up the hierarchy --- src/mdio/ingestion/schema/models.py | 6 ++---- src/mdio/ingestion/segy/reader.py | 7 ++++++- src/mdio/ingestion/segy/validation.py | 5 ++++- tests/unit/ingestion/test_schema_resolver.py | 11 ++++++----- tests/unit/ingestion/test_segy_reader.py | 6 +++--- .../ingestion/test_segy_spec_validation.py | 19 ++++++++++++++++++- tests/unit/ingestion/test_segy_validation.py | 18 +++++++++++++++++- 7 files changed, 56 insertions(+), 16 deletions(-) diff --git a/src/mdio/ingestion/schema/models.py b/src/mdio/ingestion/schema/models.py index cce198f9..e9d72b26 100644 --- a/src/mdio/ingestion/schema/models.py +++ b/src/mdio/ingestion/schema/models.py @@ -60,12 +60,10 @@ class ResolvedSchema(BaseModel): metadata: dict[str, Any] = Field(default_factory=dict) default_variable_name: str = "amplitude" - def required_header_fields(self) -> set[str]: - """Names that must be readable from SEG-Y trace headers to materialize this schema.""" + def required_fields(self) -> set[str]: + """Names that must be readable from the source to materialize this schema.""" fields = {dim.name for dim in self.dimensions if dim.is_spatial and not dim.is_calculated} fields.update(coord.name for coord in self.coordinates) - # coordinate_scalar is always needed to scale X/Y coordinates. - fields.add("coordinate_scalar") return fields def spatial_dimensions(self) -> list[DimensionSpec]: diff --git a/src/mdio/ingestion/segy/reader.py b/src/mdio/ingestion/segy/reader.py index 9acf4b30..990966c9 100644 --- a/src/mdio/ingestion/segy/reader.py +++ b/src/mdio/ingestion/segy/reader.py @@ -8,6 +8,7 @@ from mdio.core.dimension import Dimension from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry from mdio.segy.parsers import parse_headers +from mdio.segy.scalar import SCALE_COORDINATE_KEYS if TYPE_CHECKING: import numpy as np @@ -49,7 +50,11 @@ def read_index_headers( # noqa: PLR0913 spec_fields = {field.name for field in spec.trace.header.fields} if spec else set() # Drop any synthesized or missing dimensions/coordinates that aren't in the physical file spec - subset = tuple(f for f in schema.required_header_fields() if f in spec_fields) + required_fields = schema.required_fields() + if any(field in SCALE_COORDINATE_KEYS for field in required_fields): + required_fields = required_fields | {"coordinate_scalar"} + + subset = tuple(f for f in required_fields if f in spec_fields) # 2. Parse headers parsed_headers = parse_headers( diff --git a/src/mdio/ingestion/segy/validation.py b/src/mdio/ingestion/segy/validation.py index f9a6d079..3828fda3 100644 --- a/src/mdio/ingestion/segy/validation.py +++ b/src/mdio/ingestion/segy/validation.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING +from mdio.segy.scalar import SCALE_COORDINATE_KEYS + if TYPE_CHECKING: from segy.schema import SegySpec @@ -24,7 +26,8 @@ def validate_spec_in_template(segy_spec: SegySpec, mdio_template: AbstractDatase if isinstance(mdio_template, Seismic3DObnReceiverGathersTemplate): required_fields.discard("component") - required_fields = required_fields | {"coordinate_scalar"} + if any(field in SCALE_COORDINATE_KEYS for field in required_fields): + required_fields = required_fields | {"coordinate_scalar"} missing_fields = required_fields - header_fields if missing_fields: diff --git a/tests/unit/ingestion/test_schema_resolver.py b/tests/unit/ingestion/test_schema_resolver.py index 64fc469a..ba748115 100644 --- a/tests/unit/ingestion/test_schema_resolver.py +++ b/tests/unit/ingestion/test_schema_resolver.py @@ -35,14 +35,15 @@ def test_obn_template_marks_shot_index_as_calculated(self) -> None: assert shot_index.is_calculated is True assert shot_index.is_spatial is True - def test_cdp_required_header_fields(self) -> None: - """Required header fields cover spatial dims, coordinates, and ``coordinate_scalar``.""" + def test_cdp_required_fields(self) -> None: + """Required fields cover spatial dims and coordinates.""" template = Seismic3DCdpGathersTemplate(data_domain="time", gather_domain="offset") schema = SchemaResolver().resolve(template, grid_overrides=None) - # Spatial dim header keys + coordinate header keys + always-present coordinate_scalar. - required = schema.required_header_fields() - assert {"inline", "crossline", "offset", "cdp_x", "cdp_y", "coordinate_scalar"}.issubset(required) + # Spatial dim keys + coordinate keys. + required = schema.required_fields() + assert {"inline", "crossline", "offset", "cdp_x", "cdp_y"}.issubset(required) + assert "coordinate_scalar" not in required class TestSchemaResolverNonBinned: diff --git a/tests/unit/ingestion/test_segy_reader.py b/tests/unit/ingestion/test_segy_reader.py index 5017902e..9aefc2f7 100644 --- a/tests/unit/ingestion/test_segy_reader.py +++ b/tests/unit/ingestion/test_segy_reader.py @@ -92,7 +92,7 @@ def test_subset_excludes_fields_absent_from_spec(self) -> None: ) headers = make_header_array({"inline": np.array([1, 2], dtype=np.int32)}) # spec lacks 'component' on purpose. - segy_file_kwargs = {"spec": _spec_with_fields("inline", "coordinate_scalar")} + segy_file_kwargs = {"spec": _spec_with_fields("inline")} with patch(f"{_READER}.parse_headers", return_value=headers) as mock_parse: reader.read_index_headers( @@ -105,7 +105,7 @@ def test_subset_excludes_fields_absent_from_spec(self) -> None: subset = set(mock_parse.call_args.kwargs["subset"]) assert "component" not in subset - assert subset == {"inline", "coordinate_scalar"} + assert subset == {"inline"} def test_synthesize_dims_produces_missing_dimension(self) -> None: """A synthesized dim absent from headers is created and yields a dimension.""" @@ -118,7 +118,7 @@ def test_synthesize_dims_produces_missing_dimension(self) -> None: coordinates=[], ) headers = make_header_array({"receiver": np.array([5, 6, 7], dtype=np.uint32)}) - segy_file_kwargs = {"spec": _spec_with_fields("receiver", "coordinate_scalar")} + segy_file_kwargs = {"spec": _spec_with_fields("receiver")} with patch(f"{_READER}.parse_headers", return_value=headers): indexed, dimensions = reader.read_index_headers( diff --git a/tests/unit/ingestion/test_segy_spec_validation.py b/tests/unit/ingestion/test_segy_spec_validation.py index 10115532..8a995f18 100644 --- a/tests/unit/ingestion/test_segy_spec_validation.py +++ b/tests/unit/ingestion/test_segy_spec_validation.py @@ -53,7 +53,7 @@ def test_validation_fails_with_missing_fields(self) -> None: assert "CustomTemplate" in error_message def test_validation_fails_with_missing_coordinate_scalar(self) -> None: - """Test that validation fails when coordinate_scalar is missing, even with all other fields.""" + """Test that validation fails when coordinate_scalar is missing and fields require scaling.""" template = MagicMock(spec=AbstractDatasetTemplate) template.name = "TestTemplate" template.spatial_dimension_names = ("inline", "crossline") @@ -73,3 +73,20 @@ def test_validation_fails_with_missing_coordinate_scalar(self) -> None: error_message = str(exc_info.value) assert "coordinate_scalar" in error_message assert "TestTemplate" in error_message + + def test_validation_passes_with_missing_coordinate_scalar_when_not_needed(self) -> None: + """Test that validation passes when coordinate_scalar is missing but no fields require scaling.""" + template = MagicMock(spec=AbstractDatasetTemplate) + template.name = "TestTemplate" + template.spatial_dimension_names = ("inline", "crossline") + template.coordinate_names = ("source_to_receiver_distance",) + + # Create SegySpec with all standard fields except coordinate_scalar + spec = get_segy_standard(1.0) + # Remove coordinate_scalar from the standard fields + standard_fields = [field for field in spec.trace.header.fields if field.name != "coordinate_scalar"] + standard_fields.append(HeaderField(name="not_coordinate_scalar", byte=71, format="int16")) + segy_spec = spec.customize(trace_header_fields=standard_fields) + + # Should not raise any exception + validate_spec_in_template(segy_spec, template) diff --git a/tests/unit/ingestion/test_segy_validation.py b/tests/unit/ingestion/test_segy_validation.py index 8159ec51..62a73589 100644 --- a/tests/unit/ingestion/test_segy_validation.py +++ b/tests/unit/ingestion/test_segy_validation.py @@ -48,7 +48,7 @@ def test_missing_fields_listed_in_error(self) -> None: assert "CustomTemplate" in msg def test_missing_coordinate_scalar_raises(self) -> None: - """A spec without ``coordinate_scalar`` must always fail.""" + """A spec without ``coordinate_scalar`` must fail if fields require scaling.""" template = MagicMock(spec=AbstractDatasetTemplate) template.name = "TestTemplate" template.spatial_dimension_names = ("inline", "crossline") @@ -63,6 +63,22 @@ def test_missing_coordinate_scalar_raises(self) -> None: with pytest.raises(ValueError, match=r"coordinate_scalar"): validate_spec_in_template(spec, template) + def test_missing_coordinate_scalar_passes_if_not_needed(self) -> None: + """A spec without ``coordinate_scalar`` passes if no fields require scaling.""" + template = MagicMock(spec=AbstractDatasetTemplate) + template.name = "TestTemplate" + template.spatial_dimension_names = ("inline", "crossline") + template.coordinate_names = ("source_to_receiver_distance",) + template.calculated_dimension_names = () + + spec = get_segy_standard(1.0) + kept = [f for f in spec.trace.header.fields if f.name != "coordinate_scalar"] + kept.append(HeaderField(name="not_coordinate_scalar", byte=71, format="int16")) + spec = spec.customize(trace_header_fields=kept) + + # Should not raise ValueError because 'offset' is not in SCALE_COORDINATE_KEYS + validate_spec_in_template(spec, template) + def test_calculated_dimensions_are_not_required(self) -> None: """Dimensions in ``calculated_dimension_names`` should not be required from the spec.""" template = MagicMock(spec=AbstractDatasetTemplate) From 922d1da1ed7475277911dcea53f528b3b81b5c2a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Jun 2026 14:01:32 +0000 Subject: [PATCH 08/11] Fix public interface to be `segy_to_mdio` --- src/mdio/__init__.py | 2 -- src/mdio/converters/segy.py | 5 +++-- src/mdio/ingestion/__init__.py | 4 ++-- src/mdio/ingestion/segy/pipeline.py | 4 ++-- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/mdio/__init__.py b/src/mdio/__init__.py index bfae0f30..efd76492 100644 --- a/src/mdio/__init__.py +++ b/src/mdio/__init__.py @@ -9,7 +9,6 @@ from mdio.converters import mdio_to_segy from mdio.converters import segy_to_mdio from mdio.ingestion import ResolvedSchema -from mdio.ingestion import run_segy_ingestion from mdio.optimize.access_pattern import OptimizedAccessPatternConfig from mdio.optimize.access_pattern import optimize_access_patterns from mdio.segy.geometry import GridOverrides @@ -29,6 +28,5 @@ "segy_to_mdio", "OptimizedAccessPatternConfig", "optimize_access_patterns", - "run_segy_ingestion", "ResolvedSchema", ] diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 8692b40e..42ee7d7b 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -5,7 +5,6 @@ import logging from typing import TYPE_CHECKING -from mdio.ingestion.segy.pipeline import run_segy_ingestion from mdio.segy.geometry import GridOverrides logger = logging.getLogger(__name__) @@ -67,7 +66,9 @@ def segy_to_mdio( # noqa: PLR0913 """ typed_grid_overrides = _coerce_grid_overrides(grid_overrides) - return run_segy_ingestion( + from mdio.ingestion.segy.pipeline import segy_to_mdio as _ingest_segy_to_mdio # noqa: PLC0415 + + return _ingest_segy_to_mdio( segy_spec=segy_spec, mdio_template=mdio_template, input_path=input_path, diff --git a/src/mdio/ingestion/__init__.py b/src/mdio/ingestion/__init__.py index 6624f7e6..91047c18 100644 --- a/src/mdio/ingestion/__init__.py +++ b/src/mdio/ingestion/__init__.py @@ -8,12 +8,12 @@ from mdio.ingestion.schema import ResolvedSchema from mdio.ingestion.segy.index_strategies import IndexStrategy from mdio.ingestion.segy.index_strategies import IndexStrategyRegistry -from mdio.ingestion.segy.pipeline import run_segy_ingestion +from mdio.ingestion.segy.pipeline import segy_to_mdio __all__ = [ "DimensionSpec", "IndexStrategy", "IndexStrategyRegistry", "ResolvedSchema", - "run_segy_ingestion", + "segy_to_mdio", ] diff --git a/src/mdio/ingestion/segy/pipeline.py b/src/mdio/ingestion/segy/pipeline.py index 8dd69706..0752d360 100644 --- a/src/mdio/ingestion/segy/pipeline.py +++ b/src/mdio/ingestion/segy/pipeline.py @@ -112,7 +112,7 @@ def _build_grid(dimensions: list[Dimension], indexed_headers: np.ndarray, num_tr return grid -def run_segy_ingestion( # noqa: PLR0913 +def segy_to_mdio( # noqa: PLR0913 segy_spec: SegySpec, mdio_template: AbstractDatasetTemplate, input_path: UPath | Path | str, @@ -135,7 +135,7 @@ def run_segy_ingestion( # noqa: PLR0913 grid_overrides: Grid override configuration for non-standard geometries. segy_header_overrides: Option to override specific SEG-Y headers during ingestion. - Raises: + Raises: # noqa: DOC502 FileExistsError: If the output location already exists and overwrite is False. ValueError: If required fields are missing from segy_spec or required computed dimensions are not produced after grid overrides are applied. From cae31bb90cc6225f088f7ebf22ee504befd78a05 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Jun 2026 14:06:35 +0000 Subject: [PATCH 09/11] Update raw headers warning with phase-out plan --- src/mdio/ingestion/segy/raw_headers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdio/ingestion/segy/raw_headers.py b/src/mdio/ingestion/segy/raw_headers.py index fea93562..fff55498 100644 --- a/src/mdio/ingestion/segy/raw_headers.py +++ b/src/mdio/ingestion/segy/raw_headers.py @@ -48,7 +48,7 @@ def build_raw_header_variables(schema: ResolvedSchema) -> list[dict[str, Any]]: logger.warning("Raw headers are only supported for Zarr v3. Skipping raw headers.") return [] - logger.warning("MDIO__IMPORT__RAW_HEADERS is experimental and expected to change or be removed.") + logger.warning("MDIO__IMPORT__RAW_HEADERS is planned to be deprecated in release 1.3 and removed in release 1.4.") spatial_dim_names = tuple(dim.name for dim in schema.dimensions if dim.is_spatial) chunk_metadata = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=schema.chunk_shape[:-1])) return [ From 21490a38f792bc8582fb1ef86a15b3a4fa68a170 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Jun 2026 16:04:56 +0000 Subject: [PATCH 10/11] Cleanup issues --- docs/tutorials/custom_template.ipynb | 608 +++++++++--------- src/mdio/builder/templates/base.py | 56 +- src/mdio/builder/templates/seismic_3d_obn.py | 4 +- src/mdio/converters/segy.py | 7 +- src/mdio/ingestion/segy/index_strategies.py | 22 +- src/mdio/ingestion/segy/pipeline.py | 2 + src/mdio/ingestion/segy/reader.py | 4 + src/mdio/segy/geometry.py | 43 +- .../ingestion/test_segy_grid_overrides.py | 24 +- .../ingestion/test_segy_index_strategies.py | 20 +- tests/unit/ingestion/test_segy_pipeline.py | 45 ++ 11 files changed, 446 insertions(+), 389 deletions(-) diff --git a/docs/tutorials/custom_template.ipynb b/docs/tutorials/custom_template.ipynb index 32efda96..520bc3a8 100644 --- a/docs/tutorials/custom_template.ipynb +++ b/docs/tutorials/custom_template.ipynb @@ -1,302 +1,310 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "85114119ae7a4db0", - "metadata": {}, - "source": [ - "# MDIO Template Usage\n", - "\n", - "```{article-info}\n", - ":author: Altay Sansal\n", - ":date: \"{sub-ref}`today`\"\n", - ":read-time: \"{sub-ref}`wordcount-minutes` min read\"\n", - ":class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light\n", - "```\n", - "\n", - "```{warning}\n", - "Most SEG-Y files correspond to standard seismic data types or field configurations. We recommend using\n", - "the built-in templates from the registry whenever possible. Create a custom template only when your file\n", - "is unusual and cannot be represented by existing templates. In many cases, you can simply customize the\n", - "SEG-Y header byte mapping during ingestion without defining a new template.\n", - "```\n", - "\n", - "In this tutorial we will walk through the Template Registry and show how to:\n", - "\n", - "- Discover available templates in the registry\n", - "- Define and register your own template\n", - "- Build a dataset model and convert it to an Xarray Dataset using your custom template\n", - "\n", - "If this is your first time with MDIO, you may want to skim the Quickstart first." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MDIO Template Usage\n", + "\n", + "```{article-info}\n", + ":author: Altay Sansal\n", + ":date: \"{sub-ref}`today`\"\n", + ":read-time: \"{sub-ref}`wordcount-minutes` min read\"\n", + ":class-container: sd-p-0 sd-outline-muted sd-rounded-3 sd-font-weight-light\n", + "```\n", + "\n", + "```{warning}\n", + "Most SEG-Y files correspond to standard seismic data types or field configurations. We recommend using\n", + "the built-in templates from the registry whenever possible. Create a custom template only when your file\n", + "is unusual and cannot be represented by existing templates. In many cases, you can simply customize the\n", + "SEG-Y header byte mapping during ingestion without defining a new template.\n", + "```\n", + "\n", + "In this tutorial we will walk through the Template Registry and show how to:\n", + "\n", + "- Discover available templates in the registry\n", + "- Define and register your own template\n", + "- Build a dataset model and convert it to an Xarray Dataset using your custom template\n", + "\n", + "If this is your first time with MDIO, you may want to skim the Quickstart first." + ], + "id": "85114119ae7a4db0" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What is a Template and a Template Registry?\n", + "\n", + "A template defines how an MDIO dataset is structured: names of dimensions and coordinates, the default variable name, chunking hints, and attributes to be stored. Since many seismic datasets share common structures (e.g., 3D post-stack, 2D post-stack, pre-stack CDP/shot, etc.), MDIO ships with a pre-populated template registry and APIs to fetch or register templates.\n", + "\n", + "Fetching a template from it returns a copied instance you can freely customize without affecting others." + ], + "id": "a793f2cfb58f09cc" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from mdio.builder.template_registry import get_template\n", + "from mdio.builder.template_registry import get_template_registry\n", + "from mdio.builder.template_registry import list_templates\n", + "\n", + "registry = get_template_registry()\n", + "registry # pretty HTML in notebooks" + ], + "execution_count": null, + "outputs": [], + "id": "c7a760a019930d4e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can list all registered templates and get a list as well." + ], + "id": "810dbba2b6dba787" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "list_templates()" + ], + "execution_count": null, + "outputs": [], + "id": "38eb1da635c7be0f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining a Minimal Custom Template\n", + "\n", + "To define a custom template, subclass `AbstractDatasetTemplate` and set:\n", + "\n", + "- `_name`: a public name for the template\n", + "- `_dim_names`: names for each axis of your data variable (the last axis is the trace/time or trace/depth axis)\n", + "- `_physical_coord_names` and `_logical_coord_names`: optional additional coordinate variables to store along the spatial grid\n", + "- `_load_dataset_attributes()`: optional attributes stored at the dataset level\n", + "\n", + "Below we create a special template that can hold interval velocity field with multiple anisotropy parameters for a depth seismic volume.\n", + "\n", + "The dimensions, dimension-coordinates and non-dimension coordinates will automatically get created using the method\n", + "from the base class. However, since we want more variables, we override `_add_variables` to add them." + ], + "id": "d87bd9ec781a8a8e" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from mdio.builder.schemas import compressors\n", + "from mdio.builder.schemas.chunk_grid import RegularChunkGrid\n", + "from mdio.builder.schemas.chunk_grid import RegularChunkShape\n", + "from mdio.builder.schemas.dtype import ScalarType\n", + "from mdio.builder.schemas.v1.variable import VariableMetadata\n", + "from mdio.builder.templates.base import AbstractDatasetTemplate\n", + "\n", + "\n", + "class AnisotropicVelocityTemplate(AbstractDatasetTemplate):\n", + " \"\"\"A custom template that has unusual dimensions and coordinates.\"\"\"\n", + "\n", + " def __init__(self, data_domain: str = \"depth\") -> None:\n", + " super().__init__(data_domain)\n", + " # Dimension order matters; the last dimension is the depth\n", + " self._dim_names = (\"inline\", \"crossline\", self.trace_domain)\n", + " # Additional coordinates: these are added on top of dimension coordinates\n", + " self._physical_coord_names = (\"cdp_x\", \"cdp_y\")\n", + " self._var_chunk_shape = (128, 128, 128)\n", + " self._units = {}\n", + "\n", + " @property\n", + " def _name(self) -> str: # public name for the registry\n", + " return \"AnisotropicVelocity3DDepth\"\n", + "\n", + " @property\n", + " def _default_variable_name(self) -> str: # public name for the registry\n", + " return \"velocity\"\n", + "\n", + " def _load_dataset_attributes(self) -> dict:\n", + " return {\"surveyType\": \"3D\", \"gatherType\": \"line\"}\n", + "\n", + " def _add_variables(self) -> None:\n", + " \"\"\"Add the variables including default and extra.\"\"\"\n", + " for name in [\"velocity\", \"epsilon\", \"delta\"]:\n", + " chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=self.full_chunk_shape))\n", + " unit = self.get_unit_by_key(name)\n", + " self._builder.add_variable(\n", + " name=name,\n", + " dimensions=self._dim_names,\n", + " data_type=ScalarType.FLOAT32,\n", + " compressor=compressors.Blosc(cname=compressors.BloscCname.zstd),\n", + " coordinates=self.physical_coordinate_names,\n", + " metadata=VariableMetadata(chunk_grid=chunk_grid, units_v1=unit),\n", + " )\n", + "\n", + "\n", + "AnisotropicVelocityTemplate()" + ], + "execution_count": null, + "outputs": [], + "id": "cfc9d9b0e1b67a76" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Registering the Custom Template\n", + "\n", + "The registry returns a deep copy of the template on every fetch. To make the template discoverable by name, register it first, then retrieve it with `get_template`." + ], + "id": "15e61310ed0ffd97" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from mdio.builder.template_registry import register_template\n", + "\n", + "register_template(AnisotropicVelocityTemplate())\n", + "print(\"Registered:\", \"AnisotropicVelocity3DDepth\" in list_templates())\n", + "\n", + "custom_template = get_template(\"AnisotropicVelocity3DDepth\")\n", + "custom_template" + ], + "execution_count": null, + "outputs": [], + "id": "a4e1847b20da6768" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also set units at any time. For this demo we’ll set metric units. The spatial units will be inferred from the SEG-Y binary header during ingestion, but we can override them here. Ingestion will honor what is in the template." + ], + "id": "83b0772f1913c652" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from mdio.builder.schemas.v1.units import LengthUnitModel\n", + "from mdio.builder.schemas.v1.units import SpeedUnitModel\n", + "\n", + "custom_template.add_units(\n", + " {\n", + " \"depth\": LengthUnitModel(length=\"m\"),\n", + " \"cdp_x\": LengthUnitModel(length=\"m\"),\n", + " \"cdp_y\": LengthUnitModel(length=\"m\"),\n", + " \"velocity\": SpeedUnitModel(speed=\"m/s\"),\n", + " }\n", + ")\n", + "custom_template" + ], + "execution_count": null, + "outputs": [], + "id": "d7dca50d72d2f93" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Changing chunk size (chunks) on an existing template\n", + "\n", + "Often you will want to tweak the chunking strategy for performance. You can do this in two ways:\n", + "\n", + "- When defining a subclass, set a default in the constructor (e.g., `self._var_chunk_shape = (...)`).\n", + "- On an existing template instance, assign to the `full_chunk_shape` property once you know your final\n", + " dataset sizes (the tuple length must match the number of data dimensions).\n", + "\n", + "Below is a tiny demo showing how to modify the chunk shape on a fetched template. We first build the\n", + "template with known sizes to satisfy validation, then update `full_chunk_shape`.\n", + "\n", + "```{warning}\n", + "`AbstractDatasetTemplate.build_dataset` is deprecated as of 1.2 and is planned for removal in 1.2.5.\n", + "SEG-Y ingestion now builds datasets from a resolved schema via the schema-driven factory, so prefer\n", + "`mdio.segy_to_mdio` for end-to-end conversion. The `build_dataset` call below remains only to\n", + "illustrate the in-memory dataset model.\n", + "```\n", + "\n", + "```{note}\n", + "In the SEG-Y to MDIO conversion workflow, MDIO infers the final grid shape from the SEG-Y headers. It’s\n", + "common to set or adjust `full_chunk_shape` right before calling `segy_to_mdio`, using the same sizes\n", + "you expect for the final array.\n", + "```" + ], + "id": "367ade9824e72bc3" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# NOTE: build_dataset is deprecated (removal planned for 1.2.5); shown here for illustration only.\n", + "mdio_ds = custom_template.build_dataset(name=\"demo-only\", sizes=(300, 500, 1001))\n", + "# pick smaller chunks than the full array for better parallelism and IO\n", + "custom_template.full_chunk_shape = (64, 64, 64)\n", + "print(\"Chunk shape set to:\", custom_template.full_chunk_shape)\n", + "\n", + "custom_template" + ], + "execution_count": null, + "outputs": [], + "id": "75939231b58c204a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Making Dummy Xarray Dataset\n", + "\n", + "We can now take the MDIO Dataset model and convert it to Xarray with our configuration. If ingesting from SEG-Y, this step\n", + "gets executed automatically by the converter before populating the data.\n", + "\n", + "Note that the whole dataset will be populated with the fill values." + ], + "id": "a76f17cdf235de13" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from mdio.builder.xarray_builder import to_xarray_dataset\n", + "\n", + "to_xarray_dataset(mdio_ds)" + ], + "execution_count": null, + "outputs": [], + "id": "ce3dcf9c7946ea07" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Recap: Key APIs Used\n", + "\n", + "- Template registry helpers: `get_template_registry`, `list_templates`, `register_template`, `get_template`\n", + "- Base template to subclass: `AbstractDatasetTemplate`\n", + "- Make Xarray Dataset from MDIO Data Model: `to_xarray_dataset`\n", + "\n", + "With these pieces, you can standardize how your seismic data is represented in MDIO and keep ingestion code concise and repeatable.\n" + ], + "id": "fc05aa3c81f8465c" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [], + "execution_count": null, + "outputs": [], + "id": "a15848ab5c0811d6" + } + ], + "metadata": { + "mystnb": { + "execution_mode": "force" + } }, - { - "cell_type": "markdown", - "id": "a793f2cfb58f09cc", - "metadata": {}, - "source": [ - "## What is a Template and a Template Registry?\n", - "\n", - "A template defines how an MDIO dataset is structured: names of dimensions and coordinates, the default variable name, chunking hints, and attributes to be stored. Since many seismic datasets share common structures (e.g., 3D post-stack, 2D post-stack, pre-stack CDP/shot, etc.), MDIO ships with a pre-populated template registry and APIs to fetch or register templates.\n", - "\n", - "Fetching a template from it returns a copied instance you can freely customize without affecting others." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7a760a019930d4e", - "metadata": {}, - "outputs": [], - "source": [ - "from mdio.builder.template_registry import get_template\n", - "from mdio.builder.template_registry import get_template_registry\n", - "from mdio.builder.template_registry import list_templates\n", - "\n", - "registry = get_template_registry()\n", - "registry # pretty HTML in notebooks" - ] - }, - { - "cell_type": "markdown", - "id": "810dbba2b6dba787", - "metadata": {}, - "source": [ - "We can list all registered templates and get a list as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38eb1da635c7be0f", - "metadata": {}, - "outputs": [], - "source": [ - "list_templates()" - ] - }, - { - "cell_type": "markdown", - "id": "d87bd9ec781a8a8e", - "metadata": {}, - "source": [ - "## Defining a Minimal Custom Template\n", - "\n", - "To define a custom template, subclass `AbstractDatasetTemplate` and set:\n", - "\n", - "- `_name`: a public name for the template\n", - "- `_dim_names`: names for each axis of your data variable (the last axis is the trace/time or trace/depth axis)\n", - "- `_physical_coord_names` and `_logical_coord_names`: optional additional coordinate variables to store along the spatial grid\n", - "- `_load_dataset_attributes()`: optional attributes stored at the dataset level\n", - "\n", - "Below we create a special template that can hold interval velocity field with multiple anisotropy parameters for a depth seismic volume.\n", - "\n", - "The dimensions, dimension-coordinates and non-dimension coordinates will automatically get created using the method\n", - "from the base class. However, since we want more variables, we override `_add_variables` to add them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfc9d9b0e1b67a76", - "metadata": {}, - "outputs": [], - "source": [ - "from mdio.builder.schemas import compressors\n", - "from mdio.builder.schemas.chunk_grid import RegularChunkGrid\n", - "from mdio.builder.schemas.chunk_grid import RegularChunkShape\n", - "from mdio.builder.schemas.dtype import ScalarType\n", - "from mdio.builder.schemas.v1.variable import VariableMetadata\n", - "from mdio.builder.templates.base import AbstractDatasetTemplate\n", - "\n", - "\n", - "class AnisotropicVelocityTemplate(AbstractDatasetTemplate):\n", - " \"\"\"A custom template that has unusual dimensions and coordinates.\"\"\"\n", - "\n", - " def __init__(self, data_domain: str = \"depth\") -> None:\n", - " super().__init__(data_domain)\n", - " # Dimension order matters; the last dimension is the depth\n", - " self._dim_names = (\"inline\", \"crossline\", self.trace_domain)\n", - " # Additional coordinates: these are added on top of dimension coordinates\n", - " self._physical_coord_names = (\"cdp_x\", \"cdp_y\")\n", - " self._var_chunk_shape = (128, 128, 128)\n", - " self._units = {}\n", - "\n", - " @property\n", - " def _name(self) -> str: # public name for the registry\n", - " return \"AnisotropicVelocity3DDepth\"\n", - "\n", - " @property\n", - " def _default_variable_name(self) -> str: # public name for the registry\n", - " return \"velocity\"\n", - "\n", - " def _load_dataset_attributes(self) -> dict:\n", - " return {\"surveyType\": \"3D\", \"gatherType\": \"line\"}\n", - "\n", - " def _add_variables(self) -> None:\n", - " \"\"\"Add the variables including default and extra.\"\"\"\n", - " for name in [\"velocity\", \"epsilon\", \"delta\"]:\n", - " chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=self.full_chunk_shape))\n", - " unit = self.get_unit_by_key(name)\n", - " self._builder.add_variable(\n", - " name=name,\n", - " dimensions=self._dim_names,\n", - " data_type=ScalarType.FLOAT32,\n", - " compressor=compressors.Blosc(cname=compressors.BloscCname.zstd),\n", - " coordinates=self.physical_coordinate_names,\n", - " metadata=VariableMetadata(chunk_grid=chunk_grid, units_v1=unit),\n", - " )\n", - "\n", - "\n", - "AnisotropicVelocityTemplate()" - ] - }, - { - "cell_type": "markdown", - "id": "15e61310ed0ffd97", - "metadata": {}, - "source": [ - "## Registering the Custom Template\n", - "\n", - "The registry returns a deep copy of the template on every fetch. To make the template discoverable by name, register it first, then retrieve it with `get_template`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4e1847b20da6768", - "metadata": {}, - "outputs": [], - "source": [ - "from mdio.builder.template_registry import register_template\n", - "\n", - "register_template(AnisotropicVelocityTemplate())\n", - "print(\"Registered:\", \"AnisotropicVelocity3DDepth\" in list_templates())\n", - "\n", - "custom_template = get_template(\"AnisotropicVelocity3DDepth\")\n", - "custom_template" - ] - }, - { - "cell_type": "markdown", - "id": "83b0772f1913c652", - "metadata": {}, - "source": [ - "You can also set units at any time. For this demo we’ll set metric units. The spatial units will be inferred from the SEG-Y binary header during ingestion, but we can override them here. Ingestion will honor what is in the template." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d7dca50d72d2f93", - "metadata": {}, - "outputs": [], - "source": [ - "from mdio.builder.schemas.v1.units import LengthUnitModel\n", - "from mdio.builder.schemas.v1.units import SpeedUnitModel\n", - "\n", - "custom_template.add_units(\n", - " {\n", - " \"depth\": LengthUnitModel(length=\"m\"),\n", - " \"cdp_x\": LengthUnitModel(length=\"m\"),\n", - " \"cdp_y\": LengthUnitModel(length=\"m\"),\n", - " \"velocity\": SpeedUnitModel(speed=\"m/s\"),\n", - " }\n", - ")\n", - "custom_template" - ] - }, - { - "cell_type": "markdown", - "id": "367ade9824e72bc3", - "metadata": {}, - "source": [ - "## Changing chunk size (chunks) on an existing template\n", - "\n", - "Often you will want to tweak the chunking strategy for performance. You can do this in two ways:\n", - "\n", - "- When defining a subclass, set a default in the constructor (e.g., `self._var_chunk_shape = (...)`).\n", - "- On an existing template instance, assign to the `full_chunk_shape` property once you know your final\n", - " dataset sizes (the tuple length must match the number of data dimensions).\n", - "\n", - "Below is a tiny demo showing how to modify the chunk shape on a fetched template. We first build the\n", - "template with known sizes to satisfy validation, then update `full_chunk_shape`.\n", - "\n", - "```{note}\n", - "In the SEG-Y to MDIO conversion workflow, MDIO infers the final grid shape from the SEG-Y headers. It’s\n", - "common to set or adjust `full_chunk_shape` right before calling `segy_to_mdio`, using the same sizes\n", - "you expect for the final array.\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75939231b58c204a", - "metadata": {}, - "outputs": [], - "source": [ - "mdio_ds = custom_template.build_dataset(name=\"demo-only\", sizes=(300, 500, 1001))\n", - "# pick smaller chunks than the full array for better parallelism and IO\n", - "custom_template.full_chunk_shape = (64, 64, 64)\n", - "print(\"Chunk shape set to:\", custom_template.full_chunk_shape)\n", - "\n", - "custom_template" - ] - }, - { - "cell_type": "markdown", - "id": "a76f17cdf235de13", - "metadata": {}, - "source": [ - "## Making Dummy Xarray Dataset\n", - "\n", - "We can now take the MDIO Dataset model and convert it to Xarray with our configuration. If ingesting from SEG-Y, this step\n", - "gets executed automatically by the converter before populating the data.\n", - "\n", - "Note that the whole dataset will be populated with the fill values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce3dcf9c7946ea07", - "metadata": {}, - "outputs": [], - "source": [ - "from mdio.builder.xarray_builder import to_xarray_dataset\n", - "\n", - "to_xarray_dataset(mdio_ds)" - ] - }, - { - "cell_type": "markdown", - "id": "fc05aa3c81f8465c", - "metadata": {}, - "source": [ - "## Recap: Key APIs Used\n", - "\n", - "- Template registry helpers: `get_template_registry`, `list_templates`, `register_template`, `get_template`\n", - "- Base template to subclass: `AbstractDatasetTemplate`\n", - "- Make Xarray Dataset from MDIO Data Model: `to_xarray_dataset`\n", - "\n", - "With these pieces, you can standardize how your seismic data is represented in MDIO and keep ingestion code concise and repeatable.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a15848ab5c0811d6", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "mystnb": { - "execution_mode": "force" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index ed23668b..8e82db1d 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -2,6 +2,7 @@ from __future__ import annotations +import logging from abc import ABC from abc import abstractmethod from typing import TYPE_CHECKING @@ -24,6 +25,8 @@ from mdio.builder.schemas.v1.dataset import Dataset from mdio.builder.templates.types import SeismicDataDomain +logger = logging.getLogger(__name__) + class AbstractDatasetTemplate(ABC): """Abstract base class that defines the template method for Dataset building factory. @@ -46,12 +49,6 @@ def __init__(self, data_domain: SeismicDataDomain) -> None: self._var_chunk_shape: tuple[int, ...] = () self.synthesize_missing_dims: tuple[str, ...] = () - # TEMPORARY (removed with declare_coordinate_specs): set when grid overrides mutate this - # template in-place (dims collapsed into 'trace', extra coordinates added). Once mutated, - # the runtime layout intentionally diverges from the static declare_coordinate_specs() - # contract, so the drift guard in build_dataset() must not run. - self._grid_overrides_applied: bool = False - self._builder: MDIODatasetBuilder | None = None self._dim_sizes: tuple[int, ...] = () self._units: dict[str, AllUnitModel] = {} @@ -154,6 +151,12 @@ def build_dataset( ) -> Dataset: """Template method that builds the dataset. + .. deprecated:: 1.2 + ``build_dataset`` is deprecated and is planned for removal in 1.2.5. SEG-Y + ingestion now builds datasets from a resolved schema via the schema-driven + factory (:func:`mdio.ingestion.dataset_factory.build_mdio_dataset`); use + :func:`mdio.segy_to_mdio` for ingestion. + Args: name: The name of the dataset. sizes: The sizes of the dimensions. @@ -165,6 +168,11 @@ def build_dataset( Raises: ValueError: If coordinate already exists from subclass override. """ + logger.warning( + "AbstractDatasetTemplate.build_dataset is deprecated as of 1.2 and is planned for " + "removal in 1.2.5; SEG-Y ingestion builds datasets via the schema-driven factory. " + "Use `mdio.segy_to_mdio` for ingestion." + ) self._dim_sizes = sizes attributes = self._load_dataset_attributes() or {} @@ -186,10 +194,7 @@ def build_dataset( except ValueError as exc: # coordinate may already exist if "same name twice" not in str(exc): raise - # Skip the static drift guard when grid overrides have transformed the template: the - # runtime layout no longer matches the declared (override-free) specs by design. - if not self._grid_overrides_applied: - self._validate_declared_coordinate_specs() + self._validate_declared_coordinate_specs() self._add_variables() self._add_trace_mask() @@ -206,30 +211,6 @@ def add_units(self, units: dict[str, AllUnitModel]) -> None: raise ValueError(msg) self._units |= units - def apply_resolved_dimensions( - self, - dim_names: tuple[str, ...], - chunk_shape: tuple[int, ...], - ) -> None: - """Update the template's dimension layout from a resolved schema. - - Supported entry point for the ingestion pipeline to push back dimension names - and chunk shape after the SchemaResolver has applied grid overrides - (e.g. NonBinned, HasDuplicates), instead of mutating private attributes. - - Args: - dim_names: Final ordered dimension names. - chunk_shape: Chunk shape matching ``dim_names`` length. - - Raises: - ValueError: If ``len(chunk_shape) != len(dim_names)``. - """ - if len(chunk_shape) != len(dim_names): - msg = f"chunk_shape length {len(chunk_shape)} does not match dim_names length {len(dim_names)}" - raise ValueError(msg) - self._dim_names = tuple(dim_names) - self._var_chunk_shape = tuple(chunk_shape) - def _validate_declared_coordinate_specs(self) -> None: """Fail the build if :meth:`declare_coordinate_specs` drifted from the built coordinates. @@ -238,11 +219,8 @@ def _validate_declared_coordinate_specs(self) -> None: :meth:`_add_coordinates`, this guard ensures the two never diverge in name, dimensions, or dtype. The ingestion ``SchemaResolver`` trusts the declared specs, so silent drift would corrupt resolved schemas. The check runs for every template (built-in and - user-defined) on every ``build_dataset`` call that does not apply grid overrides. Grid - overrides mutate the template in-place (collapsing dims into ``trace`` and adding - coordinates), so the runtime layout intentionally diverges from the declared specs and - the guard is skipped for those builds. It is removed once ``_add_coordinates`` is derived - from the resolved schema and the duplication no longer exists. + user-defined) on every ``build_dataset`` call. It is removed once ``_add_coordinates`` + is derived from the resolved schema and the duplication no longer exists. Raises: ValueError: If the declared specs do not match the built non-dimension coordinates. diff --git a/src/mdio/builder/templates/seismic_3d_obn.py b/src/mdio/builder/templates/seismic_3d_obn.py index de58ae53..4ed6c334 100644 --- a/src/mdio/builder/templates/seismic_3d_obn.py +++ b/src/mdio/builder/templates/seismic_3d_obn.py @@ -25,8 +25,8 @@ class Seismic3DObnReceiverGathersTemplate(AbstractDatasetTemplate): Special handling for component dimension: If the SEG-Y spec does not contain a 'component' field, the ingestion process will automatically synthesize a component dimension with constant value 1 for - all traces. A warning is logged when this occurs. This is handled explicitly - in GridOverrider._synthesize_obn_component(). + all traces. A warning is logged when this occurs. This is driven by + ``synthesize_missing_dims`` and handled by ``ComponentSynthesisStrategy``. """ def __init__(self, data_domain: SeismicDataDomain = "time"): diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 42ee7d7b..d116799a 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -35,8 +35,8 @@ def _coerce_grid_overrides( return grid_overrides logger.warning( - "Passing `grid_overrides` as a dict is deprecated and will be removed in a " - "future release; pass a `mdio.GridOverrides` instance instead." + "Passing `grid_overrides` as a dict is deprecated as of 1.2 and is planned for removal " + "in 1.2.5; pass a `mdio.GridOverrides` instance instead." ) return GridOverrides.model_validate(grid_overrides) @@ -61,7 +61,8 @@ def segy_to_mdio( # noqa: PLR0913 output_path: The universal path for the output MDIO v1 file. overwrite: Whether to overwrite the output file if it already exists. Defaults to False. grid_overrides: Option to add grid overrides. Prefer a :class:`mdio.GridOverrides` - instance; ``dict`` is still accepted but emits a :class:`DeprecationWarning`. + instance; ``dict`` is still accepted (deprecated as of 1.2, planned for removal in + 1.2.5) but logs a deprecation warning. segy_header_overrides: Option to override specific SEG-Y headers during ingestion. """ typed_grid_overrides = _coerce_grid_overrides(grid_overrides) diff --git a/src/mdio/ingestion/segy/index_strategies.py b/src/mdio/ingestion/segy/index_strategies.py index 8e533f3e..5d97d6e0 100644 --- a/src/mdio/ingestion/segy/index_strategies.py +++ b/src/mdio/ingestion/segy/index_strategies.py @@ -4,8 +4,7 @@ single-responsibility `IndexStrategy` objects that can be composed via `CompositeStrategy`. Strategies are selected by `IndexStrategyRegistry` from the typed `GridOverrides` -configuration plus optional template hints. The public contract preserved by `GridOverrider` -(a thin shim around this module) keeps end-to-end ingestion behavior identical. +configuration plus optional template hints, preserving end-to-end ingestion behavior. """ from __future__ import annotations @@ -50,7 +49,7 @@ class IndexStrategy(ABC): header values; subclasses override only when they need different semantics (currently just `CompositeStrategy`). - Subclasses with header preconditions set `required_keys` so the shim and + Subclasses with header preconditions set `required_keys` so the ingestion reader and `CompositeStrategy` can raise `GridOverrideKeysError` with a clear "missing fields" message before NumPy fails on a deeper key lookup. """ @@ -66,7 +65,7 @@ def required_keys(self) -> frozenset[str]: def validate_headers(self, headers: HeaderArray) -> None: """Raise `GridOverrideKeysError` if any required header field is missing. - Callers (the `GridOverrider` shim and `CompositeStrategy`) invoke this before each + Callers (the ingestion reader and `CompositeStrategy`) invoke this before each transform so failure points at the user-facing override name rather than at a NumPy structured-array key error. """ @@ -155,12 +154,12 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: class NonBinnedStrategy(DuplicateHandlingStrategy): """Collapse selected non-binned dimensions into a single `trace` dimension. - Inherits the per-tuple `trace` counter from `DuplicateHandlingStrategy` and - captures `chunksize` so the `GridOverrider` shim can size the new `trace` chunk correctly. + Inherits the per-tuple `trace` counter from `DuplicateHandlingStrategy`, excluding the + collapsed dims from the grouping so the counter only varies along the remaining dims. + The `trace` chunk size is owned by the schema reshaping (`CollapseToTraceEffect`), not + by this strategy. Args: - chunksize: Chunk size to assign to the `trace` dimension. The strategy itself - does not apply this value; the shim uses it when rewriting the chunksize tuple. non_binned_dims: Header fields collapsed into `trace`. They are excluded from the duplicate grouping so the counter only varies along the remaining dims. coord_fields: Template coordinate names to exclude from grouping. @@ -169,19 +168,15 @@ class NonBinnedStrategy(DuplicateHandlingStrategy): def __init__( self, - chunksize: int, non_binned_dims: Iterable[str], coord_fields: Iterable[str] = (), dtype: DTypeLike = np.int16, ) -> None: - non_binned_dims = tuple(non_binned_dims) super().__init__( coord_fields=coord_fields, - excluded_fields=non_binned_dims, + excluded_fields=tuple(non_binned_dims), dtype=dtype, ) - self.chunksize = chunksize - self.non_binned_dims = non_binned_dims class ChannelWrappingStrategy(IndexStrategy): @@ -421,7 +416,6 @@ def create_strategy( if grid_overrides.non_binned: strategies.append( NonBinnedStrategy( - chunksize=grid_overrides.chunksize, non_binned_dims=grid_overrides.non_binned_dims or (), coord_fields=coord_fields, ) diff --git a/src/mdio/ingestion/segy/pipeline.py b/src/mdio/ingestion/segy/pipeline.py index 0752d360..5fdb60a5 100644 --- a/src/mdio/ingestion/segy/pipeline.py +++ b/src/mdio/ingestion/segy/pipeline.py @@ -23,6 +23,7 @@ from mdio.ingestion.segy.serializer import serialize_to_mdio from mdio.ingestion.segy.validation import validate_spec_in_template from mdio.segy.file import get_segy_file_info +from mdio.segy.geometry import validate_overrides_for_template if TYPE_CHECKING: from pathlib import Path @@ -142,6 +143,7 @@ def segy_to_mdio( # noqa: PLR0913 GridTraceCountError: If the live trace count in the built grid does not match the number of traces reported by the SEG-Y file. """ + validate_overrides_for_template(grid_overrides, mdio_template) validate_spec_in_template(segy_spec, mdio_template) input_path = _normalize_path(input_path) diff --git a/src/mdio/ingestion/segy/reader.py b/src/mdio/ingestion/segy/reader.py index 990966c9..9f64d50a 100644 --- a/src/mdio/ingestion/segy/reader.py +++ b/src/mdio/ingestion/segy/reader.py @@ -71,6 +71,10 @@ def read_index_headers( # noqa: PLR0913 ) logger.info("Using index strategy: %s", strategy.name) + # Validate up front so a missing required field surfaces as a clear GridOverrideKeysError + # naming the override, rather than a deep NumPy key error inside the transform. Composite + # strategies additionally re-validate each child against the running header array. + strategy.validate_headers(parsed_headers) indexed_headers = strategy.transform_headers(parsed_headers) # 4. Compute spatial dimensions diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index aa24270a..b8cb88de 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -15,6 +15,9 @@ from pydantic import BaseModel from pydantic import ConfigDict from pydantic import Field +from pydantic import model_validator + +from mdio.segy.exceptions import GridOverrideMissingParameterError if TYPE_CHECKING: from mdio.builder.templates.base import AbstractDatasetTemplate @@ -63,6 +66,31 @@ class GridOverrides(BaseModel): description="Dimension names to collapse into the trace dimension when `non_binned` is True.", ) + @model_validator(mode="after") + def _check_non_binned_parameters(self) -> GridOverrides: + """Require the parameters ``non_binned`` depends on. + + ``chunksize`` and ``non_binned_dims`` are only meaningful when collapsing dims into a + ``trace`` axis. Enforcing the dependency on the model means every construction path + (typed instance or a coerced legacy dict) fails fast with the same error, so the + ingestion pipeline does not need to re-check it. + + Raises: + GridOverrideMissingParameterError: When ``non_binned`` is set without both + ``chunksize`` and ``non_binned_dims``. + """ + if not self.non_binned: + return self + missing: set[str] = set() + if self.chunksize is None: + missing.add("chunksize") + if not self.non_binned_dims: + missing.add("non_binned_dims") + if missing: + command = "NonBinned" + raise GridOverrideMissingParameterError(command, missing) + return self + def __bool__(self) -> bool: """Return True if any override flag is enabled.""" return ( @@ -74,7 +102,7 @@ def __bool__(self) -> bool: ) def to_legacy_dict(self) -> dict[str, Any]: - """Dump to the legacy ``CamelCase`` dict shape consumed by :class:`GridOverrider`.""" + """Dump to the legacy ``CamelCase`` dict shape stored in dataset metadata.""" return self.model_dump(by_alias=True, exclude_defaults=True) @@ -95,24 +123,29 @@ def _resolve_synthesize_dims(template: AbstractDatasetTemplate | None) -> tuple[ return () -def _validate_template_for_overrides( - config: GridOverrides, +def validate_overrides_for_template( + config: GridOverrides | None, template: AbstractDatasetTemplate | None, ) -> None: """Reject grid override / template pairings that v1.1 forbade. ``auto_shot_wrap`` is streamer-only and ``calculate_shot_index`` is OBN-only; using either with the wrong template silently produced wrong shot indices in v1.1 unless - the per-command validator caught it. This function restores that guard. + the per-command validator caught it. This is the one guard the :class:`GridOverrides` + model cannot enforce on its own (it depends on the chosen template), so the ingestion + pipeline calls it before any header parsing. Args: - config: Typed grid overrides extracted from the user's legacy dict. + config: Typed grid overrides, or ``None`` when no overrides were requested. template: Template chosen by the caller, or ``None`` if omitted. Raises: TypeError: When ``auto_shot_wrap`` is set without a streamer template, or ``calculate_shot_index`` is set without an OBN receiver-gathers template. """ + if not config: + return + if config.auto_shot_wrap: # Lazy import: see ``_resolve_synthesize_dims`` for the cycle rationale. from mdio.builder.templates.seismic_3d_streamer_field import ( # noqa: PLC0415 diff --git a/tests/unit/ingestion/test_segy_grid_overrides.py b/tests/unit/ingestion/test_segy_grid_overrides.py index 34055fcc..d17a27df 100644 --- a/tests/unit/ingestion/test_segy_grid_overrides.py +++ b/tests/unit/ingestion/test_segy_grid_overrides.py @@ -20,7 +20,7 @@ from mdio.segy.exceptions import GridOverrideUnknownError from mdio.segy.geometry import GridOverrides from mdio.segy.geometry import _resolve_synthesize_dims -from mdio.segy.geometry import _validate_template_for_overrides +from mdio.segy.geometry import validate_overrides_for_template if TYPE_CHECKING: from mdio.builder.templates.base import AbstractDatasetTemplate @@ -47,19 +47,13 @@ def run_override( if key not in valid_keys: raise GridOverrideUnknownError(key) + # NonBinned parameter requirements are enforced by the GridOverrides model itself, so + # model_validate raises here for an under-specified NonBinned config (matching production). config = GridOverrides.model_validate(grid_overrides) - if config.non_binned: - missing: set[str] = set() - if config.chunksize is None: - missing.add("chunksize") - if not config.non_binned_dims: - missing.add("non_binned_dims") - if missing: - command = "NonBinned" - raise GridOverrideMissingParameterError(command, missing) - - _validate_template_for_overrides(config, template) + # Template-compatibility is the one guard the model cannot self-enforce; mirror the + # production pipeline so this helper cannot drift from what `segy_to_mdio` runs. + validate_overrides_for_template(config, template) synthesize_dims = _resolve_synthesize_dims(template) registry = IndexStrategyRegistry() @@ -184,6 +178,12 @@ def test_non_binned(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None expected_trace_coords = np.arange(1, 21, dtype="int32") assert_array_equal(dims[2].coords, expected_trace_coords) + def test_non_binned_missing_parameters_raises(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None: + """NonBinned without `chunksize`/`non_binned_dims` raises GridOverrideMissingParameterError.""" + index_names = ("shot_point", "cable") + with pytest.raises(GridOverrideMissingParameterError, match="chunksize"): + run_override({"NonBinned": True}, index_names, mock_streamer_headers) + class TestStreamerGridOverrides: """Check grid overrides for shot data with streamer acquisition.""" diff --git a/tests/unit/ingestion/test_segy_index_strategies.py b/tests/unit/ingestion/test_segy_index_strategies.py index 70384ed3..14bfbf9f 100644 --- a/tests/unit/ingestion/test_segy_index_strategies.py +++ b/tests/unit/ingestion/test_segy_index_strategies.py @@ -29,7 +29,7 @@ from mdio.segy.exceptions import GridOverrideKeysError from mdio.segy.geometry import GridOverrides from mdio.segy.geometry import _resolve_synthesize_dims -from mdio.segy.geometry import _validate_template_for_overrides +from mdio.segy.geometry import validate_overrides_for_template class GridOverrider: @@ -44,7 +44,7 @@ def run( ) -> np.ndarray: """Run mock strategy validation and transform.""" config = GridOverrides.model_validate(grid_overrides) - _validate_template_for_overrides(config, template) + validate_overrides_for_template(config, template) synthesize_dims = _resolve_synthesize_dims(template) registry = IndexStrategyRegistry() strategy = registry.create_strategy( @@ -93,12 +93,11 @@ def test_synthesize_dims_only(self) -> None: assert strategy.synthesize_dims == ("component",) def test_non_binned_only(self) -> None: - """``non_binned`` -> NonBinnedStrategy with chunksize and excluded dims wired.""" + """``non_binned`` -> NonBinnedStrategy with the collapsed dims excluded from grouping.""" overrides = GridOverrides(non_binned=True, chunksize=64, non_binned_dims=["channel"]) strategy = IndexStrategyRegistry().create_strategy(grid_overrides=overrides) assert isinstance(strategy, NonBinnedStrategy) - assert strategy.chunksize == 64 - assert strategy.non_binned_dims == ("channel",) + assert strategy.excluded_fields == frozenset({"channel"}) def test_has_duplicates_only(self) -> None: """``has_duplicates`` -> DuplicateHandlingStrategy.""" @@ -259,12 +258,6 @@ def test_excluded_fields_dropped_from_grouping(self) -> None: class TestNonBinnedStrategy: """``NonBinned`` is the duplicate counter wired with explicit collapse dims.""" - def test_chunksize_and_dims_recorded(self) -> None: - """Constructor stores both for the ``GridOverrider`` shim to use.""" - strategy = NonBinnedStrategy(chunksize=4, non_binned_dims=("channel",)) - assert strategy.chunksize == 4 - assert strategy.non_binned_dims == ("channel",) - def test_collapse_dim_excluded_from_counter(self) -> None: """The non-binned dim must NOT participate in the duplicate counter grouping.""" headers = _make_struct( @@ -274,7 +267,7 @@ def test_collapse_dim_excluded_from_counter(self) -> None: "channel": np.array([10, 11, 10, 11], dtype=np.int32), } ) - strategy = NonBinnedStrategy(chunksize=4, non_binned_dims=("channel",)) + strategy = NonBinnedStrategy(non_binned_dims=("channel",)) out = strategy.transform_headers(headers) # Each (shot_point, cable) tuple appears once -> counters all equal 1. assert "trace" in out.dtype.names @@ -290,7 +283,6 @@ def test_coord_fields_also_excluded(self) -> None: } ) strategy = NonBinnedStrategy( - chunksize=4, non_binned_dims=("channel",), coord_fields=("cdp_x",), ) @@ -481,7 +473,7 @@ def test_strategies_run_in_order(self) -> None: composite = CompositeStrategy( [ ComponentSynthesisStrategy(("component",)), - NonBinnedStrategy(chunksize=4, non_binned_dims=("channel",)), + NonBinnedStrategy(non_binned_dims=("channel",)), ] ) out = composite.transform_headers(headers) diff --git a/tests/unit/ingestion/test_segy_pipeline.py b/tests/unit/ingestion/test_segy_pipeline.py index 296887e7..6b7e0697 100644 --- a/tests/unit/ingestion/test_segy_pipeline.py +++ b/tests/unit/ingestion/test_segy_pipeline.py @@ -6,10 +6,13 @@ import pytest +from mdio.builder.template_registry import TemplateRegistry +from mdio.converters.segy import segy_to_mdio from mdio.ingestion.schema import DimensionSpec from mdio.ingestion.schema import ResolvedSchema from mdio.ingestion.segy import pipeline from mdio.ingestion.segy.raw_headers import build_raw_header_variables +from mdio.segy.exceptions import GridOverrideMissingParameterError def _schema(dimensions: list[DimensionSpec]) -> ResolvedSchema: @@ -74,6 +77,48 @@ def test_raises_when_calculated_dim_missing(self) -> None: pipeline._verify_calculated_dimensions(schema, produced, "Toy") +class TestGridOverrideValidationThroughPublicApi: + """The public ``segy_to_mdio`` must reject bad override/template pairings. + + These exercise the shipped entry point (not a private helper) so the guards cannot + silently drop out of the pipeline again. Validation runs before any file IO, so the + dummy paths and ``segy_spec`` are never dereferenced. + """ + + def test_auto_shot_wrap_with_non_streamer_template_raises(self) -> None: + """``AutoShotWrap`` is streamer-only; an OBN template must raise TypeError.""" + with pytest.raises(TypeError, match="auto_shot_wrap only supports"): + segy_to_mdio( + segy_spec=None, + mdio_template=TemplateRegistry().get("ObnReceiverGathers3D"), + input_path="in.segy", + output_path="out.mdio", + grid_overrides={"AutoShotWrap": True}, + ) + + def test_calculate_shot_index_with_non_obn_template_raises(self) -> None: + """``CalculateShotIndex`` is OBN-only; a streamer template must raise TypeError.""" + with pytest.raises(TypeError, match="calculate_shot_index only supports"): + segy_to_mdio( + segy_spec=None, + mdio_template=TemplateRegistry().get("StreamerFieldRecords3D"), + input_path="in.segy", + output_path="out.mdio", + grid_overrides={"CalculateShotIndex": True}, + ) + + def test_non_binned_without_parameters_raises(self) -> None: + """``NonBinned`` without ``chunksize``/``non_binned_dims`` must raise before IO.""" + with pytest.raises(GridOverrideMissingParameterError, match="chunksize"): + segy_to_mdio( + segy_spec=None, + mdio_template=TemplateRegistry().get("ObnReceiverGathers3D"), + input_path="in.segy", + output_path="out.mdio", + grid_overrides={"NonBinned": True}, + ) + + class TestBuildRawHeaderVariables: """Tests for the isolated experimental raw-headers feature.""" From cf741937b94c1c953bf357013df8ae89f83e1079 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Jun 2026 16:37:10 +0000 Subject: [PATCH 11/11] Further cleanup --- src/mdio/ingestion/segy/index_strategies.py | 61 ++++++++++++++----- tests/unit/ingestion/test_metadata.py | 4 +- tests/unit/ingestion/test_schema_effects.py | 12 +--- tests/unit/ingestion/test_schema_resolver.py | 12 ++-- .../ingestion/test_segy_index_strategies.py | 20 +++++- 5 files changed, 75 insertions(+), 34 deletions(-) diff --git a/src/mdio/ingestion/segy/index_strategies.py b/src/mdio/ingestion/segy/index_strategies.py index 5d97d6e0..fe895c31 100644 --- a/src/mdio/ingestion/segy/index_strategies.py +++ b/src/mdio/ingestion/segy/index_strategies.py @@ -89,6 +89,16 @@ def compute_dimensions(self, headers: HeaderArray, dim_names: tuple[str, ...]) - Dimension(coords=np.unique(headers[name]), name=name) for name in dim_names if name in headers.dtype.names ] + def schema_effect(self) -> SchemaEffect | None: + """Schema reshape this strategy implies, or ``None`` if it leaves the layout unchanged. + + Most strategies only transform headers. Only strategies that introduce a ``trace`` + dimension the template did not declare (see :class:`DuplicateHandlingStrategy` and + :class:`NonBinnedStrategy`) reshape the resolved schema. Co-locating the reshape with + the header transform keeps the two views of an override from drifting. + """ + return None + @property def name(self) -> str: """Return the strategy's class name; useful for logging and tests.""" @@ -150,16 +160,22 @@ def transform_headers(self, headers: HeaderArray) -> HeaderArray: trace_values = np.array(with_trace["trace"]) return rfn.append_fields(headers, "trace", trace_values, usemask=False) + def schema_effect(self) -> SchemaEffect: + """Insert a chunksize-1 ``trace`` dimension to disambiguate duplicate index tuples.""" + return InsertTraceDimEffect(chunksize=1) + class NonBinnedStrategy(DuplicateHandlingStrategy): """Collapse selected non-binned dimensions into a single `trace` dimension. Inherits the per-tuple `trace` counter from `DuplicateHandlingStrategy`, excluding the - collapsed dims from the grouping so the counter only varies along the remaining dims. - The `trace` chunk size is owned by the schema reshaping (`CollapseToTraceEffect`), not - by this strategy. + collapsed dims from the grouping so the counter only varies along the remaining dims, and + owns the matching schema reshape (`CollapseToTraceEffect`). Both views of the override -- + the header transform and the schema layout, including the `trace` chunk size -- are + therefore defined together here. Args: + chunksize: Chunk size assigned to the inserted `trace` dimension by the schema effect. non_binned_dims: Header fields collapsed into `trace`. They are excluded from the duplicate grouping so the counter only varies along the remaining dims. coord_fields: Template coordinate names to exclude from grouping. @@ -168,15 +184,23 @@ class NonBinnedStrategy(DuplicateHandlingStrategy): def __init__( self, + chunksize: int, non_binned_dims: Iterable[str], coord_fields: Iterable[str] = (), dtype: DTypeLike = np.int16, ) -> None: + collapse_dims = tuple(non_binned_dims) super().__init__( coord_fields=coord_fields, - excluded_fields=tuple(non_binned_dims), + excluded_fields=collapse_dims, dtype=dtype, ) + self._chunksize = chunksize + self._collapse_dims = collapse_dims + + def schema_effect(self) -> SchemaEffect: + """Collapse the non-binned dims into a ``trace`` dimension sized by ``chunksize``.""" + return CollapseToTraceEffect(chunksize=self._chunksize, collapse_dims=self._collapse_dims) class ChannelWrappingStrategy(IndexStrategy): @@ -335,21 +359,30 @@ def compute_dimensions(self, headers: HeaderArray, dim_names: tuple[str, ...]) - """Delegate to the final child strategy.""" return self.strategies[-1].compute_dimensions(headers, dim_names) + def schema_effect(self) -> SchemaEffect | None: + """Return the single child reshape; at most one composed strategy produces one.""" + for strategy in self.strategies: + effect = strategy.schema_effect() + if effect is not None: + return effect + return None + class IndexStrategyRegistry: """Picks the right `IndexStrategy` from grid overrides and template hints. - The registry is the single source of truth for override semantics: it maps a - `GridOverrides` both to the header-transforming `IndexStrategy` (`create_strategy`) - and to the schema-reshaping `SchemaEffect` (`schema_effect`). Keeping both derivations - here prevents the header view and the schema view from drifting. + The registry maps a `GridOverrides` to the header-transforming `IndexStrategy` in one + place (`create_strategy`). The schema-reshaping `SchemaEffect` is not selected by a + second switch: it is read off that same strategy (`schema_effect`), so the header view + and the schema view of an override cannot drift. """ def schema_effect(self, grid_overrides: GridOverrides | None) -> SchemaEffect | None: """Return the schema reshaping implied by `grid_overrides`, if any. - Only `non_binned` and `has_duplicates` change the dimension/coordinate layout; - every other override transforms headers in place without reshaping the schema. + Derived from the same strategy that will transform headers, so layout changes stay in + lock-step with the header transform. Template and synthesis hints do not affect the + reshape, so they are omitted when building the strategy here. Args: grid_overrides: Typed grid override configuration, or `None`. @@ -359,12 +392,7 @@ def schema_effect(self, grid_overrides: GridOverrides | None) -> SchemaEffect | """ if not grid_overrides: return None - if grid_overrides.non_binned: - collapse_dims = None if grid_overrides.non_binned_dims is None else tuple(grid_overrides.non_binned_dims) - return CollapseToTraceEffect(chunksize=grid_overrides.chunksize, collapse_dims=collapse_dims) - if grid_overrides.has_duplicates: - return InsertTraceDimEffect(chunksize=1) - return None + return self.create_strategy(grid_overrides).schema_effect() def create_strategy( self, @@ -416,6 +444,7 @@ def create_strategy( if grid_overrides.non_binned: strategies.append( NonBinnedStrategy( + chunksize=grid_overrides.chunksize, non_binned_dims=grid_overrides.non_binned_dims or (), coord_fields=coord_fields, ) diff --git a/tests/unit/ingestion/test_metadata.py b/tests/unit/ingestion/test_metadata.py index 3ef7a928..142579c2 100644 --- a/tests/unit/ingestion/test_metadata.py +++ b/tests/unit/ingestion/test_metadata.py @@ -34,11 +34,11 @@ def test_adds_grid_overrides_when_provided(self) -> None: def test_preserves_existing_attributes(self) -> None: """Existing attribute keys should be preserved when adding overrides.""" dataset = _make_dataset(attributes={"existing": "value"}) - overrides = GridOverrides(non_binned=True) + overrides = GridOverrides(has_duplicates=True) add_grid_override_to_metadata(dataset, grid_overrides=overrides) assert dataset.metadata.attributes == { "existing": "value", - "gridOverrides": {"NonBinned": True}, + "gridOverrides": {"HasDuplicates": True}, } def test_no_overrides_leaves_attributes_untouched(self) -> None: diff --git a/tests/unit/ingestion/test_schema_effects.py b/tests/unit/ingestion/test_schema_effects.py index 5bd4a224..cd96fdd2 100644 --- a/tests/unit/ingestion/test_schema_effects.py +++ b/tests/unit/ingestion/test_schema_effects.py @@ -47,18 +47,12 @@ def test_header_only_overrides_return_none(self) -> None: registry = IndexStrategyRegistry() assert registry.schema_effect(GridOverrides(auto_channel_wrap=True)) is None - def test_non_binned_default_collapse(self) -> None: - """NonBinned without explicit dims yields a default (None) CollapseToTraceEffect.""" - effect = IndexStrategyRegistry().schema_effect(GridOverrides(non_binned=True, chunksize=64)) - assert isinstance(effect, CollapseToTraceEffect) - assert effect.chunksize == 64 - assert effect.collapse_dims is None - - def test_non_binned_explicit_dims(self) -> None: - """NonBinned with explicit dims preserves them on the effect.""" + def test_non_binned_wires_chunksize_and_dims(self) -> None: + """NonBinned yields a CollapseToTraceEffect carrying the override's chunksize and dims.""" overrides = GridOverrides(non_binned=True, chunksize=128, non_binned_dims=["channel"]) effect = IndexStrategyRegistry().schema_effect(overrides) assert isinstance(effect, CollapseToTraceEffect) + assert effect.chunksize == 128 assert effect.collapse_dims == ("channel",) def test_has_duplicates_inserts_trace(self) -> None: diff --git a/tests/unit/ingestion/test_schema_resolver.py b/tests/unit/ingestion/test_schema_resolver.py index ba748115..07444674 100644 --- a/tests/unit/ingestion/test_schema_resolver.py +++ b/tests/unit/ingestion/test_schema_resolver.py @@ -49,11 +49,12 @@ def test_cdp_required_fields(self) -> None: class TestSchemaResolverNonBinned: """NonBinned overrides collapse spatial dimensions into a single ``trace`` axis.""" - def test_default_collapse_keeps_first_spatial_dim(self) -> None: - """Default NonBinned keeps the first spatial dim and collapses the rest into ``trace``.""" + def test_collapses_named_dims_keeping_first_spatial_dim(self) -> None: + """NonBinned collapses the named trailing dims into ``trace``, keeping the leading dim.""" template = Seismic3DStreamerShotGathersTemplate(data_domain="time") # Streamer shot template default chunk shape is (8, 1, 128, 2048). - schema = SchemaResolver().resolve(template, GridOverrides(non_binned=True, chunksize=64)) + overrides = GridOverrides(non_binned=True, chunksize=64, non_binned_dims=["cable", "channel"]) + schema = SchemaResolver().resolve(template, overrides) names = [d.name for d in schema.dimensions] assert names == ["shot_point", "trace", "time"] @@ -74,7 +75,8 @@ def test_explicit_non_binned_dims(self) -> None: def test_coordinate_dimensions_collapsed_when_referenced(self) -> None: """Coordinates referencing collapsed dims are rewritten to depend on ``trace``.""" template = Seismic3DStreamerShotGathersTemplate(data_domain="time") - schema = SchemaResolver().resolve(template, GridOverrides(non_binned=True, chunksize=64)) + overrides = GridOverrides(non_binned=True, chunksize=64, non_binned_dims=["cable", "channel"]) + schema = SchemaResolver().resolve(template, overrides) # group_coord_x originally depends on (shot_point, cable, channel). After NonBinned # collapses cable+channel, it should depend on (shot_point, trace). group_coord_x = next(c for c in schema.coordinates if c.name == "group_coord_x") @@ -83,7 +85,7 @@ def test_coordinate_dimensions_collapsed_when_referenced(self) -> None: def test_grid_override_provenance_not_in_schema_metadata(self) -> None: """The resolver is mechanics-only: override provenance is attached at the dataset level.""" template = Seismic3DStreamerShotGathersTemplate(data_domain="time") - overrides = GridOverrides(non_binned=True, chunksize=64) + overrides = GridOverrides(non_binned=True, chunksize=64, non_binned_dims=["channel"]) schema = SchemaResolver().resolve(template, overrides) assert "gridOverrides" not in schema.metadata diff --git a/tests/unit/ingestion/test_segy_index_strategies.py b/tests/unit/ingestion/test_segy_index_strategies.py index 14bfbf9f..bea178fe 100644 --- a/tests/unit/ingestion/test_segy_index_strategies.py +++ b/tests/unit/ingestion/test_segy_index_strategies.py @@ -15,6 +15,8 @@ import pytest from mdio.builder.template_registry import TemplateRegistry +from mdio.ingestion.schema import CollapseToTraceEffect +from mdio.ingestion.schema import InsertTraceDimEffect from mdio.ingestion.segy.index_strategies import ChannelWrappingStrategy if TYPE_CHECKING: @@ -249,6 +251,12 @@ def test_excluded_fields_dropped_from_grouping(self) -> None: # shot_point alone so each row in the same shot_point gets a fresh counter. np.testing.assert_array_equal(out["trace"], [1, 2, 3]) + def test_owns_insert_trace_dim_effect(self) -> None: + """The strategy owns its schema reshape: a chunksize-1 inserted ``trace`` dim.""" + effect = DuplicateHandlingStrategy().schema_effect() + assert isinstance(effect, InsertTraceDimEffect) + assert effect.chunksize == 1 + # --------------------------------------------------------------------------- # NonBinnedStrategy @@ -267,7 +275,7 @@ def test_collapse_dim_excluded_from_counter(self) -> None: "channel": np.array([10, 11, 10, 11], dtype=np.int32), } ) - strategy = NonBinnedStrategy(non_binned_dims=("channel",)) + strategy = NonBinnedStrategy(chunksize=4, non_binned_dims=("channel",)) out = strategy.transform_headers(headers) # Each (shot_point, cable) tuple appears once -> counters all equal 1. assert "trace" in out.dtype.names @@ -283,6 +291,7 @@ def test_coord_fields_also_excluded(self) -> None: } ) strategy = NonBinnedStrategy( + chunksize=4, non_binned_dims=("channel",), coord_fields=("cdp_x",), ) @@ -290,6 +299,13 @@ def test_coord_fields_also_excluded(self) -> None: # Grouping is by shot_point only -> each shot_point has 2 rows -> counters {1, 2}. np.testing.assert_array_equal(out["trace"], [1, 2, 1, 2]) + def test_owns_collapse_to_trace_effect(self) -> None: + """The strategy owns its schema reshape, carrying chunksize and collapse dims.""" + effect = NonBinnedStrategy(chunksize=8, non_binned_dims=("channel",)).schema_effect() + assert isinstance(effect, CollapseToTraceEffect) + assert effect.chunksize == 8 + assert effect.collapse_dims == ("channel",) + # --------------------------------------------------------------------------- # ChannelWrappingStrategy @@ -473,7 +489,7 @@ def test_strategies_run_in_order(self) -> None: composite = CompositeStrategy( [ ComponentSynthesisStrategy(("component",)), - NonBinnedStrategy(non_binned_dims=("channel",)), + NonBinnedStrategy(chunksize=4, non_binned_dims=("channel",)), ] ) out = composite.transform_headers(headers)