Skip to content
608 changes: 308 additions & 300 deletions docs/tutorials/custom_template.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/mdio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from mdio.api.io import to_mdio
from mdio.converters import mdio_to_segy
from mdio.converters import segy_to_mdio
from mdio.ingestion import ResolvedSchema
from mdio.optimize.access_pattern import OptimizedAccessPatternConfig
from mdio.optimize.access_pattern import optimize_access_patterns
from mdio.segy.geometry import GridOverrides
Expand All @@ -27,4 +28,5 @@
"segy_to_mdio",
"OptimizedAccessPatternConfig",
"optimize_access_patterns",
"ResolvedSchema",
]
6 changes: 5 additions & 1 deletion src/mdio/builder/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
from datetime import datetime
from enum import Enum
from enum import auto
from importlib import metadata
from typing import Any

from mdio import __version__
try:
__version__ = metadata.version("multidimio")
except metadata.PackageNotFoundError:
__version__ = "unknown"
from mdio.builder.formatting_html import dataset_builder_repr_html
from mdio.builder.schemas.compressors import ZFP
from mdio.builder.schemas.compressors import Blosc
Expand Down
124 changes: 69 additions & 55 deletions src/mdio/builder/templates/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import copy
import logging
from abc import ABC
from abc import abstractmethod
from typing import TYPE_CHECKING
Expand All @@ -19,11 +19,14 @@
from mdio.builder.schemas.v1.variable import CoordinateMetadata
from mdio.builder.schemas.v1.variable import VariableMetadata
from mdio.builder.templates.types import CoordinateSpec
from mdio.builder.templates.types import DimCoordinateTypes

if TYPE_CHECKING:
from mdio.builder.schemas.v1.dataset import Dataset
from mdio.builder.templates.types import SeismicDataDomain

logger = logging.getLogger(__name__)


class AbstractDatasetTemplate(ABC):
"""Abstract base class that defines the template method for Dataset building factory.
Expand All @@ -46,12 +49,6 @@ def __init__(self, data_domain: SeismicDataDomain) -> None:
self._var_chunk_shape: tuple[int, ...] = ()
self.synthesize_missing_dims: tuple[str, ...] = ()

# TEMPORARY (removed with declare_coordinate_specs): set when grid overrides mutate this
# template in-place (dims collapsed into 'trace', extra coordinates added). Once mutated,
# the runtime layout intentionally diverges from the static declare_coordinate_specs()
# contract, so the drift guard in build_dataset() must not run.
self._grid_overrides_applied: bool = False

self._builder: MDIODatasetBuilder | None = None
self._dim_sizes: tuple[int, ...] = ()
self._units: dict[str, AllUnitModel] = {}
Expand Down Expand Up @@ -114,6 +111,38 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]:
)
return tuple(specs)

def declare_dim_coordinate_types(self) -> DimCoordinateTypes:
"""Declare data types for each dimension coordinate in this template.

Returns:
A dictionary mapping dimension name to ScalarType.
"""
return dict.fromkeys(self.dimension_names, ScalarType.INT32)

def _dim_dtype(self, name: str) -> ScalarType:
"""Return the declared dtype for a dimension coordinate.

Args:
name: The dimension name.

Returns:
The declared ScalarType, defaulting to INT32.
"""
return self.declare_dim_coordinate_types().get(name, ScalarType.INT32)

def _add_dimension_coordinate(self, name: str) -> None:
"""Add a single dimension coordinate.

Args:
name: The dimension name.
"""
self._builder.add_coordinate(
name,
dimensions=(name,),
data_type=self._dim_dtype(name),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)),
)

def build_dataset(
self,
name: str,
Expand All @@ -122,6 +151,12 @@ def build_dataset(
) -> Dataset:
"""Template method that builds the dataset.

.. deprecated:: 1.2
``build_dataset`` is deprecated and is planned for removal in 1.2.5. SEG-Y
ingestion now builds datasets from a resolved schema via the schema-driven
factory (:func:`mdio.ingestion.dataset_factory.build_mdio_dataset`); use
:func:`mdio.segy_to_mdio` for ingestion.

Args:
name: The name of the dataset.
sizes: The sizes of the dimensions.
Expand All @@ -133,6 +168,11 @@ def build_dataset(
Raises:
ValueError: If coordinate already exists from subclass override.
"""
logger.warning(
"AbstractDatasetTemplate.build_dataset is deprecated as of 1.2 and is planned for "
"removal in 1.2.5; SEG-Y ingestion builds datasets via the schema-driven factory. "
"Use `mdio.segy_to_mdio` for ingestion."
)
self._dim_sizes = sizes

attributes = self._load_dataset_attributes() or {}
Expand All @@ -154,10 +194,7 @@ def build_dataset(
except ValueError as exc: # coordinate may already exist
if "same name twice" not in str(exc):
raise
# Skip the static drift guard when grid overrides have transformed the template: the
# runtime layout no longer matches the declared (override-free) specs by design.
if not self._grid_overrides_applied:
self._validate_declared_coordinate_specs()
self._validate_declared_coordinate_specs()
self._add_variables()
self._add_trace_mask()

Expand All @@ -174,30 +211,6 @@ def add_units(self, units: dict[str, AllUnitModel]) -> None:
raise ValueError(msg)
self._units |= units

def apply_resolved_dimensions(
self,
dim_names: tuple[str, ...],
chunk_shape: tuple[int, ...],
) -> None:
"""Update the template's dimension layout from a resolved schema.

Supported entry point for the ingestion pipeline to push back dimension names
and chunk shape after the SchemaResolver has applied grid overrides
(e.g. NonBinned, HasDuplicates), instead of mutating private attributes.

Args:
dim_names: Final ordered dimension names.
chunk_shape: Chunk shape matching ``dim_names`` length.

Raises:
ValueError: If ``len(chunk_shape) != len(dim_names)``.
"""
if len(chunk_shape) != len(dim_names):
msg = f"chunk_shape length {len(chunk_shape)} does not match dim_names length {len(dim_names)}"
raise ValueError(msg)
self._dim_names = tuple(dim_names)
self._var_chunk_shape = tuple(chunk_shape)

def _validate_declared_coordinate_specs(self) -> None:
"""Fail the build if :meth:`declare_coordinate_specs` drifted from the built coordinates.

Expand All @@ -206,11 +219,8 @@ def _validate_declared_coordinate_specs(self) -> None:
:meth:`_add_coordinates`, this guard ensures the two never diverge in name, dimensions,
or dtype. The ingestion ``SchemaResolver`` trusts the declared specs, so silent drift
would corrupt resolved schemas. The check runs for every template (built-in and
user-defined) on every ``build_dataset`` call that does not apply grid overrides. Grid
overrides mutate the template in-place (collapsing dims into ``trace`` and adding
coordinates), so the runtime layout intentionally diverges from the declared specs and
the guard is skipped for those builds. It is removed once ``_add_coordinates`` is derived
from the resolved schema and the duplication no longer exists.
user-defined) on every ``build_dataset`` call. It is removed once ``_add_coordinates``
is derived from the resolved schema and the duplication no longer exists.

Raises:
ValueError: If the declared specs do not match the built non-dimension coordinates.
Expand Down Expand Up @@ -266,32 +276,32 @@ def trace_domain(self) -> str:
@property
def spatial_dimension_names(self) -> tuple[str, ...]:
"""Returns the names of the dimensions excluding the last axis."""
return copy.deepcopy(self._dim_names[:-1])
return self._dim_names[:-1]

@property
def dimension_names(self) -> tuple[str, ...]:
"""Returns the names of the dimensions."""
return copy.deepcopy(self._dim_names)
return self._dim_names

@property
def calculated_dimension_names(self) -> tuple[str, ...]:
"""Returns the names of the dimensions."""
return copy.deepcopy(self._calculated_dims)
"""Returns the names of the calculated dimensions."""
return self._calculated_dims

@property
def physical_coordinate_names(self) -> tuple[str, ...]:
"""Returns the names of the physical (world) coordinates."""
return copy.deepcopy(self._physical_coord_names)
return self._physical_coord_names

@property
def logical_coordinate_names(self) -> tuple[str, ...]:
"""Returns the names of the logical (grid) coordinates."""
return copy.deepcopy(self._logical_coord_names)
return self._logical_coord_names

@property
def coordinate_names(self) -> tuple[str, ...]:
"""Returns names of all coordinates."""
return copy.deepcopy(self._physical_coord_names + self._logical_coord_names)
return self._physical_coord_names + self._logical_coord_names

@property
def full_chunk_shape(self) -> tuple[int, ...]:
Expand Down Expand Up @@ -354,6 +364,15 @@ def _load_dataset_attributes(self) -> dict[str, Any]:
The dataset attributes as a dictionary
"""

@property
def units(self) -> dict[str, AllUnitModel]:
"""Return a copy of the template's configured units.

Read-only view for collaborators (e.g. ingestion unit resolution) so they do not
reach into the private ``_units`` mapping.
"""
return dict(self._units)

def get_unit_by_key(self, key: str) -> AllUnitModel | None:
"""Get units by variable/dimension/coordinate name. Returns None if not found."""
return self._units.get(key, None)
Expand All @@ -375,12 +394,7 @@ def _add_coordinates(self) -> None:
"""
# Add dimension coordinates
for name in self._dim_names:
self._builder.add_coordinate(
name,
dimensions=(name,),
data_type=ScalarType.INT32,
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)),
)
self._add_dimension_coordinate(name)

# Add non-dimension coordinates
# Note: coordinate_names may be modified at runtime by grid overrides,
Expand All @@ -400,7 +414,7 @@ def _add_coordinates(self) -> None:
raise

def _add_trace_mask(self) -> None:
"""Add trace mask variables."""
"""Add trace mask variable."""
self._builder.add_variable(
name="trace_mask",
dimensions=self.spatial_dimension_names,
Expand All @@ -410,7 +424,7 @@ def _add_trace_mask(self) -> None:
)

def _add_trace_headers(self, header_dtype: StructuredType) -> None:
"""Add trace mask variables."""
"""Add trace headers variable."""
chunk_grid = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=self.full_chunk_shape[:-1]))
self._builder.add_variable(
name="headers",
Expand Down
6 changes: 3 additions & 3 deletions src/mdio/builder/templates/seismic_2d_cdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,18 @@ def _add_coordinates(self) -> None:
self._builder.add_coordinate(
"cdp",
dimensions=("cdp",),
data_type=ScalarType.INT32,
data_type=self._dim_dtype("cdp"),
)
self._builder.add_coordinate(
self._gather_domain,
dimensions=(self._gather_domain,),
data_type=ScalarType.INT32,
data_type=self._dim_dtype(self._gather_domain),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)),
)
self._builder.add_coordinate(
self.trace_domain,
dimensions=(self.trace_domain,),
data_type=ScalarType.INT32,
data_type=self._dim_dtype(self.trace_domain),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)),
)

Expand Down
2 changes: 1 addition & 1 deletion src/mdio/builder/templates/seismic_2d_streamer_shot.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def _add_coordinates(self) -> None:
self._builder.add_coordinate(
name,
dimensions=(name,),
data_type=ScalarType.INT32,
data_type=self._dim_dtype(name),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)),
)

Expand Down
8 changes: 4 additions & 4 deletions src/mdio/builder/templates/seismic_3d_cdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,23 +48,23 @@ def _add_coordinates(self) -> None:
self._builder.add_coordinate(
"inline",
dimensions=("inline",),
data_type=ScalarType.INT32,
data_type=self._dim_dtype("inline"),
)
self._builder.add_coordinate(
"crossline",
dimensions=("crossline",),
data_type=ScalarType.INT32,
data_type=self._dim_dtype("crossline"),
)
self._builder.add_coordinate(
self._gather_domain,
dimensions=(self._gather_domain,),
data_type=ScalarType.INT32,
data_type=self._dim_dtype(self._gather_domain),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self._gather_domain)),
)
self._builder.add_coordinate(
self.trace_domain,
dimensions=(self.trace_domain,),
data_type=ScalarType.INT32,
data_type=self._dim_dtype(self.trace_domain),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)),
)

Expand Down
21 changes: 16 additions & 5 deletions src/mdio/builder/templates/seismic_3d_coca.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mdio.builder.schemas.v1.variable import CoordinateMetadata
from mdio.builder.templates.base import AbstractDatasetTemplate
from mdio.builder.templates.types import CoordinateSpec
from mdio.builder.templates.types import DimCoordinateTypes
from mdio.builder.templates.types import SeismicDataDomain


Expand Down Expand Up @@ -34,34 +35,44 @@ def declare_coordinate_specs(self) -> tuple[CoordinateSpec, ...]:
CoordinateSpec(name="cdp_y", dimensions=("inline", "crossline"), dtype=ScalarType.FLOAT64),
)

def declare_dim_coordinate_types(self) -> DimCoordinateTypes:
"""Declare the data types for each dimension coordinate in this template."""
return {
"inline": ScalarType.INT32,
"crossline": ScalarType.INT32,
"offset": ScalarType.INT32,
"azimuth": ScalarType.FLOAT32,
self._data_domain: ScalarType.INT32,
}

def _add_coordinates(self) -> None:
# Add dimension coordinates
self._builder.add_coordinate(
"inline",
dimensions=("inline",),
data_type=ScalarType.INT32,
data_type=self._dim_dtype("inline"),
)
self._builder.add_coordinate(
"crossline",
dimensions=("crossline",),
data_type=ScalarType.INT32,
data_type=self._dim_dtype("crossline"),
)
self._builder.add_coordinate(
"offset",
dimensions=("offset",),
data_type=ScalarType.INT32,
data_type=self._dim_dtype("offset"),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("offset")), # same unit as X/Y
)
self._builder.add_coordinate(
"azimuth",
dimensions=("azimuth",),
data_type=ScalarType.FLOAT32,
data_type=self._dim_dtype("azimuth"),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key("azimuth")),
)
self._builder.add_coordinate(
self.trace_domain,
dimensions=(self.trace_domain,),
data_type=ScalarType.INT32,
data_type=self._dim_dtype(self.trace_domain),
metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(self.trace_domain)),
)

Expand Down
Loading
Loading