Skip to content

Commit

Permalink
Streamline endianness inference (#140)
Browse files Browse the repository at this point in the history
* Allow `None` endian in settings

* remove unnecessary settings in indexers

* streamline endianness detection

* add extra test for settings endian edge case

* add tests for exact spec

* Refactor variable names in test_segy_file

This change involves renaming a couple of variable names to improve clarity. The '_dict' variable names in the tests of segy_file have been adjusted to be more descriptive of their specific use case, enhancing overall readability.

---------

Co-authored-by: Altay Sansal <[email protected]>
  • Loading branch information
tasansal and Altay Sansal authored Jun 24, 2024
1 parent 31eb70b commit 7264354
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 24 deletions.
6 changes: 3 additions & 3 deletions src/segy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pydantic_settings import BaseSettings
from pydantic_settings import SettingsConfigDict

from segy.schema import Endianness
from segy.schema import Endianness # noqa: TCH001


class SegyBaseSettings(BaseSettings):
Expand Down Expand Up @@ -45,8 +45,8 @@ class SegySettings(SegyBaseSettings):
default_factory=BinaryHeaderSettings,
description="Overrides for binary file header settings.",
)
endianness: Endianness = Field(
default=Endianness.BIG,
endianness: Endianness | None = Field(
default=None,
description="Override the inferred endianness of the file.",
)
storage_options: dict[str, Any] = Field(
Expand Down
46 changes: 31 additions & 15 deletions src/segy/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from segy.indexing import HeaderIndexer
from segy.indexing import TraceIndexer
from segy.schema import Endianness
from segy.schema import HeaderSpec
from segy.schema import ScalarType
from segy.standards import get_segy_standard
from segy.standards.mapping import SEGY_FORMAT_MAP
Expand Down Expand Up @@ -50,21 +49,29 @@ class SegyScanResult:
sample_format: ScalarType


def infer_endianness(buffer: bytes, bin_spec: HeaderSpec) -> SegyScanResult:
def infer_endianness(
fs: AbstractFileSystem,
url: str,
spec: SegySpec,
) -> SegyScanResult:
"""Infer endianness of a binary header buffer given header spec.
The buffer length and binary header spec itemsize must be the same.
Args:
buffer: Bytes buffer of binary header.
bin_spec: Header spec defining how to parse binary header.
fs: FSSpec filesystem instance.
url: Path to the SEG-Y file.
spec: SEG-Y spec containing how to parse binary header.
Returns:
A SegyScanResult instance filled with inferred endianness, revision, and format.
Raises:
EndiannessInferenceError: When inference fails.
"""
bin_spec = spec.binary_header.model_copy(deep=True) # we will mutate, so copy
buffer = fs.read_block(url, offset=bin_spec.offset, length=bin_spec.itemsize)

for endianness in [Endianness.BIG, Endianness.LITTLE]:
bin_spec.endianness = endianness
bin_hdr = np.frombuffer(buffer, dtype=bin_spec.dtype)
Expand All @@ -91,15 +98,11 @@ def infer_endianness(buffer: bytes, bin_spec: HeaderSpec) -> SegyScanResult:

def infer_spec(fs: AbstractFileSystem, url: str) -> SegySpec:
"""Try to infer SEG-Y file revision and endianness to build a SegySpec."""
bin_spec = get_segy_standard(1.0).binary_header

buffer = fs.read_block(url, offset=bin_spec.offset, length=bin_spec.itemsize)
scan_result = infer_endianness(buffer, bin_spec)

spec = get_segy_standard(1.0)
scan_result = infer_endianness(fs, url, spec)
new_spec = get_segy_standard(scan_result.revision)
new_spec.trace.data.format = scan_result.sample_format
new_spec.endianness = scan_result.endianness

return new_spec


Expand Down Expand Up @@ -129,11 +132,27 @@ def __init__(
self.fs, self.url = url_to_fs(url, **self.settings.storage_options)
self._info = self.fs.info(self.url)

# Spec setting overrides.
if self.settings.binary.revision is not None:
self.spec = get_segy_standard(self.settings.binary.revision)
self.spec.endianness = self.settings.endianness

# Override/Infer endianness
if self.settings.endianness is None:
scan_result = infer_endianness(self.fs, self.url, self.spec)
self.spec.endianness = scan_result.endianness
else:
self.spec.endianness = self.settings.endianness

# Default, infer if no spec provided.
elif spec is None:
self.spec = infer_spec(self.fs, self.url)

# If spec is provided set to it and update endianness if its None.
else:
self.spec = spec if spec is not None else infer_spec(self.fs, self.url)
self.spec = spec
if self.spec.endianness is None:
scan_result = infer_endianness(self.fs, self.url, self.spec)
self.spec.endianness = scan_result.endianness

self._update_spec()
self.accessors = TraceAccessor(self.spec.trace)
Expand Down Expand Up @@ -249,7 +268,6 @@ def sample(self) -> AbstractIndexer:
self.url,
self.spec.trace,
self.num_traces,
settings=self.settings,
transform_pipeline=self.accessors.sample_decode_pipeline,
)

Expand All @@ -261,7 +279,6 @@ def header(self) -> HeaderIndexer:
self.url,
self.spec.trace,
self.num_traces,
settings=self.settings,
transform_pipeline=self.accessors.header_decode_pipeline,
)

Expand All @@ -273,6 +290,5 @@ def trace(self) -> TraceIndexer:
self.url,
self.spec.trace,
self.num_traces,
settings=self.settings,
transform_pipeline=self.accessors.trace_decode_pipeline,
)
4 changes: 0 additions & 4 deletions src/segy/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from segy.arrays import HeaderArray
from segy.arrays import TraceArray
from segy.config import SegySettings
from segy.transforms import TransformPipeline

if TYPE_CHECKING:
Expand Down Expand Up @@ -103,7 +102,6 @@ class AbstractIndexer(ABC):
url: A string representing the URL of the file.
spec: An instance of BaseDataType.
max_value: An integer representing the maximum value of the index.
settings: Optional parsing settings.
transform_pipeline: The transforms pipeline to apply for decoding.
"""

Expand All @@ -115,14 +113,12 @@ def __init__( # noqa: PLR0913
url: str,
spec: BaseDataType,
max_value: int,
settings: SegySettings | None = None,
transform_pipeline: TransformPipeline | None = None,
):
self.fs = fs
self.url = url
self.spec = spec
self.max_value = max_value
self.settings = SegySettings() if settings is None else settings

self.transform_pipeline = (
TransformPipeline() if transform_pipeline is None else transform_pipeline
Expand Down
20 changes: 18 additions & 2 deletions tests/test_segy_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,14 @@ def test_infer_spec(
# Check if JSON-able dict representation is valid
assert segy_file.spec._repr_json_() == segy_file.spec.model_dump(mode="json")

# Test the other case where we exactly specify the spec.
spec_expected = get_segy_standard(standard)
spec_expected.endianness = endianness
spec_expected.trace.data.format = sample_format
segy_file_expected = SegyFile(test_config.uri, spec=spec_expected)

assert segy_file_expected.spec == segy_file.spec

def test_text_file_header(
self, mock_filesystem: MemoryFileSystem, default_text: str
) -> None:
Expand Down Expand Up @@ -314,8 +322,16 @@ def test_revision_endian_override(self, mock_filesystem: MemoryFileSystem) -> No
endianness=Endianness.BIG,
)

settings_dict = {"binary": {"revision": 1.0}, "endianness": "little"}
settings = SegySettings.model_validate(settings_dict)
# Ensure still infer correctly if endian not provided
settings_dict_rev_only = {"binary": {"revision": 1.0}}
settings = SegySettings.model_validate(settings_dict_rev_only)
segy_file = SegyFile(test_config.uri, settings=settings)

assert segy_file.spec.endianness == Endianness.BIG

# Now ensure overriding both
settings_dict_both = {"binary": {"revision": 1.0}, "endianness": "little"}
settings = SegySettings.model_validate(settings_dict_both)
segy_file = SegyFile(test_config.uri, settings=settings)

assert segy_file.spec.segy_standard == SegyStandard.REV1
Expand Down

0 comments on commit 7264354

Please sign in to comment.