Source code for tesseract_core.runtime.experimental

# Copyright 2025 Pasteur Labs. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

from collections.abc import Iterator, Sequence
from typing import (
    Annotated,
    Any,
    Callable,
    Union,
    get_args,
    get_origin,
)

from pydantic import (
    GetCoreSchemaHandler,
    GetJsonSchemaHandler,
    TypeAdapter,
)
from pydantic.json_schema import JsonSchemaValue
from pydantic_core import SchemaSerializer, SchemaValidator, core_schema

from tesseract_core.runtime.file_interactions import parent_path
from tesseract_core.runtime.schema_types import safe_issubclass


[docs] class LazySequence(Sequence): """Lazy sequence type that loads items from a file handle on access. This allows users to define a sequence of objects that are lazily loaded from a data source, and validated when accessed. When used as a Pydantic annotation, lazy sequences accept either a list of objects or a glob pattern to load objects from a file path. Example: >>> class MyModel(BaseModel): ... objects: LazySequence[str] >>> model = MyModel.model_validate({"objects": ["item1", "item2"]}) >>> model.objects[0] 'item1' >>> model = MyModel.model_validate({"objects": "@/path/to/data/*.json"}) >>> model.objects[1] 'item2' """ def __init__(self, keys: Sequence[Any], getter: Callable[[Any], Any]) -> None: """Initialize a LazySequence with the given keys and getter function. Args: keys: Sequence of keys to load items from. getter: Function that loads an item from a key. Example: >>> items = LazySequence(["item1", "item2"], lambda key: f"Loaded {key}") >>> items[0] 'Loaded item1' """ self.keys = keys self.getter = getter def __class_getitem__(cls, base_type: type) -> type: """Create a new type annotation based on the given wrapped type.""" # Support for LazySequence[MyObject] syntax return Annotated[Sequence[base_type], PydanticLazySequenceAnnotation] @classmethod def __get_pydantic_core_schema__(cls, *args: Any, **kwargs: Any) -> None: # Raise if LazySequence is accidentally used as Pyedantic annotation without a wrapped type raise NotImplementedError( f"Generic {cls.__name__} objects do not support Pydantic schema generation. " f"Did you mean to use {cls.__name__}[MyObject]?" ) def __getitem__(self, key: int) -> Any: if not isinstance(key, int): raise TypeError("LazySequence indices must be integers") return self.getter(self.keys[key]) def __repr__(self) -> str: return f"{self.__class__.__name__}(keys={self.keys})" def __len__(self) -> int: return len(self.keys) def __iter__(self) -> Iterator[Any]: return (self.__getitem__(idx) for idx in range(len(self)))
class PydanticLazySequenceAnnotation: """Pydantic annotation for lazy sequences.""" def __init__(self, *args: Any, **kwargs: Any) -> None: raise RuntimeError(f"{self.__class__.__name__} cannot be instantiated") @classmethod def __get_pydantic_core_schema__( cls, _source_type: Any, _handler: GetCoreSchemaHandler, ) -> core_schema.CoreSchema: """This method is called by Pydantic to get the core schema for the annotated type. Does most of the heavy lifting for validation and serialization. """ def create_sequence(maybe_path: Union[str, Sequence[Any]]) -> LazySequence: """Expand a glob pattern into a LazySequence if needed.""" validator = SchemaValidator(item_schema) if not isinstance(maybe_path, str) or not maybe_path.startswith("@"): items = maybe_path getter = validator.validate_python return LazySequence(items, getter) # We know that the path is a glob pattern, so we need to load items from files from .file_interactions import ( expand_glob, guess_format_from_path, load_bytes, read_from_path, ) maybe_path = maybe_path[1:] items = expand_glob(maybe_path) def load_item(key: str) -> Any: buffer = read_from_path(key) format = guess_format_from_path(key) obj = load_bytes(buffer, format) context = {"base_dir": parent_path(key)} return validator.validate_python(obj, context=context) return LazySequence(items, load_item) def serialize(obj: LazySequence, __info: Any) -> Any: """When serializing, convert the LazySequence to a list of items. This is not an encouraged use case, but it is supported for completeness. """ materialized_sequence = list(obj) serializer = SchemaSerializer(sequence_of_item_schema) return serializer.to_python(materialized_sequence, **__info.__dict__) origin = get_origin(_source_type) if not safe_issubclass(origin, Sequence): # should never happen, since we always use Annotated[Sequence[...], PydanticLazySequenceAnnotation] raise ValueError( f"LazySequence can only be used with Sequence types, not {origin}" ) # This is a Sequence, so args is a single type args = get_args(_source_type) assert len(args) == 1 # Wrap in TypeAdapter so we don't need conditional logic for Python types vs. Pydantic models _source_type = TypeAdapter(args[0]) item_schema = _source_type.core_schema sequence_of_item_schema = core_schema.list_schema(item_schema) obj_or_path = core_schema.union_schema( [sequence_of_item_schema, core_schema.str_schema(pattern="^@")] ) load_schema = core_schema.chain_schema( # first load data, then validate it with the wrapped schema [ obj_or_path, core_schema.no_info_plain_validator_function( create_sequence, serialization=core_schema.plain_serializer_function_ser_schema( serialize, info_arg=True, return_schema=sequence_of_item_schema, ), ), ] ) return core_schema.json_or_python_schema( json_schema=load_schema, python_schema=load_schema, serialization=core_schema.plain_serializer_function_ser_schema( serialize, info_arg=True, ), ) @classmethod def __get_pydantic_json_schema__( cls, _core_schema: core_schema.CoreSchema, handler: GetJsonSchemaHandler ) -> JsonSchemaValue: """This method is called by Pydantic to get the JSON schema for the annotated type.""" return handler(_core_schema)