From 0d8d5c132b0ce5f74bf771d87b582c8e759dc94d Mon Sep 17 00:00:00 2001 From: Yuang Gao Date: Sat, 13 Jun 2026 12:09:44 -0700 Subject: [PATCH 1/2] ci(python): typecheck arrow.py and test_arrow.py --- python/pyproject.toml | 2 ++ python/python/lance/arrow.py | 47 +++++++++++++++++--------- python/python/lance/lance/__init__.pyi | 2 +- python/python/tests/test_arrow.py | 1 + 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index d2efab23579..ea4a96065fa 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -115,6 +115,8 @@ include = [ "python/lance/schema.py", "python/lance/file.py", "python/lance/util.py", + "python/lance/arrow.py", + "python/tests/test_arrow.py", ] # Dependencies like pyarrow make this difficult to enforce strictly. reportMissingTypeStubs = "warning" diff --git a/python/python/lance/arrow.py b/python/python/lance/arrow.py index 54da15705f8..e2e3adb2f38 100644 --- a/python/python/lance/arrow.py +++ b/python/python/lance/arrow.py @@ -4,10 +4,12 @@ """Extensions to PyArrows.""" import json +import typing from pathlib import Path from typing import Callable, Iterable, Optional, Union import pyarrow as pa +import pyarrow.compute as pc from ._arrow.bf16 import ( # noqa: F401 BFloat16, @@ -19,8 +21,10 @@ from .lance import bfloat16_array __all__ = [ + "BFloat16", "BFloat16Array", "BFloat16Type", + "PandasBFloat16Array", "bfloat16_array", "cast", "EncodedImageType", @@ -220,14 +224,15 @@ def from_uris( ['file::///tmp/1.png'] """ + storage: pa.Array if isinstance(uris, (pa.StringArray, pa.LargeStringArray)): - pass + storage = uris elif isinstance(uris, Iterable): - uris = pa.array((str(uri) for uri in uris), type=pa.string()) + storage = pa.array((str(uri) for uri in uris), type=pa.string()) else: raise TypeError("Cannot build a ImageURIArray from {}".format(type(uris))) - return cls.from_storage(ImageURIType(uris.type), uris) + return cls.from_storage(ImageURIType(storage.type), storage) def read_uris(self, storage_type=pa.binary()) -> "EncodedImageArray": """ @@ -268,7 +273,8 @@ def download(url): print("Failed to reach the server: ", e.reason) elif hasattr(e, "code"): print( - "The server could not fulfill the request. Error code: ", e.code + "The server could not fulfill the request. Error code: ", + getattr(e, "code"), ) images = [] @@ -365,22 +371,28 @@ def to_tensor( if not decoder: - def pillow_decoder(images): + def pillow_decoder(images) -> "np.ndarray": import io from PIL import Image return np.stack( - [Image.open(io.BytesIO(img)) for img in images.to_pylist()] + [ + np.asarray(Image.open(io.BytesIO(img))) + for img in images.to_pylist() + ] ) - def tensorflow_decoder(images): + def tensorflow_decoder(images) -> "np.ndarray": import tensorflow as tf decoded_to_tensor = tuple( tf.io.decode_image(img) for img in images.to_pylist() ) - return tf.stack(decoded_to_tensor, axis=0).numpy() + # tf.stack is typed as ``None`` by tensorflow's incomplete stubs. + return tf.stack( # pyright: ignore[reportOptionalCall] + decoded_to_tensor, axis=0 + ).numpy() decoders = [ ("tensorflow", tensorflow_decoder), @@ -401,9 +413,10 @@ def tensorflow_decoder(images): image_array = decoder(self.storage) if isinstance(image_array, pa.FixedShapeTensorType): - shape = image_array.shape - arrow_type = image_array.storage_type - tensor_array = image_array + tensor = typing.cast("pa.Array", image_array) + shape = tensor.shape + arrow_type = tensor.storage_type + tensor_array = tensor else: shape = image_array.shape[1:] arrow_type = pa.from_numpy_dtype(image_array.dtype) @@ -571,7 +584,8 @@ def cast( + f"got: {target_type}" ) np_arr = arr.to_numpy() - float_arr = np_arr.astype(target_type.to_pandas_dtype()) + float_type = typing.cast("pa.DataType", target_type) + float_arr = np_arr.astype(float_type.to_pandas_dtype()) return pa.array(float_arr) elif isinstance(target_type, BFloat16Type) or target_type in ["bfloat16", "bf16"]: if not pa.types.is_floating(arr.type): @@ -586,15 +600,16 @@ def cast( target_type ): # Casting fixed size list to fixed size list - if arr.type.list_size != target_type.list_size: + list_type = typing.cast("pa.DataType", target_type) + if arr.type.list_size != list_type.list_size: raise ValueError( "Only support casting fixed size list to fixed size list " f"with the same size, got: {arr.type} to {target_type}" ) - values = cast(arr.values, target_type.value_type) + values = cast(arr.values, list_type.value_type) return pa.FixedSizeListArray.from_arrays( - values=values, list_size=target_type.list_size + values=values, list_size=list_type.list_size ) # Fallback to normal cast. - return pa.compute.cast(arr, target_type, *args, **kwargs) + return pc.cast(arr, target_type, *args, **kwargs) diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 74db076db41..f65f53d2f40 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -698,7 +698,7 @@ class BFloat16: def __gt__(self, other: BFloat16) -> bool: ... def __ge__(self, other: BFloat16) -> bool: ... -def bfloat16_array(values: List[str | None]) -> BFloat16Array: ... +def bfloat16_array(values: Sequence[float | None]) -> BFloat16Array: ... class PyFullTextQuery: @staticmethod diff --git a/python/python/tests/test_arrow.py b/python/python/tests/test_arrow.py index b6e6024e0e7..1d4a27f2b4b 100644 --- a/python/python/tests/test_arrow.py +++ b/python/python/tests/test_arrow.py @@ -7,6 +7,7 @@ from pathlib import Path import lance +import lance.arrow import numpy as np import pandas as pd import pyarrow as pa From fdb65ccce92457695231f6eb177d148b9d2b99e1 Mon Sep 17 00:00:00 2001 From: Yuang Gao Date: Tue, 16 Jun 2026 21:02:47 -0700 Subject: [PATCH 2/2] ci(python): ignore optional and private imports in arrow typecheck --- python/python/lance/arrow.py | 11 ++++++----- python/python/tests/test_arrow.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/python/lance/arrow.py b/python/python/lance/arrow.py index e2e3adb2f38..da022cdc8f6 100644 --- a/python/python/lance/arrow.py +++ b/python/python/lance/arrow.py @@ -283,7 +283,9 @@ def download(url): if parsed_uri.scheme in ("http", "https"): images.append(download(uri)) else: - filesystem, path = fs.FileSystem.from_uri(uri.as_py()) + filesystem, path = fs.FileSystem.from_uri( # pyright: ignore[reportPrivateImportUsage] + uri.as_py() + ) with filesystem.open_input_stream(path) as f: images.append(f.read()) @@ -303,7 +305,7 @@ def __repr__(self): def pillow_metadata_decoder(images): import io - from PIL import Image + from PIL import Image # pyright: ignore[reportMissingImports] img = Image.open(io.BytesIO(images[0].as_py())) return img @@ -374,7 +376,7 @@ def to_tensor( def pillow_decoder(images) -> "np.ndarray": import io - from PIL import Image + from PIL import Image # pyright: ignore[reportMissingImports] return np.stack( [ @@ -389,7 +391,6 @@ def tensorflow_decoder(images) -> "np.ndarray": decoded_to_tensor = tuple( tf.io.decode_image(img) for img in images.to_pylist() ) - # tf.stack is typed as ``None`` by tensorflow's incomplete stubs. return tf.stack( # pyright: ignore[reportOptionalCall] decoded_to_tensor, axis=0 ).numpy() @@ -489,7 +490,7 @@ def to_encoded(self, encoder=None, storage_type=pa.binary()) -> "EncodedImageArr def pillow_encoder(x): import io - from PIL import Image + from PIL import Image # pyright: ignore[reportMissingImports] encoded_images = [] for y in x: diff --git a/python/python/tests/test_arrow.py b/python/python/tests/test_arrow.py index 1d4a27f2b4b..92ab52021ff 100644 --- a/python/python/tests/test_arrow.py +++ b/python/python/tests/test_arrow.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd import pyarrow as pa -import pytest +import pytest # pyright: ignore[reportMissingImports] from lance.arrow import ( BFloat16, BFloat16Array,