Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/python/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ such as the row groups and column chunk metadata and statistics:
data_page_offset: 36
total_compressed_size: 106
total_uncompressed_size: 102
bloom_filter_offset: None
bloom_filter_length: None
Data Type Handling
------------------
Expand Down
18 changes: 16 additions & 2 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,9 @@ cdef class ColumnChunkMetaData(_Weakrefable):
dictionary_page_offset: {self.dictionary_page_offset}
data_page_offset: {self.data_page_offset}
total_compressed_size: {self.total_compressed_size}
total_uncompressed_size: {self.total_uncompressed_size}"""
total_uncompressed_size: {self.total_uncompressed_size}
bloom_filter_offset: {self.bloom_filter_offset}
bloom_filter_length: {self.bloom_filter_length}"""

def to_dict(self):
"""
Expand Down Expand Up @@ -507,7 +509,9 @@ cdef class ColumnChunkMetaData(_Weakrefable):
dictionary_page_offset=self.dictionary_page_offset,
data_page_offset=self.data_page_offset,
total_compressed_size=self.total_compressed_size,
total_uncompressed_size=self.total_uncompressed_size
total_uncompressed_size=self.total_uncompressed_size,
bloom_filter_offset=self.bloom_filter_offset,
bloom_filter_length=self.bloom_filter_length,
)
return d

Expand Down Expand Up @@ -645,6 +649,16 @@ cdef class ColumnChunkMetaData(_Weakrefable):
"""Uncompressed size in bytes (int)."""
return self.metadata.total_uncompressed_size()

@property
def bloom_filter_offset(self):
"""Offset of bloom filter relative to beginning of the file (int or None)."""
return self.metadata.bloom_filter_offset()

@property
def bloom_filter_length(self):
"""Length of bloom filter in bytes (int or None)."""
return self.metadata.bloom_filter_length()

@property
def has_offset_index(self):
"""Whether the column chunk has an offset index"""
Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/includes/libparquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
int64_t index_page_offset() const
int64_t total_compressed_size() const
int64_t total_uncompressed_size() const
optional[int64_t] bloom_filter_offset() const
optional[int64_t] bloom_filter_length() const
unique_ptr[CColumnCryptoMetaData] crypto_metadata() const
optional[ParquetIndexLocation] GetColumnIndexLocation() const
optional[ParquetIndexLocation] GetOffsetIndexLocation() const
Expand Down
35 changes: 35 additions & 0 deletions python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,41 @@ def test_column_chunk_key_value_metadata(parquet_test_datadir):
assert key_value_metadata2 is None


def test_bloom_filter_offset_in_metadata():
# ColumnChunkMetaData.to_dict() when a bloom filter is written.
table = pa.table({"a": [f"id_{i}" for i in range(1000)],
"b": list(range(1000))})

buf = pa.BufferOutputStream()
pq.write_table(
table,
buf,
bloom_filter_options={"a": {"ndv": 1000}} # apply bloom filter on col a
)
metadata = pq.read_metadata(pa.BufferReader(buf.getvalue()))

col_a = metadata.row_group(0).column(0) # bloom filter written
col_b = metadata.row_group(0).column(1) # no bloom filter

assert col_a.bloom_filter_offset is not None
assert isinstance(col_a.bloom_filter_offset, int)
assert col_a.bloom_filter_length is not None
assert isinstance(col_a.bloom_filter_length, int)

assert col_b.bloom_filter_offset is None
assert col_b.bloom_filter_length is None

d = col_a.to_dict()
assert "bloom_filter_offset" in d
assert "bloom_filter_length" in d
assert d["bloom_filter_offset"] == col_a.bloom_filter_offset
assert d["bloom_filter_length"] == col_a.bloom_filter_length

d_no_bloom = col_b.to_dict()
assert d_no_bloom["bloom_filter_offset"] is None
assert d_no_bloom["bloom_filter_length"] is None


def test_internal_class_instantiation():
def msg(c):
return f"Do not call {c}'s constructor directly"
Expand Down
Loading