diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py index f87e6c890..b532796b4 100644 --- a/packages/markitdown/src/markitdown/converters/_zip_converter.py +++ b/packages/markitdown/src/markitdown/converters/_zip_converter.py @@ -1,12 +1,11 @@ -import zipfile import io import os - -from typing import BinaryIO, Any, TYPE_CHECKING +import zipfile +from typing import Any, BinaryIO, TYPE_CHECKING from .._base_converter import DocumentConverter, DocumentConverterResult +from .._exceptions import FileConversionException, UnsupportedFormatException from .._stream_info import StreamInfo -from .._exceptions import UnsupportedFormatException, FileConversionException # Break otherwise circular import for type hinting if TYPE_CHECKING: @@ -18,59 +17,63 @@ ACCEPTED_FILE_EXTENSIONS = [".zip"] +# Default safety limits +_DEFAULT_MAX_FILE_COUNT = 100 +_DEFAULT_MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB per file +_DEFAULT_MAX_TOTAL_SIZE = 200 * 1024 * 1024 # 200 MB total uncompressed + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. - The converter extracts the ZIP contents to a temporary directory, processes each file - using appropriate converters based on file extensions, and then combines the results - into a single markdown document. The temporary directory is cleaned up after processing. + The converter iterates over ZIP entries, processes each file using appropriate + converters based on file extensions, and combines the results into a single + markdown document. - Example output format: - ```markdown - Content from the zip file `example.zip`: + Safety limits guard against zip bombs and excessively large archives: - ## File: docs/readme.txt + - ``max_file_count``: maximum number of files to process (default 100). + Files beyond this limit are silently skipped with a notice appended. + - ``max_file_size``: maximum uncompressed size in bytes per individual file + (default 50 MB). Files that exceed this limit are skipped with a notice. + - ``max_total_size``: maximum total uncompressed bytes across all processed + files (default 200 MB). Processing stops when this budget is exhausted. - This is the content of readme.txt - Multiple lines are preserved + Example output format:: - ## File: images/example.jpg + Content from the zip file `example.zip`: - ImageSize: 1920x1080 - DateTimeOriginal: 2024-02-15 14:30:00 - Description: A beautiful landscape photo + ## File: docs/readme.txt - ## File: data/report.xlsx + This is the content of readme.txt - ## Sheet1 - | Column1 | Column2 | Column3 | - |---------|---------|---------| - | data1 | data2 | data3 | - | data4 | data5 | data6 | - ``` + ## File: data/report.xlsx - Key features: - - Maintains original file structure in headings - - Processes nested files recursively - - Uses appropriate converters for each file type - - Preserves formatting of converted content - - Cleans up temporary files after processing + ## Sheet1 + | Column1 | Column2 | + |---------|---------| + | data1 | data2 | """ def __init__( self, *, markitdown: "MarkItDown", + max_file_count: int = _DEFAULT_MAX_FILE_COUNT, + max_file_size: int = _DEFAULT_MAX_FILE_SIZE, + max_total_size: int = _DEFAULT_MAX_TOTAL_SIZE, ): super().__init__() self._markitdown = markitdown + self._max_file_count = max_file_count + self._max_file_size = max_file_size + self._max_total_size = max_total_size def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -88,13 +91,55 @@ def convert( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: file_path = stream_info.url or stream_info.local_path or stream_info.filename md_content = f"Content from the zip file `{file_path}`:\n\n" + files_processed = 0 + total_bytes = 0 + with zipfile.ZipFile(file_stream, "r") as zipObj: - for name in zipObj.namelist(): + for info in zipObj.infolist(): + name = info.filename + + # Skip directory entries + if name.endswith("/"): + continue + + # Guard against zip slip: skip entries with absolute paths or traversal sequences. + # Check for both Unix-style ("/") and OS-level absolute paths so the guard + # works correctly on Windows as well as POSIX. + if ( + name.startswith("/") + or os.path.isabs(name) + or ".." in name.split("/") + ): + continue + + if files_processed >= self._max_file_count: + md_content += ( + f"_Remaining files not processed: file count limit " + f"({self._max_file_count}) reached._\n" + ) + break + + uncompressed_size = info.file_size + if uncompressed_size > self._max_file_size: + md_content += ( + f"## File: {name}\n\n" + f"_Skipped: uncompressed size ({uncompressed_size:,} bytes) " + f"exceeds per-file limit ({self._max_file_size:,} bytes)._\n\n" + ) + continue + + if total_bytes + uncompressed_size > self._max_total_size: + md_content += ( + f"_Remaining files not processed: total size limit " + f"({self._max_total_size:,} bytes) reached._\n" + ) + break + try: z_file_stream = io.BytesIO(zipObj.read(name)) z_file_stream_info = StreamInfo( @@ -113,4 +158,7 @@ def convert( except FileConversionException: pass + files_processed += 1 + total_bytes += uncompressed_size + return DocumentConverterResult(markdown=md_content.strip()) diff --git a/packages/markitdown/tests/test_zip_converter.py b/packages/markitdown/tests/test_zip_converter.py new file mode 100644 index 000000000..10b4e5751 --- /dev/null +++ b/packages/markitdown/tests/test_zip_converter.py @@ -0,0 +1,136 @@ +"""Tests for ZipConverter safety limits: file count, per-file size, total size, and zip slip.""" + +import io +import zipfile +from unittest.mock import MagicMock + +from markitdown import StreamInfo +from markitdown.converters import ZipConverter + + +def _make_zip(files: list[tuple[str, bytes]]) -> bytes: + """Build an in-memory ZIP from a list of (name, data) tuples.""" + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_STORED) as zf: + for name, data in files: + zf.writestr(name, data) + return buf.getvalue() + + +def _mock_markitdown(content: str = "converted") -> MagicMock: + md = MagicMock() + result = MagicMock() + result.markdown = content + md.convert_stream.return_value = result + return md + + +class TestZipConverterFileLimits: + def test_file_count_limit_stops_processing(self): + files = [(f"file{i}.txt", f"content {i}".encode()) for i in range(5)] + zip_bytes = _make_zip(files) + md = _mock_markitdown("ok") + + converter = ZipConverter(markitdown=md, max_file_count=3) + result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip")) + + assert md.convert_stream.call_count == 3 + assert "file count limit" in result.markdown + + def test_file_count_limit_not_hit_when_under(self): + files = [(f"file{i}.txt", b"hi") for i in range(3)] + zip_bytes = _make_zip(files) + md = _mock_markitdown("ok") + + converter = ZipConverter(markitdown=md, max_file_count=10) + result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip")) + + assert md.convert_stream.call_count == 3 + assert "file count limit" not in result.markdown + + def test_per_file_size_limit_skips_oversized_file(self): + large_data = b"x" * 1000 + files = [("big.txt", large_data), ("small.txt", b"tiny")] + zip_bytes = _make_zip(files) + md = _mock_markitdown("ok") + + converter = ZipConverter(markitdown=md, max_file_size=500) + result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip")) + + assert "big.txt" in result.markdown + assert "exceeds per-file limit" in result.markdown + # small.txt should still be processed + assert md.convert_stream.call_count == 1 + + def test_total_size_limit_stops_processing(self): + # Two files each 600 bytes; total limit is 700 bytes - only first fits + files = [("a.txt", b"a" * 600), ("b.txt", b"b" * 600)] + zip_bytes = _make_zip(files) + md = _mock_markitdown("ok") + + converter = ZipConverter(markitdown=md, max_total_size=700) + result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip")) + + assert md.convert_stream.call_count == 1 + assert "total size limit" in result.markdown + + def test_directory_entries_are_skipped(self): + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.mkdir("subdir") # directory entry + zf.writestr("subdir/file.txt", "hello") + zip_bytes = buf.getvalue() + md = _mock_markitdown("ok") + + converter = ZipConverter(markitdown=md) + converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip")) + + # Only the file should be converted, not the directory entry + assert md.convert_stream.call_count == 1 + + +class TestZipConverterZipSlip: + def test_absolute_path_entry_is_skipped(self): + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + info = zipfile.ZipInfo("/etc/passwd") + zf.writestr(info, "root:x:0:0") + zf.writestr("safe.txt", "hello") + zip_bytes = buf.getvalue() + md = _mock_markitdown("ok") + + converter = ZipConverter(markitdown=md) + converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip")) + + # /etc/passwd should be skipped, only safe.txt converted + assert md.convert_stream.call_count == 1 + + def test_path_traversal_entry_is_skipped(self): + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + info = zipfile.ZipInfo("../../evil.txt") + zf.writestr(info, "malicious") + zf.writestr("safe.txt", "hello") + zip_bytes = buf.getvalue() + md = _mock_markitdown("ok") + + converter = ZipConverter(markitdown=md) + converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip")) + + assert md.convert_stream.call_count == 1 + + +class TestZipConverterAccepts: + def test_accepts_zip_extension(self): + converter = ZipConverter(markitdown=MagicMock()) + assert converter.accepts(io.BytesIO(b""), StreamInfo(extension=".zip")) + + def test_accepts_zip_mimetype(self): + converter = ZipConverter(markitdown=MagicMock()) + assert converter.accepts( + io.BytesIO(b""), StreamInfo(mimetype="application/zip") + ) + + def test_rejects_other_extension(self): + converter = ZipConverter(markitdown=MagicMock()) + assert not converter.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))