Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 81 additions & 33 deletions packages/markitdown/src/markitdown/converters/_zip_converter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import zipfile
import io
import os

from typing import BinaryIO, Any, TYPE_CHECKING
import zipfile
from typing import Any, BinaryIO, TYPE_CHECKING

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._exceptions import FileConversionException, UnsupportedFormatException
from .._stream_info import StreamInfo
from .._exceptions import UnsupportedFormatException, FileConversionException

# Break otherwise circular import for type hinting
if TYPE_CHECKING:
Expand All @@ -18,59 +17,63 @@

ACCEPTED_FILE_EXTENSIONS = [".zip"]

# Default safety limits
_DEFAULT_MAX_FILE_COUNT = 100
_DEFAULT_MAX_FILE_SIZE = 50 * 1024 * 1024 # 50 MB per file
_DEFAULT_MAX_TOTAL_SIZE = 200 * 1024 * 1024 # 200 MB total uncompressed


class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.

The converter extracts the ZIP contents to a temporary directory, processes each file
using appropriate converters based on file extensions, and then combines the results
into a single markdown document. The temporary directory is cleaned up after processing.
The converter iterates over ZIP entries, processes each file using appropriate
converters based on file extensions, and combines the results into a single
markdown document.

Example output format:
```markdown
Content from the zip file `example.zip`:
Safety limits guard against zip bombs and excessively large archives:

## File: docs/readme.txt
- ``max_file_count``: maximum number of files to process (default 100).
Files beyond this limit are silently skipped with a notice appended.
- ``max_file_size``: maximum uncompressed size in bytes per individual file
(default 50 MB). Files that exceed this limit are skipped with a notice.
- ``max_total_size``: maximum total uncompressed bytes across all processed
files (default 200 MB). Processing stops when this budget is exhausted.

This is the content of readme.txt
Multiple lines are preserved
Example output format::

## File: images/example.jpg
Content from the zip file `example.zip`:

ImageSize: 1920x1080
DateTimeOriginal: 2024-02-15 14:30:00
Description: A beautiful landscape photo
## File: docs/readme.txt

## File: data/report.xlsx
This is the content of readme.txt

## Sheet1
| Column1 | Column2 | Column3 |
|---------|---------|---------|
| data1 | data2 | data3 |
| data4 | data5 | data6 |
```
## File: data/report.xlsx

Key features:
- Maintains original file structure in headings
- Processes nested files recursively
- Uses appropriate converters for each file type
- Preserves formatting of converted content
- Cleans up temporary files after processing
## Sheet1
| Column1 | Column2 |
|---------|---------|
| data1 | data2 |
"""

def __init__(
self,
*,
markitdown: "MarkItDown",
max_file_count: int = _DEFAULT_MAX_FILE_COUNT,
max_file_size: int = _DEFAULT_MAX_FILE_SIZE,
max_total_size: int = _DEFAULT_MAX_TOTAL_SIZE,
):
super().__init__()
self._markitdown = markitdown
self._max_file_count = max_file_count
self._max_file_size = max_file_size
self._max_total_size = max_total_size

def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
Expand All @@ -88,13 +91,55 @@ def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> DocumentConverterResult:
file_path = stream_info.url or stream_info.local_path or stream_info.filename
md_content = f"Content from the zip file `{file_path}`:\n\n"

files_processed = 0
total_bytes = 0

with zipfile.ZipFile(file_stream, "r") as zipObj:
for name in zipObj.namelist():
for info in zipObj.infolist():
name = info.filename

# Skip directory entries
if name.endswith("/"):
continue

# Guard against zip slip: skip entries with absolute paths or traversal sequences.
# Check for both Unix-style ("/") and OS-level absolute paths so the guard
# works correctly on Windows as well as POSIX.
if (
name.startswith("/")
or os.path.isabs(name)
or ".." in name.split("/")
):
continue

if files_processed >= self._max_file_count:
md_content += (
f"_Remaining files not processed: file count limit "
f"({self._max_file_count}) reached._\n"
)
break

uncompressed_size = info.file_size
if uncompressed_size > self._max_file_size:
md_content += (
f"## File: {name}\n\n"
f"_Skipped: uncompressed size ({uncompressed_size:,} bytes) "
f"exceeds per-file limit ({self._max_file_size:,} bytes)._\n\n"
)
continue

if total_bytes + uncompressed_size > self._max_total_size:
md_content += (
f"_Remaining files not processed: total size limit "
f"({self._max_total_size:,} bytes) reached._\n"
)
break

try:
z_file_stream = io.BytesIO(zipObj.read(name))
z_file_stream_info = StreamInfo(
Expand All @@ -113,4 +158,7 @@ def convert(
except FileConversionException:
pass

files_processed += 1
total_bytes += uncompressed_size

return DocumentConverterResult(markdown=md_content.strip())
136 changes: 136 additions & 0 deletions packages/markitdown/tests/test_zip_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
"""Tests for ZipConverter safety limits: file count, per-file size, total size, and zip slip."""

import io
import zipfile
from unittest.mock import MagicMock

from markitdown import StreamInfo
from markitdown.converters import ZipConverter


def _make_zip(files: list[tuple[str, bytes]]) -> bytes:
"""Build an in-memory ZIP from a list of (name, data) tuples."""
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_STORED) as zf:
for name, data in files:
zf.writestr(name, data)
return buf.getvalue()


def _mock_markitdown(content: str = "converted") -> MagicMock:
md = MagicMock()
result = MagicMock()
result.markdown = content
md.convert_stream.return_value = result
return md


class TestZipConverterFileLimits:
def test_file_count_limit_stops_processing(self):
files = [(f"file{i}.txt", f"content {i}".encode()) for i in range(5)]
zip_bytes = _make_zip(files)
md = _mock_markitdown("ok")

converter = ZipConverter(markitdown=md, max_file_count=3)
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))

assert md.convert_stream.call_count == 3
assert "file count limit" in result.markdown

def test_file_count_limit_not_hit_when_under(self):
files = [(f"file{i}.txt", b"hi") for i in range(3)]
zip_bytes = _make_zip(files)
md = _mock_markitdown("ok")

converter = ZipConverter(markitdown=md, max_file_count=10)
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))

assert md.convert_stream.call_count == 3
assert "file count limit" not in result.markdown

def test_per_file_size_limit_skips_oversized_file(self):
large_data = b"x" * 1000
files = [("big.txt", large_data), ("small.txt", b"tiny")]
zip_bytes = _make_zip(files)
md = _mock_markitdown("ok")

converter = ZipConverter(markitdown=md, max_file_size=500)
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))

assert "big.txt" in result.markdown
assert "exceeds per-file limit" in result.markdown
# small.txt should still be processed
assert md.convert_stream.call_count == 1

def test_total_size_limit_stops_processing(self):
# Two files each 600 bytes; total limit is 700 bytes - only first fits
files = [("a.txt", b"a" * 600), ("b.txt", b"b" * 600)]
zip_bytes = _make_zip(files)
md = _mock_markitdown("ok")

converter = ZipConverter(markitdown=md, max_total_size=700)
result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))

assert md.convert_stream.call_count == 1
assert "total size limit" in result.markdown

def test_directory_entries_are_skipped(self):
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.mkdir("subdir") # directory entry
zf.writestr("subdir/file.txt", "hello")
zip_bytes = buf.getvalue()
md = _mock_markitdown("ok")

converter = ZipConverter(markitdown=md)
converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))

# Only the file should be converted, not the directory entry
assert md.convert_stream.call_count == 1


class TestZipConverterZipSlip:
def test_absolute_path_entry_is_skipped(self):
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
info = zipfile.ZipInfo("/etc/passwd")
zf.writestr(info, "root:x:0:0")
zf.writestr("safe.txt", "hello")
zip_bytes = buf.getvalue()
md = _mock_markitdown("ok")

converter = ZipConverter(markitdown=md)
converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))

# /etc/passwd should be skipped, only safe.txt converted
assert md.convert_stream.call_count == 1

def test_path_traversal_entry_is_skipped(self):
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
info = zipfile.ZipInfo("../../evil.txt")
zf.writestr(info, "malicious")
zf.writestr("safe.txt", "hello")
zip_bytes = buf.getvalue()
md = _mock_markitdown("ok")

converter = ZipConverter(markitdown=md)
converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))

assert md.convert_stream.call_count == 1


class TestZipConverterAccepts:
def test_accepts_zip_extension(self):
converter = ZipConverter(markitdown=MagicMock())
assert converter.accepts(io.BytesIO(b""), StreamInfo(extension=".zip"))

def test_accepts_zip_mimetype(self):
converter = ZipConverter(markitdown=MagicMock())
assert converter.accepts(
io.BytesIO(b""), StreamInfo(mimetype="application/zip")
)

def test_rejects_other_extension(self):
converter = ZipConverter(markitdown=MagicMock())
assert not converter.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))