microsoft · VANDRANKI · Apr 4, 2026
diff --git a/packages/markitdown/src/markitdown/converters/_zip_converter.py b/packages/markitdown/src/markitdown/converters/_zip_converter.py
@@ -1,12 +1,11 @@
-import zipfile
 import io
 import os
-
-from typing import BinaryIO, Any, TYPE_CHECKING
+import zipfile
+from typing import Any, BinaryIO, TYPE_CHECKING
 
 from .._base_converter import DocumentConverter, DocumentConverterResult
+from .._exceptions import FileConversionException, UnsupportedFormatException
 from .._stream_info import StreamInfo
-from .._exceptions import UnsupportedFormatException, FileConversionException
 
 # Break otherwise circular import for type hinting
 if TYPE_CHECKING:
@@ -18,59 +17,63 @@
 
 ACCEPTED_FILE_EXTENSIONS = [".zip"]
 
+# Default safety limits
+_DEFAULT_MAX_FILE_COUNT = 100
+_DEFAULT_MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB per file
+_DEFAULT_MAX_TOTAL_SIZE = 200 * 1024 * 1024  # 200 MB total uncompressed
+
 
 class ZipConverter(DocumentConverter):
     """Converts ZIP files to markdown by extracting and converting all contained files.
 
-    The converter extracts the ZIP contents to a temporary directory, processes each file
-    using appropriate converters based on file extensions, and then combines the results
-    into a single markdown document. The temporary directory is cleaned up after processing.
+    The converter iterates over ZIP entries, processes each file using appropriate
+    converters based on file extensions, and combines the results into a single
+    markdown document.
 
-    Example output format:
-    ```markdown
-    Content from the zip file `example.zip`:
+    Safety limits guard against zip bombs and excessively large archives:
 
-    ## File: docs/readme.txt
+    - ``max_file_count``: maximum number of files to process (default 100).
+      Files beyond this limit are silently skipped with a notice appended.
+    - ``max_file_size``: maximum uncompressed size in bytes per individual file
+      (default 50 MB). Files that exceed this limit are skipped with a notice.
+    - ``max_total_size``: maximum total uncompressed bytes across all processed
+      files (default 200 MB). Processing stops when this budget is exhausted.
 
-    This is the content of readme.txt
-    Multiple lines are preserved
+    Example output format::
 
-    ## File: images/example.jpg
+        Content from the zip file `example.zip`:
 
-    ImageSize: 1920x1080
-    DateTimeOriginal: 2024-02-15 14:30:00
-    Description: A beautiful landscape photo
+        ## File: docs/readme.txt
 
-    ## File: data/report.xlsx
+        This is the content of readme.txt
 
-    ## Sheet1
-    | Column1 | Column2 | Column3 |
-    |---------|---------|---------|
-    | data1   | data2   | data3   |
-    | data4   | data5   | data6   |
-    ```
+        ## File: data/report.xlsx
 
-    Key features:
-    - Maintains original file structure in headings
-    - Processes nested files recursively
-    - Uses appropriate converters for each file type
-    - Preserves formatting of converted content
-    - Cleans up temporary files after processing
+        ## Sheet1
+        | Column1 | Column2 |
+        |---------|---------|
+        | data1   | data2   |
     """
 
     def __init__(
         self,
         *,
         markitdown: "MarkItDown",
+        max_file_count: int = _DEFAULT_MAX_FILE_COUNT,
+        max_file_size: int = _DEFAULT_MAX_FILE_SIZE,
+        max_total_size: int = _DEFAULT_MAX_TOTAL_SIZE,
     ):
         super().__init__()
         self._markitdown = markitdown
+        self._max_file_count = max_file_count
+        self._max_file_size = max_file_size
+        self._max_total_size = max_total_size
 
     def accepts(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        **kwargs: Any,
     ) -> bool:
         mimetype = (stream_info.mimetype or "").lower()
         extension = (stream_info.extension or "").lower()
@@ -88,13 +91,55 @@ def convert(
         self,
         file_stream: BinaryIO,
         stream_info: StreamInfo,
-        **kwargs: Any,  # Options to pass to the converter
+        **kwargs: Any,
     ) -> DocumentConverterResult:
         file_path = stream_info.url or stream_info.local_path or stream_info.filename
         md_content = f"Content from the zip file `{file_path}`:\n\n"
 
+        files_processed = 0
+        total_bytes = 0
+
         with zipfile.ZipFile(file_stream, "r") as zipObj:
-            for name in zipObj.namelist():
+            for info in zipObj.infolist():
+                name = info.filename
+
+                # Skip directory entries
+                if name.endswith("/"):
+                    continue
+
+                # Guard against zip slip: skip entries with absolute paths or traversal sequences.
+                # Check for both Unix-style ("/") and OS-level absolute paths so the guard
+                # works correctly on Windows as well as POSIX.
+                if (
+                    name.startswith("/")
+                    or os.path.isabs(name)
+                    or ".." in name.split("/")
+                ):
+                    continue
+
+                if files_processed >= self._max_file_count:
+                    md_content += (
+                        f"_Remaining files not processed: file count limit "
+                        f"({self._max_file_count}) reached._\n"
+                    )
+                    break
+
+                uncompressed_size = info.file_size
+                if uncompressed_size > self._max_file_size:
+                    md_content += (
+                        f"## File: {name}\n\n"
+                        f"_Skipped: uncompressed size ({uncompressed_size:,} bytes) "
+                        f"exceeds per-file limit ({self._max_file_size:,} bytes)._\n\n"
+                    )
+                    continue
+
+                if total_bytes + uncompressed_size > self._max_total_size:
+                    md_content += (
+                        f"_Remaining files not processed: total size limit "
+                        f"({self._max_total_size:,} bytes) reached._\n"
+                    )
+                    break
+
                 try:
                     z_file_stream = io.BytesIO(zipObj.read(name))
                     z_file_stream_info = StreamInfo(
@@ -113,4 +158,7 @@ def convert(
                 except FileConversionException:
                     pass
 
+                files_processed += 1
+                total_bytes += uncompressed_size
+
         return DocumentConverterResult(markdown=md_content.strip())
diff --git a/packages/markitdown/tests/test_zip_converter.py b/packages/markitdown/tests/test_zip_converter.py
@@ -0,0 +1,136 @@
+"""Tests for ZipConverter safety limits: file count, per-file size, total size, and zip slip."""
+
+import io
+import zipfile
+from unittest.mock import MagicMock
+
+from markitdown import StreamInfo
+from markitdown.converters import ZipConverter
+
+
+def _make_zip(files: list[tuple[str, bytes]]) -> bytes:
+    """Build an in-memory ZIP from a list of (name, data) tuples."""
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_STORED) as zf:
+        for name, data in files:
+            zf.writestr(name, data)
+    return buf.getvalue()
+
+
+def _mock_markitdown(content: str = "converted") -> MagicMock:
+    md = MagicMock()
+    result = MagicMock()
+    result.markdown = content
+    md.convert_stream.return_value = result
+    return md
+
+
+class TestZipConverterFileLimits:
+    def test_file_count_limit_stops_processing(self):
+        files = [(f"file{i}.txt", f"content {i}".encode()) for i in range(5)]
+        zip_bytes = _make_zip(files)
+        md = _mock_markitdown("ok")
+
+        converter = ZipConverter(markitdown=md, max_file_count=3)
+        result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
+
+        assert md.convert_stream.call_count == 3
+        assert "file count limit" in result.markdown
+
+    def test_file_count_limit_not_hit_when_under(self):
+        files = [(f"file{i}.txt", b"hi") for i in range(3)]
+        zip_bytes = _make_zip(files)
+        md = _mock_markitdown("ok")
+
+        converter = ZipConverter(markitdown=md, max_file_count=10)
+        result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
+
+        assert md.convert_stream.call_count == 3
+        assert "file count limit" not in result.markdown
+
+    def test_per_file_size_limit_skips_oversized_file(self):
+        large_data = b"x" * 1000
+        files = [("big.txt", large_data), ("small.txt", b"tiny")]
+        zip_bytes = _make_zip(files)
+        md = _mock_markitdown("ok")
+
+        converter = ZipConverter(markitdown=md, max_file_size=500)
+        result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
+
+        assert "big.txt" in result.markdown
+        assert "exceeds per-file limit" in result.markdown
+        # small.txt should still be processed
+        assert md.convert_stream.call_count == 1
+
+    def test_total_size_limit_stops_processing(self):
+        # Two files each 600 bytes; total limit is 700 bytes - only first fits
+        files = [("a.txt", b"a" * 600), ("b.txt", b"b" * 600)]
+        zip_bytes = _make_zip(files)
+        md = _mock_markitdown("ok")
+
+        converter = ZipConverter(markitdown=md, max_total_size=700)
+        result = converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
+
+        assert md.convert_stream.call_count == 1
+        assert "total size limit" in result.markdown
+
+    def test_directory_entries_are_skipped(self):
+        buf = io.BytesIO()
+        with zipfile.ZipFile(buf, "w") as zf:
+            zf.mkdir("subdir")  # directory entry
+            zf.writestr("subdir/file.txt", "hello")
+        zip_bytes = buf.getvalue()
+        md = _mock_markitdown("ok")
+
+        converter = ZipConverter(markitdown=md)
+        converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
+
+        # Only the file should be converted, not the directory entry
+        assert md.convert_stream.call_count == 1
+
+
+class TestZipConverterZipSlip:
+    def test_absolute_path_entry_is_skipped(self):
+        buf = io.BytesIO()
+        with zipfile.ZipFile(buf, "w") as zf:
+            info = zipfile.ZipInfo("/etc/passwd")
+            zf.writestr(info, "root:x:0:0")
+            zf.writestr("safe.txt", "hello")
+        zip_bytes = buf.getvalue()
+        md = _mock_markitdown("ok")
+
+        converter = ZipConverter(markitdown=md)
+        converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
+
+        # /etc/passwd should be skipped, only safe.txt converted
+        assert md.convert_stream.call_count == 1
+
+    def test_path_traversal_entry_is_skipped(self):
+        buf = io.BytesIO()
+        with zipfile.ZipFile(buf, "w") as zf:
+            info = zipfile.ZipInfo("../../evil.txt")
+            zf.writestr(info, "malicious")
+            zf.writestr("safe.txt", "hello")
+        zip_bytes = buf.getvalue()
+        md = _mock_markitdown("ok")
+
+        converter = ZipConverter(markitdown=md)
+        converter.convert(io.BytesIO(zip_bytes), StreamInfo(extension=".zip"))
+
+        assert md.convert_stream.call_count == 1
+
+
+class TestZipConverterAccepts:
+    def test_accepts_zip_extension(self):
+        converter = ZipConverter(markitdown=MagicMock())
+        assert converter.accepts(io.BytesIO(b""), StreamInfo(extension=".zip"))
+
+    def test_accepts_zip_mimetype(self):
+        converter = ZipConverter(markitdown=MagicMock())
+        assert converter.accepts(
+            io.BytesIO(b""), StreamInfo(mimetype="application/zip")
+        )
+
+    def test_rejects_other_extension(self):
+        converter = ZipConverter(markitdown=MagicMock())
+        assert not converter.accepts(io.BytesIO(b""), StreamInfo(extension=".pdf"))