microsoft · gjmveloso · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 9, 2025
diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -6,6 +6,7 @@
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
+from ._llm_caption import llm_caption
 
 # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
 PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
@@ -74,6 +75,23 @@ def _merge_partial_numbering_lines(text: str) -> str:
 
 ACCEPTED_FILE_EXTENSIONS = [".pdf"]
 
+_PDF_IMAGE_LLM_PROMPT = (
+    "You are an advanced document extraction AI. Extract and reproduce all text "
+    "visible in this image exactly as it appears. Do not translate or generate new "
+    "content. Preserve the original structure including sections, titles, headers, "
+    "tables, lists, and code snippets in Markdown format. Only output valid Markdown."
+)
+
+_PDF_FULL_LLM_PROMPT = (
+    "You are an advanced document extraction AI. Your task is to analyze the provided "
+    "document, understand its content and context, and produce a perfectly structured "
+    "Markdown document from the text within it. Do not translate or generate new text. "
+    "Retain the structure of the original content, ensuring that sections, titles, "
+    "headers and important details are clearly separated. If the document contains any "
+    "tables, lists and code snippets format them correctly to preserve their original "
+    "meaning. Only a valid Markdown-formatted output is allowed."
+)
+
 
 def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str:
     """Convert a 2D list (rows/columns) into a nicely aligned Markdown table.
@@ -117,6 +135,51 @@ def fmt_row(row: list[str]) -> str:
     return "\n".join(md)
 
 
+def _collect_lt_images(element: Any) -> list[Any]:
+    """Recursively collect LTImage objects from a pdfminer layout element."""
+    try:
+        from pdfminer.layout import LTFigure, LTImage
+    except ImportError:
+        return []
+
+    images = []
+    if isinstance(element, LTImage):
+        images.append(element)
+    elif isinstance(element, LTFigure):
+        for child in element:
+            images.extend(_collect_lt_images(child))
+    return images
+
+
+def _get_lt_image_data(lt_image: Any) -> tuple[bytes, str] | None:
+    """
+    Extract raw bytes and MIME type from a pdfminer LTImage.
+
+    Returns (bytes, mime_type) for JPEG and JPEG2000 images whose compressed
+    stream bytes can be sent directly to an LLM vision API. Returns None for
+    other formats (e.g. raw bitmap, JBIG2) that cannot be used as-is.
+
+    Uses pdfminer's own filter literals for comparison to avoid fragile
+    string conversion of PSLiteral objects.
+    """
+    try:
+        from pdfminer.pdftypes import LITERALS_DCT_DECODE, LITERALS_JPX_DECODE
+
+        stream = lt_image.stream
+        for f, _ in stream.get_filters():
+            if f in LITERALS_DCT_DECODE:
+                data = stream.get_rawdata()
+                if data:
+                    return data, "image/jpeg"
+            elif f in LITERALS_JPX_DECODE:
+                data = stream.get_rawdata()
+                if data:
+                    return data, "image/jp2"
+    except Exception:
+        pass
+    return None
+
+
 def _extract_form_content_from_words(page: Any) -> str | None:
     """
     Extract form-style content from a PDF page by analyzing word positions.
@@ -565,25 +628,75 @@ def convert(
 
                     page.close()  # Free cached page data immediately
 
-            # If no pages had form-style content, use pdfminer for
-            # the whole document (better text spacing for prose).
+            # If no pages had form-style content, discard pdfplumber results and
+            # use pdfminer for the whole document (better text spacing for prose).
             if form_page_count == 0:
+                markdown_chunks = []
                 pdf_bytes.seek(0)
-                markdown = pdfminer.high_level.extract_text(pdf_bytes)
-            else:
-                markdown = "\n\n".join(markdown_chunks).strip()
+                text = pdfminer.high_level.extract_text(pdf_bytes)
+                if text and text.strip():
+                    markdown_chunks.append(text)
+
+                # Second pass: scan for LTFigure elements containing embedded
+                # images and caption them with the LLM when available. This
+                # handles PDFs with mixed content (extractable text + images).
+                llm_client = kwargs.get("llm_client")
+                llm_model = kwargs.get("llm_model")
+                if llm_client and llm_model:
+                    from pdfminer.layout import LTFigure
+
+                    pdf_bytes.seek(0)
+                    for page_layout in pdfminer.high_level.extract_pages(pdf_bytes):
+                        for element in page_layout:
+                            # LTImage is always a child of LTFigure, never a
+                            # direct child of LTPage (PDFPageAggregator wraps
+                            # every image in begin_figure/end_figure).
+                            if isinstance(element, LTFigure):
+                                for lt_img in _collect_lt_images(element):
+                                    img_data = _get_lt_image_data(lt_img)
+                                    if img_data:
+                                        img_bytes, img_mime = img_data
+                                        ext = (
+                                            ".jpg"
+                                            if img_mime == "image/jpeg"
+                                            else ".jp2"
+                                        )
+                                        caption = llm_caption(
+                                            io.BytesIO(img_bytes),
+                                            StreamInfo(
+                                                mimetype=img_mime, extension=ext
+                                            ),
+                                            client=llm_client,
+                                            model=llm_model,
+                                            prompt=_PDF_IMAGE_LLM_PROMPT,
+                                        )
+                                        if caption:
+                                            markdown_chunks.append(caption)
+
+            markdown = "\n\n".join(markdown_chunks).strip()
 
         except Exception:
             # Fallback if pdfplumber fails
             pdf_bytes.seek(0)
             markdown = pdfminer.high_level.extract_text(pdf_bytes)
 
-        # Fallback if still empty
-        if not markdown:
-            pdf_bytes.seek(0)
-            markdown = pdfminer.high_level.extract_text(pdf_bytes)
+        # Last-resort fallback: send entire PDF to LLM when no text could be
+        # extracted at all (e.g. fully scanned PDFs with no recognized images).
+        if not markdown or not markdown.strip():
+            llm_client = kwargs.get("llm_client")
+            llm_model = kwargs.get("llm_model")
+            if llm_client and llm_model:
+                pdf_bytes.seek(0)
+                markdown = llm_caption(
+                    pdf_bytes,
+                    stream_info,
+                    client=llm_client,
+                    model=llm_model,
+                    prompt=_PDF_FULL_LLM_PROMPT,
+                )
 
         # Post-process to merge MasterFormat-style partial numbering with following text
-        markdown = _merge_partial_numbering_lines(markdown)
+        if markdown:
+            markdown = _merge_partial_numbering_lines(markdown)
 
-        return DocumentConverterResult(markdown=markdown)
+        return DocumentConverterResult(markdown=markdown or "")