microsoft · Sghosh1999 · May 25, 2025 · May 25, 2025 · dillonstreator · Jun 10, 2025
diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml
@@ -52,7 +52,7 @@ pptx = ["python-pptx"]
 docx = ["mammoth", "lxml"]
 xlsx = ["pandas", "openpyxl"]
 xls = ["pandas", "xlrd"]
-pdf = ["pdfminer.six"]
+pdf = ["pdfminer.six", "pytesseract", "pdf2image"]
 outlook = ["olefile"]
 audio-transcription = ["pydub", "SpeechRecognition"]
 youtube-transcription = ["youtube-transcript-api"]

diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py
@@ -19,6 +19,13 @@
     # Preserve the error and stack trace for later
     _dependency_exc_info = sys.exc_info()
 
+# Try to import pytesseract and pdf2image for OCR
+_ocr_dependency_exc_info = None
+try:
+    import pytesseract
+    from pdf2image import convert_from_bytes
+except ImportError:
+    _ocr_dependency_exc_info = sys.exc_info()
 
 ACCEPTED_MIME_TYPE_PREFIXES = [
     "application/pdf",
@@ -57,21 +64,32 @@ def convert(
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
-        # Check the dependencies
+        # Check dependencies
         if _dependency_exc_info is not None:
             raise MissingDependencyException(
                 MISSING_DEPENDENCY_MESSAGE.format(
                     converter=type(self).__name__,
                     extension=".pdf",
                     feature="pdf",
                 )
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
-                _dependency_exc_info[2]
-            )
-
-        assert isinstance(file_stream, io.IOBase)  # for mypy
-        return DocumentConverterResult(
-            markdown=pdfminer.high_level.extract_text(file_stream),
-        )
+            ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2])
+
+        # Try to extract text with pdfminer
+        file_stream.seek(0)
+        text = pdfminer.high_level.extract_text(file_stream)
+        if text and text.strip():
+            return DocumentConverterResult(markdown=text)
+
+        # If no text found, fall back to OCR
+        if _ocr_dependency_exc_info is not None:
+            raise MissingDependencyException(
+                "OCR dependencies are missing. Please install pytesseract and pdf2image for OCR support."
+            ) from _ocr_dependency_exc_info[1].with_traceback(_ocr_dependency_exc_info[2])
+
+        file_stream.seek(0)
+        images = convert_from_bytes(file_stream.read())
+        ocr_text = []
+        for img in images:
+            ocr_text.append(pytesseract.image_to_string(img))
+        ocr_output = "\n\n".join(ocr_text)
+        return DocumentConverterResult(markdown=ocr_output)