tesseract-ocr · sking85522 · Mar 16, 2026 · Mar 16, 2026 · Mar 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -119,3 +119,5 @@ times.txt
 # CodeQL and build artifacts
 _codeql_detected_source_root
 install-sh
+venv/
+venv/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,25 @@
+# Use an official lightweight Python image.
+FROM python:3.12-slim
+
+# Allow statements and log messages to immediately appear in the Knative logs
+ENV PYTHONUNBUFFERED True
+
+# Install system dependencies including Tesseract OCR and languages
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    tesseract-ocr-hin \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set the working directory to /app
+WORKDIR /app
+
+# Copy the requirements file and install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application code
+COPY . .
+
+# Run the web service on container startup
+CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 app:app
diff --git a/app.py b/app.py
@@ -0,0 +1,95 @@
+import os
+from flask import Flask, render_template, request, flash, redirect, url_for
+from werkzeug.utils import secure_filename
+import pytesseract
+from PIL import Image, ImageEnhance, ImageFilter
+
+app = Flask(__name__)
+app.secret_key = os.environ.get('SECRET_KEY', 'default_dev_secret_key_12345')
+
+# Setup upload folder
+UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__name__)), 'uploads')
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+
+# Ensure allowable file types
+ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'webp', 'bmp', 'tiff'}
+
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+def preprocess_image(image_path):
+    """
+    Preprocess the image to enhance handwriting recognition.
+    """
+    img = Image.open(image_path)
+    # Convert to grayscale
+    img = img.convert('L')
+
+    # Enhance contrast
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(2.0)
+
+    # Apply a slight median filter to remove noise
+    img = img.filter(ImageFilter.MedianFilter(size=3))
+
+    return img
+
+@app.route('/', methods=['GET', 'POST'])
+def index():
+    extracted_text = None
+    error = None
+
+    if request.method == 'POST':
+        # check if the post request has the file part
+        if 'file' not in request.files:
+            flash('No file part')
+            return redirect(request.url)
+        file = request.files['file']
+
+        # If the user does not select a file, the browser submits an
+        # empty file without a filename.
+        if file.filename == '':
+            flash('No selected file')
+            return redirect(request.url)
+
+        if file and allowed_file(file.filename):
+            filename = secure_filename(file.filename)
+            filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+            file.save(filepath)
+
+            # Retrieve parameters for OCR
+            lang = request.form.get('lang', 'eng')
+            ocr_mode = request.form.get('mode', 'printed')
+
+            try:
+                # Preprocess image
+                img = preprocess_image(filepath)
+
+                # Perform OCR
+                if ocr_mode == 'handwriting':
+                    # Custom configuration to help with handwriting:
+                    # --psm 6 (Assume a single uniform block of text)
+                    # --oem 1 (Neural nets LSTM only - better for handwriting)
+                    custom_config = r'--oem 1 --psm 6'
+                else:
+                    # Default for printed text
+                    custom_config = r'--oem 3 --psm 3'
+
+                extracted_text = pytesseract.image_to_string(img, lang=lang, config=custom_config)
+            except Exception as e:
+                error = f"Error during OCR processing: {e}"
+            finally:
+                # Clean up the file
+                if os.path.exists(filepath):
+                    os.remove(filepath)
+        else:
+            flash('Invalid file type. Allowed: png, jpg, jpeg, gif, webp, bmp, tiff')
+            return redirect(request.url)
+
+    return render_template('index.html', extracted_text=extracted_text, error=error)
+
+if __name__ == '__main__':
+    # Determine port for Render
+    port = int(os.environ.get('PORT', 5000))
+    app.run(host='0.0.0.0', port=port, debug=False)
diff --git a/render.yaml b/render.yaml
@@ -0,0 +1,9 @@
+services:
+  - type: web
+    name: tesseract-ocr-web
+    env: python
+    buildCommand: "pip install -r requirements.txt"
+    startCommand: "gunicorn app:app"
+    envVars:
+      - key: PYTHON_VERSION
+        value: 3.12.3
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+Flask==3.1.0
+pytesseract==0.3.13
+Pillow==11.1.0
+gunicorn==23.0.0
diff --git a/src/ccutil/helpers.h b/src/ccutil/helpers.h
@@ -198,11 +198,32 @@ inline int IntCastRounded(float x) {
 inline void ReverseN(void *ptr, int num_bytes) {
   assert(num_bytes == 1 || num_bytes == 2 || num_bytes == 4 || num_bytes == 8);
   char *cptr = static_cast<char *>(ptr);
-  int halfsize = num_bytes / 2;
-  for (int i = 0; i < halfsize; ++i) {
-    char tmp = cptr[i];
-    cptr[i] = cptr[num_bytes - 1 - i];
-    cptr[num_bytes - 1 - i] = tmp;
+  switch (num_bytes) {
+    case 2: {
+      char tmp = cptr[0];
+      cptr[0] = cptr[1];
+      cptr[1] = tmp;
+      break;
+    }
+    case 4: {
+      char tmp = cptr[0];
+      cptr[0] = cptr[3];
+      cptr[3] = tmp;
+      tmp = cptr[1];
+      cptr[1] = cptr[2];
+      cptr[2] = tmp;
+      break;
+    }
+    case 8: {
+      for (int i = 0; i < 4; ++i) {
+        char tmp = cptr[i];
+        cptr[i] = cptr[7 - i];
+        cptr[7 - i] = tmp;
+      }
+      break;
+    }
+    default:
+      break;
   }
 }
 

diff --git a/templates/index.html b/templates/index.html
@@ -0,0 +1,122 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Tesseract OCR Web Interface</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 0;
+            background-color: #f4f4f9;
+        }
+        .container {
+            width: 80%;
+            margin: auto;
+            overflow: hidden;
+            padding-top: 50px;
+        }
+        .form-container {
+            background: #fff;
+            padding: 20px;
+            margin-bottom: 20px;
+            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+            border-radius: 8px;
+        }
+        .form-container h2 {
+            margin-top: 0;
+            color: #333;
+        }
+        input[type="file"], select, input[type="submit"] {
+            margin: 10px 0;
+            padding: 10px;
+            width: 100%;
+            border-radius: 4px;
+            border: 1px solid #ddd;
+            box-sizing: border-box;
+        }
+        input[type="submit"] {
+            background-color: #4CAF50;
+            color: #fff;
+            border: 0;
+            cursor: pointer;
+            font-size: 16px;
+        }
+        input[type="submit"]:hover {
+            background-color: #45a049;
+        }
+        .result-container {
+            background: #fff;
+            padding: 20px;
+            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+            white-space: pre-wrap; /* Keeps formatting */
+            border-radius: 8px;
+        }
+        .alert {
+            padding: 15px;
+            background-color: #f44336;
+            color: white;
+            margin-bottom: 20px;
+            border-radius: 4px;
+        }
+        label {
+            font-weight: bold;
+            display: block;
+            margin-top: 10px;
+        }
+    </style>
+</head>
+<body>
+
+<div class="container">
+    {% with messages = get_flashed_messages() %}
+      {% if messages %}
+        {% for message in messages %}
+          <div class="alert">
+            {{ message }}
+          </div>
+        {% endfor %}
+      {% endif %}
+    {% endwith %}
+
+    {% if error %}
+        <div class="alert">
+            {{ error }}
+        </div>
+    {% endif %}
+
+    <div class="form-container">
+        <h2>Tesseract OCR: Upload Image for Text Extraction</h2>
+        <form method="POST" action="/" enctype="multipart/form-data">
+            <label for="file">Select an image:</label>
+            <input type="file" name="file" id="file" required>
+
+            <label for="lang">Select Language:</label>
+            <select name="lang" id="lang">
+                <option value="eng">English</option>
+                <option value="hin">Hindi</option>
+                <option value="eng+hin">English + Hindi</option>
+            </select>
+
+            <label for="mode">Select Document Type:</label>
+            <select name="mode" id="mode">
+                <option value="printed">Printed Text (Default)</option>
+                <option value="handwriting">Handwriting (Experimental)</option>
+            </select>
+
+            <input type="submit" value="Extract Text">
+        </form>
+    </div>
+
+    {% if extracted_text %}
+        <div class="result-container">
+            <h3>Extracted Text:</h3>
+            <div>{{ extracted_text }}</div>
+        </div>
+    {% endif %}
+
+</div>
+
+</body>
+</html>