Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/tesseract/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,16 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
explicit TessHOcrRenderer(const char *outputbase, bool font_info);
explicit TessHOcrRenderer(const char *outputbase);

void SetInputLanguages(const char *languages);

protected:
bool BeginDocumentHandler() override;
bool AddImageHandler(TessBaseAPI *api) override;
bool EndDocumentHandler() override;

private:
bool font_info_; // whether to print font information
std::string input_languages_;
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding std::string input_languages_ to TessHOcrRenderer changes the size/layout of this exported (TESS_API) C++ class, which can break binary compatibility for downstream code that links against libtesseract and instantiates TessHOcrRenderer. If ABI stability is a concern, consider storing the new state behind an indirection (pimpl/opaque pointer) or in a separate internal structure to minimize ABI impact.

Suggested change
std::string input_languages_;
// NOTE: Additional per-instance state (e.g. input languages) is stored
// out-of-line to avoid changing the ABI-visible layout of this class.

Copilot uses AI. Check for mistakes.
};

/**
Expand Down
120 changes: 119 additions & 1 deletion src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@
**********************************************************************/

#include <tesseract/baseapi.h> // for TessBaseAPI
#include <cstring> // for strcmp
#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#include <unordered_map> // for std::unordered_map
#include <unordered_set> // for std::unordered_set
#include <tesseract/renderer.h>
#include "helpers.h" // for copy_string
#include "tesseractclass.h" // for Tesseract
Expand Down Expand Up @@ -477,6 +480,86 @@ TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
font_info_ = font_info;
}

void TessHOcrRenderer::SetInputLanguages(const char *languages) {
if (languages && languages[0] != '\0') {
input_languages_ = languages;
} else {
input_languages_.clear();
}
}

static const std::unordered_map<std::string, std::string> &Iso639Map() {
static const std::unordered_map<std::string, std::string> map{
{"afr", "af"}, {"amh", "am"}, {"ara", "ar"}, {"asm", "as"},
{"aze", "az"}, {"bel", "be"}, {"ben", "bn"}, {"bod", "bo"},
Comment on lines +491 to +494
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iso639Map() allocates the map with new and intentionally leaks it for the lifetime of the process. Unless there is a specific reason to avoid static destruction here, prefer a function-local static const std::unordered_map<...> value to avoid heap allocation/leaks and simplify the code.

Copilot uses AI. Check for mistakes.
{"bos", "bs"}, {"bre", "br"}, {"bul", "bg"}, {"cat", "ca"},
{"ceb", "ceb"}, {"ces", "cs"}, {"chi_sim", "zh"}, {"chi_tra", "zh"},
{"chr", "chr"}, {"cos", "co"}, {"cym", "cy"}, {"dan", "da"},
{"deu", "de"}, {"div", "dv"}, {"dzo", "dz"}, {"ell", "el"},
{"eng", "en"}, {"enm", "en"}, {"epo", "eo"}, {"est", "et"},
{"eus", "eu"}, {"fao", "fo"}, {"fas", "fa"}, {"fil", "fil"},
{"fin", "fi"}, {"fra", "fr"}, {"frk", "de"}, {"frm", "fr"},
{"fry", "fy"}, {"gla", "gd"}, {"gle", "ga"}, {"glg", "gl"},
{"grc", "el"}, {"guj", "gu"}, {"hat", "ht"}, {"heb", "he"},
{"hin", "hi"}, {"hrv", "hr"}, {"hun", "hu"}, {"hye", "hy"},
{"iku", "iu"}, {"ind", "id"}, {"isl", "is"}, {"ita", "it"},
{"jav", "jv"}, {"jpn", "ja"}, {"kan", "kn"}, {"kat", "ka"},
{"kaz", "kk"}, {"khm", "km"}, {"kir", "ky"}, {"kmr", "ku"},
{"kor", "ko"}, {"lao", "lo"}, {"lat", "la"}, {"lav", "lv"},
{"lit", "lt"}, {"ltz", "lb"}, {"mal", "ml"}, {"mar", "mr"},
{"mkd", "mk"}, {"mlt", "mt"}, {"mon", "mn"}, {"mri", "mi"},
{"msa", "ms"}, {"mya", "my"}, {"nep", "ne"}, {"nld", "nl"},
{"nor", "no"}, {"oci", "oc"}, {"ori", "or"}, {"pan", "pa"},
{"pol", "pl"}, {"por", "pt"}, {"pus", "ps"}, {"que", "qu"},
{"ron", "ro"}, {"rus", "ru"}, {"san", "sa"}, {"sin", "si"},
{"slk", "sk"}, {"slv", "sl"}, {"snd", "sd"}, {"spa", "es"},
{"sqi", "sq"}, {"srp", "sr"}, {"sun", "su"}, {"swa", "sw"},
{"swe", "sv"}, {"syr", "syr"}, {"tam", "ta"}, {"tat", "tt"},
{"tel", "te"}, {"tgk", "tg"}, {"tha", "th"}, {"tir", "ti"},
{"ton", "to"}, {"tur", "tr"}, {"uig", "ug"}, {"ukr", "uk"},
{"urd", "ur"}, {"uzb", "uz"}, {"vie", "vi"}, {"yid", "yi"},
{"yor", "yo"}, {"zho", "zh"},
};
return map;
}

// Look up a model name in the ISO 639 map, trying progressively shorter
// prefixes by stripping trailing _suffix parts. This handles compound
// model names like deu_latf -> deu -> "de", chi_tra_vert -> chi_tra -> "zh".
static std::string LookupIso639(const std::string &model) {
const auto &iso_map = Iso639Map();
auto it = iso_map.find(model);
if (it != iso_map.end()) {
return it->second;
}
std::string prefix = model;
while (true) {
auto pos = prefix.rfind('_');
if (pos == std::string::npos) {
break;
}
prefix = prefix.substr(0, pos);
it = iso_map.find(prefix);
if (it != iso_map.end()) {
return it->second;
}
}
return model;
}

static const std::unordered_set<std::string> &ScriptNames() {
static const std::unordered_set<std::string> set{
"Arabic", "Armenian", "Bengali", "Canadian_Aboriginal", "Cherokee",
"Cyrillic", "Devanagari", "Ethiopic", "Fraktur", "Georgian", "Greek",
Comment on lines +550 to +553
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ScriptNames() also uses new for a function-local static and leaks it. Prefer a function-local static const std::unordered_set<std::string> object unless there's a measured reason to avoid its destructor at shutdown.

Copilot uses AI. Check for mistakes.
"Gujarati", "Gurmukhi", "HanS", "HanS_vert", "HanT", "HanT_vert",
"Hangul", "Hangul_vert", "Hebrew", "Japanese", "Japanese_vert",
"Kannada", "Khmer", "Lao", "Latin", "Malayalam", "Myanmar", "Oriya",
"Sinhala", "Syriac", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan",
"Vietnamese",
};
return set;
}

bool TessHOcrRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
Expand All @@ -496,8 +579,43 @@ bool TessHOcrRenderer::BeginDocumentHandler() {
if (font_info_) {
AppendString(" ocrp_font ocrp_fsize");
}
AppendString("'/>\n");
if (!input_languages_.empty()) {
std::string langs;
std::string scripts;
const auto &script_set = ScriptNames();
std::istringstream stream(input_languages_);
std::string token;
while (std::getline(stream, token, '+')) {
if (token.empty()) {
Comment on lines 563 to +590
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new header metadata behavior (splitting/mapping languages and emitting new meta tags) isn't covered by existing unit tests. Since src/api/hocrrenderer.cpp already has test coverage for other hOCR functionality, it would be good to add a focused test that exercises TessHOcrRenderer::BeginDocumentHandler() output with (1) languages set and (2) languages unset, and asserts the exact meta tag(s) emitted (including proper escaping).

Copilot uses AI. Check for mistakes.
continue;
}
if (script_set.count(token)) {
if (!scripts.empty()) {
scripts += ' ';
}
scripts += token;
} else {
if (!langs.empty()) {
langs += ' ';
}
langs += LookupIso639(token);
}
Comment on lines +583 to +603
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation transforms the -l string by splitting on +, converting some tokens (e.g. eng->en) and joining with spaces, so the emitted <meta name='ocr-langs' ...> does not preserve the original -l value (e.g. eng+fra). That differs from the PR description/issue request and also makes ocr-langs inconsistent with the existing lang='eng' attributes emitted elsewhere in hOCR. Please either emit the raw -l value (escaped) or update the approach/spec and ensure consistency across the document.

Copilot uses AI. Check for mistakes.
}
if (!langs.empty()) {
std::string escaped_langs = HOcrEscape(langs.c_str());
AppendString(" <meta name='ocr-langs' content='");
AppendString(escaped_langs.c_str());
AppendString("' />\n");
}
if (!scripts.empty()) {
std::string escaped_scripts = HOcrEscape(scripts.c_str());
AppendString(" <meta name='ocr-scripts' content='");
AppendString(escaped_scripts.c_str());
AppendString("' />\n");
}
}
AppendString(
"'/>\n"
" </head>\n"
" <body>\n");

Expand Down
1 change: 1 addition & 0 deletions src/tesseract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,
bool font_info;
api.GetBoolVariable("hocr_font_info", &font_info);
auto renderer = std::make_unique<tesseract::TessHOcrRenderer>(outputbase, font_info);
renderer->SetInputLanguages(api.GetInitLanguagesAsString());
if (renderer->happy()) {
renderers.push_back(std::move(renderer));
} else {
Expand Down