diff --git a/include/tesseract/renderer.h b/include/tesseract/renderer.h index a8745a09ee..89ba1f3cb0 100644 --- a/include/tesseract/renderer.h +++ b/include/tesseract/renderer.h @@ -173,6 +173,8 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { explicit TessHOcrRenderer(const char *outputbase, bool font_info); explicit TessHOcrRenderer(const char *outputbase); + void SetInputLanguages(const char *languages); + protected: bool BeginDocumentHandler() override; bool AddImageHandler(TessBaseAPI *api) override; @@ -180,6 +182,7 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { private: bool font_info_; // whether to print font information + std::string input_languages_; }; /** diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index 5f319ecac1..3de22b3719 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -18,9 +18,12 @@ **********************************************************************/ #include // for TessBaseAPI +#include // for strcmp #include // for std::locale::classic #include // for std::unique_ptr #include // for std::stringstream +#include // for std::unordered_map +#include // for std::unordered_set #include #include "helpers.h" // for copy_string #include "tesseractclass.h" // for Tesseract @@ -477,6 +480,86 @@ TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) font_info_ = font_info; } +void TessHOcrRenderer::SetInputLanguages(const char *languages) { + if (languages && languages[0] != '\0') { + input_languages_ = languages; + } else { + input_languages_.clear(); + } +} + +static const std::unordered_map &Iso639Map() { + static const std::unordered_map map{ + {"afr", "af"}, {"amh", "am"}, {"ara", "ar"}, {"asm", "as"}, + {"aze", "az"}, {"bel", "be"}, {"ben", "bn"}, {"bod", "bo"}, + {"bos", "bs"}, {"bre", "br"}, {"bul", "bg"}, {"cat", "ca"}, + {"ceb", "ceb"}, {"ces", "cs"}, {"chi_sim", "zh"}, {"chi_tra", "zh"}, + {"chr", "chr"}, {"cos", "co"}, {"cym", "cy"}, {"dan", "da"}, + {"deu", "de"}, {"div", "dv"}, {"dzo", "dz"}, {"ell", "el"}, + {"eng", "en"}, {"enm", "en"}, {"epo", "eo"}, {"est", "et"}, + {"eus", "eu"}, {"fao", "fo"}, {"fas", "fa"}, {"fil", "fil"}, + {"fin", "fi"}, {"fra", "fr"}, {"frk", "de"}, {"frm", "fr"}, + {"fry", "fy"}, {"gla", "gd"}, {"gle", "ga"}, {"glg", "gl"}, + {"grc", "el"}, {"guj", "gu"}, {"hat", "ht"}, {"heb", "he"}, + {"hin", "hi"}, {"hrv", "hr"}, {"hun", "hu"}, {"hye", "hy"}, + {"iku", "iu"}, {"ind", "id"}, {"isl", "is"}, {"ita", "it"}, + {"jav", "jv"}, {"jpn", "ja"}, {"kan", "kn"}, {"kat", "ka"}, + {"kaz", "kk"}, {"khm", "km"}, {"kir", "ky"}, {"kmr", "ku"}, + {"kor", "ko"}, {"lao", "lo"}, {"lat", "la"}, {"lav", "lv"}, + {"lit", "lt"}, {"ltz", "lb"}, {"mal", "ml"}, {"mar", "mr"}, + {"mkd", "mk"}, {"mlt", "mt"}, {"mon", "mn"}, {"mri", "mi"}, + {"msa", "ms"}, {"mya", "my"}, {"nep", "ne"}, {"nld", "nl"}, + {"nor", "no"}, {"oci", "oc"}, {"ori", "or"}, {"pan", "pa"}, + {"pol", "pl"}, {"por", "pt"}, {"pus", "ps"}, {"que", "qu"}, + {"ron", "ro"}, {"rus", "ru"}, {"san", "sa"}, {"sin", "si"}, + {"slk", "sk"}, {"slv", "sl"}, {"snd", "sd"}, {"spa", "es"}, + {"sqi", "sq"}, {"srp", "sr"}, {"sun", "su"}, {"swa", "sw"}, + {"swe", "sv"}, {"syr", "syr"}, {"tam", "ta"}, {"tat", "tt"}, + {"tel", "te"}, {"tgk", "tg"}, {"tha", "th"}, {"tir", "ti"}, + {"ton", "to"}, {"tur", "tr"}, {"uig", "ug"}, {"ukr", "uk"}, + {"urd", "ur"}, {"uzb", "uz"}, {"vie", "vi"}, {"yid", "yi"}, + {"yor", "yo"}, {"zho", "zh"}, + }; + return map; +} + +// Look up a model name in the ISO 639 map, trying progressively shorter +// prefixes by stripping trailing _suffix parts. This handles compound +// model names like deu_latf -> deu -> "de", chi_tra_vert -> chi_tra -> "zh". +static std::string LookupIso639(const std::string &model) { + const auto &iso_map = Iso639Map(); + auto it = iso_map.find(model); + if (it != iso_map.end()) { + return it->second; + } + std::string prefix = model; + while (true) { + auto pos = prefix.rfind('_'); + if (pos == std::string::npos) { + break; + } + prefix = prefix.substr(0, pos); + it = iso_map.find(prefix); + if (it != iso_map.end()) { + return it->second; + } + } + return model; +} + +static const std::unordered_set &ScriptNames() { + static const std::unordered_set set{ + "Arabic", "Armenian", "Bengali", "Canadian_Aboriginal", "Cherokee", + "Cyrillic", "Devanagari", "Ethiopic", "Fraktur", "Georgian", "Greek", + "Gujarati", "Gurmukhi", "HanS", "HanS_vert", "HanT", "HanT_vert", + "Hangul", "Hangul_vert", "Hebrew", "Japanese", "Japanese_vert", + "Kannada", "Khmer", "Lao", "Latin", "Malayalam", "Myanmar", "Oriya", + "Sinhala", "Syriac", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", + "Vietnamese", + }; + return set; +} + bool TessHOcrRenderer::BeginDocumentHandler() { AppendString( "\n" @@ -496,8 +579,43 @@ bool TessHOcrRenderer::BeginDocumentHandler() { if (font_info_) { AppendString(" ocrp_font ocrp_fsize"); } + AppendString("'/>\n"); + if (!input_languages_.empty()) { + std::string langs; + std::string scripts; + const auto &script_set = ScriptNames(); + std::istringstream stream(input_languages_); + std::string token; + while (std::getline(stream, token, '+')) { + if (token.empty()) { + continue; + } + if (script_set.count(token)) { + if (!scripts.empty()) { + scripts += ' '; + } + scripts += token; + } else { + if (!langs.empty()) { + langs += ' '; + } + langs += LookupIso639(token); + } + } + if (!langs.empty()) { + std::string escaped_langs = HOcrEscape(langs.c_str()); + AppendString(" \n"); + } + if (!scripts.empty()) { + std::string escaped_scripts = HOcrEscape(scripts.c_str()); + AppendString(" \n"); + } + } AppendString( - "'/>\n" " \n" " \n"); diff --git a/src/tesseract.cpp b/src/tesseract.cpp index 8ed2c1d52f..3e7e905554 100644 --- a/src/tesseract.cpp +++ b/src/tesseract.cpp @@ -516,6 +516,7 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api, bool font_info; api.GetBoolVariable("hocr_font_info", &font_info); auto renderer = std::make_unique(outputbase, font_info); + renderer->SetInputLanguages(api.GetInitLanguagesAsString()); if (renderer->happy()) { renderers.push_back(std::move(renderer)); } else {