-
Notifications
You must be signed in to change notification settings - Fork 10.6k
Add languages parameter to hOCR output metadata #4531
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,9 +18,12 @@ | |
| **********************************************************************/ | ||
|
|
||
| #include <tesseract/baseapi.h> // for TessBaseAPI | ||
| #include <cstring> // for strcmp | ||
| #include <locale> // for std::locale::classic | ||
| #include <memory> // for std::unique_ptr | ||
| #include <sstream> // for std::stringstream | ||
| #include <unordered_map> // for std::unordered_map | ||
| #include <unordered_set> // for std::unordered_set | ||
| #include <tesseract/renderer.h> | ||
| #include "helpers.h" // for copy_string | ||
| #include "tesseractclass.h" // for Tesseract | ||
|
|
@@ -477,6 +480,86 @@ TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) | |
| font_info_ = font_info; | ||
| } | ||
|
|
||
| void TessHOcrRenderer::SetInputLanguages(const char *languages) { | ||
| if (languages && languages[0] != '\0') { | ||
| input_languages_ = languages; | ||
| } else { | ||
| input_languages_.clear(); | ||
| } | ||
| } | ||
|
|
||
| static const std::unordered_map<std::string, std::string> &Iso639Map() { | ||
| static const std::unordered_map<std::string, std::string> map{ | ||
| {"afr", "af"}, {"amh", "am"}, {"ara", "ar"}, {"asm", "as"}, | ||
| {"aze", "az"}, {"bel", "be"}, {"ben", "bn"}, {"bod", "bo"}, | ||
|
Comment on lines
+491
to
+494
|
||
| {"bos", "bs"}, {"bre", "br"}, {"bul", "bg"}, {"cat", "ca"}, | ||
| {"ceb", "ceb"}, {"ces", "cs"}, {"chi_sim", "zh"}, {"chi_tra", "zh"}, | ||
| {"chr", "chr"}, {"cos", "co"}, {"cym", "cy"}, {"dan", "da"}, | ||
| {"deu", "de"}, {"div", "dv"}, {"dzo", "dz"}, {"ell", "el"}, | ||
| {"eng", "en"}, {"enm", "en"}, {"epo", "eo"}, {"est", "et"}, | ||
| {"eus", "eu"}, {"fao", "fo"}, {"fas", "fa"}, {"fil", "fil"}, | ||
| {"fin", "fi"}, {"fra", "fr"}, {"frk", "de"}, {"frm", "fr"}, | ||
| {"fry", "fy"}, {"gla", "gd"}, {"gle", "ga"}, {"glg", "gl"}, | ||
| {"grc", "el"}, {"guj", "gu"}, {"hat", "ht"}, {"heb", "he"}, | ||
| {"hin", "hi"}, {"hrv", "hr"}, {"hun", "hu"}, {"hye", "hy"}, | ||
| {"iku", "iu"}, {"ind", "id"}, {"isl", "is"}, {"ita", "it"}, | ||
| {"jav", "jv"}, {"jpn", "ja"}, {"kan", "kn"}, {"kat", "ka"}, | ||
| {"kaz", "kk"}, {"khm", "km"}, {"kir", "ky"}, {"kmr", "ku"}, | ||
| {"kor", "ko"}, {"lao", "lo"}, {"lat", "la"}, {"lav", "lv"}, | ||
| {"lit", "lt"}, {"ltz", "lb"}, {"mal", "ml"}, {"mar", "mr"}, | ||
| {"mkd", "mk"}, {"mlt", "mt"}, {"mon", "mn"}, {"mri", "mi"}, | ||
| {"msa", "ms"}, {"mya", "my"}, {"nep", "ne"}, {"nld", "nl"}, | ||
| {"nor", "no"}, {"oci", "oc"}, {"ori", "or"}, {"pan", "pa"}, | ||
| {"pol", "pl"}, {"por", "pt"}, {"pus", "ps"}, {"que", "qu"}, | ||
| {"ron", "ro"}, {"rus", "ru"}, {"san", "sa"}, {"sin", "si"}, | ||
| {"slk", "sk"}, {"slv", "sl"}, {"snd", "sd"}, {"spa", "es"}, | ||
| {"sqi", "sq"}, {"srp", "sr"}, {"sun", "su"}, {"swa", "sw"}, | ||
| {"swe", "sv"}, {"syr", "syr"}, {"tam", "ta"}, {"tat", "tt"}, | ||
| {"tel", "te"}, {"tgk", "tg"}, {"tha", "th"}, {"tir", "ti"}, | ||
| {"ton", "to"}, {"tur", "tr"}, {"uig", "ug"}, {"ukr", "uk"}, | ||
| {"urd", "ur"}, {"uzb", "uz"}, {"vie", "vi"}, {"yid", "yi"}, | ||
| {"yor", "yo"}, {"zho", "zh"}, | ||
| }; | ||
| return map; | ||
| } | ||
|
|
||
| // Look up a model name in the ISO 639 map, trying progressively shorter | ||
| // prefixes by stripping trailing _suffix parts. This handles compound | ||
| // model names like deu_latf -> deu -> "de", chi_tra_vert -> chi_tra -> "zh". | ||
| static std::string LookupIso639(const std::string &model) { | ||
| const auto &iso_map = Iso639Map(); | ||
| auto it = iso_map.find(model); | ||
| if (it != iso_map.end()) { | ||
| return it->second; | ||
| } | ||
| std::string prefix = model; | ||
| while (true) { | ||
| auto pos = prefix.rfind('_'); | ||
| if (pos == std::string::npos) { | ||
| break; | ||
| } | ||
| prefix = prefix.substr(0, pos); | ||
| it = iso_map.find(prefix); | ||
| if (it != iso_map.end()) { | ||
| return it->second; | ||
| } | ||
| } | ||
| return model; | ||
| } | ||
|
|
||
| static const std::unordered_set<std::string> &ScriptNames() { | ||
| static const std::unordered_set<std::string> set{ | ||
| "Arabic", "Armenian", "Bengali", "Canadian_Aboriginal", "Cherokee", | ||
| "Cyrillic", "Devanagari", "Ethiopic", "Fraktur", "Georgian", "Greek", | ||
|
Comment on lines
+550
to
+553
|
||
| "Gujarati", "Gurmukhi", "HanS", "HanS_vert", "HanT", "HanT_vert", | ||
| "Hangul", "Hangul_vert", "Hebrew", "Japanese", "Japanese_vert", | ||
| "Kannada", "Khmer", "Lao", "Latin", "Malayalam", "Myanmar", "Oriya", | ||
| "Sinhala", "Syriac", "Tamil", "Telugu", "Thaana", "Thai", "Tibetan", | ||
| "Vietnamese", | ||
| }; | ||
| return set; | ||
| } | ||
|
|
||
| bool TessHOcrRenderer::BeginDocumentHandler() { | ||
| AppendString( | ||
| "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" | ||
|
|
@@ -496,8 +579,43 @@ bool TessHOcrRenderer::BeginDocumentHandler() { | |
| if (font_info_) { | ||
| AppendString(" ocrp_font ocrp_fsize"); | ||
| } | ||
| AppendString("'/>\n"); | ||
| if (!input_languages_.empty()) { | ||
| std::string langs; | ||
| std::string scripts; | ||
| const auto &script_set = ScriptNames(); | ||
| std::istringstream stream(input_languages_); | ||
| std::string token; | ||
| while (std::getline(stream, token, '+')) { | ||
| if (token.empty()) { | ||
|
Comment on lines
563
to
+590
|
||
| continue; | ||
| } | ||
| if (script_set.count(token)) { | ||
| if (!scripts.empty()) { | ||
| scripts += ' '; | ||
| } | ||
| scripts += token; | ||
| } else { | ||
| if (!langs.empty()) { | ||
| langs += ' '; | ||
| } | ||
| langs += LookupIso639(token); | ||
| } | ||
|
Comment on lines
+583
to
+603
|
||
| } | ||
| if (!langs.empty()) { | ||
| std::string escaped_langs = HOcrEscape(langs.c_str()); | ||
| AppendString(" <meta name='ocr-langs' content='"); | ||
| AppendString(escaped_langs.c_str()); | ||
| AppendString("' />\n"); | ||
| } | ||
| if (!scripts.empty()) { | ||
| std::string escaped_scripts = HOcrEscape(scripts.c_str()); | ||
| AppendString(" <meta name='ocr-scripts' content='"); | ||
| AppendString(escaped_scripts.c_str()); | ||
| AppendString("' />\n"); | ||
| } | ||
| } | ||
| AppendString( | ||
| "'/>\n" | ||
| " </head>\n" | ||
| " <body>\n"); | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding
std::string input_languages_toTessHOcrRendererchanges the size/layout of this exported (TESS_API) C++ class, which can break binary compatibility for downstream code that links against libtesseract and instantiatesTessHOcrRenderer. If ABI stability is a concern, consider storing the new state behind an indirection (pimpl/opaque pointer) or in a separate internal structure to minimize ABI impact.