diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 3b31b65aae..717c6afcd9 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -765,6 +765,7 @@ int TessBaseAPI::Recognize(ETEXT_DESC *monitor) { delete page_res_; if (block_list_->empty()) { page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_); + recognition_done_ = true; return 0; // Empty page. } diff --git a/unittest/baseapi_test.cc b/unittest/baseapi_test.cc index 9aa4702405..6c497b64fe 100644 --- a/unittest/baseapi_test.cc +++ b/unittest/baseapi_test.cc @@ -157,6 +157,37 @@ TEST_F(TesseractTest, HOCRContainsBaseline) { src_pix.destroy(); } +// Tests that all output formats return valid results on an empty page. +// Regression test for https://github.com/tesseract-ocr/tesseract/issues/4112 +TEST_F(TesseractTest, EmptyPageOutputConsistency) { + tesseract::TessBaseAPI api; + if (api.Init(TessdataPath().c_str(), "eng", tesseract::OEM_LSTM_ONLY) == -1) { + // eng.traineddata not found. + GTEST_SKIP(); + } + // Create a blank white image (no text to detect). + Image blank_pix = pixCreate(200, 200, 8); + CHECK(blank_pix); + pixSetAll(blank_pix); + api.SetImage(blank_pix); + ASSERT_EQ(api.Recognize(nullptr), 0); + + // All output formats should return non-null, even on an empty page. + char *hocr = api.GetHOCRText(0); + EXPECT_TRUE(hocr != nullptr); + delete[] hocr; + + char *utf8 = api.GetUTF8Text(); + EXPECT_TRUE(utf8 != nullptr); + delete[] utf8; + + char *tsv = api.GetTSVText(0); + EXPECT_TRUE(tsv != nullptr); + delete[] tsv; + + blank_pix.destroy(); +} + // Tests that Tesseract gets exactly the right answer on some page numbers. TEST_F(TesseractTest, AdaptToWordStrTest) { #ifdef DISABLED_LEGACY_ENGINE