From 5b5d6cca9a4202a78f15edba53715e2a0d291d6b Mon Sep 17 00:00:00 2001 From: ksqsf Date: Fri, 6 Mar 2026 20:41:53 +0100 Subject: [PATCH] feat(syllabifier): trim leading delimiters --- src/rime/algo/syllabifier.cc | 14 ++++++++-- src/rime/gear/abc_segmentor.cc | 2 +- test/syllabifier_test.cc | 51 ++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/src/rime/algo/syllabifier.cc b/src/rime/algo/syllabifier.cc index d3577628ac..b7ed6b7973 100644 --- a/src/rime/algo/syllabifier.cc +++ b/src/rime/algo/syllabifier.cc @@ -53,12 +53,19 @@ int Syllabifier::BuildSyllableGraph(const string& input, if (current_pos > farthest) farthest = current_pos; - DLOG(INFO) << "current_pos: " << current_pos; + + // consume leading delimiters + size_t begin_pos = current_pos; + while (begin_pos < input.length() && + delimiters_.find(input[begin_pos]) != string::npos) + ++begin_pos; + DLOG(INFO) << "current_pos: " << current_pos + << ", begin_pos: " << begin_pos; // see where we can go by advancing a syllable vector matches; set exact_match_syllables; - auto current_input = input.substr(current_pos); + auto current_input = input.substr(begin_pos); prism.CommonPrefixSearch(current_input, &matches); if (corrector_) { for (auto& m : matches) { @@ -78,12 +85,13 @@ int Syllabifier::BuildSyllableGraph(const string& input, } } + size_t leading_gap = begin_pos - current_pos; if (!matches.empty()) { auto& end_vertices(graph->edges[current_pos]); for (const auto& m : matches) { if (m.length == 0) continue; - size_t end_pos = current_pos + m.length; + size_t end_pos = current_pos + leading_gap + m.length; // consume trailing delimiters while (end_pos < input.length() && delimiters_.find(input[end_pos]) != string::npos) diff --git a/src/rime/gear/abc_segmentor.cc b/src/rime/gear/abc_segmentor.cc index f99bd55f24..1e9deb8883 100644 --- a/src/rime/gear/abc_segmentor.cc +++ b/src/rime/gear/abc_segmentor.cc @@ -44,7 +44,7 @@ bool AbcSegmentor::Proceed(Segmentation* segmentation) { bool expecting_an_initial = true; for (; k < input.length(); ++k) { bool is_letter = alphabet_.find(input[k]) != string::npos; - bool is_delimiter = (k != j) && (delimiter_.find(input[k]) != string::npos); + bool is_delimiter = (k != 0) && (delimiter_.find(input[k]) != string::npos); if (!is_letter && !is_delimiter) break; bool is_initial = initials_.find(input[k]) != string::npos; diff --git a/test/syllabifier_test.cc b/test/syllabifier_test.cc index 6dc4430d34..fa748ff39d 100644 --- a/test/syllabifier_test.cc +++ b/test/syllabifier_test.cc @@ -160,3 +160,54 @@ TEST_F(RimeSyllabifierTest, TransposedSyllableGraph) { ASSERT_FALSE(NULL == g.indices[0][syllable_id_["chan"]][0]); EXPECT_EQ(4, g.indices[0][syllable_id_["chan"]][0]->end_pos); } + +TEST_F(RimeSyllabifierTest, TrimLeadingDelimiters) { + rime::Syllabifier s(" '"); + rime::SyllableGraph g; + const rime::string input("''a"); + s.BuildSyllableGraph(input, *prism_, &g); + EXPECT_EQ(input.length(), g.input_length); + EXPECT_EQ(input.length(), g.interpreted_length); + EXPECT_EQ(2, g.vertices.size()); + ASSERT_FALSE(g.vertices.end() == g.vertices.find(3)); + EXPECT_EQ(rime::kNormalSpelling, g.vertices[1]); + rime::SpellingMap& sp(g.edges[0][3]); + EXPECT_EQ(1, sp.size()); + ASSERT_FALSE(sp.end() == sp.find(syllable_id_["a"])); + EXPECT_EQ(rime::kNormalSpelling, sp[0].type); + EXPECT_EQ(0.0, sp[0].credibility); +} + +TEST_F(RimeSyllabifierTest, TrimTrailingDelimiters) { + rime::Syllabifier s(" '"); + rime::SyllableGraph g; + const rime::string input("a''"); + s.BuildSyllableGraph(input, *prism_, &g); + EXPECT_EQ(input.length(), g.input_length); + EXPECT_EQ(input.length(), g.interpreted_length); + EXPECT_EQ(2, g.vertices.size()); + ASSERT_FALSE(g.vertices.end() == g.vertices.find(3)); + EXPECT_EQ(rime::kNormalSpelling, g.vertices[1]); + rime::SpellingMap& sp(g.edges[0][3]); + EXPECT_EQ(1, sp.size()); + ASSERT_FALSE(sp.end() == sp.find(syllable_id_["a"])); + EXPECT_EQ(rime::kNormalSpelling, sp[0].type); + EXPECT_EQ(0.0, sp[0].credibility); +} + +TEST_F(RimeSyllabifierTest, TrimBothLeadingAndTrailingDelimiters) { + rime::Syllabifier s(" '"); + rime::SyllableGraph g; + const rime::string input("''a''"); + s.BuildSyllableGraph(input, *prism_, &g); + EXPECT_EQ(input.length(), g.input_length); + EXPECT_EQ(input.length(), g.interpreted_length); + EXPECT_EQ(2, g.vertices.size()); + ASSERT_FALSE(g.vertices.end() == g.vertices.find(5)); + EXPECT_EQ(rime::kNormalSpelling, g.vertices[1]); + rime::SpellingMap& sp(g.edges[0][5]); + EXPECT_EQ(1, sp.size()); + ASSERT_FALSE(sp.end() == sp.find(syllable_id_["a"])); + EXPECT_EQ(rime::kNormalSpelling, sp[0].type); + EXPECT_EQ(0.0, sp[0].credibility); +}