-
-
Notifications
You must be signed in to change notification settings - Fork 34.4k
gh-130273: Add pure Python implementation of unicodedata.iter_graphemes() #148218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
ambv
wants to merge
6
commits into
python:main
Choose a base branch
from
ambv:gh-130273-iter-graphemes
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 1 commit
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
df60e53
Add pure Python implementation of unicodedata.iter_graphemes()
ambv 6262980
Add _py_grapheme to stdlib_module_names.h
ambv 38db422
Make the first argument positional-only
ambv 70bdb56
Make _py_grapheme standalone by generating property tables
ambv 5701c0b
Fix newlines to make linter happy
ambv e073e06
Achieve 100% statement and branch test coverage
ambv File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,198 @@ | ||
| """Pure Python implementation of unicodedata.iter_graphemes(). | ||
|
|
||
| Uses the extended grapheme cluster rules from Unicode TR29. | ||
| """ | ||
|
|
||
| import sys | ||
| import unicodedata | ||
|
|
||
|
|
||
| class Segment: | ||
| """Represents a grapheme cluster segment within a string.""" | ||
|
|
||
| __slots__ = ('_string', 'start', 'end') | ||
|
|
||
| def __init__(self, string, start, end): | ||
| self._string = string | ||
| self.start = start | ||
| self.end = end | ||
|
|
||
| def __str__(self): | ||
| return self._string[self.start:self.end] | ||
|
|
||
| def __repr__(self): | ||
| return f"<Segment {self.start}:{self.end}>" | ||
|
|
||
|
|
||
| # Grapheme_Cluster_Break property values (matching C #defines) | ||
| _GCB_Other = "Other" | ||
| _GCB_Prepend = "Prepend" | ||
| _GCB_CR = "CR" | ||
| _GCB_LF = "LF" | ||
| _GCB_Control = "Control" | ||
| _GCB_Extend = "Extend" | ||
| _GCB_Regional_Indicator = "Regional_Indicator" | ||
| _GCB_SpacingMark = "SpacingMark" | ||
| _GCB_L = "L" | ||
| _GCB_V = "V" | ||
| _GCB_T = "T" | ||
| _GCB_LV = "LV" | ||
| _GCB_LVT = "LVT" | ||
| _GCB_ZWJ = "ZWJ" | ||
|
|
||
| # Indic_Conjunct_Break property values | ||
| _InCB_None = "None" | ||
| _InCB_Linker = "Linker" | ||
| _InCB_Consonant = "Consonant" | ||
| _InCB_Extend = "Extend" | ||
|
|
||
| # Extended Pictographic FSM states (for GB11) | ||
| _EP_INIT = 0 | ||
| _EP_STARTED = 1 | ||
| _EP_ZWJ = 2 | ||
| _EP_MATCHED = 3 | ||
|
|
||
| # Indic Conjunct Break FSM states (for GB9c) | ||
| _INCB_INIT = 0 | ||
| _INCB_STARTED = 1 | ||
| _INCB_LINKER = 2 | ||
| _INCB_MATCHED = 3 | ||
|
|
||
|
|
||
| def _update_ext_pict_state(state, gcb, ext_pict): | ||
| if ext_pict: | ||
| return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED | ||
| if state == _EP_STARTED or state == _EP_MATCHED: | ||
| if gcb == _GCB_Extend: | ||
| return _EP_STARTED | ||
| if gcb == _GCB_ZWJ: | ||
| return _EP_ZWJ | ||
| return _EP_INIT | ||
|
|
||
|
|
||
| def _update_incb_state(state, incb): | ||
| if incb == _InCB_Consonant: | ||
| return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED | ||
| if state != _INCB_INIT: | ||
| if incb == _InCB_Extend: | ||
| return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED | ||
| if incb == _InCB_Linker: | ||
| return _INCB_LINKER | ||
| return _INCB_INIT | ||
|
|
||
|
|
||
| def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state): | ||
| """Return True if a grapheme cluster break occurs between two characters.""" | ||
| # GB3: Do not break between a CR and LF. | ||
| if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF: | ||
| return False | ||
|
|
||
| # GB4: Break after controls. | ||
| if prev_gcb in (_GCB_CR, _GCB_LF, _GCB_Control): | ||
| return True | ||
|
|
||
| # GB5: Break before controls. | ||
| if curr_gcb in (_GCB_CR, _GCB_LF, _GCB_Control): | ||
| return True | ||
|
|
||
| # GB6: Do not break Hangul syllable sequences (L). | ||
| if prev_gcb == _GCB_L and curr_gcb in (_GCB_L, _GCB_V, _GCB_LV, _GCB_LVT): | ||
| return False | ||
|
|
||
| # GB7: Do not break Hangul syllable sequences (LV, V). | ||
| if prev_gcb in (_GCB_LV, _GCB_V) and curr_gcb in (_GCB_V, _GCB_T): | ||
| return False | ||
|
|
||
| # GB8: Do not break Hangul syllable sequences (LVT, T). | ||
| if prev_gcb in (_GCB_LVT, _GCB_T) and curr_gcb == _GCB_T: | ||
| return False | ||
|
|
||
| # GB9: Do not break before extending characters or ZWJ. | ||
| if curr_gcb in (_GCB_Extend, _GCB_ZWJ): | ||
| return False | ||
|
|
||
| # GB9a: Do not break before SpacingMarks. | ||
| if curr_gcb == _GCB_SpacingMark: | ||
| return False | ||
|
|
||
| # GB9b: Do not break after Prepend characters. | ||
| if prev_gcb == _GCB_Prepend: | ||
| return False | ||
|
|
||
| # GB9c: Do not break within Indic conjunct clusters. | ||
| if incb_state == _INCB_MATCHED: | ||
| return False | ||
|
|
||
| # GB11: Do not break within emoji ZWJ sequences. | ||
| if ep_state == _EP_MATCHED: | ||
| return False | ||
|
|
||
| # GB12/GB13: Do not break within emoji flag sequences. | ||
| if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator: | ||
| return ri_flag | ||
|
|
||
| # GB999: Otherwise, break everywhere. | ||
| return True | ||
|
|
||
|
|
||
| def iter_graphemes(string, start=0, end=sys.maxsize): | ||
| """Iterate over grapheme clusters in a string. | ||
|
|
||
| Uses extended grapheme cluster rules from TR29. | ||
|
|
||
| Returns an iterator yielding Segment objects with start/end attributes | ||
| and str() support. | ||
| """ | ||
| if not isinstance(string, str): | ||
| raise TypeError( | ||
| "argument must be a unicode character, not " | ||
| f"'{type(string).__name__}'" | ||
| ) | ||
ambv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| length = len(string) | ||
| # Adjust indices (matching CPython's ADJUST_INDICES macro) | ||
| if end > length: | ||
| end = length | ||
| if end < 0: | ||
| end += length | ||
| if end < 0: | ||
| end = 0 | ||
| if start < 0: | ||
| start += length | ||
| if start < 0: | ||
| start = 0 | ||
|
|
||
| return _iter_grapheme_clusters(string, start, end) | ||
|
|
||
|
|
||
| def _iter_grapheme_clusters(string, start, end): | ||
| gcb = _GCB_Other | ||
| ep_state = _EP_INIT | ||
| incb_state = _INCB_INIT | ||
| ri_flag = False | ||
|
|
||
| cluster_start = start | ||
| pos = start | ||
| while pos < end: | ||
| ch = string[pos] | ||
| curr_gcb = unicodedata.grapheme_cluster_break(ch) | ||
| ext_pict = unicodedata.extended_pictographic(ch) | ||
| incb = unicodedata.indic_conjunct_break(ch) | ||
ambv marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| ep_state = _update_ext_pict_state(ep_state, curr_gcb, ext_pict) | ||
| ri_flag = (not ri_flag) if curr_gcb == _GCB_Regional_Indicator else False | ||
| incb_state = _update_incb_state(incb_state, incb) | ||
|
|
||
| prev_gcb = gcb | ||
| gcb = curr_gcb | ||
|
|
||
| if pos != cluster_start and _grapheme_break( | ||
| prev_gcb, curr_gcb, ep_state, ri_flag, incb_state | ||
| ): | ||
| yield Segment(string, cluster_start, pos) | ||
| cluster_start = pos | ||
|
|
||
| pos += 1 | ||
|
|
||
| if cluster_start < end: | ||
| yield Segment(string, cluster_start, end) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1300,16 +1300,103 @@ class MyStr(str): | |
| self.assertIs(type(normalize(form, MyStr(input_str))), str) | ||
|
|
||
|
|
||
| class GraphemeBreakTest(unittest.TestCase): | ||
| class BaseGraphemeBreakTest: | ||
| iter_graphemes = staticmethod(unicodedata.iter_graphemes) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not needed. It is initialized in subclasses. |
||
|
|
||
| def test_grapheme_break_types(self): | ||
| self.assertRaises(TypeError, self.iter_graphemes) | ||
| self.assertRaises(TypeError, self.iter_graphemes, b'x') | ||
|
|
||
| def test_grapheme_break_empty(self): | ||
| graphemes = self._graphemes | ||
| self.assertEqual(graphemes(''), []) | ||
|
|
||
| def test_grapheme_break_simple(self): | ||
| graphemes = self._graphemes | ||
| self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd']) | ||
| self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd']) | ||
| self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c']) | ||
| self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd']) | ||
| self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c']) | ||
| self.assertEqual(graphemes('abcd', 3, 1), []) | ||
| self.assertEqual(graphemes('abcd', 5), []) | ||
| self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd']) | ||
| self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd']) | ||
| self.assertEqual(graphemes('abcd', 0, -5), []) | ||
|
|
||
| def test_grapheme_break_rules(self): | ||
| graphemes = self._graphemes | ||
| # GB3 | ||
| self.assertEqual(graphemes('\r\n'), ['\r\n']) | ||
| # GB4 | ||
| self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308']) | ||
| self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308']) | ||
| self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308']) | ||
| # GB5 | ||
| self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r']) | ||
| self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n']) | ||
| self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0']) | ||
| # GB6 | ||
| self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160']) | ||
| self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00']) | ||
| self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01']) | ||
| # GB7 | ||
| self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160']) | ||
| self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8']) | ||
| self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160']) | ||
| self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8']) | ||
| # GB8 | ||
| self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8']) | ||
| self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8']) | ||
| # GB9 | ||
| self.assertEqual(graphemes('a\u0300'), ['a\u0300']) | ||
| self.assertEqual(graphemes('a\u200D'), ['a\u200D']) | ||
| # GB9a | ||
| self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903']) | ||
| # GB9b | ||
| self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661']) | ||
| # GB9c | ||
| self.assertEqual(graphemes('\u0915\u094d\u0924'), | ||
| ['\u0915\u094d\u0924']) | ||
| self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'), | ||
| ['\u0915\u094D\u094D\u0924']) | ||
| self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'), | ||
| ['\u0915\u094D\u0924\u094D\u092F']) | ||
| # GB11 | ||
| self.assertEqual(graphemes( | ||
| '\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F' | ||
| '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'), | ||
| ['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F' | ||
| '\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC']) | ||
| # GB12 | ||
| self.assertEqual(graphemes( | ||
| '\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'), | ||
| ['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3']) | ||
| # GB13 | ||
| self.assertEqual(graphemes( | ||
| 'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'), | ||
| ['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3']) | ||
|
|
||
| def test_segment_object(self): | ||
| segments = list(self.iter_graphemes('spa\u0300m')) | ||
| self.assertEqual(len(segments), 4, segments) | ||
| segment = segments[2] | ||
| self.assertEqual(segment.start, 2) | ||
| self.assertEqual(segment.end, 4) | ||
| self.assertEqual(str(segment), 'a\u0300') | ||
ambv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def _graphemes(self, *args): | ||
| return list(map(str, self.iter_graphemes(*args))) | ||
|
|
||
| @requires_resource('network') | ||
| def test_grapheme_break(self): | ||
| def test_tr29_conformance(self): | ||
| TESTDATAFILE = "GraphemeBreakTest.txt" | ||
| testdata = download_test_data_file(TESTDATAFILE) | ||
|
|
||
| with testdata: | ||
| self.run_grapheme_break_tests(testdata) | ||
| self._run_grapheme_break_tests(testdata) | ||
ambv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| def run_grapheme_break_tests(self, testdata): | ||
| def _run_grapheme_break_tests(self, testdata): | ||
| for line in testdata: | ||
| line, _, comment = line.partition('#') | ||
| line = line.strip() | ||
|
|
@@ -1330,19 +1417,32 @@ def run_grapheme_break_tests(self, testdata): | |
| self.assertEqual(chunks.pop(), '', line) | ||
| input = ''.join(chunks) | ||
| with self.subTest(line): | ||
| result = list(unicodedata.iter_graphemes(input)) | ||
| result = list(self.iter_graphemes(input)) | ||
| self.assertEqual(list(map(str, result)), chunks, comment) | ||
| self.assertEqual([x.start for x in result], breaks[:-1], comment) | ||
| self.assertEqual([x.end for x in result], breaks[1:], comment) | ||
| self.assertEqual([x.start for x in result], | ||
| breaks[:-1], comment) | ||
| self.assertEqual([x.end for x in result], | ||
| breaks[1:], comment) | ||
| for i in range(1, len(breaks) - 1): | ||
| result = list(unicodedata.iter_graphemes(input, breaks[i])) | ||
| self.assertEqual(list(map(str, result)), chunks[i:], comment) | ||
| self.assertEqual([x.start for x in result], breaks[i:-1], comment) | ||
| self.assertEqual([x.end for x in result], breaks[i+1:], comment) | ||
| result = list(self.iter_graphemes(input, breaks[i])) | ||
| self.assertEqual(list(map(str, result)), | ||
| chunks[i:], comment) | ||
| self.assertEqual([x.start for x in result], | ||
| breaks[i:-1], comment) | ||
| self.assertEqual([x.end for x in result], | ||
| breaks[i+1:], comment) | ||
|
|
||
|
|
||
| class GraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest): | ||
| iter_graphemes = staticmethod(unicodedata.iter_graphemes) | ||
|
|
||
| def test_segment_repr(self): | ||
| segment = list(unicodedata.iter_graphemes('spa\u0300m'))[2] | ||
| self.assertEqual(repr(segment), '<Segment 2:4>') | ||
| self.assertRaises(TypeError, iter, segment) | ||
| self.assertRaises(TypeError, len, segment) | ||
|
|
||
| def test_reference_loops(self): | ||
| # Test that reference loops involving GraphemeBreakIterator or | ||
| # Segment can be broken by the garbage collector. | ||
| class S(str): | ||
| pass | ||
|
|
||
|
|
@@ -1363,5 +1463,12 @@ class S(str): | |
| self.assertIsNone(wr()) | ||
|
|
||
|
|
||
| class PyGraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest): | ||
| @classmethod | ||
| def setUpClass(cls): | ||
| from _py_grapheme import iter_graphemes | ||
| cls.iter_graphemes = staticmethod(iter_graphemes) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.