Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions Lib/_py_grapheme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
"""Pure Python implementation of unicodedata.iter_graphemes().

Uses the extended grapheme cluster rules from Unicode TR29.
"""

import sys
import unicodedata


class Segment:
"""Represents a grapheme cluster segment within a string."""

__slots__ = ('_string', 'start', 'end')

def __init__(self, string, start, end):
self._string = string
self.start = start
self.end = end

def __str__(self):
return self._string[self.start:self.end]

def __repr__(self):
return f"<Segment {self.start}:{self.end}>"


# Grapheme_Cluster_Break property values (matching C #defines)
_GCB_Other = "Other"
_GCB_Prepend = "Prepend"
_GCB_CR = "CR"
_GCB_LF = "LF"
_GCB_Control = "Control"
_GCB_Extend = "Extend"
_GCB_Regional_Indicator = "Regional_Indicator"
_GCB_SpacingMark = "SpacingMark"
_GCB_L = "L"
_GCB_V = "V"
_GCB_T = "T"
_GCB_LV = "LV"
_GCB_LVT = "LVT"
_GCB_ZWJ = "ZWJ"

# Indic_Conjunct_Break property values
_InCB_None = "None"
_InCB_Linker = "Linker"
_InCB_Consonant = "Consonant"
_InCB_Extend = "Extend"

# Extended Pictographic FSM states (for GB11)
_EP_INIT = 0
_EP_STARTED = 1
_EP_ZWJ = 2
_EP_MATCHED = 3

# Indic Conjunct Break FSM states (for GB9c)
_INCB_INIT = 0
_INCB_STARTED = 1
_INCB_LINKER = 2
_INCB_MATCHED = 3


def _update_ext_pict_state(state, gcb, ext_pict):
if ext_pict:
return _EP_MATCHED if state == _EP_ZWJ else _EP_STARTED
if state == _EP_STARTED or state == _EP_MATCHED:
if gcb == _GCB_Extend:
return _EP_STARTED
if gcb == _GCB_ZWJ:
return _EP_ZWJ
return _EP_INIT


def _update_incb_state(state, incb):
if incb == _InCB_Consonant:
return _INCB_MATCHED if state == _INCB_LINKER else _INCB_STARTED
if state != _INCB_INIT:
if incb == _InCB_Extend:
return _INCB_LINKER if state == _INCB_LINKER else _INCB_STARTED
if incb == _InCB_Linker:
return _INCB_LINKER
return _INCB_INIT


def _grapheme_break(prev_gcb, curr_gcb, ep_state, ri_flag, incb_state):
"""Return True if a grapheme cluster break occurs between two characters."""
# GB3: Do not break between a CR and LF.
if prev_gcb == _GCB_CR and curr_gcb == _GCB_LF:
return False

# GB4: Break after controls.
if prev_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
return True

# GB5: Break before controls.
if curr_gcb in (_GCB_CR, _GCB_LF, _GCB_Control):
return True

# GB6: Do not break Hangul syllable sequences (L).
if prev_gcb == _GCB_L and curr_gcb in (_GCB_L, _GCB_V, _GCB_LV, _GCB_LVT):
return False

# GB7: Do not break Hangul syllable sequences (LV, V).
if prev_gcb in (_GCB_LV, _GCB_V) and curr_gcb in (_GCB_V, _GCB_T):
return False

# GB8: Do not break Hangul syllable sequences (LVT, T).
if prev_gcb in (_GCB_LVT, _GCB_T) and curr_gcb == _GCB_T:
return False

# GB9: Do not break before extending characters or ZWJ.
if curr_gcb in (_GCB_Extend, _GCB_ZWJ):
return False

# GB9a: Do not break before SpacingMarks.
if curr_gcb == _GCB_SpacingMark:
return False

# GB9b: Do not break after Prepend characters.
if prev_gcb == _GCB_Prepend:
return False

# GB9c: Do not break within Indic conjunct clusters.
if incb_state == _INCB_MATCHED:
return False

# GB11: Do not break within emoji ZWJ sequences.
if ep_state == _EP_MATCHED:
return False

# GB12/GB13: Do not break within emoji flag sequences.
if prev_gcb == _GCB_Regional_Indicator and curr_gcb == _GCB_Regional_Indicator:
return ri_flag

# GB999: Otherwise, break everywhere.
return True


def iter_graphemes(string, start=0, end=sys.maxsize):
"""Iterate over grapheme clusters in a string.

Uses extended grapheme cluster rules from TR29.

Returns an iterator yielding Segment objects with start/end attributes
and str() support.
"""
if not isinstance(string, str):
raise TypeError(
"argument must be a unicode character, not "
f"'{type(string).__name__}'"
)

length = len(string)
# Adjust indices (matching CPython's ADJUST_INDICES macro)
if end > length:
end = length
if end < 0:
end += length
if end < 0:
end = 0
if start < 0:
start += length
if start < 0:
start = 0

return _iter_grapheme_clusters(string, start, end)


def _iter_grapheme_clusters(string, start, end):
gcb = _GCB_Other
ep_state = _EP_INIT
incb_state = _INCB_INIT
ri_flag = False

cluster_start = start
pos = start
while pos < end:
ch = string[pos]
curr_gcb = unicodedata.grapheme_cluster_break(ch)
ext_pict = unicodedata.extended_pictographic(ch)
incb = unicodedata.indic_conjunct_break(ch)

ep_state = _update_ext_pict_state(ep_state, curr_gcb, ext_pict)
ri_flag = (not ri_flag) if curr_gcb == _GCB_Regional_Indicator else False
incb_state = _update_incb_state(incb_state, incb)

prev_gcb = gcb
gcb = curr_gcb

if pos != cluster_start and _grapheme_break(
prev_gcb, curr_gcb, ep_state, ri_flag, incb_state
):
yield Segment(string, cluster_start, pos)
cluster_start = pos

pos += 1

if cluster_start < end:
yield Segment(string, cluster_start, end)
133 changes: 120 additions & 13 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1300,16 +1300,103 @@ class MyStr(str):
self.assertIs(type(normalize(form, MyStr(input_str))), str)


class GraphemeBreakTest(unittest.TestCase):
class BaseGraphemeBreakTest:
iter_graphemes = staticmethod(unicodedata.iter_graphemes)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not needed. It is initialized in subclasses.


def test_grapheme_break_types(self):
self.assertRaises(TypeError, self.iter_graphemes)
self.assertRaises(TypeError, self.iter_graphemes, b'x')

def test_grapheme_break_empty(self):
graphemes = self._graphemes
self.assertEqual(graphemes(''), [])

def test_grapheme_break_simple(self):
graphemes = self._graphemes
self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
self.assertEqual(graphemes('abcd', 3, 1), [])
self.assertEqual(graphemes('abcd', 5), [])
self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
self.assertEqual(graphemes('abcd', 0, -5), [])

def test_grapheme_break_rules(self):
graphemes = self._graphemes
# GB3
self.assertEqual(graphemes('\r\n'), ['\r\n'])
# GB4
self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
# GB5
self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
# GB6
self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
# GB7
self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
# GB8
self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
# GB9
self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
# GB9a
self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
# GB9b
self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
# GB9c
self.assertEqual(graphemes('\u0915\u094d\u0924'),
['\u0915\u094d\u0924'])
self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
['\u0915\u094D\u094D\u0924'])
self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
['\u0915\u094D\u0924\u094D\u092F'])
# GB11
self.assertEqual(graphemes(
'\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
# GB12
self.assertEqual(graphemes(
'\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
# GB13
self.assertEqual(graphemes(
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])

def test_segment_object(self):
segments = list(self.iter_graphemes('spa\u0300m'))
self.assertEqual(len(segments), 4, segments)
segment = segments[2]
self.assertEqual(segment.start, 2)
self.assertEqual(segment.end, 4)
self.assertEqual(str(segment), 'a\u0300')

def _graphemes(self, *args):
return list(map(str, self.iter_graphemes(*args)))

@requires_resource('network')
def test_grapheme_break(self):
def test_tr29_conformance(self):
TESTDATAFILE = "GraphemeBreakTest.txt"
testdata = download_test_data_file(TESTDATAFILE)

with testdata:
self.run_grapheme_break_tests(testdata)
self._run_grapheme_break_tests(testdata)

def run_grapheme_break_tests(self, testdata):
def _run_grapheme_break_tests(self, testdata):
for line in testdata:
line, _, comment = line.partition('#')
line = line.strip()
Expand All @@ -1330,19 +1417,32 @@ def run_grapheme_break_tests(self, testdata):
self.assertEqual(chunks.pop(), '', line)
input = ''.join(chunks)
with self.subTest(line):
result = list(unicodedata.iter_graphemes(input))
result = list(self.iter_graphemes(input))
self.assertEqual(list(map(str, result)), chunks, comment)
self.assertEqual([x.start for x in result], breaks[:-1], comment)
self.assertEqual([x.end for x in result], breaks[1:], comment)
self.assertEqual([x.start for x in result],
breaks[:-1], comment)
self.assertEqual([x.end for x in result],
breaks[1:], comment)
for i in range(1, len(breaks) - 1):
result = list(unicodedata.iter_graphemes(input, breaks[i]))
self.assertEqual(list(map(str, result)), chunks[i:], comment)
self.assertEqual([x.start for x in result], breaks[i:-1], comment)
self.assertEqual([x.end for x in result], breaks[i+1:], comment)
result = list(self.iter_graphemes(input, breaks[i]))
self.assertEqual(list(map(str, result)),
chunks[i:], comment)
self.assertEqual([x.start for x in result],
breaks[i:-1], comment)
self.assertEqual([x.end for x in result],
breaks[i+1:], comment)


class GraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest):
iter_graphemes = staticmethod(unicodedata.iter_graphemes)

def test_segment_repr(self):
segment = list(unicodedata.iter_graphemes('spa\u0300m'))[2]
self.assertEqual(repr(segment), '<Segment 2:4>')
self.assertRaises(TypeError, iter, segment)
self.assertRaises(TypeError, len, segment)

def test_reference_loops(self):
# Test that reference loops involving GraphemeBreakIterator or
# Segment can be broken by the garbage collector.
class S(str):
pass

Expand All @@ -1363,5 +1463,12 @@ class S(str):
self.assertIsNone(wr())


class PyGraphemeBreakTest(unittest.TestCase, BaseGraphemeBreakTest):
@classmethod
def setUpClass(cls):
from _py_grapheme import iter_graphemes
cls.iter_graphemes = staticmethod(iter_graphemes)


if __name__ == "__main__":
unittest.main()
Loading