Coverage for src/indium/segments.py: 99%
135 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
1"""Grapheme-aware text operations without regex library dependency.
3This module provides safe text slicing and truncation that respects grapheme
4cluster boundaries. Prevents breaking emoji sequences, combining character
5sequences, and other multi-codepoint visual units.
7Implementation: UAX #29 (Unicode Text Segmentation)
8"""
10import bisect
11from collections.abc import Iterator
12from functools import lru_cache
13from typing import Optional
15from ._grapheme_data import (
16 CONTROL,
17 CR,
18 EXTEND,
19 EXTENDED_PICTOGRAPHIC,
20 GRAPHEME_BREAK_RANGES,
21 INCB_CONSONANT,
22 INCB_EXTEND,
23 INCB_LINKER,
24 LF,
25 LV,
26 LVT,
27 PREPEND,
28 REGIONAL_INDICATOR,
29 SPACINGMARK,
30 ZWJ,
31 L,
32 T,
33 V,
34)
37@lru_cache(maxsize=4096)
38def _get_break_property(codepoint: int) -> int:
39 """Get Grapheme_Cluster_Break property for a code point."""
40 # Binary search in the RLE table
41 # index-1 gives the range containing codepoint (or starting before it)
42 idx = bisect.bisect_right(GRAPHEME_BREAK_RANGES, (codepoint, 999))
43 if idx == 0:
44 return 0 # Should not happen if table covers 0
46 start, prop = GRAPHEME_BREAK_RANGES[idx - 1]
47 return prop
50def safe_truncate(text: str, max_graphemes: int) -> str:
51 """Truncate text to max grapheme clusters, not code points.
53 Ensures cut doesn't break:
54 - Emoji sequences (ZWJ, skin tone modifiers)
55 - Combining character sequences (base + marks)
56 - Regional indicator pairs (flag emoji)
58 Args:
59 text: Input string
60 max_graphemes: Maximum number of visual units (grapheme clusters)
62 Returns:
63 Truncated string at valid grapheme boundary
65 Raises:
66 ValueError: If max_graphemes is negative
68 Examples:
69 >>> safe_truncate("hello", 3)
70 'hel'
71 >>> safe_truncate("café", 3) # é is one grapheme
72 'caf'
73 >>> safe_truncate("👨👩👧", 1) # Family emoji is one grapheme
74 '👨\u200d👩\u200d👧'
75 >>> safe_truncate("hello👋🏽world", 6) # Waving hand with skin tone
76 'hello👋🏽'
77 """
78 if max_graphemes < 0:
79 raise ValueError(f"max_graphemes must be non-negative, got {max_graphemes}")
81 if max_graphemes == 0:
82 return ""
84 grapheme_count = 0
85 pos = 0
87 while pos < len(text) and grapheme_count < max_graphemes:
88 # Find end of current grapheme
89 grapheme_end = _find_grapheme_end(text, pos)
90 pos = grapheme_end
91 grapheme_count += 1
93 return text[:pos]
96def count_graphemes(text: str) -> int:
97 """Count grapheme clusters (visual units) in text.
99 Args:
100 text: Input string
102 Returns:
103 Number of grapheme clusters
105 Examples:
106 >>> count_graphemes("hello")
107 5
108 >>> count_graphemes("café") # é = e + combining acute
109 4
110 >>> count_graphemes("👨👩👧") # Family emoji
111 1
112 >>> count_graphemes("hello👋🏽") # Waving with skin tone
113 6
114 """
115 count = 0
116 pos = 0
118 while pos < len(text):
119 grapheme_end = _find_grapheme_end(text, pos)
120 count += 1
121 pos = grapheme_end
123 return count
126def grapheme_slice(text: str, start: int, end: Optional[int] = None) -> str:
127 """Slice text by grapheme indices, not code points.
129 Args:
130 text: Input string
131 start: Start grapheme index (inclusive)
132 end: End grapheme index (exclusive). If None, slice to end
134 Returns:
135 Substring from start to end grapheme indices
137 Raises:
138 ValueError: If start is negative or end < start
140 Examples:
141 >>> grapheme_slice("hello", 1, 4)
142 'ell'
143 >>> grapheme_slice("café", 0, 3)
144 'caf'
145 >>> grapheme_slice("👨👩👧test", 1, 3)
146 'te'
147 """
148 if start < 0:
149 raise ValueError(f"start must be non-negative, got {start}")
151 if end is not None and end < start:
152 raise ValueError(f"end ({end}) must be >= start ({start})")
154 # Find start position in code points
155 grapheme_count = 0
156 start_pos = 0
158 while start_pos < len(text) and grapheme_count < start:
159 grapheme_end = _find_grapheme_end(text, start_pos)
160 start_pos = grapheme_end
161 grapheme_count += 1
163 # If we ran out of text before reaching start, return empty
164 if start_pos >= len(text):
165 return ""
167 # If no end specified, return from start to end of text
168 if end is None:
169 return text[start_pos:]
171 # Find end position in code points
172 end_pos = start_pos
173 while end_pos < len(text) and grapheme_count < end:
174 grapheme_end = _find_grapheme_end(text, end_pos)
175 end_pos = grapheme_end
176 grapheme_count += 1
178 return text[start_pos:end_pos]
181def iter_graphemes(text: str) -> Iterator[str]:
182 """Iterate over grapheme clusters.
184 Args:
185 text: Input string
187 Yields:
188 Individual grapheme clusters
190 Examples:
191 >>> list(iter_graphemes("hello"))
192 ['h', 'e', 'l', 'l', 'o']
193 >>> list(iter_graphemes("café"))
194 ['c', 'a', 'f', 'é']
195 >>> list(iter_graphemes("a👋🏽b"))
196 ['a', '👋🏽', 'b']
197 """
198 pos = 0
200 while pos < len(text):
201 grapheme_end = _find_grapheme_end(text, pos)
202 yield text[pos:grapheme_end]
203 pos = grapheme_end
206# Internal helper: Find end of grapheme cluster starting at pos
207def _find_grapheme_end(text: str, pos: int) -> int:
208 """Find the end position of the grapheme cluster starting at pos.
210 Args:
211 text: Input string
212 pos: Start position of grapheme
214 Returns:
215 Position after the end of the grapheme cluster
216 """
217 if pos >= len(text):
218 return pos
220 # Start with first character
221 end = pos + 1
223 # Continue while we're not at a grapheme boundary
224 while end < len(text) and not _is_grapheme_boundary(text, end):
225 end += 1
227 return end
230def _is_grapheme_boundary(text: str, pos: int) -> bool:
231 """Check if position is a valid grapheme boundary.
233 Implements Unicode TR29 (Grapheme Cluster Boundaries).
235 Args:
236 text: Input string
237 pos: Position to check
239 Returns:
240 True if position is a valid grapheme boundary
241 """
242 if pos == 0 or pos >= len(text):
243 return True
245 curr_char = text[pos]
246 prev_char = text[pos - 1]
248 curr_prop = _get_break_property(ord(curr_char))
249 prev_prop = _get_break_property(ord(prev_char))
251 # GB3: CR x LF
252 if prev_prop == CR and curr_prop == LF:
253 return False
255 # GB4: (Control | CR | LF) ÷
256 if prev_prop in (CONTROL, CR, LF):
257 return True
259 # GB5: ÷ (Control | CR | LF)
260 if curr_prop in (CONTROL, CR, LF):
261 return True
263 # GB6: L x (L | V | LV | LVT)
264 if prev_prop == L and curr_prop in (L, V, LV, LVT):
265 return False
267 # GB7: (LV | V) x (V | T)
268 if prev_prop in (LV, V) and curr_prop in (V, T):
269 return False
271 # GB8: (LVT | T) x T
272 if prev_prop in (LVT, T) and curr_prop == T:
273 return False
275 # GB9: x (Extend | ZWJ)
276 # Note: InCB_Linker and InCB_Extend must also be treated as Extend for GB9
277 if curr_prop in (EXTEND, ZWJ, INCB_EXTEND, INCB_LINKER):
278 return False
280 # GB9a: x SpacingMark
281 if curr_prop == SPACINGMARK:
282 return False
284 # GB9b: Prepend x
285 if prev_prop == PREPEND:
286 return False
288 # GB9c: \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} \p{Zwj} ]* x \p{InCB=Consonant}
289 # Restriction: The Linker itself must be part of a valid syllable, i.e., preceded by Consonant.
290 # This fixes failures where 'a' + Virama + 'Ta' breaks (because 'a' is not InCB Consonant).
291 if curr_prop == INCB_CONSONANT:
292 # 1. Scan backwards for Linker
293 i = pos - 1
294 found_linker = False
295 linker_index = -1
297 while i >= 0:
298 prop = _get_break_property(ord(text[i]))
299 if prop == INCB_LINKER:
300 found_linker = True
301 linker_index = i
302 break
303 if prop not in (EXTEND, ZWJ, INCB_EXTEND, INCB_LINKER):
304 break # Sequence broken before finding Linker
305 i -= 1
307 if found_linker:
308 # 2. Verify Linker is attached to a Consonant
309 # Scan backwards from Linker skipping Extend/ZWJ/Linker
310 j = linker_index - 1
311 valid_base = False
312 while j >= 0:
313 prop_j = _get_break_property(ord(text[j]))
314 if prop_j == INCB_CONSONANT:
315 valid_base = True
316 break
317 if prop_j not in (EXTEND, ZWJ, INCB_EXTEND, INCB_LINKER):
318 break # Hit start of cluster or invalid char
319 j -= 1
321 if valid_base:
322 return False
324 # GB11: \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
325 if prev_prop == ZWJ and curr_prop == EXTENDED_PICTOGRAPHIC:
326 # Scan backwards to see if ZWJ is preceded by Extended_Pictographic + Extend*
327 i = pos - 2
328 while i >= 0:
329 prop = _get_break_property(ord(text[i]))
330 if prop == EXTENDED_PICTOGRAPHIC:
331 return False
332 # Treat InCB properties as Extend for this rule too?
333 # UAX #29 says "Extend". InCB_Extend and InCB_Linker ARE Extend.
334 if prop not in (EXTEND, INCB_EXTEND, INCB_LINKER):
335 break
336 i -= 1
338 # GB12/GB13: Regional_Indicator sequence
339 if prev_prop == REGIONAL_INDICATOR and curr_prop == REGIONAL_INDICATOR:
340 # We need to count RI characters backwards from pos
341 # If count is odd, it's a valid pair (break after second).
342 # If count is even, we are in middle of new pair (no break).
343 # The rule effectively says: "Do not break within a pair".
344 # A pair starts at an even offset from the start of the RI sequence.
346 ri_count = 0
347 i = pos - 1
348 while i >= 0 and _get_break_property(ord(text[i])) == REGIONAL_INDICATOR:
349 ri_count += 1
350 i -= 1
352 # If we have seen an odd number of RIs before 'curr',
353 # then 'prev' and 'curr' form a pair.
354 if ri_count % 2 == 1:
355 return False
357 # GB999: Any ÷ Any
358 return True