Coverage for src/indium/segments.py: 99%

135 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-01-08 22:34 +0000

1"""Grapheme-aware text operations without regex library dependency. 

2 

3This module provides safe text slicing and truncation that respects grapheme 

4cluster boundaries. Prevents breaking emoji sequences, combining character 

5sequences, and other multi-codepoint visual units. 

6 

7Implementation: UAX #29 (Unicode Text Segmentation) 

8""" 

9 

10import bisect 

11from collections.abc import Iterator 

12from functools import lru_cache 

13from typing import Optional 

14 

15from ._grapheme_data import ( 

16 CONTROL, 

17 CR, 

18 EXTEND, 

19 EXTENDED_PICTOGRAPHIC, 

20 GRAPHEME_BREAK_RANGES, 

21 INCB_CONSONANT, 

22 INCB_EXTEND, 

23 INCB_LINKER, 

24 LF, 

25 LV, 

26 LVT, 

27 PREPEND, 

28 REGIONAL_INDICATOR, 

29 SPACINGMARK, 

30 ZWJ, 

31 L, 

32 T, 

33 V, 

34) 

35 

36 

37@lru_cache(maxsize=4096) 

38def _get_break_property(codepoint: int) -> int: 

39 """Get Grapheme_Cluster_Break property for a code point.""" 

40 # Binary search in the RLE table 

41 # index-1 gives the range containing codepoint (or starting before it) 

42 idx = bisect.bisect_right(GRAPHEME_BREAK_RANGES, (codepoint, 999)) 

43 if idx == 0: 

44 return 0 # Should not happen if table covers 0 

45 

46 start, prop = GRAPHEME_BREAK_RANGES[idx - 1] 

47 return prop 

48 

49 

50def safe_truncate(text: str, max_graphemes: int) -> str: 

51 """Truncate text to max grapheme clusters, not code points. 

52 

53 Ensures cut doesn't break: 

54 - Emoji sequences (ZWJ, skin tone modifiers) 

55 - Combining character sequences (base + marks) 

56 - Regional indicator pairs (flag emoji) 

57 

58 Args: 

59 text: Input string 

60 max_graphemes: Maximum number of visual units (grapheme clusters) 

61 

62 Returns: 

63 Truncated string at valid grapheme boundary 

64 

65 Raises: 

66 ValueError: If max_graphemes is negative 

67 

68 Examples: 

69 >>> safe_truncate("hello", 3) 

70 'hel' 

71 >>> safe_truncate("café", 3) # é is one grapheme 

72 'caf' 

73 >>> safe_truncate("👨‍👩‍👧", 1) # Family emoji is one grapheme 

74 '👨\u200d👩\u200d👧' 

75 >>> safe_truncate("hello👋🏽world", 6) # Waving hand with skin tone 

76 'hello👋🏽' 

77 """ 

78 if max_graphemes < 0: 

79 raise ValueError(f"max_graphemes must be non-negative, got {max_graphemes}") 

80 

81 if max_graphemes == 0: 

82 return "" 

83 

84 grapheme_count = 0 

85 pos = 0 

86 

87 while pos < len(text) and grapheme_count < max_graphemes: 

88 # Find end of current grapheme 

89 grapheme_end = _find_grapheme_end(text, pos) 

90 pos = grapheme_end 

91 grapheme_count += 1 

92 

93 return text[:pos] 

94 

95 

96def count_graphemes(text: str) -> int: 

97 """Count grapheme clusters (visual units) in text. 

98 

99 Args: 

100 text: Input string 

101 

102 Returns: 

103 Number of grapheme clusters 

104 

105 Examples: 

106 >>> count_graphemes("hello") 

107 5 

108 >>> count_graphemes("café") # é = e + combining acute 

109 4 

110 >>> count_graphemes("👨‍👩‍👧") # Family emoji 

111 1 

112 >>> count_graphemes("hello👋🏽") # Waving with skin tone 

113 6 

114 """ 

115 count = 0 

116 pos = 0 

117 

118 while pos < len(text): 

119 grapheme_end = _find_grapheme_end(text, pos) 

120 count += 1 

121 pos = grapheme_end 

122 

123 return count 

124 

125 

126def grapheme_slice(text: str, start: int, end: Optional[int] = None) -> str: 

127 """Slice text by grapheme indices, not code points. 

128 

129 Args: 

130 text: Input string 

131 start: Start grapheme index (inclusive) 

132 end: End grapheme index (exclusive). If None, slice to end 

133 

134 Returns: 

135 Substring from start to end grapheme indices 

136 

137 Raises: 

138 ValueError: If start is negative or end < start 

139 

140 Examples: 

141 >>> grapheme_slice("hello", 1, 4) 

142 'ell' 

143 >>> grapheme_slice("café", 0, 3) 

144 'caf' 

145 >>> grapheme_slice("👨‍👩‍👧test", 1, 3) 

146 'te' 

147 """ 

148 if start < 0: 

149 raise ValueError(f"start must be non-negative, got {start}") 

150 

151 if end is not None and end < start: 

152 raise ValueError(f"end ({end}) must be >= start ({start})") 

153 

154 # Find start position in code points 

155 grapheme_count = 0 

156 start_pos = 0 

157 

158 while start_pos < len(text) and grapheme_count < start: 

159 grapheme_end = _find_grapheme_end(text, start_pos) 

160 start_pos = grapheme_end 

161 grapheme_count += 1 

162 

163 # If we ran out of text before reaching start, return empty 

164 if start_pos >= len(text): 

165 return "" 

166 

167 # If no end specified, return from start to end of text 

168 if end is None: 

169 return text[start_pos:] 

170 

171 # Find end position in code points 

172 end_pos = start_pos 

173 while end_pos < len(text) and grapheme_count < end: 

174 grapheme_end = _find_grapheme_end(text, end_pos) 

175 end_pos = grapheme_end 

176 grapheme_count += 1 

177 

178 return text[start_pos:end_pos] 

179 

180 

181def iter_graphemes(text: str) -> Iterator[str]: 

182 """Iterate over grapheme clusters. 

183 

184 Args: 

185 text: Input string 

186 

187 Yields: 

188 Individual grapheme clusters 

189 

190 Examples: 

191 >>> list(iter_graphemes("hello")) 

192 ['h', 'e', 'l', 'l', 'o'] 

193 >>> list(iter_graphemes("café")) 

194 ['c', 'a', 'f', 'é'] 

195 >>> list(iter_graphemes("a👋🏽b")) 

196 ['a', '👋🏽', 'b'] 

197 """ 

198 pos = 0 

199 

200 while pos < len(text): 

201 grapheme_end = _find_grapheme_end(text, pos) 

202 yield text[pos:grapheme_end] 

203 pos = grapheme_end 

204 

205 

206# Internal helper: Find end of grapheme cluster starting at pos 

207def _find_grapheme_end(text: str, pos: int) -> int: 

208 """Find the end position of the grapheme cluster starting at pos. 

209 

210 Args: 

211 text: Input string 

212 pos: Start position of grapheme 

213 

214 Returns: 

215 Position after the end of the grapheme cluster 

216 """ 

217 if pos >= len(text): 

218 return pos 

219 

220 # Start with first character 

221 end = pos + 1 

222 

223 # Continue while we're not at a grapheme boundary 

224 while end < len(text) and not _is_grapheme_boundary(text, end): 

225 end += 1 

226 

227 return end 

228 

229 

230def _is_grapheme_boundary(text: str, pos: int) -> bool: 

231 """Check if position is a valid grapheme boundary. 

232 

233 Implements Unicode TR29 (Grapheme Cluster Boundaries). 

234 

235 Args: 

236 text: Input string 

237 pos: Position to check 

238 

239 Returns: 

240 True if position is a valid grapheme boundary 

241 """ 

242 if pos == 0 or pos >= len(text): 

243 return True 

244 

245 curr_char = text[pos] 

246 prev_char = text[pos - 1] 

247 

248 curr_prop = _get_break_property(ord(curr_char)) 

249 prev_prop = _get_break_property(ord(prev_char)) 

250 

251 # GB3: CR x LF 

252 if prev_prop == CR and curr_prop == LF: 

253 return False 

254 

255 # GB4: (Control | CR | LF) ÷ 

256 if prev_prop in (CONTROL, CR, LF): 

257 return True 

258 

259 # GB5: ÷ (Control | CR | LF) 

260 if curr_prop in (CONTROL, CR, LF): 

261 return True 

262 

263 # GB6: L x (L | V | LV | LVT) 

264 if prev_prop == L and curr_prop in (L, V, LV, LVT): 

265 return False 

266 

267 # GB7: (LV | V) x (V | T) 

268 if prev_prop in (LV, V) and curr_prop in (V, T): 

269 return False 

270 

271 # GB8: (LVT | T) x T 

272 if prev_prop in (LVT, T) and curr_prop == T: 

273 return False 

274 

275 # GB9: x (Extend | ZWJ) 

276 # Note: InCB_Linker and InCB_Extend must also be treated as Extend for GB9 

277 if curr_prop in (EXTEND, ZWJ, INCB_EXTEND, INCB_LINKER): 

278 return False 

279 

280 # GB9a: x SpacingMark 

281 if curr_prop == SPACINGMARK: 

282 return False 

283 

284 # GB9b: Prepend x 

285 if prev_prop == PREPEND: 

286 return False 

287 

288 # GB9c: \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} \p{Zwj} ]* x \p{InCB=Consonant} 

289 # Restriction: The Linker itself must be part of a valid syllable, i.e., preceded by Consonant. 

290 # This fixes failures where 'a' + Virama + 'Ta' breaks (because 'a' is not InCB Consonant). 

291 if curr_prop == INCB_CONSONANT: 

292 # 1. Scan backwards for Linker 

293 i = pos - 1 

294 found_linker = False 

295 linker_index = -1 

296 

297 while i >= 0: 

298 prop = _get_break_property(ord(text[i])) 

299 if prop == INCB_LINKER: 

300 found_linker = True 

301 linker_index = i 

302 break 

303 if prop not in (EXTEND, ZWJ, INCB_EXTEND, INCB_LINKER): 

304 break # Sequence broken before finding Linker 

305 i -= 1 

306 

307 if found_linker: 

308 # 2. Verify Linker is attached to a Consonant 

309 # Scan backwards from Linker skipping Extend/ZWJ/Linker 

310 j = linker_index - 1 

311 valid_base = False 

312 while j >= 0: 

313 prop_j = _get_break_property(ord(text[j])) 

314 if prop_j == INCB_CONSONANT: 

315 valid_base = True 

316 break 

317 if prop_j not in (EXTEND, ZWJ, INCB_EXTEND, INCB_LINKER): 

318 break # Hit start of cluster or invalid char 

319 j -= 1 

320 

321 if valid_base: 

322 return False 

323 

324 # GB11: \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic} 

325 if prev_prop == ZWJ and curr_prop == EXTENDED_PICTOGRAPHIC: 

326 # Scan backwards to see if ZWJ is preceded by Extended_Pictographic + Extend* 

327 i = pos - 2 

328 while i >= 0: 

329 prop = _get_break_property(ord(text[i])) 

330 if prop == EXTENDED_PICTOGRAPHIC: 

331 return False 

332 # Treat InCB properties as Extend for this rule too? 

333 # UAX #29 says "Extend". InCB_Extend and InCB_Linker ARE Extend. 

334 if prop not in (EXTEND, INCB_EXTEND, INCB_LINKER): 

335 break 

336 i -= 1 

337 

338 # GB12/GB13: Regional_Indicator sequence 

339 if prev_prop == REGIONAL_INDICATOR and curr_prop == REGIONAL_INDICATOR: 

340 # We need to count RI characters backwards from pos 

341 # If count is odd, it's a valid pair (break after second). 

342 # If count is even, we are in middle of new pair (no break). 

343 # The rule effectively says: "Do not break within a pair". 

344 # A pair starts at an even offset from the start of the RI sequence. 

345 

346 ri_count = 0 

347 i = pos - 1 

348 while i >= 0 and _get_break_property(ord(text[i])) == REGIONAL_INDICATOR: 

349 ri_count += 1 

350 i -= 1 

351 

352 # If we have seen an odd number of RIs before 'curr', 

353 # then 'prev' and 'curr' form a pair. 

354 if ri_count % 2 == 1: 

355 return False 

356 

357 # GB999: Any ÷ Any 

358 return True