Coverage for src/indium/_unicode_data.py: 93%
30 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
1"""Unicode category helpers and utilities.
3Private module providing low-level Unicode operations.
4Zero external dependencies - stdlib only.
5"""
7import unicodedata
8from typing import Final
10# Whitespace characters (not considered invisible for our purposes)
11WHITESPACE_CHARS: Final[frozenset[str]] = frozenset({' ', '\n', '\t', '\r', '\v', '\f'})
13# Invisible Unicode categories
14# Cf = Format characters (zero-width, bidi controls, etc.)
15# Cc = Control characters (NULL, backspace, etc.)
16# Co = Private use characters
17INVISIBLE_CATEGORIES: Final[frozenset[str]] = frozenset({'Cf', 'Cc', 'Co'})
20def is_invisible(char: str) -> bool:
21 """Check if character is invisible (Format, Control, or Private Use).
23 Args:
24 char: Single character to check
26 Returns:
27 True if character is in Cf, Cc, or Co Unicode category
29 Examples:
30 >>> is_invisible('\u200B') # ZERO WIDTH SPACE
31 True
32 >>> is_invisible('a')
33 False
34 >>> is_invisible(' ') # Regular space is NOT invisible
35 False
36 """
37 if len(char) != 1:
38 raise ValueError(f"Expected single character, got {len(char)} characters")
40 # Standard whitespace is NOT invisible
41 if char in WHITESPACE_CHARS:
42 return False
44 category = unicodedata.category(char)
45 return category in INVISIBLE_CATEGORIES
48def is_combining(char: str) -> bool:
49 """Check if character is a combining mark.
51 Combining marks (accents, diacritics) attach to base characters.
53 Args:
54 char: Single character to check
56 Returns:
57 True if character has non-zero combining class
59 Examples:
60 >>> is_combining('\u0301') # COMBINING ACUTE ACCENT
61 True
62 >>> is_combining('a')
63 False
64 """
65 if len(char) != 1:
66 raise ValueError(f"Expected single character, got {len(char)} characters")
68 # combining() only returns Canonical Combining Class, which is 0 for many
69 # combining characters (e.g., enclosing marks, Hebrew vowels).
70 # We must check the category: Mn (Nonspacing), Mc (Spacing), Me (Enclosing)
71 return unicodedata.category(char).startswith('M')
74def is_whitespace(char: str) -> bool:
75 r"""Check if character is standard whitespace.
77 Args:
78 char: Single character to check
80 Returns:
81 True if character is space, newline, tab, etc.
83 Examples:
84 >>> is_whitespace(' ')
85 True
86 >>> is_whitespace('\n')
87 True
88 >>> is_whitespace('\u200B') # ZERO WIDTH SPACE is NOT standard whitespace
89 False
90 """
91 if len(char) != 1:
92 raise ValueError(f"Expected single character, got {len(char)} characters")
94 return char in WHITESPACE_CHARS
97def get_unicode_name(char: str, default: str = "<unnamed>") -> str:
98 """Get Unicode character name.
100 Args:
101 char: Single character
102 default: Default name if character has no name
104 Returns:
105 Unicode character name or default
107 Examples:
108 >>> get_unicode_name('a')
109 'LATIN SMALL LETTER A'
110 >>> get_unicode_name('\u200B')
111 'ZERO WIDTH SPACE'
112 """
113 if len(char) != 1:
114 raise ValueError(f"Expected single character, got {len(char)} characters")
116 try:
117 return unicodedata.name(char)
118 except ValueError:
119 return default
122def get_category(char: str) -> str:
123 """Get Unicode category for character.
125 Args:
126 char: Single character
128 Returns:
129 Two-letter category code (e.g., 'Ll', 'Cf', 'Cc')
131 Examples:
132 >>> get_category('a')
133 'Ll'
134 >>> get_category('\u200B')
135 'Cf'
136 """
137 if len(char) != 1:
138 raise ValueError(f"Expected single character, got {len(char)} characters")
140 return unicodedata.category(char)