Coverage for src/indium/_unicode_data.py: 93%

30 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-01-08 22:34 +0000

1"""Unicode category helpers and utilities. 

2 

3Private module providing low-level Unicode operations. 

4Zero external dependencies - stdlib only. 

5""" 

6 

7import unicodedata 

8from typing import Final 

9 

10# Whitespace characters (not considered invisible for our purposes) 

11WHITESPACE_CHARS: Final[frozenset[str]] = frozenset({' ', '\n', '\t', '\r', '\v', '\f'}) 

12 

13# Invisible Unicode categories 

14# Cf = Format characters (zero-width, bidi controls, etc.) 

15# Cc = Control characters (NULL, backspace, etc.) 

16# Co = Private use characters 

17INVISIBLE_CATEGORIES: Final[frozenset[str]] = frozenset({'Cf', 'Cc', 'Co'}) 

18 

19 

20def is_invisible(char: str) -> bool: 

21 """Check if character is invisible (Format, Control, or Private Use). 

22 

23 Args: 

24 char: Single character to check 

25 

26 Returns: 

27 True if character is in Cf, Cc, or Co Unicode category 

28 

29 Examples: 

30 >>> is_invisible('\u200B') # ZERO WIDTH SPACE 

31 True 

32 >>> is_invisible('a') 

33 False 

34 >>> is_invisible(' ') # Regular space is NOT invisible 

35 False 

36 """ 

37 if len(char) != 1: 

38 raise ValueError(f"Expected single character, got {len(char)} characters") 

39 

40 # Standard whitespace is NOT invisible 

41 if char in WHITESPACE_CHARS: 

42 return False 

43 

44 category = unicodedata.category(char) 

45 return category in INVISIBLE_CATEGORIES 

46 

47 

48def is_combining(char: str) -> bool: 

49 """Check if character is a combining mark. 

50 

51 Combining marks (accents, diacritics) attach to base characters. 

52 

53 Args: 

54 char: Single character to check 

55 

56 Returns: 

57 True if character has non-zero combining class 

58 

59 Examples: 

60 >>> is_combining('\u0301') # COMBINING ACUTE ACCENT 

61 True 

62 >>> is_combining('a') 

63 False 

64 """ 

65 if len(char) != 1: 

66 raise ValueError(f"Expected single character, got {len(char)} characters") 

67 

68 # combining() only returns Canonical Combining Class, which is 0 for many 

69 # combining characters (e.g., enclosing marks, Hebrew vowels). 

70 # We must check the category: Mn (Nonspacing), Mc (Spacing), Me (Enclosing) 

71 return unicodedata.category(char).startswith('M') 

72 

73 

74def is_whitespace(char: str) -> bool: 

75 r"""Check if character is standard whitespace. 

76 

77 Args: 

78 char: Single character to check 

79 

80 Returns: 

81 True if character is space, newline, tab, etc. 

82 

83 Examples: 

84 >>> is_whitespace(' ') 

85 True 

86 >>> is_whitespace('\n') 

87 True 

88 >>> is_whitespace('\u200B') # ZERO WIDTH SPACE is NOT standard whitespace 

89 False 

90 """ 

91 if len(char) != 1: 

92 raise ValueError(f"Expected single character, got {len(char)} characters") 

93 

94 return char in WHITESPACE_CHARS 

95 

96 

97def get_unicode_name(char: str, default: str = "<unnamed>") -> str: 

98 """Get Unicode character name. 

99 

100 Args: 

101 char: Single character 

102 default: Default name if character has no name 

103 

104 Returns: 

105 Unicode character name or default 

106 

107 Examples: 

108 >>> get_unicode_name('a') 

109 'LATIN SMALL LETTER A' 

110 >>> get_unicode_name('\u200B') 

111 'ZERO WIDTH SPACE' 

112 """ 

113 if len(char) != 1: 

114 raise ValueError(f"Expected single character, got {len(char)} characters") 

115 

116 try: 

117 return unicodedata.name(char) 

118 except ValueError: 

119 return default 

120 

121 

122def get_category(char: str) -> str: 

123 """Get Unicode category for character. 

124 

125 Args: 

126 char: Single character 

127 

128 Returns: 

129 Two-letter category code (e.g., 'Ll', 'Cf', 'Cc') 

130 

131 Examples: 

132 >>> get_category('a') 

133 'Ll' 

134 >>> get_category('\u200B') 

135 'Cf' 

136 """ 

137 if len(char) != 1: 

138 raise ValueError(f"Expected single character, got {len(char)} characters") 

139 

140 return unicodedata.category(char)