Coverage for src/indium/invisibles.py: 100%

50 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-01-08 22:34 +0000

1"""Invisible character detection and sanitization. 

2 

3This module provides functions to reveal, sanitize, and analyze invisible 

4characters in Unicode text. Useful for security validation, log analysis, 

5and preventing invisible character attacks. 

6""" 

7 

8from typing import Final 

9 

10from ._unicode_data import ( 

11 get_category, 

12 get_unicode_name, 

13 is_invisible, 

14 is_whitespace, 

15) 

16 

17# Special invisible characters requiring specific handling 

18ZWJ: Final[str] = '\u200D' # ZERO WIDTH JOINER (essential for emoji) 

19ZWNJ: Final[str] = '\u200C' # ZERO WIDTH NON-JOINER 

20ZWSP: Final[str] = '\u200B' # ZERO WIDTH SPACE 

21SOFT_HYPHEN: Final[str] = '\u00AD' # SOFT HYPHEN 

22 

23# Bidi control characters (security risk - homograph attacks) 

24BIDI_CONTROLS: Final[frozenset[str]] = frozenset({ 

25 '\u202A', # LEFT-TO-RIGHT EMBEDDING 

26 '\u202B', # RIGHT-TO-LEFT EMBEDDING 

27 '\u202C', # POP DIRECTIONAL FORMATTING 

28 '\u202D', # LEFT-TO-RIGHT OVERRIDE 

29 '\u202E', # RIGHT-TO-LEFT OVERRIDE 

30 '\u2066', # LEFT-TO-RIGHT ISOLATE 

31 '\u2067', # RIGHT-TO-LEFT ISOLATE 

32 '\u2068', # FIRST STRONG ISOLATE 

33 '\u2069', # POP DIRECTIONAL ISOLATE 

34}) 

35 

36 

37def reveal(text: str, *, substitute: str = "␣", format: str = "unicode") -> str: 

38 """Replace invisible characters with visible markers. 

39 

40 Args: 

41 text: Input string 

42 substitute: Marker for standard whitespace (default: "␣") 

43 format: Output format - "unicode" for <U+200B>, "hex" for \\u200b, 

44 "name" for ZERO WIDTH SPACE 

45 

46 Returns: 

47 String with invisibles replaced by visible markers 

48 

49 Raises: 

50 ValueError: If format is not recognized 

51 

52 Examples: 

53 >>> reveal("hello\\u200Bworld") 

54 'hello<U+200B>world' 

55 >>> reveal("hello\\u200Bworld", format="hex") 

56 'hello\\\\u200bworld' 

57 >>> reveal("hello\\u200Bworld", format="name") 

58 'hello<ZERO WIDTH SPACE>world' 

59 >>> reveal("hello world", substitute="·") 

60 'hello·world' 

61 """ 

62 if format not in ("unicode", "hex", "name"): 

63 raise ValueError(f"Invalid format: {format!r}. Must be 'unicode', 'hex', or 'name'") 

64 

65 result: list[str] = [] 

66 

67 for char in text: 

68 if is_whitespace(char): 

69 # Standard whitespace - use substitute marker 

70 result.append(substitute) 

71 elif is_invisible(char): 

72 # Invisible character - format based on format parameter 

73 if format == "unicode": 

74 codepoint = ord(char) 

75 result.append(f"<U+{codepoint:04X}>") 

76 elif format == "hex": 

77 codepoint = ord(char) 

78 result.append(f"\\u{codepoint:04x}") 

79 else: # format == "name" 

80 name = get_unicode_name(char, f"U+{ord(char):04X}") 

81 result.append(f"<{name}>") 

82 else: 

83 # Regular character - keep as-is 

84 result.append(char) 

85 

86 return ''.join(result) 

87 

88 

89def sanitize(text: str, *, schema: str = "strict", preserve_zwj: bool = False) -> str: 

90 """Remove invisible characters while preserving legitimate whitespace. 

91 

92 Args: 

93 text: Input string 

94 schema: Sanitization schema: 

95 - "strict": Remove all invisibles except standard whitespace 

96 - "permissive": Keep ZWJ for emoji sequences 

97 preserve_zwj: If True, preserve ZWJ for emoji (overrides schema) 

98 

99 Returns: 

100 Cleaned string with invisible characters removed 

101 

102 Raises: 

103 ValueError: If schema is not recognized 

104 

105 Examples: 

106 >>> sanitize("hello\\u200Bworld") 

107 'helloworld' 

108 >>> sanitize("hello world") # Preserves spaces 

109 'hello world' 

110 >>> sanitize("family👨\\u200D👩\\u200D👧", schema="permissive") 

111 'family👨\\u200d👩\\u200d👧' 

112 >>> sanitize("family👨\\u200D👩\\u200D👧", preserve_zwj=True) 

113 'family👨\\u200d👩\\u200d👧' 

114 """ 

115 if schema not in ("strict", "permissive"): 

116 raise ValueError(f"Invalid schema: {schema!r}. Must be 'strict' or 'permissive'") 

117 

118 keep_zwj = preserve_zwj or (schema == "permissive") 

119 result: list[str] = [] 

120 

121 for char in text: 

122 # Always keep standard whitespace 

123 if is_whitespace(char) or char == ZWJ and keep_zwj: 

124 result.append(char) 

125 # Remove all other invisible characters 

126 elif is_invisible(char): 

127 continue 

128 # Keep all visible characters 

129 else: 

130 result.append(char) 

131 

132 return ''.join(result) 

133 

134 

135def detect_invisibles(text: str) -> list[tuple[int, str, str]]: 

136 """Find all invisible characters and their positions. 

137 

138 Args: 

139 text: Input string to analyze 

140 

141 Returns: 

142 List of (position, character, unicode_name) tuples for each invisible 

143 character found. Standard whitespace is NOT included. 

144 

145 Examples: 

146 >>> detect_invisibles("hello\\u200Bworld") 

147 [(5, '\\u200b', 'ZERO WIDTH SPACE')] 

148 >>> detect_invisibles("hello world") # Space is not invisible 

149 [] 

150 >>> text = "a\\u202Eb\\u200Bc" 

151 >>> invisibles = detect_invisibles(text) 

152 >>> len(invisibles) 

153 2 

154 >>> invisibles[0][2] # Unicode name 

155 'RIGHT-TO-LEFT OVERRIDE' 

156 """ 

157 result: list[tuple[int, str, str]] = [] 

158 

159 for pos, char in enumerate(text): 

160 if not is_whitespace(char) and is_invisible(char): 

161 name = get_unicode_name(char, f"U+{ord(char):04X}") 

162 result.append((pos, char, name)) 

163 

164 return result 

165 

166 

167def count_by_category(text: str) -> dict[str, int]: 

168 """Count characters by Unicode category. 

169 

170 Useful for analyzing text composition and identifying potential issues. 

171 

172 Args: 

173 text: Input string to analyze 

174 

175 Returns: 

176 Dict mapping category code to count 

177 

178 Examples: 

179 >>> count_by_category("hello") 

180 {'Ll': 5} 

181 >>> count_by_category("Hello World!") 

182 {'Lu': 2, 'Ll': 8, 'Zs': 1, 'Po': 1} 

183 >>> result = count_by_category("test\\u200B") 

184 >>> result['Cf'] # Format category (invisible) 

185 1 

186 """ 

187 counts: dict[str, int] = {} 

188 

189 for char in text: 

190 category = get_category(char) 

191 counts[category] = counts.get(category, 0) + 1 

192 

193 return counts