Coverage for src/indium/invisibles.py: 100%
50 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
1"""Invisible character detection and sanitization.
3This module provides functions to reveal, sanitize, and analyze invisible
4characters in Unicode text. Useful for security validation, log analysis,
5and preventing invisible character attacks.
6"""
8from typing import Final
10from ._unicode_data import (
11 get_category,
12 get_unicode_name,
13 is_invisible,
14 is_whitespace,
15)
17# Special invisible characters requiring specific handling
18ZWJ: Final[str] = '\u200D' # ZERO WIDTH JOINER (essential for emoji)
19ZWNJ: Final[str] = '\u200C' # ZERO WIDTH NON-JOINER
20ZWSP: Final[str] = '\u200B' # ZERO WIDTH SPACE
21SOFT_HYPHEN: Final[str] = '\u00AD' # SOFT HYPHEN
23# Bidi control characters (security risk - homograph attacks)
24BIDI_CONTROLS: Final[frozenset[str]] = frozenset({
25 '\u202A', # LEFT-TO-RIGHT EMBEDDING
26 '\u202B', # RIGHT-TO-LEFT EMBEDDING
27 '\u202C', # POP DIRECTIONAL FORMATTING
28 '\u202D', # LEFT-TO-RIGHT OVERRIDE
29 '\u202E', # RIGHT-TO-LEFT OVERRIDE
30 '\u2066', # LEFT-TO-RIGHT ISOLATE
31 '\u2067', # RIGHT-TO-LEFT ISOLATE
32 '\u2068', # FIRST STRONG ISOLATE
33 '\u2069', # POP DIRECTIONAL ISOLATE
34})
37def reveal(text: str, *, substitute: str = "␣", format: str = "unicode") -> str:
38 """Replace invisible characters with visible markers.
40 Args:
41 text: Input string
42 substitute: Marker for standard whitespace (default: "␣")
43 format: Output format - "unicode" for <U+200B>, "hex" for \\u200b,
44 "name" for ZERO WIDTH SPACE
46 Returns:
47 String with invisibles replaced by visible markers
49 Raises:
50 ValueError: If format is not recognized
52 Examples:
53 >>> reveal("hello\\u200Bworld")
54 'hello<U+200B>world'
55 >>> reveal("hello\\u200Bworld", format="hex")
56 'hello\\\\u200bworld'
57 >>> reveal("hello\\u200Bworld", format="name")
58 'hello<ZERO WIDTH SPACE>world'
59 >>> reveal("hello world", substitute="·")
60 'hello·world'
61 """
62 if format not in ("unicode", "hex", "name"):
63 raise ValueError(f"Invalid format: {format!r}. Must be 'unicode', 'hex', or 'name'")
65 result: list[str] = []
67 for char in text:
68 if is_whitespace(char):
69 # Standard whitespace - use substitute marker
70 result.append(substitute)
71 elif is_invisible(char):
72 # Invisible character - format based on format parameter
73 if format == "unicode":
74 codepoint = ord(char)
75 result.append(f"<U+{codepoint:04X}>")
76 elif format == "hex":
77 codepoint = ord(char)
78 result.append(f"\\u{codepoint:04x}")
79 else: # format == "name"
80 name = get_unicode_name(char, f"U+{ord(char):04X}")
81 result.append(f"<{name}>")
82 else:
83 # Regular character - keep as-is
84 result.append(char)
86 return ''.join(result)
89def sanitize(text: str, *, schema: str = "strict", preserve_zwj: bool = False) -> str:
90 """Remove invisible characters while preserving legitimate whitespace.
92 Args:
93 text: Input string
94 schema: Sanitization schema:
95 - "strict": Remove all invisibles except standard whitespace
96 - "permissive": Keep ZWJ for emoji sequences
97 preserve_zwj: If True, preserve ZWJ for emoji (overrides schema)
99 Returns:
100 Cleaned string with invisible characters removed
102 Raises:
103 ValueError: If schema is not recognized
105 Examples:
106 >>> sanitize("hello\\u200Bworld")
107 'helloworld'
108 >>> sanitize("hello world") # Preserves spaces
109 'hello world'
110 >>> sanitize("family👨\\u200D👩\\u200D👧", schema="permissive")
111 'family👨\\u200d👩\\u200d👧'
112 >>> sanitize("family👨\\u200D👩\\u200D👧", preserve_zwj=True)
113 'family👨\\u200d👩\\u200d👧'
114 """
115 if schema not in ("strict", "permissive"):
116 raise ValueError(f"Invalid schema: {schema!r}. Must be 'strict' or 'permissive'")
118 keep_zwj = preserve_zwj or (schema == "permissive")
119 result: list[str] = []
121 for char in text:
122 # Always keep standard whitespace
123 if is_whitespace(char) or char == ZWJ and keep_zwj:
124 result.append(char)
125 # Remove all other invisible characters
126 elif is_invisible(char):
127 continue
128 # Keep all visible characters
129 else:
130 result.append(char)
132 return ''.join(result)
135def detect_invisibles(text: str) -> list[tuple[int, str, str]]:
136 """Find all invisible characters and their positions.
138 Args:
139 text: Input string to analyze
141 Returns:
142 List of (position, character, unicode_name) tuples for each invisible
143 character found. Standard whitespace is NOT included.
145 Examples:
146 >>> detect_invisibles("hello\\u200Bworld")
147 [(5, '\\u200b', 'ZERO WIDTH SPACE')]
148 >>> detect_invisibles("hello world") # Space is not invisible
149 []
150 >>> text = "a\\u202Eb\\u200Bc"
151 >>> invisibles = detect_invisibles(text)
152 >>> len(invisibles)
153 2
154 >>> invisibles[0][2] # Unicode name
155 'RIGHT-TO-LEFT OVERRIDE'
156 """
157 result: list[tuple[int, str, str]] = []
159 for pos, char in enumerate(text):
160 if not is_whitespace(char) and is_invisible(char):
161 name = get_unicode_name(char, f"U+{ord(char):04X}")
162 result.append((pos, char, name))
164 return result
167def count_by_category(text: str) -> dict[str, int]:
168 """Count characters by Unicode category.
170 Useful for analyzing text composition and identifying potential issues.
172 Args:
173 text: Input string to analyze
175 Returns:
176 Dict mapping category code to count
178 Examples:
179 >>> count_by_category("hello")
180 {'Ll': 5}
181 >>> count_by_category("Hello World!")
182 {'Lu': 2, 'Ll': 8, 'Zs': 1, 'Po': 1}
183 >>> result = count_by_category("test\\u200B")
184 >>> result['Cf'] # Format category (invisible)
185 1
186 """
187 counts: dict[str, int] = {}
189 for char in text:
190 category = get_category(char)
191 counts[category] = counts.get(category, 0) + 1
193 return counts