Coverage for src/indium/spoofing.py: 94%
72 statements
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2026-01-08 22:34 +0000
1"""Visual spoofing detection via homoglyphs and mixed scripts.
3This module provides functions to detect and normalize visually confusable
4characters that could be used in phishing attacks, domain spoofing, or
5other security vulnerabilities.
6"""
8import bisect
9import unicodedata
10from functools import lru_cache
12from ._confusables import CONFUSABLES
13from ._scripts_data import SCRIPT_RANGES
16def skeleton(text: str) -> str:
17 """Convert text to visual skeleton (canonical confusable form).
19 Maps confusable characters to their Latin prototypes:
20 - Cyrillic 'а' (U+0430) → Latin 'a'
21 - Greek 'ο' (U+03BF) → Latin 'o'
22 - Fullwidth 'a' (U+FF41) → Latin 'a'
24 Uses NFKC normalization + confusables map (~70 chars).
26 Args:
27 text: Input string to normalize
29 Returns:
30 Normalized string with confusables replaced by Latin equivalents
32 Examples:
33 >>> skeleton("pаypal") # Cyrillic 'а'
34 'paypal'
35 >>> skeleton("gοοgle") # Greek 'ο'
36 'google'
37 >>> skeleton("facebook") # Fullwidth
38 'facebook'
39 >>> skeleton(skeleton("test")) # Idempotent
40 'test'
41 """
42 # Fast path: ASCII strings are already normalized and "Latin"
43 if text.isascii():
44 return text
46 # Step 1: Apply NFKC normalization (compatibility decomposition)
47 # This handles many mathematical/styled variants automatically
48 normalized = unicodedata.normalize('NFKC', text)
50 # Step 2: Apply confusables map for remaining lookalikes
51 result: list[str] = []
52 for char in normalized:
53 # Replace confusable with Latin equivalent, or keep original
54 result.append(CONFUSABLES.get(char, char))
56 return ''.join(result)
59def is_mixed_script(text: str, *, ignore_common: bool = True) -> bool:
60 """Detect if text mixes incompatible scripts (e.g., Latin + Cyrillic).
62 Args:
63 text: Input string to analyze
64 ignore_common: If True, ignore Common/Inherited scripts (numbers,
65 punctuation, emoji). Default: True
67 Returns:
68 True if text contains mixed scripts within non-whitespace runs
70 Examples:
71 >>> is_mixed_script("hello") # Pure Latin
72 False
73 >>> is_mixed_script("привет") # Pure Cyrillic
74 False
75 >>> is_mixed_script("helloпривет") # Mixed Latin+Cyrillic
76 True
77 >>> is_mixed_script("hello123") # Numbers are Common script
78 False
79 >>> is_mixed_script("hello мир") # Different words, different scripts
80 False
81 """
82 if not text:
83 return False
85 # Fast path: ASCII is always purely Latin (or Common).
86 # If we ignore Common, then ASCII text is never mixed-script.
87 if ignore_common and text.isascii():
88 return False
90 # Split text into words (whitespace-separated tokens)
91 words = text.split()
93 for word in words:
94 scripts_in_word: set[str] = set()
96 for char in word:
97 script = _get_script_name(char)
99 # Ignore Common/Inherited scripts if configured
100 if ignore_common and script in ('Common', 'Inherited', 'Unknown'):
101 continue
103 scripts_in_word.add(script)
105 # If a single word has multiple scripts, it's mixed
106 if len(scripts_in_word) > 1:
107 return True
109 return False
112def get_script_blocks(text: str) -> list[tuple[str, int, int]]:
113 """Identify script blocks in text.
115 Args:
116 text: Input string to analyze
118 Returns:
119 List of (script_name, start_pos, end_pos) tuples
121 Examples:
122 >>> get_script_blocks("hello")
123 [('Latin', 0, 5)]
124 >>> get_script_blocks("helloпривет")
125 [('Latin', 0, 5), ('Cyrillic', 5, 11)]
126 >>> blocks = get_script_blocks("test123")
127 >>> len(blocks)
128 2
129 >>> blocks[0][0] # First block is Latin
130 'Latin'
131 """
132 if not text:
133 return []
135 blocks: list[tuple[str, int, int]] = []
136 current_script = _get_script_name(text[0])
137 start_pos = 0
139 for pos, char in enumerate(text[1:], start=1):
140 script = _get_script_name(char)
142 # Script changed - record previous block
143 if script != current_script:
144 blocks.append((current_script, start_pos, pos))
145 current_script = script
146 start_pos = pos
148 # Add final block
149 blocks.append((current_script, start_pos, len(text)))
151 return blocks
154def detect_confusables(
155 text: str, target_script: str = "Latin"
156) -> list[tuple[int, str, str]]:
157 """Find characters that look like target script but aren't.
159 Args:
160 text: Input string to analyze
161 target_script: Script to check against (default: "Latin")
163 Returns:
164 List of (position, character, confusable_with) tuples
166 Examples:
167 >>> detect_confusables("pаypal") # Cyrillic 'а' looks like Latin 'a'
168 [(1, 'а', 'a')]
169 >>> detect_confusables("hello") # All Latin
170 []
171 >>> result = detect_confusables("gοοgle") # Greek 'ο'
172 >>> len(result)
173 2
174 >>> result[0][2] # Confusable with
175 'o'
176 """
177 # Fast path: ASCII text is already Latin/Common, so it cannot contain
178 # characters from other scripts that mimic Latin.
179 if text.isascii() and target_script == "Latin":
180 return []
182 result: list[tuple[int, str, str]] = []
184 for pos, char in enumerate(text):
185 # Check if character is a known confusable
186 if char in CONFUSABLES:
187 latin_equivalent = CONFUSABLES[char]
188 # Character looks like target script but isn't
189 script = _get_script_name(char)
190 if script != target_script:
191 result.append((pos, char, latin_equivalent))
193 return result
196# Internal helper: Get script name for a character
197@lru_cache(maxsize=4096)
198def _get_script_name(char: str) -> str:
199 """Get Unicode script name for character.
201 Uses binary search over generated Unicode data tables.
203 Args:
204 char: Single character
206 Returns:
207 Script name (e.g., "Latin", "Cyrillic", "Greek", "Common")
208 """
209 if len(char) != 1:
210 return "Unknown"
212 codepoint = ord(char)
214 # Binary search to find the script range
215 # SCRIPT_RANGES is a sorted list of (start_cp, script_name)
216 # bisect_right returns the insertion point to maintain order
217 # index-1 gives the range that starts <= codepoint
218 idx = bisect.bisect_right(SCRIPT_RANGES, (codepoint, 'zzzzzz'))
220 if idx == 0:
221 return "Unknown"
223 start_cp, script = SCRIPT_RANGES[idx - 1]
225 # Note: Our generator fills gaps with "Unknown", so this covers
226 # the case where codepoint is in a gap (implicit or explicit)
228 if script in ('Hiragana', 'Katakana', 'Han', 'Hangul'):
229 return 'CJK'
231 if script == "Unknown":
232 # Fallback to category heuristics for Common/Inherited if unknown script
233 # This handles characters not yet in our table or special categories
234 category = unicodedata.category(char)
235 if category.startswith('N') or category.startswith('P') or \
236 category.startswith('S') or category.startswith('Z'):
237 return "Common"
238 if category.startswith('M'):
239 return "Inherited"
241 return script