Coverage for src/indium/spoofing.py: 94%

72 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2026-01-08 22:34 +0000

1"""Visual spoofing detection via homoglyphs and mixed scripts. 

2 

3This module provides functions to detect and normalize visually confusable 

4characters that could be used in phishing attacks, domain spoofing, or 

5other security vulnerabilities. 

6""" 

7 

8import bisect 

9import unicodedata 

10from functools import lru_cache 

11 

12from ._confusables import CONFUSABLES 

13from ._scripts_data import SCRIPT_RANGES 

14 

15 

16def skeleton(text: str) -> str: 

17 """Convert text to visual skeleton (canonical confusable form). 

18 

19 Maps confusable characters to their Latin prototypes: 

20 - Cyrillic 'а' (U+0430) → Latin 'a' 

21 - Greek 'ο' (U+03BF) → Latin 'o' 

22 - Fullwidth 'a' (U+FF41) → Latin 'a' 

23 

24 Uses NFKC normalization + confusables map (~70 chars). 

25 

26 Args: 

27 text: Input string to normalize 

28 

29 Returns: 

30 Normalized string with confusables replaced by Latin equivalents 

31 

32 Examples: 

33 >>> skeleton("pаypal") # Cyrillic 'а' 

34 'paypal' 

35 >>> skeleton("gοοgle") # Greek 'ο' 

36 'google' 

37 >>> skeleton("facebook") # Fullwidth 

38 'facebook' 

39 >>> skeleton(skeleton("test")) # Idempotent 

40 'test' 

41 """ 

42 # Fast path: ASCII strings are already normalized and "Latin" 

43 if text.isascii(): 

44 return text 

45 

46 # Step 1: Apply NFKC normalization (compatibility decomposition) 

47 # This handles many mathematical/styled variants automatically 

48 normalized = unicodedata.normalize('NFKC', text) 

49 

50 # Step 2: Apply confusables map for remaining lookalikes 

51 result: list[str] = [] 

52 for char in normalized: 

53 # Replace confusable with Latin equivalent, or keep original 

54 result.append(CONFUSABLES.get(char, char)) 

55 

56 return ''.join(result) 

57 

58 

59def is_mixed_script(text: str, *, ignore_common: bool = True) -> bool: 

60 """Detect if text mixes incompatible scripts (e.g., Latin + Cyrillic). 

61 

62 Args: 

63 text: Input string to analyze 

64 ignore_common: If True, ignore Common/Inherited scripts (numbers, 

65 punctuation, emoji). Default: True 

66 

67 Returns: 

68 True if text contains mixed scripts within non-whitespace runs 

69 

70 Examples: 

71 >>> is_mixed_script("hello") # Pure Latin 

72 False 

73 >>> is_mixed_script("привет") # Pure Cyrillic 

74 False 

75 >>> is_mixed_script("helloпривет") # Mixed Latin+Cyrillic 

76 True 

77 >>> is_mixed_script("hello123") # Numbers are Common script 

78 False 

79 >>> is_mixed_script("hello мир") # Different words, different scripts 

80 False 

81 """ 

82 if not text: 

83 return False 

84 

85 # Fast path: ASCII is always purely Latin (or Common). 

86 # If we ignore Common, then ASCII text is never mixed-script. 

87 if ignore_common and text.isascii(): 

88 return False 

89 

90 # Split text into words (whitespace-separated tokens) 

91 words = text.split() 

92 

93 for word in words: 

94 scripts_in_word: set[str] = set() 

95 

96 for char in word: 

97 script = _get_script_name(char) 

98 

99 # Ignore Common/Inherited scripts if configured 

100 if ignore_common and script in ('Common', 'Inherited', 'Unknown'): 

101 continue 

102 

103 scripts_in_word.add(script) 

104 

105 # If a single word has multiple scripts, it's mixed 

106 if len(scripts_in_word) > 1: 

107 return True 

108 

109 return False 

110 

111 

112def get_script_blocks(text: str) -> list[tuple[str, int, int]]: 

113 """Identify script blocks in text. 

114 

115 Args: 

116 text: Input string to analyze 

117 

118 Returns: 

119 List of (script_name, start_pos, end_pos) tuples 

120 

121 Examples: 

122 >>> get_script_blocks("hello") 

123 [('Latin', 0, 5)] 

124 >>> get_script_blocks("helloпривет") 

125 [('Latin', 0, 5), ('Cyrillic', 5, 11)] 

126 >>> blocks = get_script_blocks("test123") 

127 >>> len(blocks) 

128 2 

129 >>> blocks[0][0] # First block is Latin 

130 'Latin' 

131 """ 

132 if not text: 

133 return [] 

134 

135 blocks: list[tuple[str, int, int]] = [] 

136 current_script = _get_script_name(text[0]) 

137 start_pos = 0 

138 

139 for pos, char in enumerate(text[1:], start=1): 

140 script = _get_script_name(char) 

141 

142 # Script changed - record previous block 

143 if script != current_script: 

144 blocks.append((current_script, start_pos, pos)) 

145 current_script = script 

146 start_pos = pos 

147 

148 # Add final block 

149 blocks.append((current_script, start_pos, len(text))) 

150 

151 return blocks 

152 

153 

154def detect_confusables( 

155 text: str, target_script: str = "Latin" 

156) -> list[tuple[int, str, str]]: 

157 """Find characters that look like target script but aren't. 

158 

159 Args: 

160 text: Input string to analyze 

161 target_script: Script to check against (default: "Latin") 

162 

163 Returns: 

164 List of (position, character, confusable_with) tuples 

165 

166 Examples: 

167 >>> detect_confusables("pаypal") # Cyrillic 'а' looks like Latin 'a' 

168 [(1, 'а', 'a')] 

169 >>> detect_confusables("hello") # All Latin 

170 [] 

171 >>> result = detect_confusables("gοοgle") # Greek 'ο' 

172 >>> len(result) 

173 2 

174 >>> result[0][2] # Confusable with 

175 'o' 

176 """ 

177 # Fast path: ASCII text is already Latin/Common, so it cannot contain 

178 # characters from other scripts that mimic Latin. 

179 if text.isascii() and target_script == "Latin": 

180 return [] 

181 

182 result: list[tuple[int, str, str]] = [] 

183 

184 for pos, char in enumerate(text): 

185 # Check if character is a known confusable 

186 if char in CONFUSABLES: 

187 latin_equivalent = CONFUSABLES[char] 

188 # Character looks like target script but isn't 

189 script = _get_script_name(char) 

190 if script != target_script: 

191 result.append((pos, char, latin_equivalent)) 

192 

193 return result 

194 

195 

196# Internal helper: Get script name for a character 

197@lru_cache(maxsize=4096) 

198def _get_script_name(char: str) -> str: 

199 """Get Unicode script name for character. 

200 

201 Uses binary search over generated Unicode data tables. 

202 

203 Args: 

204 char: Single character 

205 

206 Returns: 

207 Script name (e.g., "Latin", "Cyrillic", "Greek", "Common") 

208 """ 

209 if len(char) != 1: 

210 return "Unknown" 

211 

212 codepoint = ord(char) 

213 

214 # Binary search to find the script range 

215 # SCRIPT_RANGES is a sorted list of (start_cp, script_name) 

216 # bisect_right returns the insertion point to maintain order 

217 # index-1 gives the range that starts <= codepoint 

218 idx = bisect.bisect_right(SCRIPT_RANGES, (codepoint, 'zzzzzz')) 

219 

220 if idx == 0: 

221 return "Unknown" 

222 

223 start_cp, script = SCRIPT_RANGES[idx - 1] 

224 

225 # Note: Our generator fills gaps with "Unknown", so this covers 

226 # the case where codepoint is in a gap (implicit or explicit) 

227 

228 if script in ('Hiragana', 'Katakana', 'Han', 'Hangul'): 

229 return 'CJK' 

230 

231 if script == "Unknown": 

232 # Fallback to category heuristics for Common/Inherited if unknown script 

233 # This handles characters not yet in our table or special categories 

234 category = unicodedata.category(char) 

235 if category.startswith('N') or category.startswith('P') or \ 

236 category.startswith('S') or category.startswith('Z'): 

237 return "Common" 

238 if category.startswith('M'): 

239 return "Inherited" 

240 

241 return script 

242