Coverage for src / core / tool_name_extractor.py: 95%

134 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2026-01-04 04:43 +0000

1"""Extract human-readable tool names from shell commands. 

2 

3This module provides a utility function to extract the primary tool name from 

4complex shell commands for use in quality gate messaging, lint caching, and 

5logging. For example: 

6- "npx eslint ." -> "eslint" 

7- "uvx ruff check ." -> "ruff" 

8- "uv run pytest" -> "pytest" 

9 

10This module is in src.core because it has no dependencies on domain/pipeline/ 

11orchestration/infra layers and is used by multiple layers. 

12""" 

13 

14from __future__ import annotations 

15 

16import logging 

17import shlex 

18 

19logger = logging.getLogger(__name__) 

20 

21# Wrappers where we skip the wrapper and use the next positional argument 

22# Note: pnpm and yarn are NOT here because they have compound command handling 

23_SINGLE_TOKEN_WRAPPERS: frozenset[str] = frozenset({"npx", "bunx", "uvx", "pipx"}) 

24 

25# Multi-token wrapper sequences where we skip the entire sequence 

26# Format: (first_token, second_token) -> skip both tokens 

27_MULTI_TOKEN_WRAPPERS: frozenset[tuple[str, str]] = frozenset( 

28 { 

29 ("python", "-m"), 

30 ("python3", "-m"), 

31 ("uv", "run"), 

32 ("poetry", "run"), 

33 ("pipx", "run"), 

34 } 

35) 

36 

37# Compound commands where we include the subcommand in the tool name 

38# Format: base_command -> set of subcommands that form compound tools 

39_COMPOUND_COMMANDS: dict[str, frozenset[str]] = { 

40 "go": frozenset({"test", "build", "vet", "fmt", "mod", "generate"}), 

41 "cargo": frozenset({"clippy", "test", "build", "check", "fmt", "bench"}), 

42 "npm": frozenset({"test", "run"}), 

43 "pnpm": frozenset({"test", "run"}), 

44 "yarn": frozenset({"test", "run"}), 

45} 

46 

47# Shell built-ins to skip (these appear before a real command) 

48_SHELL_BUILTINS: frozenset[str] = frozenset( 

49 { 

50 "export", 

51 "set", 

52 "unset", 

53 "source", 

54 ".", 

55 "eval", 

56 "exec", 

57 "cd", 

58 "pushd", 

59 "popd", 

60 "alias", 

61 "unalias", 

62 "declare", 

63 "local", 

64 "readonly", 

65 "typeset", 

66 } 

67) 

68 

69# Built-ins that take path/file arguments (source, ., cd, pushd, popd) 

70_PATH_BUILTINS: frozenset[str] = frozenset({"source", ".", "cd", "pushd", "popd"}) 

71 

72# Built-ins whose arguments are not commands (skip the rest of the segment) 

73# export and set operate on variable assignments or shell options/positional 

74# parameters rather than executing commands passed as arguments. 

75_SKIP_REST_BUILTINS: frozenset[str] = frozenset( 

76 { 

77 "export", 

78 "set", 

79 "unset", 

80 "alias", 

81 "unalias", 

82 "declare", 

83 "local", 

84 "readonly", 

85 "typeset", 

86 } 

87) 

88 

89# Wrapper flags that consume a value (skip flag + value) 

90_WRAPPER_VALUE_FLAGS: frozenset[str] = frozenset( 

91 { 

92 "-p", 

93 "--package", 

94 "--from", 

95 "--extra", 

96 "--with", 

97 # uv run flags that take a value 

98 "--group", 

99 "--only-group", 

100 "--no-group", 

101 "--python", 

102 "-c", 

103 "--directory", 

104 "--env-file", 

105 "--config-file", 

106 } 

107) 

108 

109# Commands that are typically setup/shell commands, not actual tools 

110# When these appear in a multi-segment command (with && or ;), prefer later segments 

111_SETUP_COMMANDS: frozenset[str] = frozenset({"cd", "echo", "install"}) 

112 

113 

114def _is_env_assignment(token: str) -> bool: 

115 """Check if a token is an environment variable assignment. 

116 

117 Args: 

118 token: Token to check. 

119 

120 Returns: 

121 True if token looks like VAR=value assignment. 

122 """ 

123 # Must have = and the part before = must be a valid identifier 

124 if "=" not in token: 

125 return False 

126 name, _, _ = token.partition("=") 

127 # Empty name or starts with digit or has special chars isn't a valid identifier 

128 if not name or name[0].isdigit(): 

129 return False 

130 return all(c.isalnum() or c == "_" for c in name) 

131 

132 

133def _strip_path_prefix(cmd: str) -> str: 

134 """Strip path prefixes from command names. 

135 

136 Args: 

137 cmd: Command that might have a path prefix. 

138 

139 Returns: 

140 Just the base command name. 

141 """ 

142 # Split on / and take the last part 

143 if "/" in cmd: 

144 return cmd.rsplit("/", 1)[-1] 

145 return cmd 

146 

147 

148def _parse_command(command: str) -> list[str]: 

149 """Parse a command string into tokens. 

150 

151 Uses shlex for proper shell parsing, falls back to whitespace 

152 splitting if shlex fails. 

153 

154 Args: 

155 command: Shell command string. 

156 

157 Returns: 

158 List of parsed tokens. 

159 """ 

160 try: 

161 return shlex.split(command) 

162 except ValueError: 

163 # Fallback to simple whitespace splitting for malformed commands 

164 logger.warning( 

165 "shlex.split failed for command, using whitespace split: %r", command 

166 ) 

167 return command.split() 

168 

169 

170def _skip_builtin_arguments(tokens: list[str], idx: int, builtin: str) -> int: 

171 """Skip arguments to a shell built-in. 

172 

173 Different built-ins have different argument patterns: 

174 - export/set and others in _SKIP_REST_BUILTINS: skip all remaining tokens 

175 (their arguments are variable assignments, options, or positional params, 

176 not commands to execute) 

177 - source/.: a single path argument 

178 

179 Args: 

180 tokens: Parsed command tokens. 

181 idx: Current index (pointing to first token after built-in). 

182 builtin: The built-in command name. 

183 

184 Returns: 

185 New index after skipping all built-in arguments. 

186 """ 

187 if builtin in _SKIP_REST_BUILTINS: 

188 # These built-ins operate on variables/definitions/options; remaining 

189 # tokens are not commands to execute in this segment. 

190 return len(tokens) 

191 elif builtin in _PATH_BUILTINS: 

192 # source/. take a single path argument 

193 if idx < len(tokens): 

194 idx += 1 

195 return idx 

196 

197 

198def _skip_wrapper_flags(tokens: list[str], idx: int) -> int: 

199 """Skip wrapper flags (and their values) to find the actual tool token.""" 

200 while idx < len(tokens): 

201 token = tokens[idx] 

202 if not token.startswith("-"): 

203 return idx 

204 token_lower = token.lower() 

205 if token_lower in _WRAPPER_VALUE_FLAGS: 

206 idx += 2 # Skip flag and its value 

207 continue 

208 idx += 1 

209 return idx 

210 

211 

212def _extract_from_tokens(tokens: list[str]) -> str: 

213 """Extract tool name from a list of parsed tokens. 

214 

215 All matching against internal token sets is case-insensitive, allowing 

216 commands like "CARGO clippy" or "NPX eslint" to be recognized correctly. 

217 The returned tool name is normalized to lowercase for known wrappers and 

218 compound commands to ensure consistent lint_type identification. 

219 

220 Args: 

221 tokens: Parsed command tokens. 

222 

223 Returns: 

224 Extracted tool name (lowercase for known patterns, original case otherwise). 

225 """ 

226 if not tokens: 

227 return "" 

228 

229 # Skip leading env var assignments 

230 idx = 0 

231 while idx < len(tokens) and _is_env_assignment(tokens[idx]): 

232 idx += 1 

233 

234 if idx >= len(tokens): 

235 # Only env assignments, return empty 

236 return "" 

237 

238 first = _strip_path_prefix(tokens[idx]) 

239 first_lower = first.lower() 

240 

241 # Skip shell built-ins and their arguments (case-insensitive check) 

242 while first_lower in _SHELL_BUILTINS: 

243 builtin = first_lower 

244 idx += 1 

245 # Skip arguments specific to each built-in 

246 idx = _skip_builtin_arguments(tokens, idx, builtin) 

247 if idx >= len(tokens): 

248 return "" # Only built-ins, return empty 

249 first = _strip_path_prefix(tokens[idx]) 

250 first_lower = first.lower() 

251 

252 # Check for multi-token wrapper sequences (case-insensitive) 

253 if idx + 1 < len(tokens): 

254 second = tokens[idx + 1] 

255 second_lower = second.lower() 

256 if (first_lower, second_lower) in _MULTI_TOKEN_WRAPPERS: 

257 idx += 2 

258 if idx >= len(tokens): 

259 # Wrapper without command, return wrapper name (lowercase) 

260 logger.warning("Wrapper %s %s without following command", first, second) 

261 return first_lower 

262 idx = _skip_wrapper_flags(tokens, idx) 

263 if idx >= len(tokens): 

264 logger.warning("Wrapper %s %s without following command", first, second) 

265 return first_lower 

266 first = _strip_path_prefix(tokens[idx]) 

267 first_lower = first.lower() 

268 

269 # Check for compound commands BEFORE single-token wrappers (case-insensitive) 

270 # This allows npm, pnpm, yarn to be recognized as compound commands 

271 if first_lower in _COMPOUND_COMMANDS: 

272 if idx + 1 < len(tokens): 

273 next_token = tokens[idx + 1] 

274 next_token_lower = next_token.lower() 

275 # npm/pnpm/yarn run has special handling: npm run lint -> npm run:lint 

276 if first_lower in ("npm", "pnpm", "yarn") and next_token_lower == "run": 

277 if idx + 2 < len(tokens): 

278 script_name = tokens[idx + 2].lower() 

279 return f"{first_lower} run:{script_name}" 

280 return f"{first_lower} run" 

281 # Other compound commands: go test -> go test (lowercase) 

282 if next_token_lower in _COMPOUND_COMMANDS[first_lower]: 

283 return f"{first_lower} {next_token_lower}" 

284 

285 # Check for single-token wrappers (case-insensitive) 

286 if first_lower in _SINGLE_TOKEN_WRAPPERS: 

287 idx += 1 

288 # After wrapper, skip any leading flags (and their values) 

289 idx = _skip_wrapper_flags(tokens, idx) 

290 if idx >= len(tokens): 

291 logger.warning("Wrapper %s without following command", first) 

292 return first_lower 

293 first = _strip_path_prefix(tokens[idx]) 

294 

295 return first.lower() 

296 

297 

298def _is_meaningful_tool(tool_name: str) -> bool: 

299 """Check if a tool name is meaningful for reporting. 

300 

301 Some commands like 'cd', 'echo', 'npm install' are setup commands 

302 rather than the actual tool being run. Checks are case-insensitive. 

303 

304 Args: 

305 tool_name: Extracted tool name. 

306 

307 Returns: 

308 True if the tool name is meaningful. 

309 """ 

310 if not tool_name: 

311 return False 

312 # Get the base command (first word), normalize to lowercase for comparison 

313 base = tool_name.split()[0].lower() 

314 tool_name_lower = tool_name.lower() 

315 # Skip common setup commands 

316 if base in _SETUP_COMMANDS: 

317 return False 

318 # npm without compound (i.e., just "npm") or "npm install" is setup 

319 if base == "npm": 

320 if tool_name_lower == "npm" or "install" in tool_name_lower: 

321 return False 

322 return True 

323 

324 

325def extract_tool_name(command: str) -> str: 

326 """Extract human-readable tool name from a shell command. 

327 

328 This function extracts the primary tool name from complex shell commands 

329 for use in quality gate messaging and logging. 

330 

331 Algorithm: 

332 1. Handle shell operators (&&, ||, |, ;) by trying each segment 

333 2. Parse via shlex.split; fallback to whitespace if parsing fails 

334 3. Skip env var assignments (tokens with = before command) 

335 4. Skip shell built-ins (export, set, cd) 

336 5. Strip path prefixes (/usr/bin/eslint -> eslint) 

337 6. Apply wrapper rules: 

338 - Single-token: npx, bunx, uvx, pipx -> skip wrapper, use next positional 

339 - Multi-token: python -m, uv run, poetry run -> skip sequence 

340 - Compound: go test, cargo clippy, npm test -> include subcommand 

341 - Script: npm run lint -> npm run:lint 

342 7. Fallback: return first token 

343 

344 Args: 

345 command: Shell command string. 

346 

347 Returns: 

348 Human-readable tool name. For empty/malformed commands, returns 

349 the best-effort result (possibly empty string) with warning logged. 

350 

351 Examples: 

352 >>> extract_tool_name("npx eslint .") 

353 'eslint' 

354 >>> extract_tool_name("uvx ruff check .") 

355 'ruff' 

356 >>> extract_tool_name("uv run pytest") 

357 'pytest' 

358 >>> extract_tool_name("go test ./...") 

359 'go test' 

360 >>> extract_tool_name("cargo clippy") 

361 'cargo clippy' 

362 >>> extract_tool_name("npm run lint") 

363 'npm run:lint' 

364 """ 

365 if not command or not command.strip(): 

366 logger.warning("Empty command provided to extract_tool_name") 

367 return "" 

368 

369 command = command.strip() 

370 

371 # Handle shell operators by trying each segment 

372 # Split on common operators and process each segment 

373 # Try in order: &&, ||, ;, | (pipe last since it's most common for chaining) 

374 for operator in ("&&", "||", ";", "|"): 

375 if operator in command: 

376 segments = command.split(operator) 

377 # Try to find a meaningful tool from any segment 

378 for segment in segments: 

379 segment = segment.strip() 

380 if segment: 

381 result = _extract_from_tokens(_parse_command(segment)) 

382 if result and _is_meaningful_tool(result): 

383 return result 

384 # If no meaningful tool found, try first segment as fallback 

385 for segment in segments: 

386 segment = segment.strip() 

387 if segment: 

388 result = _extract_from_tokens(_parse_command(segment)) 

389 if result: 

390 return result 

391 

392 # No operators or operators didn't yield a result - parse directly 

393 tokens = _parse_command(command) 

394 result = _extract_from_tokens(tokens) 

395 

396 if not result: 

397 logger.warning("Could not extract tool name from command: %r", command) 

398 

399 return result