Coverage for src / core / tool_name_extractor.py: 95%
134 statements
« prev ^ index » next coverage.py v7.13.0, created at 2026-01-04 04:43 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2026-01-04 04:43 +0000
1"""Extract human-readable tool names from shell commands.
3This module provides a utility function to extract the primary tool name from
4complex shell commands for use in quality gate messaging, lint caching, and
5logging. For example:
6- "npx eslint ." -> "eslint"
7- "uvx ruff check ." -> "ruff"
8- "uv run pytest" -> "pytest"
10This module is in src.core because it has no dependencies on domain/pipeline/
11orchestration/infra layers and is used by multiple layers.
12"""
14from __future__ import annotations
16import logging
17import shlex
19logger = logging.getLogger(__name__)
21# Wrappers where we skip the wrapper and use the next positional argument
22# Note: pnpm and yarn are NOT here because they have compound command handling
23_SINGLE_TOKEN_WRAPPERS: frozenset[str] = frozenset({"npx", "bunx", "uvx", "pipx"})
25# Multi-token wrapper sequences where we skip the entire sequence
26# Format: (first_token, second_token) -> skip both tokens
27_MULTI_TOKEN_WRAPPERS: frozenset[tuple[str, str]] = frozenset(
28 {
29 ("python", "-m"),
30 ("python3", "-m"),
31 ("uv", "run"),
32 ("poetry", "run"),
33 ("pipx", "run"),
34 }
35)
37# Compound commands where we include the subcommand in the tool name
38# Format: base_command -> set of subcommands that form compound tools
39_COMPOUND_COMMANDS: dict[str, frozenset[str]] = {
40 "go": frozenset({"test", "build", "vet", "fmt", "mod", "generate"}),
41 "cargo": frozenset({"clippy", "test", "build", "check", "fmt", "bench"}),
42 "npm": frozenset({"test", "run"}),
43 "pnpm": frozenset({"test", "run"}),
44 "yarn": frozenset({"test", "run"}),
45}
47# Shell built-ins to skip (these appear before a real command)
48_SHELL_BUILTINS: frozenset[str] = frozenset(
49 {
50 "export",
51 "set",
52 "unset",
53 "source",
54 ".",
55 "eval",
56 "exec",
57 "cd",
58 "pushd",
59 "popd",
60 "alias",
61 "unalias",
62 "declare",
63 "local",
64 "readonly",
65 "typeset",
66 }
67)
69# Built-ins that take path/file arguments (source, ., cd, pushd, popd)
70_PATH_BUILTINS: frozenset[str] = frozenset({"source", ".", "cd", "pushd", "popd"})
72# Built-ins whose arguments are not commands (skip the rest of the segment)
73# export and set operate on variable assignments or shell options/positional
74# parameters rather than executing commands passed as arguments.
75_SKIP_REST_BUILTINS: frozenset[str] = frozenset(
76 {
77 "export",
78 "set",
79 "unset",
80 "alias",
81 "unalias",
82 "declare",
83 "local",
84 "readonly",
85 "typeset",
86 }
87)
89# Wrapper flags that consume a value (skip flag + value)
90_WRAPPER_VALUE_FLAGS: frozenset[str] = frozenset(
91 {
92 "-p",
93 "--package",
94 "--from",
95 "--extra",
96 "--with",
97 # uv run flags that take a value
98 "--group",
99 "--only-group",
100 "--no-group",
101 "--python",
102 "-c",
103 "--directory",
104 "--env-file",
105 "--config-file",
106 }
107)
109# Commands that are typically setup/shell commands, not actual tools
110# When these appear in a multi-segment command (with && or ;), prefer later segments
111_SETUP_COMMANDS: frozenset[str] = frozenset({"cd", "echo", "install"})
114def _is_env_assignment(token: str) -> bool:
115 """Check if a token is an environment variable assignment.
117 Args:
118 token: Token to check.
120 Returns:
121 True if token looks like VAR=value assignment.
122 """
123 # Must have = and the part before = must be a valid identifier
124 if "=" not in token:
125 return False
126 name, _, _ = token.partition("=")
127 # Empty name or starts with digit or has special chars isn't a valid identifier
128 if not name or name[0].isdigit():
129 return False
130 return all(c.isalnum() or c == "_" for c in name)
133def _strip_path_prefix(cmd: str) -> str:
134 """Strip path prefixes from command names.
136 Args:
137 cmd: Command that might have a path prefix.
139 Returns:
140 Just the base command name.
141 """
142 # Split on / and take the last part
143 if "/" in cmd:
144 return cmd.rsplit("/", 1)[-1]
145 return cmd
148def _parse_command(command: str) -> list[str]:
149 """Parse a command string into tokens.
151 Uses shlex for proper shell parsing, falls back to whitespace
152 splitting if shlex fails.
154 Args:
155 command: Shell command string.
157 Returns:
158 List of parsed tokens.
159 """
160 try:
161 return shlex.split(command)
162 except ValueError:
163 # Fallback to simple whitespace splitting for malformed commands
164 logger.warning(
165 "shlex.split failed for command, using whitespace split: %r", command
166 )
167 return command.split()
170def _skip_builtin_arguments(tokens: list[str], idx: int, builtin: str) -> int:
171 """Skip arguments to a shell built-in.
173 Different built-ins have different argument patterns:
174 - export/set and others in _SKIP_REST_BUILTINS: skip all remaining tokens
175 (their arguments are variable assignments, options, or positional params,
176 not commands to execute)
177 - source/.: a single path argument
179 Args:
180 tokens: Parsed command tokens.
181 idx: Current index (pointing to first token after built-in).
182 builtin: The built-in command name.
184 Returns:
185 New index after skipping all built-in arguments.
186 """
187 if builtin in _SKIP_REST_BUILTINS:
188 # These built-ins operate on variables/definitions/options; remaining
189 # tokens are not commands to execute in this segment.
190 return len(tokens)
191 elif builtin in _PATH_BUILTINS:
192 # source/. take a single path argument
193 if idx < len(tokens):
194 idx += 1
195 return idx
198def _skip_wrapper_flags(tokens: list[str], idx: int) -> int:
199 """Skip wrapper flags (and their values) to find the actual tool token."""
200 while idx < len(tokens):
201 token = tokens[idx]
202 if not token.startswith("-"):
203 return idx
204 token_lower = token.lower()
205 if token_lower in _WRAPPER_VALUE_FLAGS:
206 idx += 2 # Skip flag and its value
207 continue
208 idx += 1
209 return idx
212def _extract_from_tokens(tokens: list[str]) -> str:
213 """Extract tool name from a list of parsed tokens.
215 All matching against internal token sets is case-insensitive, allowing
216 commands like "CARGO clippy" or "NPX eslint" to be recognized correctly.
217 The returned tool name is normalized to lowercase for known wrappers and
218 compound commands to ensure consistent lint_type identification.
220 Args:
221 tokens: Parsed command tokens.
223 Returns:
224 Extracted tool name (lowercase for known patterns, original case otherwise).
225 """
226 if not tokens:
227 return ""
229 # Skip leading env var assignments
230 idx = 0
231 while idx < len(tokens) and _is_env_assignment(tokens[idx]):
232 idx += 1
234 if idx >= len(tokens):
235 # Only env assignments, return empty
236 return ""
238 first = _strip_path_prefix(tokens[idx])
239 first_lower = first.lower()
241 # Skip shell built-ins and their arguments (case-insensitive check)
242 while first_lower in _SHELL_BUILTINS:
243 builtin = first_lower
244 idx += 1
245 # Skip arguments specific to each built-in
246 idx = _skip_builtin_arguments(tokens, idx, builtin)
247 if idx >= len(tokens):
248 return "" # Only built-ins, return empty
249 first = _strip_path_prefix(tokens[idx])
250 first_lower = first.lower()
252 # Check for multi-token wrapper sequences (case-insensitive)
253 if idx + 1 < len(tokens):
254 second = tokens[idx + 1]
255 second_lower = second.lower()
256 if (first_lower, second_lower) in _MULTI_TOKEN_WRAPPERS:
257 idx += 2
258 if idx >= len(tokens):
259 # Wrapper without command, return wrapper name (lowercase)
260 logger.warning("Wrapper %s %s without following command", first, second)
261 return first_lower
262 idx = _skip_wrapper_flags(tokens, idx)
263 if idx >= len(tokens):
264 logger.warning("Wrapper %s %s without following command", first, second)
265 return first_lower
266 first = _strip_path_prefix(tokens[idx])
267 first_lower = first.lower()
269 # Check for compound commands BEFORE single-token wrappers (case-insensitive)
270 # This allows npm, pnpm, yarn to be recognized as compound commands
271 if first_lower in _COMPOUND_COMMANDS:
272 if idx + 1 < len(tokens):
273 next_token = tokens[idx + 1]
274 next_token_lower = next_token.lower()
275 # npm/pnpm/yarn run has special handling: npm run lint -> npm run:lint
276 if first_lower in ("npm", "pnpm", "yarn") and next_token_lower == "run":
277 if idx + 2 < len(tokens):
278 script_name = tokens[idx + 2].lower()
279 return f"{first_lower} run:{script_name}"
280 return f"{first_lower} run"
281 # Other compound commands: go test -> go test (lowercase)
282 if next_token_lower in _COMPOUND_COMMANDS[first_lower]:
283 return f"{first_lower} {next_token_lower}"
285 # Check for single-token wrappers (case-insensitive)
286 if first_lower in _SINGLE_TOKEN_WRAPPERS:
287 idx += 1
288 # After wrapper, skip any leading flags (and their values)
289 idx = _skip_wrapper_flags(tokens, idx)
290 if idx >= len(tokens):
291 logger.warning("Wrapper %s without following command", first)
292 return first_lower
293 first = _strip_path_prefix(tokens[idx])
295 return first.lower()
298def _is_meaningful_tool(tool_name: str) -> bool:
299 """Check if a tool name is meaningful for reporting.
301 Some commands like 'cd', 'echo', 'npm install' are setup commands
302 rather than the actual tool being run. Checks are case-insensitive.
304 Args:
305 tool_name: Extracted tool name.
307 Returns:
308 True if the tool name is meaningful.
309 """
310 if not tool_name:
311 return False
312 # Get the base command (first word), normalize to lowercase for comparison
313 base = tool_name.split()[0].lower()
314 tool_name_lower = tool_name.lower()
315 # Skip common setup commands
316 if base in _SETUP_COMMANDS:
317 return False
318 # npm without compound (i.e., just "npm") or "npm install" is setup
319 if base == "npm":
320 if tool_name_lower == "npm" or "install" in tool_name_lower:
321 return False
322 return True
325def extract_tool_name(command: str) -> str:
326 """Extract human-readable tool name from a shell command.
328 This function extracts the primary tool name from complex shell commands
329 for use in quality gate messaging and logging.
331 Algorithm:
332 1. Handle shell operators (&&, ||, |, ;) by trying each segment
333 2. Parse via shlex.split; fallback to whitespace if parsing fails
334 3. Skip env var assignments (tokens with = before command)
335 4. Skip shell built-ins (export, set, cd)
336 5. Strip path prefixes (/usr/bin/eslint -> eslint)
337 6. Apply wrapper rules:
338 - Single-token: npx, bunx, uvx, pipx -> skip wrapper, use next positional
339 - Multi-token: python -m, uv run, poetry run -> skip sequence
340 - Compound: go test, cargo clippy, npm test -> include subcommand
341 - Script: npm run lint -> npm run:lint
342 7. Fallback: return first token
344 Args:
345 command: Shell command string.
347 Returns:
348 Human-readable tool name. For empty/malformed commands, returns
349 the best-effort result (possibly empty string) with warning logged.
351 Examples:
352 >>> extract_tool_name("npx eslint .")
353 'eslint'
354 >>> extract_tool_name("uvx ruff check .")
355 'ruff'
356 >>> extract_tool_name("uv run pytest")
357 'pytest'
358 >>> extract_tool_name("go test ./...")
359 'go test'
360 >>> extract_tool_name("cargo clippy")
361 'cargo clippy'
362 >>> extract_tool_name("npm run lint")
363 'npm run:lint'
364 """
365 if not command or not command.strip():
366 logger.warning("Empty command provided to extract_tool_name")
367 return ""
369 command = command.strip()
371 # Handle shell operators by trying each segment
372 # Split on common operators and process each segment
373 # Try in order: &&, ||, ;, | (pipe last since it's most common for chaining)
374 for operator in ("&&", "||", ";", "|"):
375 if operator in command:
376 segments = command.split(operator)
377 # Try to find a meaningful tool from any segment
378 for segment in segments:
379 segment = segment.strip()
380 if segment:
381 result = _extract_from_tokens(_parse_command(segment))
382 if result and _is_meaningful_tool(result):
383 return result
384 # If no meaningful tool found, try first segment as fallback
385 for segment in segments:
386 segment = segment.strip()
387 if segment:
388 result = _extract_from_tokens(_parse_command(segment))
389 if result:
390 return result
392 # No operators or operators didn't yield a result - parse directly
393 tokens = _parse_command(command)
394 result = _extract_from_tokens(tokens)
396 if not result:
397 logger.warning("Could not extract tool name from command: %r", command)
399 return result