Coverage for little_loops / link_checker.py: 0%
183 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
1"""Link checker for markdown documentation.
3Provides automated verification that links in markdown files are valid.
4Supports HTTP/HTTPS URL checking and internal file reference validation.
5"""
7from __future__ import annotations
9import json
10import re
11import urllib.error
12import urllib.request
13from concurrent.futures import ThreadPoolExecutor, as_completed
14from dataclasses import dataclass, field
15from pathlib import Path
17# Markdown link patterns
18MARKDOWN_LINK_PATTERN = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
19BARE_URL_PATTERN = re.compile(r'(?:^|[\s\'"<\(])((?:https?://)[^\s\'"<>)]+)', re.MULTILINE)
21# Default ignore patterns
22DEFAULT_IGNORE_PATTERNS = [
23 r"^http://localhost",
24 r"^https://localhost",
25 r"^http://127\.0\.0\.1",
26 r"^https://127\.0\.0\.1",
27 r"^http://0\.0\.0\.0",
28 r"^https://0\.0\.0\.0",
29]
31# Files to check by default (relative to base directory)
32DEFAULT_DOC_FILES = [
33 "README.md",
34 "CONTRIBUTING.md",
35 "docs/**/*.md",
36]
39@dataclass
40class LinkResult:
41 """Result of checking a single link.
43 Attributes:
44 url: The URL that was checked
45 file: File containing the link
46 line: Line number where link appears
47 status: Status of the link ("valid", "broken", "timeout", "ignored", "internal")
48 error: Error message if link is broken
49 link_text: The link text from markdown [text](url)
50 """
52 url: str
53 file: str
54 line: int
55 status: str
56 error: str | None = None
57 link_text: str | None = None
60@dataclass
61class LinkCheckResult:
62 """Overall results from link checking.
64 Attributes:
65 total_links: Total number of links found
66 valid_links: Number of valid links
67 broken_links: Number of broken links
68 ignored_links: Number of ignored links
69 internal_links: Number of internal file references
70 results: List of individual link results
71 """
73 total_links: int = 0
74 valid_links: int = 0
75 broken_links: int = 0
76 ignored_links: int = 0
77 internal_links: int = 0
78 results: list[LinkResult] = field(default_factory=list)
80 @property
81 def has_errors(self) -> bool:
82 """Check if any broken links were found."""
83 return self.broken_links > 0
86def extract_links_from_markdown(content: str, file_path: str) -> list[tuple[str, str | None, int]]:
87 """Extract links from markdown content.
89 Args:
90 content: Markdown file content
91 file_path: Path to the file (for context)
93 Returns:
94 List of (url, link_text, line_number) tuples
95 """
96 links: list[tuple[str, str | None, int]] = []
97 lines = content.splitlines()
99 for line_num, line in enumerate(lines, start=1):
100 # Extract markdown links [text](url)
101 for match in MARKDOWN_LINK_PATTERN.finditer(line):
102 url = match.group(2).strip()
103 link_text = match.group(1)
104 links.append((url, link_text, line_num))
106 # Extract bare URLs, excluding those already captured in markdown links
107 # First, remove markdown link URLs from the line
108 line_without_md_links = MARKDOWN_LINK_PATTERN.sub("", line)
109 for match in BARE_URL_PATTERN.finditer(line_without_md_links):
110 url = match.group(1).strip()
111 # Clean up trailing punctuation
112 url = re.sub(r"[.,;:!?)\]]+$", "", url)
113 links.append((url, None, line_num))
115 return links
118def is_internal_reference(url: str) -> bool:
119 """Check if URL is an internal file reference.
121 Args:
122 url: URL to check
124 Returns:
125 True if internal reference, False otherwise
126 """
127 # Internal references start with # or ./ or ../
128 if url.startswith("#") or url.startswith("./") or url.startswith("../"):
129 return True
130 # Ends with .md (plain file reference)
131 if url.endswith(".md"):
132 return True
133 # Relative markdown links with anchors (e.g. TROUBLESHOOTING.md#getting-help)
134 if ".md#" in url:
135 return True
136 return False
139def should_ignore_url(url: str, ignore_patterns: list[str]) -> bool:
140 """Check if URL should be ignored based on patterns.
142 Args:
143 url: URL to check
144 ignore_patterns: List of regex patterns to match
146 Returns:
147 True if URL should be ignored
148 """
149 for pattern in ignore_patterns:
150 if re.search(pattern, url):
151 return True
152 return False
155def check_url(url: str, timeout: int = 10) -> tuple[bool, str | None]:
156 """Check if a URL is reachable.
158 Args:
159 url: URL to check
160 timeout: Request timeout in seconds
162 Returns:
163 Tuple of (is_valid, error_message)
164 """
165 try:
166 # Create request with user agent
167 req = urllib.request.Request(url, headers={"User-Agent": "little-loops-link-checker/1.0"})
168 # Use HEAD request for efficiency
169 req.get_method = lambda: "HEAD" # type: ignore[method-assign]
171 with urllib.request.urlopen(req, timeout=timeout) as response:
172 # Accept 2xx and 3xx status codes
173 if 200 <= response.status < 400:
174 return True, None
175 return False, f"HTTP {response.status}"
177 except urllib.error.HTTPError as e:
178 return False, f"HTTP {e.code}"
179 except urllib.error.URLError as e:
180 return False, f"Connection error: {e.reason}"
181 except TimeoutError:
182 return False, "Timeout"
183 except Exception as e:
184 return False, str(e)
187def check_markdown_links(
188 base_dir: Path,
189 ignore_patterns: list[str] | None = None,
190 timeout: int = 10,
191 verbose: bool = False,
192 max_workers: int = 10,
193) -> LinkCheckResult:
194 """Check all markdown files for broken links.
196 Args:
197 base_dir: Base directory to search
198 ignore_patterns: List of regex patterns to ignore
199 timeout: Request timeout in seconds
200 verbose: Whether to show progress
201 max_workers: Maximum concurrent HTTP requests
203 Returns:
204 LinkCheckResult with all findings
205 """
206 if ignore_patterns is None:
207 ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy()
209 result = LinkCheckResult()
211 # Find all markdown files
212 md_files = list(base_dir.rglob("*.md"))
214 # Pass 1: Classify links, collect HTTP URLs for concurrent checking
215 http_checks: list[tuple[str, str | None, int, str]] = [] # (url, link_text, line, file)
217 for md_file in md_files:
218 try:
219 content = md_file.read_text()
220 relative_path = md_file.relative_to(base_dir)
221 file_str = str(relative_path)
223 links = extract_links_from_markdown(content, file_str)
225 for url, link_text, line_num in links:
226 result.total_links += 1
228 # Check if should ignore
229 if should_ignore_url(url, ignore_patterns):
230 result.ignored_links += 1
231 result.results.append(
232 LinkResult(
233 url=url,
234 file=file_str,
235 line=line_num,
236 status="ignored",
237 )
238 )
239 continue
241 # Check if internal reference
242 if is_internal_reference(url):
243 result.internal_links += 1
244 result.results.append(
245 LinkResult(
246 url=url,
247 file=file_str,
248 line=line_num,
249 status="internal",
250 link_text=link_text,
251 )
252 )
253 continue
255 # Only check HTTP/HTTPS URLs; treat everything else as internal
256 if not url.startswith(("http://", "https://")):
257 result.internal_links += 1
258 result.results.append(
259 LinkResult(
260 url=url,
261 file=file_str,
262 line=line_num,
263 status="internal",
264 link_text=link_text,
265 )
266 )
267 continue
269 # Collect HTTP/HTTPS URLs for concurrent checking
270 http_checks.append((url, link_text, line_num, file_str))
272 except Exception as e:
273 # File read error - log as broken entry for this file
274 result.results.append(
275 LinkResult(
276 url="",
277 file=str(md_file.relative_to(base_dir)),
278 line=0,
279 status="broken",
280 error=f"File read error: {e}",
281 )
282 )
284 # Pass 2: Check HTTP URLs concurrently
285 if http_checks:
286 with ThreadPoolExecutor(max_workers=max_workers) as executor:
287 future_to_meta = {
288 executor.submit(check_url, url, timeout): (url, link_text, line_num, file_str)
289 for url, link_text, line_num, file_str in http_checks
290 }
292 for future in as_completed(future_to_meta):
293 url, link_text, line_num, file_str = future_to_meta[future]
294 is_valid, error = future.result()
296 if is_valid:
297 result.valid_links += 1
298 result.results.append(
299 LinkResult(
300 url=url,
301 file=file_str,
302 line=line_num,
303 status="valid",
304 link_text=link_text,
305 )
306 )
307 else:
308 result.broken_links += 1
309 result.results.append(
310 LinkResult(
311 url=url,
312 file=file_str,
313 line=line_num,
314 status="broken",
315 error=error,
316 link_text=link_text,
317 )
318 )
320 return result
323def load_ignore_patterns(base_dir: Path) -> list[str]:
324 """Load ignore patterns from .mlc.config.json.
326 Args:
327 base_dir: Base directory path
329 Returns:
330 List of ignore patterns
331 """
332 patterns = DEFAULT_IGNORE_PATTERNS.copy()
334 config_file = base_dir / ".mlc.config.json"
335 if not config_file.exists():
336 return patterns
338 try:
339 with open(config_file) as f:
340 config = json.load(f)
342 # Extract ignore patterns
343 ignore_list = config.get("ignorePatterns", [])
344 for item in ignore_list:
345 if isinstance(item, dict):
346 pattern = item.get("pattern", "")
347 elif isinstance(item, str):
348 pattern = item
349 else:
350 continue
352 if pattern:
353 patterns.append(pattern)
355 except (OSError, json.JSONDecodeError):
356 # If config is invalid, use defaults
357 pass
359 return patterns
362def format_result_text(result: LinkCheckResult) -> str:
363 """Format link check result as text.
365 Args:
366 result: Link check result
368 Returns:
369 Formatted text output
370 """
371 lines = ["Documentation Link Check", "=" * 40]
373 if result.has_errors:
374 lines.append(f"✗ Found {result.broken_links} broken link(s):")
375 lines.append("")
377 for r in result.results:
378 if r.status == "broken":
379 text_part = f"[{r.link_text}]" if r.link_text else ""
380 lines.append(f" {text_part}({r.url})")
381 lines.append(f" at {r.file}:{r.line}")
382 if r.error:
383 lines.append(f" Error: {r.error}")
384 lines.append("")
386 # Summary
387 lines.append("Summary:")
388 lines.append(f" Total links: {result.total_links}")
389 lines.append(f" Valid: {result.valid_links}")
390 lines.append(f" Broken: {result.broken_links}")
391 lines.append(f" Internal refs: {result.internal_links}")
392 lines.append(f" Ignored: {result.ignored_links}")
394 else:
395 lines.append(
396 f"✓ All {result.total_links} link(s) valid! "
397 f"({result.internal_links} internal, {result.ignored_links} ignored)"
398 )
400 return "\n".join(lines)
403def format_result_json(result: LinkCheckResult) -> str:
404 """Format link check result as JSON.
406 Args:
407 result: Link check result
409 Returns:
410 JSON string
411 """
412 data = {
413 "total_links": result.total_links,
414 "valid_links": result.valid_links,
415 "broken_links": result.broken_links,
416 "ignored_links": result.ignored_links,
417 "internal_links": result.internal_links,
418 "has_errors": result.has_errors,
419 "results": [
420 {
421 "url": r.url,
422 "file": r.file,
423 "line": r.line,
424 "status": r.status,
425 "error": r.error,
426 "link_text": r.link_text,
427 }
428 for r in result.results
429 ],
430 }
432 return json.dumps(data, indent=2)
435def format_result_markdown(result: LinkCheckResult) -> str:
436 """Format link check result as Markdown.
438 Args:
439 result: Link check result
441 Returns:
442 Markdown formatted string
443 """
444 lines = ["# Documentation Link Check", ""]
445 lines.append("## Summary")
446 lines.append("")
447 lines.append(f"- **Total links**: {result.total_links}")
448 lines.append(f"- **Valid**: {result.valid_links}")
449 lines.append(f"- **Broken**: {result.broken_links}")
450 lines.append(f"- **Internal references**: {result.internal_links}")
451 lines.append(f"- **Ignored**: {result.ignored_links}")
452 lines.append("")
454 if result.has_errors:
455 lines.append("## ❌ Broken Links")
456 lines.append("")
457 lines.append("| URL | File | Line | Error |")
458 lines.append("|-----|------|------|-------|")
460 for r in result.results:
461 if r.status == "broken":
462 url_display = r.url[:60] + "..." if len(r.url) > 60 else r.url
463 error_display = r.error or "Unknown"
464 lines.append(f"| `{url_display}` | `{r.file}` | {r.line} | {error_display} |")
465 else:
466 lines.append("## ✅ All Links Valid")
467 lines.append("")
468 lines.append(
469 f"All {result.total_links} links are valid "
470 f"({result.internal_links} internal references, "
471 f"{result.ignored_links} ignored)."
472 )
474 return "\n".join(lines)