Coverage for little_loops / link_checker.py: 0%

183 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-03-18 16:18 -0500

1"""Link checker for markdown documentation. 

2 

3Provides automated verification that links in markdown files are valid. 

4Supports HTTP/HTTPS URL checking and internal file reference validation. 

5""" 

6 

7from __future__ import annotations 

8 

9import json 

10import re 

11import urllib.error 

12import urllib.request 

13from concurrent.futures import ThreadPoolExecutor, as_completed 

14from dataclasses import dataclass, field 

15from pathlib import Path 

16 

17# Markdown link patterns 

18MARKDOWN_LINK_PATTERN = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") 

19BARE_URL_PATTERN = re.compile(r'(?:^|[\s\'"<\(])((?:https?://)[^\s\'"<>)]+)', re.MULTILINE) 

20 

21# Default ignore patterns 

22DEFAULT_IGNORE_PATTERNS = [ 

23 r"^http://localhost", 

24 r"^https://localhost", 

25 r"^http://127\.0\.0\.1", 

26 r"^https://127\.0\.0\.1", 

27 r"^http://0\.0\.0\.0", 

28 r"^https://0\.0\.0\.0", 

29] 

30 

31# Files to check by default (relative to base directory) 

32DEFAULT_DOC_FILES = [ 

33 "README.md", 

34 "CONTRIBUTING.md", 

35 "docs/**/*.md", 

36] 

37 

38 

39@dataclass 

40class LinkResult: 

41 """Result of checking a single link. 

42 

43 Attributes: 

44 url: The URL that was checked 

45 file: File containing the link 

46 line: Line number where link appears 

47 status: Status of the link ("valid", "broken", "timeout", "ignored", "internal") 

48 error: Error message if link is broken 

49 link_text: The link text from markdown [text](url) 

50 """ 

51 

52 url: str 

53 file: str 

54 line: int 

55 status: str 

56 error: str | None = None 

57 link_text: str | None = None 

58 

59 

60@dataclass 

61class LinkCheckResult: 

62 """Overall results from link checking. 

63 

64 Attributes: 

65 total_links: Total number of links found 

66 valid_links: Number of valid links 

67 broken_links: Number of broken links 

68 ignored_links: Number of ignored links 

69 internal_links: Number of internal file references 

70 results: List of individual link results 

71 """ 

72 

73 total_links: int = 0 

74 valid_links: int = 0 

75 broken_links: int = 0 

76 ignored_links: int = 0 

77 internal_links: int = 0 

78 results: list[LinkResult] = field(default_factory=list) 

79 

80 @property 

81 def has_errors(self) -> bool: 

82 """Check if any broken links were found.""" 

83 return self.broken_links > 0 

84 

85 

86def extract_links_from_markdown(content: str, file_path: str) -> list[tuple[str, str | None, int]]: 

87 """Extract links from markdown content. 

88 

89 Args: 

90 content: Markdown file content 

91 file_path: Path to the file (for context) 

92 

93 Returns: 

94 List of (url, link_text, line_number) tuples 

95 """ 

96 links: list[tuple[str, str | None, int]] = [] 

97 lines = content.splitlines() 

98 

99 for line_num, line in enumerate(lines, start=1): 

100 # Extract markdown links [text](url) 

101 for match in MARKDOWN_LINK_PATTERN.finditer(line): 

102 url = match.group(2).strip() 

103 link_text = match.group(1) 

104 links.append((url, link_text, line_num)) 

105 

106 # Extract bare URLs, excluding those already captured in markdown links 

107 # First, remove markdown link URLs from the line 

108 line_without_md_links = MARKDOWN_LINK_PATTERN.sub("", line) 

109 for match in BARE_URL_PATTERN.finditer(line_without_md_links): 

110 url = match.group(1).strip() 

111 # Clean up trailing punctuation 

112 url = re.sub(r"[.,;:!?)\]]+$", "", url) 

113 links.append((url, None, line_num)) 

114 

115 return links 

116 

117 

118def is_internal_reference(url: str) -> bool: 

119 """Check if URL is an internal file reference. 

120 

121 Args: 

122 url: URL to check 

123 

124 Returns: 

125 True if internal reference, False otherwise 

126 """ 

127 # Internal references start with # or ./ or ../ 

128 if url.startswith("#") or url.startswith("./") or url.startswith("../"): 

129 return True 

130 # Ends with .md (plain file reference) 

131 if url.endswith(".md"): 

132 return True 

133 # Relative markdown links with anchors (e.g. TROUBLESHOOTING.md#getting-help) 

134 if ".md#" in url: 

135 return True 

136 return False 

137 

138 

139def should_ignore_url(url: str, ignore_patterns: list[str]) -> bool: 

140 """Check if URL should be ignored based on patterns. 

141 

142 Args: 

143 url: URL to check 

144 ignore_patterns: List of regex patterns to match 

145 

146 Returns: 

147 True if URL should be ignored 

148 """ 

149 for pattern in ignore_patterns: 

150 if re.search(pattern, url): 

151 return True 

152 return False 

153 

154 

155def check_url(url: str, timeout: int = 10) -> tuple[bool, str | None]: 

156 """Check if a URL is reachable. 

157 

158 Args: 

159 url: URL to check 

160 timeout: Request timeout in seconds 

161 

162 Returns: 

163 Tuple of (is_valid, error_message) 

164 """ 

165 try: 

166 # Create request with user agent 

167 req = urllib.request.Request(url, headers={"User-Agent": "little-loops-link-checker/1.0"}) 

168 # Use HEAD request for efficiency 

169 req.get_method = lambda: "HEAD" # type: ignore[method-assign] 

170 

171 with urllib.request.urlopen(req, timeout=timeout) as response: 

172 # Accept 2xx and 3xx status codes 

173 if 200 <= response.status < 400: 

174 return True, None 

175 return False, f"HTTP {response.status}" 

176 

177 except urllib.error.HTTPError as e: 

178 return False, f"HTTP {e.code}" 

179 except urllib.error.URLError as e: 

180 return False, f"Connection error: {e.reason}" 

181 except TimeoutError: 

182 return False, "Timeout" 

183 except Exception as e: 

184 return False, str(e) 

185 

186 

187def check_markdown_links( 

188 base_dir: Path, 

189 ignore_patterns: list[str] | None = None, 

190 timeout: int = 10, 

191 verbose: bool = False, 

192 max_workers: int = 10, 

193) -> LinkCheckResult: 

194 """Check all markdown files for broken links. 

195 

196 Args: 

197 base_dir: Base directory to search 

198 ignore_patterns: List of regex patterns to ignore 

199 timeout: Request timeout in seconds 

200 verbose: Whether to show progress 

201 max_workers: Maximum concurrent HTTP requests 

202 

203 Returns: 

204 LinkCheckResult with all findings 

205 """ 

206 if ignore_patterns is None: 

207 ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy() 

208 

209 result = LinkCheckResult() 

210 

211 # Find all markdown files 

212 md_files = list(base_dir.rglob("*.md")) 

213 

214 # Pass 1: Classify links, collect HTTP URLs for concurrent checking 

215 http_checks: list[tuple[str, str | None, int, str]] = [] # (url, link_text, line, file) 

216 

217 for md_file in md_files: 

218 try: 

219 content = md_file.read_text() 

220 relative_path = md_file.relative_to(base_dir) 

221 file_str = str(relative_path) 

222 

223 links = extract_links_from_markdown(content, file_str) 

224 

225 for url, link_text, line_num in links: 

226 result.total_links += 1 

227 

228 # Check if should ignore 

229 if should_ignore_url(url, ignore_patterns): 

230 result.ignored_links += 1 

231 result.results.append( 

232 LinkResult( 

233 url=url, 

234 file=file_str, 

235 line=line_num, 

236 status="ignored", 

237 ) 

238 ) 

239 continue 

240 

241 # Check if internal reference 

242 if is_internal_reference(url): 

243 result.internal_links += 1 

244 result.results.append( 

245 LinkResult( 

246 url=url, 

247 file=file_str, 

248 line=line_num, 

249 status="internal", 

250 link_text=link_text, 

251 ) 

252 ) 

253 continue 

254 

255 # Only check HTTP/HTTPS URLs; treat everything else as internal 

256 if not url.startswith(("http://", "https://")): 

257 result.internal_links += 1 

258 result.results.append( 

259 LinkResult( 

260 url=url, 

261 file=file_str, 

262 line=line_num, 

263 status="internal", 

264 link_text=link_text, 

265 ) 

266 ) 

267 continue 

268 

269 # Collect HTTP/HTTPS URLs for concurrent checking 

270 http_checks.append((url, link_text, line_num, file_str)) 

271 

272 except Exception as e: 

273 # File read error - log as broken entry for this file 

274 result.results.append( 

275 LinkResult( 

276 url="", 

277 file=str(md_file.relative_to(base_dir)), 

278 line=0, 

279 status="broken", 

280 error=f"File read error: {e}", 

281 ) 

282 ) 

283 

284 # Pass 2: Check HTTP URLs concurrently 

285 if http_checks: 

286 with ThreadPoolExecutor(max_workers=max_workers) as executor: 

287 future_to_meta = { 

288 executor.submit(check_url, url, timeout): (url, link_text, line_num, file_str) 

289 for url, link_text, line_num, file_str in http_checks 

290 } 

291 

292 for future in as_completed(future_to_meta): 

293 url, link_text, line_num, file_str = future_to_meta[future] 

294 is_valid, error = future.result() 

295 

296 if is_valid: 

297 result.valid_links += 1 

298 result.results.append( 

299 LinkResult( 

300 url=url, 

301 file=file_str, 

302 line=line_num, 

303 status="valid", 

304 link_text=link_text, 

305 ) 

306 ) 

307 else: 

308 result.broken_links += 1 

309 result.results.append( 

310 LinkResult( 

311 url=url, 

312 file=file_str, 

313 line=line_num, 

314 status="broken", 

315 error=error, 

316 link_text=link_text, 

317 ) 

318 ) 

319 

320 return result 

321 

322 

323def load_ignore_patterns(base_dir: Path) -> list[str]: 

324 """Load ignore patterns from .mlc.config.json. 

325 

326 Args: 

327 base_dir: Base directory path 

328 

329 Returns: 

330 List of ignore patterns 

331 """ 

332 patterns = DEFAULT_IGNORE_PATTERNS.copy() 

333 

334 config_file = base_dir / ".mlc.config.json" 

335 if not config_file.exists(): 

336 return patterns 

337 

338 try: 

339 with open(config_file) as f: 

340 config = json.load(f) 

341 

342 # Extract ignore patterns 

343 ignore_list = config.get("ignorePatterns", []) 

344 for item in ignore_list: 

345 if isinstance(item, dict): 

346 pattern = item.get("pattern", "") 

347 elif isinstance(item, str): 

348 pattern = item 

349 else: 

350 continue 

351 

352 if pattern: 

353 patterns.append(pattern) 

354 

355 except (OSError, json.JSONDecodeError): 

356 # If config is invalid, use defaults 

357 pass 

358 

359 return patterns 

360 

361 

362def format_result_text(result: LinkCheckResult) -> str: 

363 """Format link check result as text. 

364 

365 Args: 

366 result: Link check result 

367 

368 Returns: 

369 Formatted text output 

370 """ 

371 lines = ["Documentation Link Check", "=" * 40] 

372 

373 if result.has_errors: 

374 lines.append(f"✗ Found {result.broken_links} broken link(s):") 

375 lines.append("") 

376 

377 for r in result.results: 

378 if r.status == "broken": 

379 text_part = f"[{r.link_text}]" if r.link_text else "" 

380 lines.append(f" {text_part}({r.url})") 

381 lines.append(f" at {r.file}:{r.line}") 

382 if r.error: 

383 lines.append(f" Error: {r.error}") 

384 lines.append("") 

385 

386 # Summary 

387 lines.append("Summary:") 

388 lines.append(f" Total links: {result.total_links}") 

389 lines.append(f" Valid: {result.valid_links}") 

390 lines.append(f" Broken: {result.broken_links}") 

391 lines.append(f" Internal refs: {result.internal_links}") 

392 lines.append(f" Ignored: {result.ignored_links}") 

393 

394 else: 

395 lines.append( 

396 f"✓ All {result.total_links} link(s) valid! " 

397 f"({result.internal_links} internal, {result.ignored_links} ignored)" 

398 ) 

399 

400 return "\n".join(lines) 

401 

402 

403def format_result_json(result: LinkCheckResult) -> str: 

404 """Format link check result as JSON. 

405 

406 Args: 

407 result: Link check result 

408 

409 Returns: 

410 JSON string 

411 """ 

412 data = { 

413 "total_links": result.total_links, 

414 "valid_links": result.valid_links, 

415 "broken_links": result.broken_links, 

416 "ignored_links": result.ignored_links, 

417 "internal_links": result.internal_links, 

418 "has_errors": result.has_errors, 

419 "results": [ 

420 { 

421 "url": r.url, 

422 "file": r.file, 

423 "line": r.line, 

424 "status": r.status, 

425 "error": r.error, 

426 "link_text": r.link_text, 

427 } 

428 for r in result.results 

429 ], 

430 } 

431 

432 return json.dumps(data, indent=2) 

433 

434 

435def format_result_markdown(result: LinkCheckResult) -> str: 

436 """Format link check result as Markdown. 

437 

438 Args: 

439 result: Link check result 

440 

441 Returns: 

442 Markdown formatted string 

443 """ 

444 lines = ["# Documentation Link Check", ""] 

445 lines.append("## Summary") 

446 lines.append("") 

447 lines.append(f"- **Total links**: {result.total_links}") 

448 lines.append(f"- **Valid**: {result.valid_links}") 

449 lines.append(f"- **Broken**: {result.broken_links}") 

450 lines.append(f"- **Internal references**: {result.internal_links}") 

451 lines.append(f"- **Ignored**: {result.ignored_links}") 

452 lines.append("") 

453 

454 if result.has_errors: 

455 lines.append("## ❌ Broken Links") 

456 lines.append("") 

457 lines.append("| URL | File | Line | Error |") 

458 lines.append("|-----|------|------|-------|") 

459 

460 for r in result.results: 

461 if r.status == "broken": 

462 url_display = r.url[:60] + "..." if len(r.url) > 60 else r.url 

463 error_display = r.error or "Unknown" 

464 lines.append(f"| `{url_display}` | `{r.file}` | {r.line} | {error_display} |") 

465 else: 

466 lines.append("## ✅ All Links Valid") 

467 lines.append("") 

468 lines.append( 

469 f"All {result.total_links} links are valid " 

470 f"({result.internal_links} internal references, " 

471 f"{result.ignored_links} ignored)." 

472 ) 

473 

474 return "\n".join(lines)