Coverage for little_loops / issue_history / doc_synthesis.py: 0%

150 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-03-18 16:18 -0500

1"""Documentation synthesis from completed issue history. 

2 

3Synthesizes architecture documentation from completed issues by scoring 

4relevance to a given topic, ordering chronologically by completion date, 

5and constructing a structured markdown document. 

6""" 

7 

8from __future__ import annotations 

9 

10import re 

11from datetime import date 

12from pathlib import Path 

13 

14from little_loops.issue_history.models import CompletedIssue 

15from little_loops.text_utils import extract_words, score_bm25 

16 

17 

18def score_relevance( 

19 topic: str, 

20 issue: CompletedIssue, 

21 content: str, 

22 corpus_stats: dict | None = None, 

23 scoring: str = "intersection", 

24) -> float: 

25 """Score how relevant a completed issue is to a topic. 

26 

27 Supports three scoring modes: 

28 - "intersection" (default): fraction of topic words appearing in the issue. 

29 Works well for topic matching because it doesn't penalize long documents. 

30 - "hybrid": intersection * 0.5 + normalized BM25 * 0.5. Requires corpus_stats. 

31 - "bm25": normalized BM25 score only. Requires corpus_stats. 

32 

33 Args: 

34 topic: Search topic string 

35 issue: Completed issue to score 

36 content: Raw file content of the issue 

37 corpus_stats: Corpus statistics for BM25 (doc_freq, avg_doc_len, total_docs). 

38 Required for "hybrid" and "bm25" modes; ignored for "intersection". 

39 scoring: Scoring mode — "intersection", "hybrid", or "bm25" 

40 

41 Returns: 

42 Relevance score from 0.0 to 1.0 

43 """ 

44 topic_words = extract_words(topic) 

45 if not topic_words: 

46 return 0.0 

47 

48 # Combine issue ID, filename stem, and content for matching 

49 issue_text = f"{issue.issue_id} {issue.path.stem.replace('-', ' ')} {content}" 

50 issue_words = extract_words(issue_text) 

51 

52 # Intersection-based: what fraction of topic words appear in the issue? 

53 intersection_score = len(topic_words & issue_words) / len(topic_words) 

54 

55 if scoring == "intersection" or not corpus_stats: 

56 return intersection_score 

57 

58 # BM25 scoring requires at least one matching term (early exit for efficiency) 

59 if intersection_score == 0.0: 

60 return 0.0 

61 

62 raw_bm25 = score_bm25(topic_words, issue_words, **corpus_stats) 

63 # Normalize BM25 to [0, 1) using x / (x + 1) — smooth and monotonic 

64 bm25_normalized = raw_bm25 / (raw_bm25 + 1) if raw_bm25 > 0 else 0.0 

65 

66 if scoring == "bm25": 

67 return bm25_normalized 

68 # "hybrid": equal blend of intersection (recall) and BM25 (precision) 

69 return intersection_score * 0.5 + bm25_normalized * 0.5 

70 

71 

72def _compute_corpus_stats(candidates: list[tuple[CompletedIssue, str]]) -> dict: 

73 """Compute BM25 corpus statistics over a set of candidate issues. 

74 

75 Args: 

76 candidates: List of (issue, content) tuples 

77 

78 Returns: 

79 dict with keys: doc_freq (dict[str, int]), avg_doc_len (float), total_docs (int) 

80 """ 

81 doc_words_list: list[set[str]] = [] 

82 for issue, content in candidates: 

83 issue_text = f"{issue.issue_id} {issue.path.stem.replace('-', ' ')} {content}" 

84 doc_words_list.append(extract_words(issue_text)) 

85 

86 total_docs = len(doc_words_list) 

87 if total_docs == 0: 

88 return {"doc_freq": {}, "avg_doc_len": 0.0, "total_docs": 0} 

89 

90 doc_freq: dict[str, int] = {} 

91 total_len = 0 

92 for words in doc_words_list: 

93 total_len += len(words) 

94 for word in words: 

95 doc_freq[word] = doc_freq.get(word, 0) + 1 

96 

97 return { 

98 "doc_freq": doc_freq, 

99 "avg_doc_len": total_len / total_docs, 

100 "total_docs": total_docs, 

101 } 

102 

103 

104def _extract_section(content: str, heading: str) -> str: 

105 """Extract content under a markdown heading. 

106 

107 Args: 

108 content: Full markdown content 

109 heading: Heading text to find (without ##) 

110 

111 Returns: 

112 Section content (empty string if not found) 

113 """ 

114 pattern = rf"^##\s+{re.escape(heading)}\s*$" 

115 match = re.search(pattern, content, re.MULTILINE) 

116 if not match: 

117 return "" 

118 

119 start = match.end() 

120 # Find next heading of same or higher level 

121 next_heading = re.search(r"^##\s", content[start:], re.MULTILINE) 

122 if next_heading: 

123 end = start + next_heading.start() 

124 else: 

125 end = len(content) 

126 

127 return content[start:end].strip() 

128 

129 

130def _extract_title(content: str) -> str: 

131 """Extract the H1 title from issue content. 

132 

133 Args: 

134 content: Issue file content 

135 

136 Returns: 

137 Title text, or empty string if not found 

138 """ 

139 match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) 

140 return match.group(1).strip() if match else "" 

141 

142 

143def synthesize_docs( 

144 topic: str, 

145 issues: list[CompletedIssue], 

146 contents: dict[Path, str], 

147 format: str = "narrative", 

148 min_relevance: float = 0.5, 

149 since: date | None = None, 

150 issue_type: str | None = None, 

151 scoring: str = "intersection", 

152) -> str: 

153 """Synthesize documentation from completed issues matching a topic. 

154 

155 Filters issues by relevance, orders chronologically by completion date, 

156 and builds a markdown document. 

157 

158 Args: 

159 topic: Topic to search for 

160 issues: List of completed issues 

161 contents: Pre-loaded issue file contents (path -> content) 

162 format: Output format - "narrative" or "structured" 

163 min_relevance: Minimum relevance score threshold 

164 since: Only include issues completed after this date 

165 issue_type: Filter by issue type (BUG, FEAT, ENH) 

166 scoring: Relevance scoring mode — "intersection" (default), "hybrid", or "bm25" 

167 

168 Returns: 

169 Synthesized markdown document 

170 """ 

171 # Pre-filter by type and date (before relevance scoring) 

172 candidates: list[tuple[CompletedIssue, str]] = [] 

173 for issue in issues: 

174 content = contents.get(issue.path, "") 

175 if not content: 

176 continue 

177 if issue_type and issue.issue_type != issue_type: 

178 continue 

179 if since and issue.completed_date and issue.completed_date < since: 

180 continue 

181 candidates.append((issue, content)) 

182 

183 # Compute corpus statistics for BM25-based scoring modes 

184 corpus_stats: dict | None = None 

185 if scoring != "intersection" and candidates: 

186 corpus_stats = _compute_corpus_stats(candidates) 

187 

188 # Score and filter issues 

189 scored: list[tuple[CompletedIssue, float, str]] = [] 

190 for issue, content in candidates: 

191 score = score_relevance(topic, issue, content, corpus_stats=corpus_stats, scoring=scoring) 

192 if score >= min_relevance: 

193 scored.append((issue, score, content)) 

194 

195 # Sort by completion date ascending (oldest first), then by score 

196 scored.sort( 

197 key=lambda x: ( 

198 x[0].completed_date or date.min, 

199 -x[1], 

200 ) 

201 ) 

202 

203 if not scored: 

204 return f"No completed issues found matching topic: {topic}" 

205 

206 if format == "structured": 

207 return build_structured_doc(topic, scored) 

208 return build_narrative_doc(topic, scored) 

209 

210 

211def build_narrative_doc( 

212 topic: str, 

213 scored_issues: list[tuple[CompletedIssue, float, str]], 

214) -> str: 

215 """Build a narrative-style documentation document. 

216 

217 Each issue becomes a section describing what was built and why, 

218 ordered chronologically to read as a development narrative. 

219 

220 Args: 

221 topic: The topic being documented 

222 scored_issues: List of (issue, score, content) tuples, pre-sorted 

223 

224 Returns: 

225 Markdown document string 

226 """ 

227 lines: list[str] = [] 

228 

229 lines.append(f"# {topic}") 

230 lines.append("") 

231 lines.append( 

232 f"*Synthesized from {len(scored_issues)} completed issue(s). Generated from issue history.*" 

233 ) 

234 lines.append("") 

235 

236 for issue, _score, content in scored_issues: 

237 title = _extract_title(content) 

238 display_title = title or issue.issue_id 

239 

240 # Section heading with date 

241 date_str = issue.completed_date.isoformat() if issue.completed_date else "unknown" 

242 lines.append(f"## {display_title}") 

243 lines.append("") 

244 lines.append( 

245 f"*{issue.issue_id} | Completed: {date_str} | " 

246 f"Type: {issue.issue_type} | Priority: {issue.priority}*" 

247 ) 

248 lines.append("") 

249 

250 # Summary section 

251 summary = _extract_section(content, "Summary") 

252 if summary: 

253 lines.append(summary) 

254 lines.append("") 

255 

256 # Motivation section 

257 motivation = _extract_section(content, "Motivation") 

258 if motivation: 

259 lines.append(f"**Motivation:** {motivation}") 

260 lines.append("") 

261 

262 # Implementation notes 

263 impl_notes = _extract_section(content, "Implementation Notes") 

264 if not impl_notes: 

265 impl_notes = _extract_section(content, "Resolution") 

266 if impl_notes: 

267 lines.append(f"**Implementation:** {impl_notes}") 

268 lines.append("") 

269 

270 lines.append("---") 

271 lines.append("") 

272 

273 return "\n".join(lines) 

274 

275 

276def build_structured_doc( 

277 topic: str, 

278 scored_issues: list[tuple[CompletedIssue, float, str]], 

279) -> str: 

280 """Build a structured documentation document. 

281 

282 Organized with a summary table followed by detailed sections, 

283 focusing on technical content rather than narrative flow. 

284 

285 Args: 

286 topic: The topic being documented 

287 scored_issues: List of (issue, score, content) tuples, pre-sorted 

288 

289 Returns: 

290 Markdown document string 

291 """ 

292 lines: list[str] = [] 

293 

294 lines.append(f"# {topic}") 

295 lines.append("") 

296 lines.append( 

297 f"*Synthesized from {len(scored_issues)} completed issue(s). Generated from issue history.*" 

298 ) 

299 lines.append("") 

300 

301 # Summary table 

302 lines.append("## Overview") 

303 lines.append("") 

304 lines.append("| Issue | Type | Priority | Completed | Relevance |") 

305 lines.append("|-------|------|----------|-----------|-----------|") 

306 for issue, score, content in scored_issues: 

307 title = _extract_title(content) or issue.issue_id 

308 # Truncate long titles for the table 

309 if len(title) > 60: 

310 title = title[:57] + "..." 

311 date_str = issue.completed_date.isoformat() if issue.completed_date else "N/A" 

312 lines.append( 

313 f"| {title} | {issue.issue_type} | {issue.priority} | {date_str} | {score:.0%} |" 

314 ) 

315 lines.append("") 

316 

317 # Detailed sections 

318 lines.append("## Details") 

319 lines.append("") 

320 

321 for issue, _score, content in scored_issues: 

322 title = _extract_title(content) or issue.issue_id 

323 date_str = issue.completed_date.isoformat() if issue.completed_date else "unknown" 

324 

325 lines.append(f"### {title}") 

326 lines.append("") 

327 lines.append(f"**ID:** {issue.issue_id} | **Completed:** {date_str}") 

328 lines.append("") 

329 

330 summary = _extract_section(content, "Summary") 

331 if summary: 

332 lines.append(summary) 

333 lines.append("") 

334 

335 # Expected behavior or proposed solution 

336 expected = _extract_section(content, "Expected Behavior") 

337 if not expected: 

338 expected = _extract_section(content, "Proposed Solution") 

339 if expected: 

340 lines.append(f"**Solution:** {expected}") 

341 lines.append("") 

342 

343 impl_notes = _extract_section(content, "Implementation Notes") 

344 if not impl_notes: 

345 impl_notes = _extract_section(content, "Resolution") 

346 if impl_notes: 

347 lines.append(f"**Implementation:** {impl_notes}") 

348 lines.append("") 

349 

350 return "\n".join(lines)