Coverage for little_loops / issue_history / doc_synthesis.py: 0%
150 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
1"""Documentation synthesis from completed issue history.
3Synthesizes architecture documentation from completed issues by scoring
4relevance to a given topic, ordering chronologically by completion date,
5and constructing a structured markdown document.
6"""
8from __future__ import annotations
10import re
11from datetime import date
12from pathlib import Path
14from little_loops.issue_history.models import CompletedIssue
15from little_loops.text_utils import extract_words, score_bm25
18def score_relevance(
19 topic: str,
20 issue: CompletedIssue,
21 content: str,
22 corpus_stats: dict | None = None,
23 scoring: str = "intersection",
24) -> float:
25 """Score how relevant a completed issue is to a topic.
27 Supports three scoring modes:
28 - "intersection" (default): fraction of topic words appearing in the issue.
29 Works well for topic matching because it doesn't penalize long documents.
30 - "hybrid": intersection * 0.5 + normalized BM25 * 0.5. Requires corpus_stats.
31 - "bm25": normalized BM25 score only. Requires corpus_stats.
33 Args:
34 topic: Search topic string
35 issue: Completed issue to score
36 content: Raw file content of the issue
37 corpus_stats: Corpus statistics for BM25 (doc_freq, avg_doc_len, total_docs).
38 Required for "hybrid" and "bm25" modes; ignored for "intersection".
39 scoring: Scoring mode — "intersection", "hybrid", or "bm25"
41 Returns:
42 Relevance score from 0.0 to 1.0
43 """
44 topic_words = extract_words(topic)
45 if not topic_words:
46 return 0.0
48 # Combine issue ID, filename stem, and content for matching
49 issue_text = f"{issue.issue_id} {issue.path.stem.replace('-', ' ')} {content}"
50 issue_words = extract_words(issue_text)
52 # Intersection-based: what fraction of topic words appear in the issue?
53 intersection_score = len(topic_words & issue_words) / len(topic_words)
55 if scoring == "intersection" or not corpus_stats:
56 return intersection_score
58 # BM25 scoring requires at least one matching term (early exit for efficiency)
59 if intersection_score == 0.0:
60 return 0.0
62 raw_bm25 = score_bm25(topic_words, issue_words, **corpus_stats)
63 # Normalize BM25 to [0, 1) using x / (x + 1) — smooth and monotonic
64 bm25_normalized = raw_bm25 / (raw_bm25 + 1) if raw_bm25 > 0 else 0.0
66 if scoring == "bm25":
67 return bm25_normalized
68 # "hybrid": equal blend of intersection (recall) and BM25 (precision)
69 return intersection_score * 0.5 + bm25_normalized * 0.5
72def _compute_corpus_stats(candidates: list[tuple[CompletedIssue, str]]) -> dict:
73 """Compute BM25 corpus statistics over a set of candidate issues.
75 Args:
76 candidates: List of (issue, content) tuples
78 Returns:
79 dict with keys: doc_freq (dict[str, int]), avg_doc_len (float), total_docs (int)
80 """
81 doc_words_list: list[set[str]] = []
82 for issue, content in candidates:
83 issue_text = f"{issue.issue_id} {issue.path.stem.replace('-', ' ')} {content}"
84 doc_words_list.append(extract_words(issue_text))
86 total_docs = len(doc_words_list)
87 if total_docs == 0:
88 return {"doc_freq": {}, "avg_doc_len": 0.0, "total_docs": 0}
90 doc_freq: dict[str, int] = {}
91 total_len = 0
92 for words in doc_words_list:
93 total_len += len(words)
94 for word in words:
95 doc_freq[word] = doc_freq.get(word, 0) + 1
97 return {
98 "doc_freq": doc_freq,
99 "avg_doc_len": total_len / total_docs,
100 "total_docs": total_docs,
101 }
104def _extract_section(content: str, heading: str) -> str:
105 """Extract content under a markdown heading.
107 Args:
108 content: Full markdown content
109 heading: Heading text to find (without ##)
111 Returns:
112 Section content (empty string if not found)
113 """
114 pattern = rf"^##\s+{re.escape(heading)}\s*$"
115 match = re.search(pattern, content, re.MULTILINE)
116 if not match:
117 return ""
119 start = match.end()
120 # Find next heading of same or higher level
121 next_heading = re.search(r"^##\s", content[start:], re.MULTILINE)
122 if next_heading:
123 end = start + next_heading.start()
124 else:
125 end = len(content)
127 return content[start:end].strip()
130def _extract_title(content: str) -> str:
131 """Extract the H1 title from issue content.
133 Args:
134 content: Issue file content
136 Returns:
137 Title text, or empty string if not found
138 """
139 match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
140 return match.group(1).strip() if match else ""
143def synthesize_docs(
144 topic: str,
145 issues: list[CompletedIssue],
146 contents: dict[Path, str],
147 format: str = "narrative",
148 min_relevance: float = 0.5,
149 since: date | None = None,
150 issue_type: str | None = None,
151 scoring: str = "intersection",
152) -> str:
153 """Synthesize documentation from completed issues matching a topic.
155 Filters issues by relevance, orders chronologically by completion date,
156 and builds a markdown document.
158 Args:
159 topic: Topic to search for
160 issues: List of completed issues
161 contents: Pre-loaded issue file contents (path -> content)
162 format: Output format - "narrative" or "structured"
163 min_relevance: Minimum relevance score threshold
164 since: Only include issues completed after this date
165 issue_type: Filter by issue type (BUG, FEAT, ENH)
166 scoring: Relevance scoring mode — "intersection" (default), "hybrid", or "bm25"
168 Returns:
169 Synthesized markdown document
170 """
171 # Pre-filter by type and date (before relevance scoring)
172 candidates: list[tuple[CompletedIssue, str]] = []
173 for issue in issues:
174 content = contents.get(issue.path, "")
175 if not content:
176 continue
177 if issue_type and issue.issue_type != issue_type:
178 continue
179 if since and issue.completed_date and issue.completed_date < since:
180 continue
181 candidates.append((issue, content))
183 # Compute corpus statistics for BM25-based scoring modes
184 corpus_stats: dict | None = None
185 if scoring != "intersection" and candidates:
186 corpus_stats = _compute_corpus_stats(candidates)
188 # Score and filter issues
189 scored: list[tuple[CompletedIssue, float, str]] = []
190 for issue, content in candidates:
191 score = score_relevance(topic, issue, content, corpus_stats=corpus_stats, scoring=scoring)
192 if score >= min_relevance:
193 scored.append((issue, score, content))
195 # Sort by completion date ascending (oldest first), then by score
196 scored.sort(
197 key=lambda x: (
198 x[0].completed_date or date.min,
199 -x[1],
200 )
201 )
203 if not scored:
204 return f"No completed issues found matching topic: {topic}"
206 if format == "structured":
207 return build_structured_doc(topic, scored)
208 return build_narrative_doc(topic, scored)
211def build_narrative_doc(
212 topic: str,
213 scored_issues: list[tuple[CompletedIssue, float, str]],
214) -> str:
215 """Build a narrative-style documentation document.
217 Each issue becomes a section describing what was built and why,
218 ordered chronologically to read as a development narrative.
220 Args:
221 topic: The topic being documented
222 scored_issues: List of (issue, score, content) tuples, pre-sorted
224 Returns:
225 Markdown document string
226 """
227 lines: list[str] = []
229 lines.append(f"# {topic}")
230 lines.append("")
231 lines.append(
232 f"*Synthesized from {len(scored_issues)} completed issue(s). Generated from issue history.*"
233 )
234 lines.append("")
236 for issue, _score, content in scored_issues:
237 title = _extract_title(content)
238 display_title = title or issue.issue_id
240 # Section heading with date
241 date_str = issue.completed_date.isoformat() if issue.completed_date else "unknown"
242 lines.append(f"## {display_title}")
243 lines.append("")
244 lines.append(
245 f"*{issue.issue_id} | Completed: {date_str} | "
246 f"Type: {issue.issue_type} | Priority: {issue.priority}*"
247 )
248 lines.append("")
250 # Summary section
251 summary = _extract_section(content, "Summary")
252 if summary:
253 lines.append(summary)
254 lines.append("")
256 # Motivation section
257 motivation = _extract_section(content, "Motivation")
258 if motivation:
259 lines.append(f"**Motivation:** {motivation}")
260 lines.append("")
262 # Implementation notes
263 impl_notes = _extract_section(content, "Implementation Notes")
264 if not impl_notes:
265 impl_notes = _extract_section(content, "Resolution")
266 if impl_notes:
267 lines.append(f"**Implementation:** {impl_notes}")
268 lines.append("")
270 lines.append("---")
271 lines.append("")
273 return "\n".join(lines)
276def build_structured_doc(
277 topic: str,
278 scored_issues: list[tuple[CompletedIssue, float, str]],
279) -> str:
280 """Build a structured documentation document.
282 Organized with a summary table followed by detailed sections,
283 focusing on technical content rather than narrative flow.
285 Args:
286 topic: The topic being documented
287 scored_issues: List of (issue, score, content) tuples, pre-sorted
289 Returns:
290 Markdown document string
291 """
292 lines: list[str] = []
294 lines.append(f"# {topic}")
295 lines.append("")
296 lines.append(
297 f"*Synthesized from {len(scored_issues)} completed issue(s). Generated from issue history.*"
298 )
299 lines.append("")
301 # Summary table
302 lines.append("## Overview")
303 lines.append("")
304 lines.append("| Issue | Type | Priority | Completed | Relevance |")
305 lines.append("|-------|------|----------|-----------|-----------|")
306 for issue, score, content in scored_issues:
307 title = _extract_title(content) or issue.issue_id
308 # Truncate long titles for the table
309 if len(title) > 60:
310 title = title[:57] + "..."
311 date_str = issue.completed_date.isoformat() if issue.completed_date else "N/A"
312 lines.append(
313 f"| {title} | {issue.issue_type} | {issue.priority} | {date_str} | {score:.0%} |"
314 )
315 lines.append("")
317 # Detailed sections
318 lines.append("## Details")
319 lines.append("")
321 for issue, _score, content in scored_issues:
322 title = _extract_title(content) or issue.issue_id
323 date_str = issue.completed_date.isoformat() if issue.completed_date else "unknown"
325 lines.append(f"### {title}")
326 lines.append("")
327 lines.append(f"**ID:** {issue.issue_id} | **Completed:** {date_str}")
328 lines.append("")
330 summary = _extract_section(content, "Summary")
331 if summary:
332 lines.append(summary)
333 lines.append("")
335 # Expected behavior or proposed solution
336 expected = _extract_section(content, "Expected Behavior")
337 if not expected:
338 expected = _extract_section(content, "Proposed Solution")
339 if expected:
340 lines.append(f"**Solution:** {expected}")
341 lines.append("")
343 impl_notes = _extract_section(content, "Implementation Notes")
344 if not impl_notes:
345 impl_notes = _extract_section(content, "Resolution")
346 if impl_notes:
347 lines.append(f"**Implementation:** {impl_notes}")
348 lines.append("")
350 return "\n".join(lines)