Coverage for little_loops / text_utils.py: 0%
44 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-18 16:18 -0500
1"""Text extraction utilities for issue content.
3Provides shared functions for extracting file paths from markdown issue
4content. Used by dependency_mapper, issue_history, and other modules that
5need to identify file references in issue text.
6"""
8from __future__ import annotations
10import math
11import re
12from pathlib import Path
14# File path patterns for extraction from issue content
15_BACKTICK_PATH = re.compile(r"`([^`\s]+\.[a-z]{2,4})`")
16_BOLD_FILE_PATH = re.compile(r"\*\*File\*\*:\s*`?([^`\n]+\.[a-z]{2,4})`?")
17_STANDALONE_PATH = re.compile(
18 r"(?:^|\s)([a-zA-Z_][\w/.-]*\.[a-z]{2,4})(?::\d+)?(?:\s|$|:|\))",
19 re.MULTILINE,
20)
21_CODE_FENCE = re.compile(r"```[\s\S]*?```", re.MULTILINE)
23# File extensions that indicate real source file paths
24SOURCE_EXTENSIONS = frozenset(
25 {
26 ".py",
27 ".ts",
28 ".js",
29 ".tsx",
30 ".jsx",
31 ".md",
32 ".json",
33 ".yaml",
34 ".yml",
35 ".toml",
36 ".cfg",
37 ".ini",
38 ".html",
39 ".css",
40 ".scss",
41 ".sh",
42 ".bash",
43 ".sql",
44 ".go",
45 ".rs",
46 ".java",
47 ".kt",
48 ".rb",
49 ".php",
50 }
51)
54def extract_file_paths(content: str) -> set[str]:
55 """Extract file paths from issue content.
57 Searches for file paths in:
58 - Backtick-quoted paths: `path/to/file.py`
59 - Location section bold paths: **File**: `path/to/file.py`
60 - Standalone paths with recognized extensions
62 Code fence blocks are stripped before extraction to avoid
63 matching paths inside example code. Line number suffixes
64 (e.g., ``path.py:123``) are normalized by stripping the
65 line number portion.
67 Args:
68 content: Issue file content
70 Returns:
71 Set of file paths found in the content
72 """
73 if not content:
74 return set()
76 # Strip code fences to avoid matching example paths
77 stripped = _CODE_FENCE.sub("", content)
79 paths: set[str] = set()
80 for pattern in (_BOLD_FILE_PATH, _BACKTICK_PATH, _STANDALONE_PATH):
81 for match in pattern.finditer(stripped):
82 path = match.group(1).strip()
83 # Normalize: remove line numbers (path.py:123 -> path.py)
84 if ":" in path and path.split(":")[-1].isdigit():
85 path = ":".join(path.split(":")[:-1])
86 # Only include paths with directory separators or recognized extensions
87 ext = Path(path).suffix.lower()
88 if ext in SOURCE_EXTENSIONS and ("/" in path or ext):
89 paths.add(path)
90 return paths
93# =============================================================================
94# Word Extraction and Overlap Scoring
95# =============================================================================
97# Common stop words excluded from word extraction
98_COMMON_WORDS = frozenset(
99 {
100 "the",
101 "and",
102 "for",
103 "this",
104 "that",
105 "with",
106 "from",
107 "are",
108 "was",
109 "were",
110 "been",
111 "have",
112 "has",
113 "had",
114 "not",
115 "but",
116 "can",
117 "will",
118 "should",
119 "would",
120 "could",
121 "may",
122 "might",
123 "must",
124 "file",
125 "code",
126 "issue",
127 }
128)
131def extract_words(text: str) -> set[str]:
132 """Extract significant words from text.
134 Extracts all lowercase alphabetic words of 3+ characters,
135 excluding common stop words. Useful for topic-based relevance
136 scoring via Jaccard similarity.
138 Args:
139 text: Input text
141 Returns:
142 Set of lowercase words (3+ chars, excluding common words)
143 """
144 words = set(re.findall(r"\b[a-z]{3,}\b", text.lower()))
145 return words - _COMMON_WORDS
148def calculate_word_overlap(words1: set[str], words2: set[str]) -> float:
149 """Calculate Jaccard similarity between word sets.
151 Args:
152 words1: First word set
153 words2: Second word set
155 Returns:
156 Similarity score from 0.0 to 1.0
157 """
158 if not words1 or not words2:
159 return 0.0
160 intersection = words1 & words2
161 union = words1 | words2
162 return len(intersection) / len(union)
165def score_bm25(
166 query_words: set[str],
167 doc_words: set[str],
168 doc_freq: dict[str, int],
169 avg_doc_len: float,
170 total_docs: int,
171 k1: float = 1.5,
172 b: float = 0.75,
173) -> float:
174 """Compute BM25 relevance score for a document against a query.
176 Uses the Robertson BM25 formula with IDF smoothing. Since doc_words
177 is a set (unique terms only), term frequency within the document is
178 always 1 for matching terms.
180 Args:
181 query_words: Set of query terms
182 doc_words: Set of document terms (unique words, from extract_words)
183 doc_freq: Document frequency per term (number of docs containing each term)
184 avg_doc_len: Average document length in unique words across corpus
185 total_docs: Total number of documents in corpus
186 k1: Term frequency saturation parameter (default: 1.5)
187 b: Length normalization parameter (default: 0.75)
189 Returns:
190 BM25 score (non-negative float, unbounded above)
191 """
192 if not query_words or not doc_words or total_docs == 0 or avg_doc_len == 0:
193 return 0.0
195 doc_len = len(doc_words)
196 score = 0.0
198 for term in query_words & doc_words:
199 df = doc_freq.get(term, 0)
200 # Robertson IDF with +1 smoothing to keep score non-negative
201 idf = math.log((total_docs - df + 0.5) / (df + 0.5) + 1)
202 # TF = 1 (term present in doc), with length normalization
203 tf_norm = (k1 + 1) / (1 + k1 * (1 - b + b * doc_len / avg_doc_len))
204 score += idf * tf_norm
206 return score