Coverage for little_loops / text_utils.py: 0%

44 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-03-18 16:18 -0500

1"""Text extraction utilities for issue content. 

2 

3Provides shared functions for extracting file paths from markdown issue 

4content. Used by dependency_mapper, issue_history, and other modules that 

5need to identify file references in issue text. 

6""" 

7 

8from __future__ import annotations 

9 

10import math 

11import re 

12from pathlib import Path 

13 

14# File path patterns for extraction from issue content 

15_BACKTICK_PATH = re.compile(r"`([^`\s]+\.[a-z]{2,4})`") 

16_BOLD_FILE_PATH = re.compile(r"\*\*File\*\*:\s*`?([^`\n]+\.[a-z]{2,4})`?") 

17_STANDALONE_PATH = re.compile( 

18 r"(?:^|\s)([a-zA-Z_][\w/.-]*\.[a-z]{2,4})(?::\d+)?(?:\s|$|:|\))", 

19 re.MULTILINE, 

20) 

21_CODE_FENCE = re.compile(r"```[\s\S]*?```", re.MULTILINE) 

22 

23# File extensions that indicate real source file paths 

24SOURCE_EXTENSIONS = frozenset( 

25 { 

26 ".py", 

27 ".ts", 

28 ".js", 

29 ".tsx", 

30 ".jsx", 

31 ".md", 

32 ".json", 

33 ".yaml", 

34 ".yml", 

35 ".toml", 

36 ".cfg", 

37 ".ini", 

38 ".html", 

39 ".css", 

40 ".scss", 

41 ".sh", 

42 ".bash", 

43 ".sql", 

44 ".go", 

45 ".rs", 

46 ".java", 

47 ".kt", 

48 ".rb", 

49 ".php", 

50 } 

51) 

52 

53 

54def extract_file_paths(content: str) -> set[str]: 

55 """Extract file paths from issue content. 

56 

57 Searches for file paths in: 

58 - Backtick-quoted paths: `path/to/file.py` 

59 - Location section bold paths: **File**: `path/to/file.py` 

60 - Standalone paths with recognized extensions 

61 

62 Code fence blocks are stripped before extraction to avoid 

63 matching paths inside example code. Line number suffixes 

64 (e.g., ``path.py:123``) are normalized by stripping the 

65 line number portion. 

66 

67 Args: 

68 content: Issue file content 

69 

70 Returns: 

71 Set of file paths found in the content 

72 """ 

73 if not content: 

74 return set() 

75 

76 # Strip code fences to avoid matching example paths 

77 stripped = _CODE_FENCE.sub("", content) 

78 

79 paths: set[str] = set() 

80 for pattern in (_BOLD_FILE_PATH, _BACKTICK_PATH, _STANDALONE_PATH): 

81 for match in pattern.finditer(stripped): 

82 path = match.group(1).strip() 

83 # Normalize: remove line numbers (path.py:123 -> path.py) 

84 if ":" in path and path.split(":")[-1].isdigit(): 

85 path = ":".join(path.split(":")[:-1]) 

86 # Only include paths with directory separators or recognized extensions 

87 ext = Path(path).suffix.lower() 

88 if ext in SOURCE_EXTENSIONS and ("/" in path or ext): 

89 paths.add(path) 

90 return paths 

91 

92 

93# ============================================================================= 

94# Word Extraction and Overlap Scoring 

95# ============================================================================= 

96 

97# Common stop words excluded from word extraction 

98_COMMON_WORDS = frozenset( 

99 { 

100 "the", 

101 "and", 

102 "for", 

103 "this", 

104 "that", 

105 "with", 

106 "from", 

107 "are", 

108 "was", 

109 "were", 

110 "been", 

111 "have", 

112 "has", 

113 "had", 

114 "not", 

115 "but", 

116 "can", 

117 "will", 

118 "should", 

119 "would", 

120 "could", 

121 "may", 

122 "might", 

123 "must", 

124 "file", 

125 "code", 

126 "issue", 

127 } 

128) 

129 

130 

131def extract_words(text: str) -> set[str]: 

132 """Extract significant words from text. 

133 

134 Extracts all lowercase alphabetic words of 3+ characters, 

135 excluding common stop words. Useful for topic-based relevance 

136 scoring via Jaccard similarity. 

137 

138 Args: 

139 text: Input text 

140 

141 Returns: 

142 Set of lowercase words (3+ chars, excluding common words) 

143 """ 

144 words = set(re.findall(r"\b[a-z]{3,}\b", text.lower())) 

145 return words - _COMMON_WORDS 

146 

147 

148def calculate_word_overlap(words1: set[str], words2: set[str]) -> float: 

149 """Calculate Jaccard similarity between word sets. 

150 

151 Args: 

152 words1: First word set 

153 words2: Second word set 

154 

155 Returns: 

156 Similarity score from 0.0 to 1.0 

157 """ 

158 if not words1 or not words2: 

159 return 0.0 

160 intersection = words1 & words2 

161 union = words1 | words2 

162 return len(intersection) / len(union) 

163 

164 

165def score_bm25( 

166 query_words: set[str], 

167 doc_words: set[str], 

168 doc_freq: dict[str, int], 

169 avg_doc_len: float, 

170 total_docs: int, 

171 k1: float = 1.5, 

172 b: float = 0.75, 

173) -> float: 

174 """Compute BM25 relevance score for a document against a query. 

175 

176 Uses the Robertson BM25 formula with IDF smoothing. Since doc_words 

177 is a set (unique terms only), term frequency within the document is 

178 always 1 for matching terms. 

179 

180 Args: 

181 query_words: Set of query terms 

182 doc_words: Set of document terms (unique words, from extract_words) 

183 doc_freq: Document frequency per term (number of docs containing each term) 

184 avg_doc_len: Average document length in unique words across corpus 

185 total_docs: Total number of documents in corpus 

186 k1: Term frequency saturation parameter (default: 1.5) 

187 b: Length normalization parameter (default: 0.75) 

188 

189 Returns: 

190 BM25 score (non-negative float, unbounded above) 

191 """ 

192 if not query_words or not doc_words or total_docs == 0 or avg_doc_len == 0: 

193 return 0.0 

194 

195 doc_len = len(doc_words) 

196 score = 0.0 

197 

198 for term in query_words & doc_words: 

199 df = doc_freq.get(term, 0) 

200 # Robertson IDF with +1 smoothing to keep score non-negative 

201 idf = math.log((total_docs - df + 0.5) / (df + 0.5) + 1) 

202 # TF = 1 (term present in doc), with length normalization 

203 tf_norm = (k1 + 1) / (1 + k1 * (1 - b + b * doc_len / avg_doc_len)) 

204 score += idf * tf_norm 

205 

206 return score