Coverage for little_loops / issue_discovery / matching.py: 0%

70 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-03-18 16:18 -0500

1"""Issue matching types and text similarity helpers.""" 

2 

3from __future__ import annotations 

4 

5import re 

6from dataclasses import dataclass, field 

7from enum import Enum 

8from pathlib import Path 

9from typing import TYPE_CHECKING 

10 

11# Promoted to text_utils.py as public functions; aliased here for backward compat 

12from little_loops.text_utils import calculate_word_overlap as _calculate_word_overlap # noqa: F401 

13from little_loops.text_utils import extract_words as _extract_words # noqa: F401 

14 

15if TYPE_CHECKING: 

16 from little_loops.config import BRConfig 

17 

18 

19# ============================================================================= 

20# Enums 

21# ============================================================================= 

22 

23 

24class MatchClassification(Enum): 

25 """Classification of how a finding matches an existing issue. 

26 

27 Used to distinguish between true duplicates, regressions, and invalid fixes 

28 when a finding matches a completed issue. 

29 """ 

30 

31 NEW_ISSUE = "new_issue" # No existing issue matches 

32 DUPLICATE = "duplicate" # Active issue exists 

33 REGRESSION = "regression" # Completed, files modified AFTER fix (fix broke) 

34 INVALID_FIX = "invalid_fix" # Completed, files NOT modified after fix (never worked) 

35 UNVERIFIED = "unverified" # Completed, no fix commit tracked (can't determine) 

36 

37 

38# ============================================================================= 

39# Data Classes 

40# ============================================================================= 

41 

42 

43@dataclass 

44class RegressionEvidence: 

45 """Evidence for regression vs invalid fix classification. 

46 

47 Attributes: 

48 fix_commit_sha: SHA of the commit that fixed the original issue 

49 fix_commit_exists: Whether the fix commit exists in current history 

50 files_modified_since_fix: Files from the fix that were modified after fix 

51 days_since_fix: Number of days since the fix was applied 

52 related_commits: Commits that modified the relevant files after fix 

53 """ 

54 

55 fix_commit_sha: str | None = None 

56 fix_commit_exists: bool = True 

57 files_modified_since_fix: list[str] = field(default_factory=list) 

58 days_since_fix: int = 0 

59 related_commits: list[str] = field(default_factory=list) 

60 

61 

62@dataclass 

63class FindingMatch: 

64 """Result of matching a finding to an existing issue. 

65 

66 Attributes: 

67 issue_path: Path to matched issue file, or None if no match 

68 match_type: Type of match ("exact", "similar", "content", "none") 

69 match_score: Confidence score from 0.0 to 1.0 

70 is_completed: Whether the matched issue is in completed/ 

71 matched_terms: Terms that matched (for debugging) 

72 classification: How to classify this match (regression, duplicate, etc.) 

73 regression_evidence: Evidence supporting regression classification 

74 """ 

75 

76 issue_path: Path | None 

77 match_type: str 

78 match_score: float 

79 is_completed: bool = False 

80 matched_terms: list[str] = field(default_factory=list) 

81 classification: MatchClassification = MatchClassification.NEW_ISSUE 

82 regression_evidence: RegressionEvidence | None = None 

83 

84 @property 

85 def should_skip(self) -> bool: 

86 """Return True if finding is a duplicate and should be skipped.""" 

87 return self.match_score >= 0.8 

88 

89 @property 

90 def should_update(self) -> bool: 

91 """Return True if finding should update the existing issue.""" 

92 return 0.5 <= self.match_score < 0.8 

93 

94 @property 

95 def should_create(self) -> bool: 

96 """Return True if a new issue should be created.""" 

97 return self.match_score < 0.5 

98 

99 @property 

100 def should_reopen(self) -> bool: 

101 """Return True if a completed issue should be reopened.""" 

102 return self.is_completed and self.match_score >= 0.5 

103 

104 @property 

105 def should_reopen_as_regression(self) -> bool: 

106 """Return True if issue should be reopened as a regression. 

107 

108 A regression means the fix was applied but later code changes broke it. 

109 """ 

110 return ( 

111 self.is_completed 

112 and self.match_score >= 0.5 

113 and self.classification == MatchClassification.REGRESSION 

114 ) 

115 

116 @property 

117 def should_reopen_as_invalid_fix(self) -> bool: 

118 """Return True if issue should be reopened due to invalid fix. 

119 

120 An invalid fix means the original fix never actually resolved the issue. 

121 """ 

122 return ( 

123 self.is_completed 

124 and self.match_score >= 0.5 

125 and self.classification == MatchClassification.INVALID_FIX 

126 ) 

127 

128 @property 

129 def is_unverified(self) -> bool: 

130 """Return True if regression status cannot be determined. 

131 

132 Unverified means the completed issue has no fix commit tracked, 

133 so we cannot determine if this is a regression or invalid fix. 

134 """ 

135 return ( 

136 self.is_completed 

137 and self.match_score >= 0.5 

138 and self.classification == MatchClassification.UNVERIFIED 

139 ) 

140 

141 

142# ============================================================================= 

143# Text Matching Helpers 

144# ============================================================================= 

145 

146 

147def _normalize_text(text: str) -> str: 

148 """Normalize text for comparison. 

149 

150 Args: 

151 text: Input text 

152 

153 Returns: 

154 Lowercase text with normalized whitespace 

155 """ 

156 return re.sub(r"\s+", " ", text.lower().strip()) 

157 

158 

159def _extract_line_numbers(text: str) -> set[int]: 

160 """Extract line numbers from text. 

161 

162 Args: 

163 text: Input text 

164 

165 Returns: 

166 Set of line numbers found 

167 """ 

168 numbers: set[int] = set() 

169 # Match line number patterns 

170 patterns = [ 

171 r"\*\*Line(?:\(s\))?\*\*:\s*(\d+)(?:-(\d+))?", # **Line(s)**: 42-45 

172 r":(\d+)(?:-(\d+))?", # :42-45 (in paths) 

173 r"line\s+(\d+)", # line 42 

174 ] 

175 for pattern in patterns: 

176 for match in re.finditer(pattern, text, re.IGNORECASE): 

177 numbers.add(int(match.group(1))) 

178 if match.lastindex and match.lastindex >= 2 and match.group(2): 

179 numbers.add(int(match.group(2))) 

180 return numbers 

181 

182 

183def _matches_issue_type( 

184 finding_type: str, 

185 issue_path: Path, 

186 config: BRConfig, 

187 is_completed: bool, 

188) -> bool: 

189 """Check if finding type matches issue path using configured categories. 

190 

191 Args: 

192 finding_type: The type of finding (e.g., 'BUG', 'ENH', 'FEAT') 

193 issue_path: Path to the issue file 

194 config: Configuration with category definitions 

195 is_completed: Whether the issue is in the completed directory 

196 

197 Returns: 

198 True if the finding type matches the issue path's category 

199 """ 

200 if is_completed: 

201 return True 

202 

203 path_str = str(issue_path) 

204 for category in config.issues.categories.values(): 

205 if finding_type == category.prefix and f"/{category.dir}/" in path_str: 

206 return True 

207 return False