Coverage for src/dataknobs_llm/prompts/versioning/types.py: 97%

137 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-31 16:07 -0600

1"""Core type definitions for prompt versioning and A/B testing. 

2 

3This module defines: 

4- Version data structures 

5- Experiment configurations 

6- Metrics tracking types 

7- Custom exceptions 

8""" 

9 

10from dataclasses import dataclass, field 

11from datetime import datetime 

12from typing import Any, Dict, List, Optional 

13from enum import Enum 

14 

15 

16class VersioningError(Exception): 

17 """Base exception for versioning-related errors.""" 

18 pass 

19 

20 

21class VersionStatus(Enum): 

22 """Status of a prompt version. 

23 

24 Attributes: 

25 DRAFT: Version is in development 

26 ACTIVE: Version is active and can be used 

27 PRODUCTION: Version is deployed in production 

28 DEPRECATED: Version is deprecated but still available 

29 ARCHIVED: Version is archived and should not be used 

30 """ 

31 DRAFT = "draft" 

32 ACTIVE = "active" 

33 PRODUCTION = "production" 

34 DEPRECATED = "deprecated" 

35 ARCHIVED = "archived" 

36 

37 

38@dataclass 

39class PromptVersion: 

40 """Represents a versioned prompt. 

41 

42 Attributes: 

43 version_id: Unique identifier for this version (auto-generated) 

44 name: Name of the prompt 

45 prompt_type: Type of prompt ("system", "user", "message") 

46 version: Semantic version string (e.g., "1.2.3") 

47 template: The prompt template content 

48 defaults: Default parameter values 

49 validation: Validation configuration 

50 metadata: Additional metadata (author, description, etc.) 

51 created_at: Timestamp when version was created 

52 created_by: Username/ID of creator 

53 parent_version: Previous version ID (for history tracking) 

54 tags: List of tags (e.g., ["production", "experiment-A"]) 

55 status: Current status of this version 

56 """ 

57 version_id: str 

58 name: str 

59 prompt_type: str 

60 version: str 

61 template: str 

62 defaults: Dict[str, Any] = field(default_factory=dict) 

63 validation: Optional[Dict[str, Any]] = None 

64 metadata: Dict[str, Any] = field(default_factory=dict) 

65 created_at: datetime = field(default_factory=datetime.utcnow) 

66 created_by: Optional[str] = None 

67 parent_version: Optional[str] = None 

68 tags: List[str] = field(default_factory=list) 

69 status: VersionStatus = VersionStatus.ACTIVE 

70 

71 def to_dict(self) -> Dict[str, Any]: 

72 """Convert to dictionary for storage.""" 

73 return { 

74 "version_id": self.version_id, 

75 "name": self.name, 

76 "prompt_type": self.prompt_type, 

77 "version": self.version, 

78 "template": self.template, 

79 "defaults": self.defaults, 

80 "validation": self.validation, 

81 "metadata": self.metadata, 

82 "created_at": self.created_at.isoformat(), 

83 "created_by": self.created_by, 

84 "parent_version": self.parent_version, 

85 "tags": self.tags, 

86 "status": self.status.value, 

87 } 

88 

89 @classmethod 

90 def from_dict(cls, data: Dict[str, Any]) -> "PromptVersion": 

91 """Create from dictionary.""" 

92 data = data.copy() 

93 # Parse datetime 

94 if isinstance(data.get("created_at"), str): 

95 data["created_at"] = datetime.fromisoformat(data["created_at"]) 

96 # Parse status enum 

97 if isinstance(data.get("status"), str): 

98 data["status"] = VersionStatus(data["status"]) 

99 return cls(**data) 

100 

101 

102@dataclass 

103class PromptVariant: 

104 """A variant in an A/B test experiment. 

105 

106 Attributes: 

107 version: Version string of this variant 

108 weight: Traffic allocation weight (relative weight, must be > 0.0) 

109 Weights are normalized to sum to 1.0 when creating experiment 

110 description: Human-readable description 

111 metadata: Additional variant metadata 

112 """ 

113 version: str 

114 weight: float 

115 description: str = "" 

116 metadata: Dict[str, Any] = field(default_factory=dict) 

117 

118 def __post_init__(self): 

119 """Validate weight is positive.""" 

120 if self.weight <= 0.0: 

121 raise ValueError(f"Variant weight must be positive, got {self.weight}") 

122 

123 def to_dict(self) -> Dict[str, Any]: 

124 """Convert to dictionary for storage.""" 

125 return { 

126 "version": self.version, 

127 "weight": self.weight, 

128 "description": self.description, 

129 "metadata": self.metadata, 

130 } 

131 

132 @classmethod 

133 def from_dict(cls, data: Dict[str, Any]) -> "PromptVariant": 

134 """Create from dictionary.""" 

135 return cls(**data) 

136 

137 

138@dataclass 

139class PromptExperiment: 

140 """Configuration for an A/B test experiment. 

141 

142 Attributes: 

143 experiment_id: Unique identifier for this experiment 

144 name: Name of the prompt being tested 

145 prompt_type: Type of prompt ("system", "user", "message") 

146 variants: List of variants in this experiment 

147 traffic_split: Mapping of version to traffic percentage 

148 start_date: When experiment started 

149 end_date: When experiment ended (None if still running) 

150 status: Current status ("running", "paused", "completed") 

151 metrics: Aggregated metrics for the experiment 

152 metadata: Additional experiment metadata 

153 """ 

154 experiment_id: str 

155 name: str 

156 prompt_type: str 

157 variants: List[PromptVariant] 

158 traffic_split: Dict[str, float] 

159 start_date: datetime = field(default_factory=datetime.utcnow) 

160 end_date: Optional[datetime] = None 

161 status: str = "running" 

162 metrics: Dict[str, Any] = field(default_factory=dict) 

163 metadata: Dict[str, Any] = field(default_factory=dict) 

164 

165 def __post_init__(self): 

166 """Validate traffic split sums to 1.0.""" 

167 total = sum(self.traffic_split.values()) 

168 if not (0.99 <= total <= 1.01): # Allow small floating point error 

169 raise ValueError( 

170 f"Traffic split must sum to 1.0, got {total}. " 

171 f"Split: {self.traffic_split}" 

172 ) 

173 

174 def to_dict(self) -> Dict[str, Any]: 

175 """Convert to dictionary for storage.""" 

176 return { 

177 "experiment_id": self.experiment_id, 

178 "name": self.name, 

179 "prompt_type": self.prompt_type, 

180 "variants": [v.to_dict() for v in self.variants], 

181 "traffic_split": self.traffic_split, 

182 "start_date": self.start_date.isoformat(), 

183 "end_date": self.end_date.isoformat() if self.end_date else None, 

184 "status": self.status, 

185 "metrics": self.metrics, 

186 "metadata": self.metadata, 

187 } 

188 

189 @classmethod 

190 def from_dict(cls, data: Dict[str, Any]) -> "PromptExperiment": 

191 """Create from dictionary.""" 

192 data = data.copy() 

193 # Parse datetimes 

194 if isinstance(data.get("start_date"), str): 

195 data["start_date"] = datetime.fromisoformat(data["start_date"]) 

196 if isinstance(data.get("end_date"), str): 

197 data["end_date"] = datetime.fromisoformat(data["end_date"]) 

198 # Parse variants 

199 if data.get("variants"): 

200 data["variants"] = [ 

201 PromptVariant.from_dict(v) if isinstance(v, dict) else v 

202 for v in data["variants"] 

203 ] 

204 return cls(**data) 

205 

206 

207@dataclass 

208class PromptMetrics: 

209 """Performance metrics for a prompt version. 

210 

211 Attributes: 

212 version_id: Version ID these metrics belong to 

213 total_uses: Total number of times this version was used 

214 success_count: Number of successful uses 

215 error_count: Number of errors/failures 

216 total_response_time: Total response time across all uses (seconds) 

217 total_tokens: Total tokens used across all uses 

218 user_ratings: List of user ratings (1-5 scale) 

219 last_used: Timestamp of last use 

220 metadata: Additional custom metrics 

221 """ 

222 version_id: str 

223 total_uses: int = 0 

224 success_count: int = 0 

225 error_count: int = 0 

226 total_response_time: float = 0.0 

227 total_tokens: int = 0 

228 user_ratings: List[float] = field(default_factory=list) 

229 last_used: Optional[datetime] = None 

230 metadata: Dict[str, Any] = field(default_factory=dict) 

231 

232 @property 

233 def success_rate(self) -> float: 

234 """Calculate success rate.""" 

235 if self.total_uses == 0: 

236 return 0.0 

237 return self.success_count / self.total_uses 

238 

239 @property 

240 def avg_response_time(self) -> float: 

241 """Calculate average response time.""" 

242 if self.total_uses == 0: 

243 return 0.0 

244 return self.total_response_time / self.total_uses 

245 

246 @property 

247 def avg_tokens(self) -> float: 

248 """Calculate average tokens per use.""" 

249 if self.total_uses == 0: 

250 return 0.0 

251 return self.total_tokens / self.total_uses 

252 

253 @property 

254 def avg_rating(self) -> float: 

255 """Calculate average user rating.""" 

256 if not self.user_ratings: 

257 return 0.0 

258 return sum(self.user_ratings) / len(self.user_ratings) 

259 

260 def to_dict(self) -> Dict[str, Any]: 

261 """Convert to dictionary for storage.""" 

262 return { 

263 "version_id": self.version_id, 

264 "total_uses": self.total_uses, 

265 "success_count": self.success_count, 

266 "error_count": self.error_count, 

267 "total_response_time": self.total_response_time, 

268 "total_tokens": self.total_tokens, 

269 "user_ratings": self.user_ratings, 

270 "last_used": self.last_used.isoformat() if self.last_used else None, 

271 "metadata": self.metadata, 

272 # Include computed properties 

273 "success_rate": self.success_rate, 

274 "avg_response_time": self.avg_response_time, 

275 "avg_tokens": self.avg_tokens, 

276 "avg_rating": self.avg_rating, 

277 } 

278 

279 @classmethod 

280 def from_dict(cls, data: Dict[str, Any]) -> "PromptMetrics": 

281 """Create from dictionary.""" 

282 data = data.copy() 

283 # Parse datetime 

284 if isinstance(data.get("last_used"), str): 

285 data["last_used"] = datetime.fromisoformat(data["last_used"]) 

286 # Remove computed properties (they're recalculated) 

287 for key in ["success_rate", "avg_response_time", "avg_tokens", "avg_rating"]: 

288 data.pop(key, None) 

289 return cls(**data) 

290 

291 

292@dataclass 

293class MetricEvent: 

294 """Single event for metrics tracking. 

295 

296 Attributes: 

297 version_id: Version ID this event belongs to 

298 timestamp: When the event occurred 

299 success: Whether the use was successful 

300 response_time: Response time in seconds (None if not applicable) 

301 tokens: Number of tokens used (None if not applicable) 

302 user_rating: User rating 1-5 (None if not provided) 

303 metadata: Additional event metadata 

304 """ 

305 version_id: str 

306 timestamp: datetime = field(default_factory=datetime.utcnow) 

307 success: bool = True 

308 response_time: Optional[float] = None 

309 tokens: Optional[int] = None 

310 user_rating: Optional[float] = None 

311 metadata: Dict[str, Any] = field(default_factory=dict) 

312 

313 def to_dict(self) -> Dict[str, Any]: 

314 """Convert to dictionary for storage.""" 

315 return { 

316 "version_id": self.version_id, 

317 "timestamp": self.timestamp.isoformat(), 

318 "success": self.success, 

319 "response_time": self.response_time, 

320 "tokens": self.tokens, 

321 "user_rating": self.user_rating, 

322 "metadata": self.metadata, 

323 } 

324 

325 @classmethod 

326 def from_dict(cls, data: Dict[str, Any]) -> "MetricEvent": 

327 """Create from dictionary.""" 

328 data = data.copy() 

329 if isinstance(data.get("timestamp"), str): 

330 data["timestamp"] = datetime.fromisoformat(data["timestamp"]) 

331 return cls(**data)