Coverage for src/dataknobs_llm/prompts/versioning/types.py: 97%

137 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-08 13:51 -0700

1"""Core type definitions for prompt versioning and A/B testing. 

2 

3This module defines: 

4- Version data structures 

5- Experiment configurations 

6- Metrics tracking types 

7- Custom exceptions 

8""" 

9 

10from dataclasses import dataclass, field 

11from datetime import datetime 

12from typing import Any, Dict, List 

13from enum import Enum 

14 

15class VersioningError(Exception): 

16 """Base exception for versioning-related errors.""" 

17 pass 

18 

19 

20class VersionStatus(Enum): 

21 """Status of a prompt version. 

22 

23 Attributes: 

24 DRAFT: Version is in development 

25 ACTIVE: Version is active and can be used 

26 PRODUCTION: Version is deployed in production 

27 DEPRECATED: Version is deprecated but still available 

28 ARCHIVED: Version is archived and should not be used 

29 """ 

30 DRAFT = "draft" 

31 ACTIVE = "active" 

32 PRODUCTION = "production" 

33 DEPRECATED = "deprecated" 

34 ARCHIVED = "archived" 

35 

36 

37@dataclass 

38class PromptVersion: 

39 """Represents a versioned prompt. 

40 

41 Attributes: 

42 version_id: Unique identifier for this version (auto-generated) 

43 name: Name of the prompt 

44 prompt_type: Type of prompt ("system", "user", "message") 

45 version: Semantic version string (e.g., "1.2.3") 

46 template: The prompt template content 

47 defaults: Default parameter values 

48 validation: Validation configuration 

49 metadata: Additional metadata (author, description, etc.) 

50 created_at: Timestamp when version was created 

51 created_by: Username/ID of creator 

52 parent_version: Previous version ID (for history tracking) 

53 tags: List of tags (e.g., ["production", "experiment-A"]) 

54 status: Current status of this version 

55 """ 

56 version_id: str 

57 name: str 

58 prompt_type: str 

59 version: str 

60 template: str 

61 defaults: Dict[str, Any] = field(default_factory=dict) 

62 validation: Dict[str, Any] | None = None 

63 metadata: Dict[str, Any] = field(default_factory=dict) 

64 created_at: datetime = field(default_factory=datetime.utcnow) 

65 created_by: str | None = None 

66 parent_version: str | None = None 

67 tags: List[str] = field(default_factory=list) 

68 status: VersionStatus = VersionStatus.ACTIVE 

69 

70 def to_dict(self) -> Dict[str, Any]: 

71 """Convert to dictionary for storage.""" 

72 return { 

73 "version_id": self.version_id, 

74 "name": self.name, 

75 "prompt_type": self.prompt_type, 

76 "version": self.version, 

77 "template": self.template, 

78 "defaults": self.defaults, 

79 "validation": self.validation, 

80 "metadata": self.metadata, 

81 "created_at": self.created_at.isoformat(), 

82 "created_by": self.created_by, 

83 "parent_version": self.parent_version, 

84 "tags": self.tags, 

85 "status": self.status.value, 

86 } 

87 

88 @classmethod 

89 def from_dict(cls, data: Dict[str, Any]) -> "PromptVersion": 

90 """Create from dictionary.""" 

91 data = data.copy() 

92 # Parse datetime 

93 if isinstance(data.get("created_at"), str): 

94 data["created_at"] = datetime.fromisoformat(data["created_at"]) 

95 # Parse status enum 

96 if isinstance(data.get("status"), str): 

97 data["status"] = VersionStatus(data["status"]) 

98 return cls(**data) 

99 

100 

101@dataclass 

102class PromptVariant: 

103 """A variant in an A/B test experiment. 

104 

105 Attributes: 

106 version: Version string of this variant 

107 weight: Traffic allocation weight (relative weight, must be > 0.0) 

108 Weights are normalized to sum to 1.0 when creating experiment 

109 description: Human-readable description 

110 metadata: Additional variant metadata 

111 """ 

112 version: str 

113 weight: float 

114 description: str = "" 

115 metadata: Dict[str, Any] = field(default_factory=dict) 

116 

117 def __post_init__(self): 

118 """Validate weight is positive.""" 

119 if self.weight <= 0.0: 

120 raise ValueError(f"Variant weight must be positive, got {self.weight}") 

121 

122 def to_dict(self) -> Dict[str, Any]: 

123 """Convert to dictionary for storage.""" 

124 return { 

125 "version": self.version, 

126 "weight": self.weight, 

127 "description": self.description, 

128 "metadata": self.metadata, 

129 } 

130 

131 @classmethod 

132 def from_dict(cls, data: Dict[str, Any]) -> "PromptVariant": 

133 """Create from dictionary.""" 

134 return cls(**data) 

135 

136 

137@dataclass 

138class PromptExperiment: 

139 """Configuration for an A/B test experiment. 

140 

141 Attributes: 

142 experiment_id: Unique identifier for this experiment 

143 name: Name of the prompt being tested 

144 prompt_type: Type of prompt ("system", "user", "message") 

145 variants: List of variants in this experiment 

146 traffic_split: Mapping of version to traffic percentage 

147 start_date: When experiment started 

148 end_date: When experiment ended (None if still running) 

149 status: Current status ("running", "paused", "completed") 

150 metrics: Aggregated metrics for the experiment 

151 metadata: Additional experiment metadata 

152 """ 

153 experiment_id: str 

154 name: str 

155 prompt_type: str 

156 variants: List[PromptVariant] 

157 traffic_split: Dict[str, float] 

158 start_date: datetime = field(default_factory=datetime.utcnow) 

159 end_date: datetime | None = None 

160 status: str = "running" 

161 metrics: Dict[str, Any] = field(default_factory=dict) 

162 metadata: Dict[str, Any] = field(default_factory=dict) 

163 

164 def __post_init__(self): 

165 """Validate traffic split sums to 1.0.""" 

166 total = sum(self.traffic_split.values()) 

167 if not (0.99 <= total <= 1.01): # Allow small floating point error 

168 raise ValueError( 

169 f"Traffic split must sum to 1.0, got {total}. " 

170 f"Split: {self.traffic_split}" 

171 ) 

172 

173 def to_dict(self) -> Dict[str, Any]: 

174 """Convert to dictionary for storage.""" 

175 return { 

176 "experiment_id": self.experiment_id, 

177 "name": self.name, 

178 "prompt_type": self.prompt_type, 

179 "variants": [v.to_dict() for v in self.variants], 

180 "traffic_split": self.traffic_split, 

181 "start_date": self.start_date.isoformat(), 

182 "end_date": self.end_date.isoformat() if self.end_date else None, 

183 "status": self.status, 

184 "metrics": self.metrics, 

185 "metadata": self.metadata, 

186 } 

187 

188 @classmethod 

189 def from_dict(cls, data: Dict[str, Any]) -> "PromptExperiment": 

190 """Create from dictionary.""" 

191 data = data.copy() 

192 # Parse datetimes 

193 if isinstance(data.get("start_date"), str): 

194 data["start_date"] = datetime.fromisoformat(data["start_date"]) 

195 if isinstance(data.get("end_date"), str): 

196 data["end_date"] = datetime.fromisoformat(data["end_date"]) 

197 # Parse variants 

198 if data.get("variants"): 

199 data["variants"] = [ 

200 PromptVariant.from_dict(v) if isinstance(v, dict) else v 

201 for v in data["variants"] 

202 ] 

203 return cls(**data) 

204 

205 

206@dataclass 

207class PromptMetrics: 

208 """Performance metrics for a prompt version. 

209 

210 Attributes: 

211 version_id: Version ID these metrics belong to 

212 total_uses: Total number of times this version was used 

213 success_count: Number of successful uses 

214 error_count: Number of errors/failures 

215 total_response_time: Total response time across all uses (seconds) 

216 total_tokens: Total tokens used across all uses 

217 user_ratings: List of user ratings (1-5 scale) 

218 last_used: Timestamp of last use 

219 metadata: Additional custom metrics 

220 """ 

221 version_id: str 

222 total_uses: int = 0 

223 success_count: int = 0 

224 error_count: int = 0 

225 total_response_time: float = 0.0 

226 total_tokens: int = 0 

227 user_ratings: List[float] = field(default_factory=list) 

228 last_used: datetime | None = None 

229 metadata: Dict[str, Any] = field(default_factory=dict) 

230 

231 @property 

232 def success_rate(self) -> float: 

233 """Calculate success rate.""" 

234 if self.total_uses == 0: 

235 return 0.0 

236 return self.success_count / self.total_uses 

237 

238 @property 

239 def avg_response_time(self) -> float: 

240 """Calculate average response time.""" 

241 if self.total_uses == 0: 

242 return 0.0 

243 return self.total_response_time / self.total_uses 

244 

245 @property 

246 def avg_tokens(self) -> float: 

247 """Calculate average tokens per use.""" 

248 if self.total_uses == 0: 

249 return 0.0 

250 return self.total_tokens / self.total_uses 

251 

252 @property 

253 def avg_rating(self) -> float: 

254 """Calculate average user rating.""" 

255 if not self.user_ratings: 

256 return 0.0 

257 return sum(self.user_ratings) / len(self.user_ratings) 

258 

259 def to_dict(self) -> Dict[str, Any]: 

260 """Convert to dictionary for storage.""" 

261 return { 

262 "version_id": self.version_id, 

263 "total_uses": self.total_uses, 

264 "success_count": self.success_count, 

265 "error_count": self.error_count, 

266 "total_response_time": self.total_response_time, 

267 "total_tokens": self.total_tokens, 

268 "user_ratings": self.user_ratings, 

269 "last_used": self.last_used.isoformat() if self.last_used else None, 

270 "metadata": self.metadata, 

271 # Include computed properties 

272 "success_rate": self.success_rate, 

273 "avg_response_time": self.avg_response_time, 

274 "avg_tokens": self.avg_tokens, 

275 "avg_rating": self.avg_rating, 

276 } 

277 

278 @classmethod 

279 def from_dict(cls, data: Dict[str, Any]) -> "PromptMetrics": 

280 """Create from dictionary.""" 

281 data = data.copy() 

282 # Parse datetime 

283 if isinstance(data.get("last_used"), str): 

284 data["last_used"] = datetime.fromisoformat(data["last_used"]) 

285 # Remove computed properties (they're recalculated) 

286 for key in ["success_rate", "avg_response_time", "avg_tokens", "avg_rating"]: 

287 data.pop(key, None) 

288 return cls(**data) 

289 

290 

291@dataclass 

292class MetricEvent: 

293 """Single event for metrics tracking. 

294 

295 Attributes: 

296 version_id: Version ID this event belongs to 

297 timestamp: When the event occurred 

298 success: Whether the use was successful 

299 response_time: Response time in seconds (None if not applicable) 

300 tokens: Number of tokens used (None if not applicable) 

301 user_rating: User rating 1-5 (None if not provided) 

302 metadata: Additional event metadata 

303 """ 

304 version_id: str 

305 timestamp: datetime = field(default_factory=datetime.utcnow) 

306 success: bool = True 

307 response_time: float | None = None 

308 tokens: int | None = None 

309 user_rating: float | None = None 

310 metadata: Dict[str, Any] = field(default_factory=dict) 

311 

312 def to_dict(self) -> Dict[str, Any]: 

313 """Convert to dictionary for storage.""" 

314 return { 

315 "version_id": self.version_id, 

316 "timestamp": self.timestamp.isoformat(), 

317 "success": self.success, 

318 "response_time": self.response_time, 

319 "tokens": self.tokens, 

320 "user_rating": self.user_rating, 

321 "metadata": self.metadata, 

322 } 

323 

324 @classmethod 

325 def from_dict(cls, data: Dict[str, Any]) -> "MetricEvent": 

326 """Create from dictionary.""" 

327 data = data.copy() 

328 if isinstance(data.get("timestamp"), str): 

329 data["timestamp"] = datetime.fromisoformat(data["timestamp"]) 

330 return cls(**data)