Coverage for llm_dataset_engine/utils/metrics_exporter.py: 0%

32 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-15 18:04 +0200

1""" 

2Prometheus metrics export for monitoring. 

3 

4Provides instrumentation for external monitoring systems. 

5""" 

6 

7from prometheus_client import Counter, Gauge, Histogram, start_http_server 

8 

9from llm_dataset_engine.utils import get_logger 

10 

11logger = get_logger(__name__) 

12 

13 

14class PrometheusMetrics: 

15 """ 

16 Prometheus metrics exporter. 

17  

18 Follows Single Responsibility: only handles metrics export. 

19 """ 

20 

21 def __init__(self, port: int = 9090): 

22 """ 

23 Initialize Prometheus metrics. 

24 

25 Args: 

26 port: Port for metrics HTTP server 

27 """ 

28 self.port = port 

29 self._server_started = False 

30 

31 # Define metrics 

32 self.requests_total = Counter( 

33 "llm_requests_total", 

34 "Total LLM requests", 

35 ["provider", "model", "stage"], 

36 ) 

37 

38 self.request_duration = Histogram( 

39 "llm_request_duration_seconds", 

40 "LLM request duration in seconds", 

41 ["provider", "stage"], 

42 ) 

43 

44 self.cost_total = Gauge( 

45 "llm_cost_total_usd", 

46 "Total cost in USD", 

47 ["provider"], 

48 ) 

49 

50 self.errors_total = Counter( 

51 "llm_errors_total", 

52 "Total errors", 

53 ["stage", "error_type"], 

54 ) 

55 

56 self.rows_processed = Gauge( 

57 "llm_rows_processed_total", 

58 "Total rows processed", 

59 ["stage"], 

60 ) 

61 

62 self.rows_per_second = Gauge( 

63 "llm_rows_per_second", 

64 "Processing throughput", 

65 ) 

66 

67 def start_server(self) -> None: 

68 """Start HTTP server for metrics endpoint.""" 

69 if not self._server_started: 

70 try: 

71 start_http_server(self.port) 

72 self._server_started = True 

73 logger.info( 

74 f"Prometheus metrics server started on port {self.port}" 

75 ) 

76 except Exception as e: 

77 logger.error(f"Failed to start metrics server: {e}") 

78 

79 def record_request( 

80 self, provider: str, model: str, stage: str, duration: float 

81 ) -> None: 

82 """ 

83 Record LLM request metrics. 

84 

85 Args: 

86 provider: Provider name 

87 model: Model name 

88 stage: Stage name 

89 duration: Request duration in seconds 

90 """ 

91 self.requests_total.labels( 

92 provider=provider, model=model, stage=stage 

93 ).inc() 

94 

95 self.request_duration.labels(provider=provider, stage=stage).observe( 

96 duration 

97 ) 

98 

99 def record_cost(self, provider: str, cost: float) -> None: 

100 """ 

101 Record cost metric. 

102 

103 Args: 

104 provider: Provider name 

105 cost: Cost in USD 

106 """ 

107 self.cost_total.labels(provider=provider).set(cost) 

108 

109 def record_error(self, stage: str, error_type: str) -> None: 

110 """ 

111 Record error metric. 

112 

113 Args: 

114 stage: Stage name 

115 error_type: Error type 

116 """ 

117 self.errors_total.labels(stage=stage, error_type=error_type).inc() 

118 

119 def record_rows_processed(self, stage: str, count: int) -> None: 

120 """ 

121 Record rows processed. 

122 

123 Args: 

124 stage: Stage name 

125 count: Number of rows 

126 """ 

127 self.rows_processed.labels(stage=stage).set(count) 

128 

129 def record_throughput(self, rows_per_second: float) -> None: 

130 """ 

131 Record processing throughput. 

132 

133 Args: 

134 rows_per_second: Throughput metric 

135 """ 

136 self.rows_per_second.set(rows_per_second) 

137