{% extends "base.html" %} {% block title %}Evaluation #{{ evaluation.id }} — evalkit{% endblock %} {% block content %}

Projects / Project #{{ evaluation.project_id }} / Evaluation #{{ evaluation.id }}

Evaluation #{{ evaluation.id }}

Download JSON

Suite

{{ evaluation.suite_name }}

Status

{{ evaluation.status.value }}

Average Score

{% if evaluation.average_score is not none %} {% set pct = (evaluation.average_score * 100) | int %}

{{ pct }}%

{% else %}

{% endif %}

Test Cases

{{ evaluation.results | length }}

{% set tag_dict = {} %} {% for t in evaluation.tags %}{% set _ = tag_dict.update({t.key: t.value}) %}{% endfor %}

Tags

{% if tag_dict %}
{% for key, value in tag_dict.items() %} {{ key }}: {{ value }} {% endfor %}
{% else %}

No tags

{% endif %}
{% if prev_evaluation %}
Compare with previous (#{{ prev_evaluation.id }})
{% endif %} {% if evaluation.results %} {% set metric_names = [] %} {% for r in evaluation.results %} {% for ms in r.metric_scores %} {% if ms.metric_name not in metric_names %}{% set _ = metric_names.append(ms.metric_name) %}{% endif %} {% endfor %} {% endfor %} {% if metric_names %}

Metric Breakdown

{% for mn in metric_names %} {% set scores = [] %} {% set passed = [] %} {% for r in evaluation.results %} {% for ms in r.metric_scores %} {% if ms.metric_name == mn %} {% set _ = scores.append(ms.score) %} {% if ms.passed %}{% set _ = passed.append(1) %}{% endif %} {% endif %} {% endfor %} {% endfor %} {% set avg = (scores | sum / scores | length) if scores else 0 %} {% set pct = (avg * 100) | int %}
{{ mn }} {{ passed | length }}/{{ scores | length }} passed • avg {{ pct }}%
{% endfor %}
{% endif %}

Test Cases

{% for result in evaluation.results %} {% set tc = result.get_test_case_dict() %}
{{ pct }}

{{ tc.question }}

{% for ms in result.metric_scores %} {{ ms.metric_name | replace('_metric', '') | replace('_', ' ') | truncate(8, True, '') }} {% endfor %}
{% if tc.answer %}
Answer: {{ tc.answer }}
{% endif %} {% if tc.context %}
Context:
    {% for chunk in tc.context %}
  • {{ chunk }}
  • {% endfor %}
{% endif %} {% if tc.expected_answer %}
Expected: {{ tc.expected_answer }}
{% endif %} {% if tc.expected_tool_calls or tc.actual_tool_calls %}
Tool Calls:

Expected

{% for tool in (tc.expected_tool_calls or []) %}
{{ tool.name }} {% if tool.parameters %}
{{ tool.parameters | tojson(indent=2) }}
{% endif %}
{% endfor %}

Actual

{% for tool in (tc.actual_tool_calls or []) %} {% set idx = loop.index0 %} {% set expected_list = tc.expected_tool_calls or [] %} {% set name_match = idx < expected_list | length and expected_list[idx].name == tool.name %}
{{ tool.name }} {% if tool.parameters %}
{{ tool.parameters | tojson(indent=2) }}
{% endif %}
{% endfor %}
{% endif %}
{% for ms in result.metric_scores %}
{{ "%.2f"|format(ms.score) }} {{ ms.metric_name }}: {{ ms.reason }}
{% endfor %}
{% endfor %}
{% else %}
{% if evaluation.status.value == 'pending' or evaluation.status.value == 'running' %}

Evaluation in progress

Results will appear here once complete.

{% else %}

📭

No results available

{% endif %}
{% endif %} {% endblock %}