{% extends "base.html" %} {% block title %}Compare — {{ project.name }} — evalkit{% endblock %} {% block content %}

Projects / {{ project.name }} / Compare

Compare Evaluations

BASELINE #{{ comparison.baseline.id }}
{% if comparison.baseline.average_score is not none %} {% set pct = (comparison.baseline.average_score * 100) | int %}

{{ pct }}%

{% endif %}

{{ comparison.baseline.suite_name }}

{% if comparison.baseline.tags %}
{% for k, v in comparison.baseline.tags.items() %} {{ k }}: {{ v }} {% endfor %}
{% endif %}
{% for comp in comparison.comparisons %}
#{{ comp.id }} {% if comp.average_score is not none and comparison.baseline.average_score is not none %} {% set delta = ((comp.average_score - comparison.baseline.average_score) * 100) | int %} {% if delta > 0 %} +{{ delta }}% {% elif delta < 0 %} {{ delta }}% {% else %} 0% {% endif %} {% endif %}
{% if comp.average_score is not none %} {% set pct = (comp.average_score * 100) | int %}

{{ pct }}%

{% endif %}

{{ comp.suite_name }}

{% if comp.tags %}
{% for k, v in comp.tags.items() %} {{ k }}: {{ v }} {% endfor %}
{% endif %}
{% endfor %}

{{ comparison.summary.improved }}

Improved

{{ comparison.summary.regressed }}

Regressed

{{ comparison.summary.unchanged }}

Unchanged

{% if comparison.questions %}

Questions

{% for q in comparison.questions %}
{% set bpct = (q.baseline_result.overall_score * 100) | int %} {{ bpct }}

{{ q.question }}

{% for cr in q.comparison_results %} {% if cr %} {% set delta = (cr.score_delta * 100) | int %} {% if delta > 5 %} +{{ delta }}% {% elif delta < -5 %} {{ delta }}% {% else %} {{ delta }}% {% endif %} {% else %} -- {% endif %} {% endfor %}

Baseline #{{ comparison.baseline.id }}

Answer: {{ q.baseline_result.answer }}
{% if q.baseline_result.expected_answer %}
Expected: {{ q.baseline_result.expected_answer }}
{% endif %}
{% for m in q.baseline_result.metrics %}
{{ "%.2f"|format(m.score) }} {{ m.name }}: {{ m.reason }}
{% endfor %}
{% for cr in q.comparison_results %} {% if cr %}

#{{ cr.evaluation_id }} {% set delta = (cr.score_delta * 100) | int %} {% if delta > 5 %} +{{ delta }}% {% elif delta < -5 %} {{ delta }}% {% endif %}

Answer: {{ cr.answer }}
{% if cr.expected_answer %}
Expected: {{ cr.expected_answer }}
{% endif %}
{% for m in cr.metrics %}
{{ "%.2f"|format(m.score) }} {% set md = (m.score_delta * 100) | int %} {% if md > 5 %} (+{{ md }}%) {% elif md < -5 %} ({{ md }}%) {% endif %} {{ m.name }}: {{ m.reason }}
{% endfor %}
{% else %}
Not evaluated
{% endif %} {% endfor %}
{% endfor %}
{% endif %} {% if comparison.new_questions %}

New Questions (not in baseline)

{% for q in comparison.new_questions %}

{{ q.question }}

{% endfor %}
{% endif %} {% endblock %}