{% extends "base.html" %} {% block title %}Evalground - Memorizz{% endblock %} {% block content %} {% if warnings %}
Heads up
{% endif %} {% if error %}
! {{ error }}
{% endif %}

Dataset Library

{% for item in dataset_status %}
LongMemEval {{ item.variant|upper }}
{{ item.filename }} {% if item.exists %} Ready {% else %} Missing {% endif %}
{% endfor %}
{% if download_message %}
{{ download_message }}
{% endif %} {% if download_error %}
! {{ download_error }}
{% endif %} {% if missing_variants %}

Uses the bundled LongMemEval downloader to fetch dataset files.

{% endif %}

Benchmark Run

{% if agents %}

Uses the connected Oracle provider and OpenAI for scoring. Results saved to {{ eval_results_dir }}.

{% else %}
EVAL

No agents available

Create an agent to run benchmarks.

{% endif %}

Run History

{% if runs_history %}
{% for run in runs_history %} {% set run_status = run.status or 'unknown' %} {% set badge_class = 'badge-warning' %} {% if run_status == 'completed' %} {% set badge_class = 'badge-success' %} {% elif run_status in ['failed', 'canceled'] %} {% set badge_class = 'badge-danger' %} {% endif %} {% endfor %}
Run ID Status Benchmark Agent Dataset Samples Accuracy Created Finished Actions
{{ run.run_id[:8] }}... {{ run_status }} {{ run.benchmark or 'longmemeval' }} {{ run.agent_name }} {{ run.dataset_variant or 'oracle' }} {{ run.num_samples }} {% if run.overall_accuracy is not none %} {{ run.overall_accuracy }}% {% else %} - {% endif %} {{ run.created_at or '-' }} {{ run.finished_at or '-' }} View {% if run_status in ['queued', 'running', 'canceling'] %} {% endif %}
{% else %}

No benchmark runs yet.

{% endif %}
{% if selected_agent %}
Agent {% if selected_agent.persona and selected_agent.persona.name %} {{ selected_agent.persona.name }} {% else %} Agent {% endif %}
Agent ID {{ selected_agent.agent_id }}
Mode {{ selected_agent.application_mode or 'assistant' }}
{% if selected_agent.memory_ids %}
Memory IDs {{ selected_agent.memory_ids|length }}
{% endif %}
{% endif %} {% if eval_results %}

Results

Overall Accuracy
{{ (eval_results.overall_accuracy * 100)|round(2) }}%
Overall Score
{{ eval_results.overall_score|round(3) }}
Samples
{{ eval_results.metadata.num_samples }}
Processing Time
{{ eval_results.metadata.total_processing_time|round(2) }}s
{% if eval_output_path %} {% endif %}
Dataset {{ eval_results.metadata.dataset_variant }}
Mode {{ eval_results.metadata.application_mode|default('assistant') }}
Timestamp {{ eval_results.metadata.timestamp }}
Output {{ eval_output_path }}
{% for category, metrics in eval_results.category_results.items() %}
{{ category|replace('-', ' ')|title }}
Accuracy {{ metrics.accuracy|round(2) }}
Avg Score {{ metrics.average_score|round(2) }}
Samples {{ metrics.num_samples }}
{% endfor %}
{% elif selected_agent %}
EVAL

No results yet

Run a benchmark to see results.

{% endif %} {% if run_output %}

Run Output

{{ run_output }}
{% endif %} {% endblock %} {% block scripts %} {% endblock %}