{% load eval_tags %} {% comment %} Cross-config × case matrix grid. Variables: matrix (required): dict[case_id -> dict[config_run_id -> ExperimentCaseResult]] cases (required): list of dicts with case_id, domain config_runs (required): list of ExperimentConfigRun scoring_enabled (required): bool {% endcomment %}
Case Results
{% for cr in config_runs %} {% endfor %} {% for case in cases %} {% for cr in config_runs %} {% with case_id=case.case_id cr_id=cr.config_run_id %} {% with row=matrix|get_item:case_id %} {% with cell=row|get_item:cr_id %} {% endwith %} {% endwith %} {% endwith %} {% endfor %} {% endfor %}
Case Input {{ cr.config_name }}
{{ case.case_id }}
{% if case.domain %}({{ case.domain }}){% endif %}
{{ case.input }} {% if cell %} {% if cell.status == 'error' %} ERROR {% elif scoring_enabled %} {% if cell.passed %} PASS {% else %} FAIL {% endif %} {% else %}

                    
                  
{% endif %}
{% else %} {% endif %}
{% for cr in config_runs %}
{{ cr.config_name }} {% if cr.is_baseline %} baseline {% endif %}
{% with case_id=case.case_id cr_id=cr.config_run_id %} {% with row=matrix|get_item:case_id %} {% with cell=row|get_item:cr_id %} {% if cell %} {% if cell.status == 'error' %}
Error: {{ cell.error_message|default:"Unknown error" }}
{% else %}

Output


                    
                  
{% if scoring_enabled %}
{% if cell.passed %} PASS {% elif cell.passed == False %} FAIL {% endif %}
{% if cell.assertion_results %}
    {% for a in cell.assertion_results %}
  • {% if a.passed %}✓{% else %}✗{% endif %} {% firstof a.type a.assertion "—" %} {% if not a.passed and a.failure %} — {{ a.failure }} {% endif %}
  • {% endfor %}
{% endif %}
{% endif %}
Tokens in
Tokens out
Latency
{{ cell.input_tokens|default:"—" }}
{{ cell.output_tokens|default:"—" }}
{{ cell.total_latency_seconds|default:"—" }}s
{% endif %} {% else %} {% endif %} {% endwith %} {% endwith %} {% endwith %}
{% endfor %}
{% if not cases %}

No case results available.

{% endif %}