MeteoSwiss · dnerini · May 28, 2026 · May 1, 2026 · May 4, 2026 · May 4, 2026
diff --git a/README.md b/README.md
@@ -99,6 +99,22 @@ experiment:
     # Stratification dimensions to include in the experiment dashboard (any of season, region, init_hour).
     stratification:
       - season
+  # Optional: named scorecards comparing each forecaster against a chosen baseline.
+  scorecards:
+    enabled: true
+    sections:
+      short_range:
+        # Baseline label — must match the `label` field of a baseline entry in `runs`.
+        baseline: COSMO-E
+        # Lead-time range as start/stop/step (hours).
+        lead_times: "0/120/6"
+        # Stratification dimension to use as scorecard columns (e.g. region, season).
+        stratification: region
+        # Variables and metrics as scorecard rows. Format: VAR:METRIC1,METRIC2,...
+        # Supported metrics: RMSE, R2, ETS, POD, FAR (categorical metrics require thresholds).
+        variables:
+          - "T_2M:RMSE,R2"
+          - "TOT_PREC:RMSE,ETS"
 
 locations:
   # All workflow outputs are written under this root.
@@ -121,7 +137,7 @@ profile:
     plot_forecast_frame: 32
 ```
 
-The `runs` list accepts `forecaster`, `interpolator`, and `baseline` entries. For `dates`, you can either provide a `start` / `end` / `frequency` block as above or an explicit list of ISO timestamps for case-study style runs. Stratification, thresholds, and dashboard settings are all grouped under the `experiment` key.
+The `runs` list accepts `forecaster`, `interpolator`, and `baseline` entries. For `dates`, you can either provide a `start` / `end` / `frequency` block as above or an explicit list of ISO timestamps for case-study style runs. Stratification, thresholds, dashboard, and scorecard settings are all grouped under the `experiment` key.
 
 You can then run it with:
 
@@ -244,7 +260,8 @@ All outputs are rooted at `OUT_ROOT` (from `locations.output_root` in the config
 ├── logs/                                      # one sub-directory per rule
 └── results/{experiment_name}/               # final products
     ├── dashboard/
-    └── plots/
+    ├── plots/
+    └── scorecards/
 ```
 
 ### Wildcard conventions

diff --git a/config/forecasters-ich1-oper-fixed.yaml b/config/forecasters-ich1-oper-fixed.yaml
@@ -11,7 +11,6 @@ dates:
   - 2024-02-01T18:00
   - 2025-03-01T00:00
 
-
 runs:
   - forecaster:
       checkpoint: https://service.meteoswiss.ch/mlstore#/experiments/602/runs/9f4626a7562a4eb49700aaaaa8607230

diff --git a/config/interpolators-ich1.yaml b/config/interpolators-ich1.yaml
@@ -11,8 +11,6 @@ dates:
 
 runs:
   - interpolator:
-      inference_resources:
-        slurm_partition: normal-shared
       # for checkpoints trained on a different HPC, using mlflow doesn't work due to difference in
       # paths, so we directly specify the checkpoint path here
       checkpoint: /store_new/mch/msopr/ml/tmp/inference-last.ckpt
@@ -25,10 +23,18 @@ runs:
         steps: 0/120/6
         extra_requirements:
           - git+https://github.com/ecmwf/anemoi-inference.git@e369b1a90313e9701db13f63364a467aa281cf36
+          - eccodes==2.39.1
+          - eccodes-cosmo-resources-python==2.38.3.1
       extra_requirements:
         - git+https://github.com/ecmwf/anemoi-inference.git@e369b1a90313e9701db13f63364a467aa281cf36
         # pinned anemoi-datasets because of ecmwf/anemoi-utils#284, can be removed when fixed
         - anemoi-datasets==0.5.35
+        - eccodes==2.39.1
+        - eccodes-cosmo-resources-python==2.38.3.1
+  - baseline:
+      label: INCA
+      root: /store_new/mch/msclim/INCA
+      steps: 0/6/1
   - baseline:
       label: ICON-CH2-ctrl
       root: /store_new/mch/msopr/osm/ICON-CH2-EPS
@@ -39,8 +45,8 @@ runs:
       steps: 0/33/1
 
 truth:
-  label: KENDA-CH1
-  root: /store_new/mch/msopr/ml/datasets/mch-ich1-1km-2024-2025-1h-pl13-v1.0.zarr
+  label: SwissMetNet
+  root: output/data/observations/peakweather
 
 experiment:
   params:
@@ -51,12 +57,11 @@ experiment:
     - TOT_PREC
   stratification:
     regions:
-      - jura
       - mittelland
-      - voralpen
-      - alpennordhang
-      - innerealpentaeler
+      - berge
+      - alpennordseite
       - alpensuedseite
+      - jura
     root: /scratch/mch/bhendj/regions/Prognoseregionen_LV95_20220517
   thresholds:
     TOT_PREC:
@@ -73,6 +78,36 @@ experiment:
       # - region
       # - init_hour
       - season
+  scorecards:
+    enabled: true
+    sections:
+      nowcasting:
+        baseline: INCA
+        lead_times: "0/6/1"
+        stratification: region
+        variables:
+          - "U_10M:RMSE,R2,ETS"
+          - "V_10M:RMSE,R2,ETS"
+          - "T_2M:RMSE,R2,ETS"
+          - "TOT_PREC:RMSE,R2,ETS"
+      short_range:
+        baseline: ICON-CH1-ctrl
+        lead_times: "6/33/6"
+        stratification: region
+        variables:
+          - "U_10M:RMSE,R2,ETS"
+          - "V_10M:RMSE,R2,ETS"
+          - "T_2M:RMSE,R2,ETS"
+          - "TOT_PREC:RMSE,R2,ETS"
+      medium_range:
+        baseline: ICON-CH2-ctrl
+        lead_times: "24/120/24"
+        stratification: region
+        variables:
+          - "U_10M:RMSE,R2,ETS"
+          - "V_10M:RMSE,R2,ETS"
+          - "T_2M:RMSE,R2,ETS"
+          - "TOT_PREC:RMSE,R2,ETS"
 
 showcase:
   params:

diff --git a/src/evalml/config.py b/src/evalml/config.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Dict, List, Any, ClassVar, FrozenSet
+from typing import Dict, List, Any, ClassVar, FrozenSet, Optional
 
 from pydantic import BaseModel, Field, RootModel, field_validator
 
@@ -259,6 +259,49 @@ class AnimationsConfig(BaseModel):
     )
 
 
+class ScorecardConfig(BaseModel):
+    """Configuration for a single named scorecard."""
+
+    baseline: str = Field(
+        ...,
+        description="Baseline label to compare against (must match the `label` field of a baseline entry in `runs`).",
+    )
+    lead_times: str = Field(
+        ...,
+        description="Lead-time range as start/stop/step (hours).",
+    )
+    stratification: str = Field(
+        ...,
+        description="Dimension to use as scorecard columns (e.g. 'region').",
+    )
+    variables: List[str] = Field(
+        ...,
+        description=(
+            "Variables and metrics as scorecard rows (VAR:M1,M2 format). "
+            "An empty list [] is accepted by the schema but falls back to a "
+            "hard-coded RMSE-only set (U_10M, V_10M, T_2M, PMSL, TD_2M, TOT_PREC). "
+            "Omit ':...' after a variable name to include all available metrics for it."
+        ),
+    )
+
+    model_config = {"extra": "forbid"}
+
+
+class ExperimentScorecardConfig(BaseModel):
+    """Top-level scorecard block: a single enabled flag plus named scorecard sections."""
+
+    enabled: bool = Field(
+        default=True,
+        description="Whether to generate scorecards.",
+    )
+    sections: Dict[str, ScorecardConfig] = Field(
+        default_factory=dict,
+        description="Named scorecard configurations (e.g. nowcasting, short_range, medium_range).",
+    )
+
+    model_config = {"extra": "forbid"}
+
+
 class ShowcaseConfig(BaseModel):
     """Configuration for the showcase workflow."""
 
@@ -326,6 +369,10 @@ class ExperimentConfig(BaseModel):
         ...,
         description="Settings for the experiment dashboard.",
     )
+    scorecards: Optional[ExperimentScorecardConfig] = Field(
+        default=None,
+        description="Scorecard generation configuration. Omit or set enabled: false to disable.",
+    )
 
     @field_validator("thresholds")
     @classmethod

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -131,6 +131,21 @@ rule experiment_all:
             rules.verification_metrics_plot.output,
             experiment=EXPERIMENT_NAME,
         ),
+        (
+            [
+                path
+                for c in CANDIDATES
+                for path in expand(
+                    rules.report_scorecard.output,
+                    env_id=c.split("/")[0],
+                    config_hash=c.split("/")[1],
+                    experiment=EXPERIMENT_NAME,
+                    scorecard_name=list(SCORECARD_CONFIGS),
+                )
+            ]
+            if config.get("experiment", {}).get("scorecards", {}).get("enabled", True)
+            else []
+        ),
 
 
 rule showcase_all:

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -245,14 +245,14 @@ def collect_all_baselines():
         if "baseline" not in run_entry:
             continue
         baseline_config = run_entry["baseline"]
-        baseline_id = Path(baseline_config["root"]).stem
+        baseline_id = baseline_config.get("label", Path(baseline_config["root"]).stem)
         baselines[baseline_id] = baseline_config
 
     # Backward compatibility with legacy top-level `baselines` block.
     for baseline_entry in copy.deepcopy(config.get("baselines", [])):
         baseline_type = next(iter(baseline_entry))
         baseline_config = baseline_entry[baseline_type]
-        baseline_id = Path(baseline_config["root"]).stem
+        baseline_id = baseline_config.get("label", Path(baseline_config["root"]).stem)
         baseline_config.pop("baseline_id", None)
         baselines[baseline_id] = baseline_config
 
@@ -332,3 +332,7 @@ RUN_CONFIGS = collect_all_runs()
 ENV_CONFIGS = collect_all_envs()
 BASELINE_CONFIGS = collect_all_baselines()
 EXPERIMENT_PARTICIPANTS = collect_experiment_participants()
+_scorecard = config.get("experiment", {}).get("scorecards", {})
+SCORECARD_CONFIGS = (
+    _scorecard.get("sections", {}) if _scorecard.get("enabled", True) else {}
+)
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
@@ -50,3 +50,52 @@ rule report_experiment_dashboard:
             --stratification {params.stratification} \
             --output {output} >{log} 2>&1
         """
+
+
+rule report_scorecard:
+    input:
+        script="workflow/scripts/report_scorecard.py",
+        verif_run=lambda wc: EXPERIMENT_PARTICIPANTS[f"{wc.env_id}/{wc.config_hash}"],
+        verif_baseline=lambda wc: EXPERIMENT_PARTICIPANTS[
+            SCORECARD_CONFIGS[wc.scorecard_name]["baseline"]
+        ],
+    output:
+        report(
+            OUT_ROOT
+            / "results/{experiment}/scorecards/{scorecard_name}/scorecard_{scorecard_name}_{env_id}_{config_hash}.png",
+        ),
+    log:
+        OUT_ROOT
+        / "logs/report_scorecard/{experiment}/{scorecard_name}/{env_id}/{config_hash}.log",
+    wildcard_constraints:
+        env_id="[^/]+",  # no slashes
+        config_hash="[^/]+",
+        scorecard_name="[^/]+",
+    localrule: True
+    params:
+        lead_times=lambda wc: SCORECARD_CONFIGS[wc.scorecard_name]["lead_times"],
+        stratification=lambda wc: SCORECARD_CONFIGS[wc.scorecard_name]["stratification"],
+        variables=lambda wc: SCORECARD_CONFIGS[wc.scorecard_name]["variables"],
+        run_source=lambda wc: RUN_CONFIGS[f"{wc.env_id}/{wc.config_hash}"].get(
+            "label", f"{wc.env_id}/{wc.config_hash}"
+        ),
+        baseline_source=lambda wc: BASELINE_CONFIGS[
+            SCORECARD_CONFIGS[wc.scorecard_name]["baseline"]
+        ].get("label", SCORECARD_CONFIGS[wc.scorecard_name]["baseline"]),
+    shell:
+        """
+        VAR_ARGS=()
+        for v in {params.variables:q}; do
+            VAR_ARGS+=(--variable "$v")
+        done
+
+        python {input.script} \
+            --verif_run {input.verif_run:q} \
+            --verif_baseline {input.verif_baseline:q} \
+            --run_source {params.run_source:q} \
+            --baseline_source {params.baseline_source:q} \
+            --lead_times {params.lead_times:q} \
+            --stratification {params.stratification:q} \
+            "${{VAR_ARGS[@]}}" \
+            --output {output:q} >{log} 2>&1
+        """