ci(hw): Generate error reports if no runner with tags

lucasssvaz · lucasssvaz · commit cd21f770c66c · 2025-10-10T22:24:00.000-03:00
diff --git a/.gitlab/scripts/gen_hw_jobs.py b/.gitlab/scripts/gen_hw_jobs.py
@@ -8,6 +8,10 @@
 import copy
 import traceback
 from pathlib import Path
+from typing import Iterable
+from urllib.parse import urlencode
+import urllib.request
+import urllib.error
 
 # Resolve repository root from this script location
 SCRIPT_DIR = Path(__file__).resolve().parent
@@ -184,6 +188,109 @@ def parse_list_arg(s: str) -> list[str]:
     return [part.strip() for part in txt.split(",") if part.strip()]
 
 
+def _gitlab_auth_header() -> tuple[str, str]:
+    """Return header key and value for GitLab API auth, preferring PRIVATE-TOKEN, then JOB-TOKEN.
+
+    Falls back to empty auth if neither is available.
+    """
+    private = os.environ.get("GITLAB_API_TOKEN") or os.environ.get("PRIVATE_TOKEN")
+    if private:
+        return ("PRIVATE-TOKEN", private)
+    job = os.environ.get("CI_JOB_TOKEN")
+    if job:
+        return ("JOB-TOKEN", job)
+    return ("", "")
+
+
+def _gitlab_api_get(path: str) -> tuple[int, dict | list | None]:
+    """Perform a GET to GitLab API v4 and return (status_code, json_obj_or_None).
+
+    Uses project-level API base from CI env. Returns (0, None) if base env is missing.
+    """
+    base = os.environ.get("CI_API_V4_URL")
+    if not base:
+        return 0, None
+    url = base.rstrip("/") + "/" + path.lstrip("/")
+    key, value = _gitlab_auth_header()
+    req = urllib.request.Request(url)
+    if key:
+        req.add_header(key, value)
+    try:
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            status = resp.getcode()
+            data = resp.read()
+            try:
+                obj = json.loads(data.decode("utf-8")) if data else None
+            except Exception:
+                obj = None
+            return status, obj
+    except urllib.error.HTTPError as e:
+        try:
+            body = e.read().decode("utf-8")
+        except Exception:
+            body = str(e)
+        sys.stderr.write(f"[WARN] GitLab API GET {url} failed: {e} body={body}\n")
+        return e.code, None
+    except Exception as e:
+        sys.stderr.write(f"[WARN] GitLab API GET {url} error: {e}\n")
+        sys.stderr.write(traceback.format_exc() + "\n")
+        return -1, None
+
+
+def list_project_runners() -> list[dict]:
+    """List runners available to this project via GitLab API.
+
+    Requires CI vars CI_API_V4_URL and CI_PROJECT_ID and either GITLAB_API_TOKEN or CI_JOB_TOKEN.
+    Returns an empty list if not accessible.
+    """
+    project_id = os.environ.get("CI_PROJECT_ID")
+    if not project_id:
+        return []
+
+    runners: list[dict] = []
+    page = 1
+    per_page = 100
+    while True:
+        q = urlencode({"per_page": per_page, "page": page})
+        status, obj = _gitlab_api_get(f"projects/{project_id}/runners?{q}")
+        if status != 200 or not isinstance(obj, list):
+            # Project-scoped listing might be restricted for JOB-TOKEN in some instances.
+            # Return what we have (likely nothing) and let caller decide.
+            break
+        runners.extend(x for x in obj if isinstance(x, dict))
+        if len(obj) < per_page:
+            break
+        page += 1
+    return runners
+
+
+def runner_supports_tags(runner: dict, required_tags: Iterable[str]) -> bool:
+    tag_list = runner.get("tag_list") or []
+    if not isinstance(tag_list, list):
+        return False
+    tags = {str(t).strip() for t in tag_list if isinstance(t, str) and t.strip()}
+    if not tags:
+        return False
+    # Skip paused/inactive runners
+    if runner.get("paused") is True:
+        return False
+    if runner.get("active") is False:
+        return False
+    return all(t in tags for t in required_tags)
+
+
+def any_runner_matches(required_tags: Iterable[str], runners: list[dict]) -> bool:
+    req = [t for t in required_tags if t]
+    for r in runners:
+        try:
+            if runner_supports_tags(r, req):
+                return True
+        except Exception:
+            # Be robust to unexpected runner payloads
+            continue
+    return False
+
+
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--chips", required=True, help="Comma-separated or JSON array list of SoCs")
@@ -249,25 +356,72 @@ def main():
 
     # Build child pipeline YAML in deterministic order
     jobs_entries = []  # list of (sort_key, job_name, job_dict)
+
+    # Discover available runners (best-effort)
+    available_runners = list_project_runners()
+    if not available_runners:
+        print("[WARN] Could not enumerate project runners or none found; skipping runner-tag availability checks.")
+
+    # Accumulate all missing-runner groups to emit a single stub job
+    missing_groups: list[dict] = []
+
     for (chip, tagset, test_type), test_dirs in group_map.items():
         tag_list = sorted(tagset)
         # Build name suffix excluding the SOC itself to avoid duplication
         non_soc_tags = [t for t in tag_list if t != chip]
         tag_suffix = "-".join(non_soc_tags) if non_soc_tags else "generic"
-        job_name = f"hw-{chip}-{test_type}-{tag_suffix}"[:255]
 
-        # Clone base job and adjust (preserve key order using deepcopy)
+        # Determine if any runner can serve this job
+        can_schedule = True
+        if available_runners:
+            can_schedule = any_runner_matches(tag_list, available_runners)
+
+        if can_schedule:
+            job_name = f"hw-{chip}-{test_type}-{tag_suffix}"[:255]
+
+            # Clone base job and adjust (preserve key order using deepcopy)
+            job = copy.deepcopy(base_job)
+            # Ensure tags include SOC+extras
+            job["tags"] = tag_list
+            vars_block = job.get("variables", {})
+            vars_block["TEST_CHIP"] = chip
+            vars_block["TEST_TYPE"] = test_type
+            # Provide list of test directories for this job
+            vars_block["TEST_LIST"] = "\n".join(sorted(test_dirs))
+            job["variables"] = vars_block
+
+            sort_key = (chip, test_type, tag_suffix)
+            jobs_entries.append((sort_key, job_name, job))
+        else:
+            # Accumulate for a single combined missing-runner job
+            missing_groups.append(
+                {
+                    "chip": chip,
+                    "test_type": test_type,
+                    "required_tags": tag_list,
+                    "test_dirs": sorted(test_dirs),
+                }
+            )
+
+    # If any groups are missing runners, create one combined stub job to emit all JUnit errors
+    if missing_groups:
+        job_name = "hw-missing-runners"
         job = copy.deepcopy(base_job)
-        # Ensure tags include SOC+extras
-        job["tags"] = tag_list
+        if "tags" in job:
+            del job["tags"]
+        job["before_script"] = [
+            "echo 'No suitable hardware runners found for some groups; generating combined JUnit error stubs.'"
+        ]
         vars_block = job.get("variables", {})
-        vars_block["TEST_CHIP"] = chip
-        vars_block["TEST_TYPE"] = test_type
-        # Provide list of test directories for this job
-        vars_block["TEST_LIST"] = "\n".join(sorted(test_dirs))
+        # Store as JSON string for the generator script to process
+        vars_block["MISSING_GROUPS_JSON"] = json.dumps(missing_groups)
         job["variables"] = vars_block
-
-        sort_key = (chip, test_type, tag_suffix)
+        job["script"] = [
+            "python3 .gitlab/scripts/generate_missing_runner_junit.py",
+            "exit 1",
+        ]
+        # Ensure it sorts after normal jobs
+        sort_key = ("zzz", "zzz", "zzz")
         jobs_entries.append((sort_key, job_name, job))
 
     # Order jobs by (chip, type, tag_suffix)
diff --git a/.gitlab/scripts/generate_missing_runner_junit.py b/.gitlab/scripts/generate_missing_runner_junit.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import json
+from pathlib import Path
+import xml.etree.ElementTree as ET
+from typing import Optional
+
+
+def read_env_list(name: str) -> list[str]:
+    raw = os.environ.get(name, "")
+    return [item.strip() for item in raw.splitlines() if item.strip()]
+
+
+def write_single_suite(out_path: Path, suite_name: str, testcase_name: str, error_message: str) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    suite = ET.Element(
+        "testsuite",
+        attrib={
+            "name": suite_name,
+            "tests": "1",
+            "errors": "1",
+            "failures": "0",
+        },
+    )
+    tc = ET.SubElement(
+        suite,
+        "testcase",
+        attrib={"classname": "hardware.missing_runner", "name": testcase_name},
+    )
+    err = ET.SubElement(
+        tc,
+        "error",
+        attrib={"message": error_message},
+    )
+    err.text = (
+        "The hardware test could not be scheduled because no runner with the "
+        "required tag combination is online/available."
+    )
+    ET.ElementTree(suite).write(out_path, encoding="utf-8", xml_declaration=True)
+
+
+def _leading_spaces_count(s: str) -> int:
+    return len(s) - len(s.lstrip(" "))
+
+
+def _manual_parse_fqbn_length(ci_text: str, chip: str) -> Optional[int]:
+    lines = ci_text.splitlines()
+    fqbn_idx = None
+    fqbn_indent = None
+    for idx, line in enumerate(lines):
+        if line.strip().startswith("fqbn:"):
+            fqbn_idx = idx
+            fqbn_indent = _leading_spaces_count(line)
+            break
+    if fqbn_idx is None:
+        return None
+    chip_idx = None
+    chip_indent = None
+    for j in range(fqbn_idx + 1, len(lines)):
+        line = lines[j]
+        if not line.strip():
+            continue
+        indent = _leading_spaces_count(line)
+        if indent <= fqbn_indent:
+            break
+        stripped = line.strip()
+        # Match '<chip>:' at this indentation level
+        if stripped.startswith(f"{chip}:"):
+            chip_idx = j
+            chip_indent = indent
+            break
+    if chip_idx is None:
+        return None
+    count = 0
+    for k in range(chip_idx + 1, len(lines)):
+        line = lines[k]
+        if not line.strip():
+            continue
+        indent = _leading_spaces_count(line)
+        if indent <= chip_indent:
+            break
+        if line.strip().startswith("-"):
+            count += 1
+    return count if count > 0 else 1
+
+
+def detect_fqbn_count(test_dir: Path, chip: str) -> int:
+    """Return number of FQBN configs for this test and chip. Defaults to 1.
+
+    Tries PyYAML if available; otherwise uses a simple indentation-based parser.
+    """
+    ci_path = test_dir / "ci.yml"
+    if not ci_path.exists():
+        return 1
+    try:
+        import yaml  # type: ignore
+
+        data = yaml.safe_load(ci_path.read_text(encoding="utf-8")) or {}
+        fqbn = data.get("fqbn", {})
+        if isinstance(fqbn, dict):
+            v = fqbn.get(chip)
+            if isinstance(v, list):
+                return len(v) if len(v) > 0 else 1
+        return 1
+    except Exception:
+        # Fallback to manual parsing
+        try:
+            text = ci_path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            return 1
+        length = _manual_parse_fqbn_length(text, chip)
+        return length if length is not None else 1
+
+
+def main() -> int:
+    groups_json = os.environ.get("MISSING_GROUPS_JSON")
+    if groups_json:
+        try:
+            groups = json.loads(groups_json)
+            if not isinstance(groups, list):
+                groups = []
+        except Exception:
+            groups = []
+        for g in groups:
+            if not isinstance(g, dict):
+                continue
+            chip = str(g.get("chip", "unknown"))
+            test_type = str(g.get("test_type", "unknown"))
+            required_tags = " ".join(g.get("required_tags", []) or [])
+            test_dirs = g.get("test_dirs", []) or []
+            for test_dir in test_dirs:
+                sketchdir = Path(test_dir)
+                sketchname = sketchdir.name
+                count = detect_fqbn_count(sketchdir, chip)
+                if count <= 1:
+                    out_path = sketchdir / chip / f"{sketchname}.xml"
+                    suite_name = f"{test_type}_hardware_{chip}_{sketchname}"
+                    msg = f"No available runner matches required tags: {required_tags} (chip={chip})"
+                    write_single_suite(out_path, suite_name, sketchname, msg)
+                    print(f"Wrote JUnit error report to {out_path}")
+                else:
+                    for i in range(count):
+                        out_path = sketchdir / chip / f"{sketchname}{i}.xml"
+                        suite_name = f"{test_type}_hardware_{chip}_{sketchname}{i}"
+                        msg = f"No available runner matches required tags: {required_tags} (chip={chip})"
+                        write_single_suite(out_path, suite_name, f"{sketchname}{i}", msg)
+                        print(f"Wrote JUnit error report to {out_path}")
+        return 0
+
+    # Legacy single-group envs
+    tests = read_env_list("TEST_LIST")
+    chip = os.environ.get("TEST_CHIP", "unknown")
+    test_type = os.environ.get("TEST_TYPE", "unknown")
+    required_tags = os.environ.get("REQUIRED_TAGS", "").strip()
+
+    if tests:
+        for test_dir in tests:
+            sketchdir = Path(test_dir)
+            sketchname = sketchdir.name
+            # Determine number of configs (FQBN list entries) for this chip
+            count = detect_fqbn_count(sketchdir, chip)
+            if count <= 1:
+                out_path = sketchdir / chip / f"{sketchname}.xml"
+                suite_name = f"{test_type}_hardware_{chip}_{sketchname}"
+                msg = f"No available runner matches required tags: {required_tags} (chip={chip})"
+                write_single_suite(out_path, suite_name, sketchname, msg)
+                print(f"Wrote JUnit error report to {out_path}")
+            else:
+                for i in range(count):
+                    out_path = sketchdir / chip / f"{sketchname}{i}.xml"
+                    suite_name = f"{test_type}_hardware_{chip}_{sketchname}{i}"
+                    msg = f"No available runner matches required tags: {required_tags} (chip={chip})"
+                    write_single_suite(out_path, suite_name, f"{sketchname}{i}", msg)
+                    print(f"Wrote JUnit error report to {out_path}")
+    else:
+        # Fallback: produce a generic suite so the pipeline reports an error
+        out_dir = Path("tests") / test_type / chip
+        out_path = out_dir / "missing_runner.xml"
+        suite_name = f"{test_type}_hardware_{chip}_missing"
+        msg = f"No available runner matches required tags: {required_tags} (chip={chip})"
+        write_single_suite(out_path, suite_name, "missing_runner", msg)
+        print(f"Wrote JUnit error report to {out_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+
+
diff --git a/tests/validation/wifi/ci.yml b/tests/validation/wifi/ci.yml