diff --git a/torchci/clickhouse_queries/vllm/queue_per_build_windowed/params.json b/torchci/clickhouse_queries/vllm/queue_per_build_windowed/params.json new file mode 100644 index 0000000000..4c17e282b2 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/queue_per_build_windowed/params.json @@ -0,0 +1,12 @@ +{ + "params": { + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)" + }, + "tests": [ + { + "startTime": "2025-10-17T00:00:00.000", + "stopTime": "2025-10-18T00:00:00.000" + } + ] +} diff --git a/torchci/clickhouse_queries/vllm/queue_per_build_windowed/query.sql b/torchci/clickhouse_queries/vllm/queue_per_build_windowed/query.sql new file mode 100644 index 0000000000..e5ab54d256 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/queue_per_build_windowed/query.sql @@ -0,0 +1,210 @@ +/* Windowed per-build table (UTC), incl. PR & main builds, with queue totals, cost, and is_main_branch. + WAIT: only attempts with started_at IS NOT NULL contribute wait (runnable → started). + RUN: clip to [w_start, w_end]; 1-day zombie guard for open 'running' attempts. + COST: 1.3232 * gpu_1_queue_run_hours + 4.602 * gpu_4_queue_run_hours (fixed). +*/ + +WITH + parseDateTime64BestEffort({startTime:String}, 3) AS w_start, -- inclusive (UTC) + parseDateTime64BestEffort({stopTime:String}, 3) AS w_end, -- exclusive (UTC) + toDateTime64(now(), 3) AS now64, + (w_end - INTERVAL 1 DAY) AS zombie_cutoff, + toDateTime64('2100-01-01 00:00:00', 3) AS FAR_FUTURE, + ['gpu_1_queue','gpu_4_queue', 'cpu_queue_premerge_us_east_1'] AS QUEUES + +/* 1) All builds created within the window (+ branch/PR context) */ +, builds_window AS ( + SELECT + tupleElement(build,'id') AS build_id, + + argMax(tupleElement(build,'number'), tupleElement(job,'created_at')) AS build_number, + argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')) AS build_url, + concat(argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')), '/steps/table') AS steps_table_url, + argMax(tupleElement(build,'commit'), tupleElement(job,'created_at')) AS commit_sha, + + /* robust start/finish (fallback to job min/max if build-level fields are NULL) */ + coalesce(argMax(tupleElement(build,'started_at'), tupleElement(job,'created_at')), + min(tupleElement(job,'started_at'))) AS robust_start, + coalesce(argMax(tupleElement(build,'finished_at'), tupleElement(job,'created_at')), + max(tupleElement(job,'finished_at'))) AS robust_finish, + + countDistinct(tupleElement(job,'id')) AS steps_count, + argMax(tupleElement(build,'state'), tupleElement(job,'created_at')) AS latest_build_state, + + /* repo + PR mapping (repo_slug may come from pipeline or PR repo) */ + coalesce( + nullIf(extract(argMax(tupleElement(pipeline,'repository'), tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''), + nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''), + nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), '([^/]+/[^/.]+)'), '') + ) AS repo_slug, + coalesce( + toInt64OrNull(argMax(tupleElement(build,'pull_request').id, tupleElement(job,'created_at'))), + toInt64OrNull(extract(argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')), 'pull/([0-9]+)')) + ) AS pr_number, + + argMax(tupleElement(build,'created_at'), tupleElement(job,'created_at')) AS build_created_at_utc, + argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')) AS branch_name + FROM vllm.vllm_buildkite_jobs + GROUP BY tupleElement(build,'id') + HAVING build_created_at_utc >= w_start AND build_created_at_utc < w_end +) + +/* 2) Agent-run attempts for those builds that can overlap the window */ +, base_agent AS ( + SELECT + tupleElement(build,'id') AS build_id, + tupleElement(job,'id') AS job_id, + tupleElement(job,'created_at') AS created_at, + tupleElement(job,'state') AS state, + tupleElement(job,'runnable_at') AS runnable_at, + tupleElement(job,'started_at') AS started_at, + tupleElement(job,'finished_at') AS finished_at, + replaceOne(arrayFirst(x -> startsWith(x,'queue='), + tupleElement(job,'agent_query_rules')), 'queue=', '') AS queue_key + FROM vllm.vllm_buildkite_jobs + INNER JOIN builds_window b ON tupleElement(build,'id') = b.build_id + WHERE tupleElement(job,'type') IN ('script','command') + AND ( + tupleElement(job,'runnable_at') < w_end OR + tupleElement(job,'started_at') < w_end OR + ifNull(tupleElement(job,'finished_at'), FAR_FUTURE) >= w_start + ) +) + +/* 3) Collapse to (build_id, job_id) and collect attempts keyed by queue */ +, jobs_by_build AS ( + SELECT + build_id, + job_id, + argMax(state, created_at) AS latest_state, + max(created_at) AS last_event_at, + + /* RUN attempts: (queue, start, finish) */ + arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL, + groupArray((queue_key, started_at, finished_at)) + )) AS run_triplets, + + /* WAIT attempts: (queue, runnable, start) — ONLY attempts that actually started */ + arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL AND t.3 IS NOT NULL, + groupArray((queue_key, runnable_at, started_at)) + )) AS wait_triplets + FROM base_agent + GROUP BY build_id, job_id +) + +/* 4) RUN attempts → per build × queue (clip to window; zombie guard for open runs) */ +, runs_scored AS ( + SELECT + build_id, + tupleElement(rt, 1) AS queue_key, + greatest(tupleElement(rt, 2), w_start) AS s_clip, + least( + ifNull( + tupleElement(rt, 3), + if(latest_state = 'running' AND last_event_at < zombie_cutoff, + least(last_event_at + INTERVAL 1 MINUTE, w_end), + w_end) + ), + w_end + ) AS e_clip + FROM jobs_by_build + ARRAY JOIN run_triplets AS rt + WHERE tupleElement(rt, 1) IN QUEUES +) +, run_by_build AS ( + SELECT + build_id, queue_key, + sumIf(dateDiff('second', s_clip, e_clip), e_clip > s_clip) AS total_run_s + FROM runs_scored + GROUP BY build_id, queue_key +) + +/* 5) WAIT attempts (runnable → started) → per build × queue (clip to window) */ +, waits_scored AS ( + SELECT + build_id, + tupleElement(wt, 1) AS queue_key, + greatest(tupleElement(wt, 2), w_start) AS ra_clip, + least(tupleElement(wt, 3), w_end) AS st_clip, + greatest(0, dateDiff('second', greatest(tupleElement(wt, 2), w_start), least(tupleElement(wt, 3), w_end))) AS wait_s + FROM jobs_by_build + ARRAY JOIN wait_triplets AS wt + WHERE tupleElement(wt, 1) IN QUEUES +) +, waits_p90_pivot AS ( + SELECT + build_id, + /* P90 per queue (approximate quantile; broadly supported) */ + quantileIf(0.9)(toFloat64(wait_s), queue_key = 'gpu_1_queue') AS gpu1_p90_s, + quantileIf(0.9)(toFloat64(wait_s), queue_key = 'gpu_4_queue') AS gpu4_p90_s, + quantileIf(0.9)(toFloat64(wait_s), queue_key = 'cpu_queue_premerge_us_east_1') AS cpu_p90_s, + /* Combined P90 across both queues */ + quantile(0.9)(toFloat64(wait_s)) AS p90_combined_s + FROM waits_scored + WHERE wait_s > 0 + GROUP BY build_id +) + +/* 6) Pivot per-build totals to hour columns */ +, run_totals_by_build AS ( + SELECT + build_id, + round(sumIf(total_run_s, queue_key = 'gpu_1_queue') / 3600.0, 2) AS gpu_1_queue_run_hours, + round(sumIf(total_run_s, queue_key = 'gpu_4_queue') / 3600.0, 2) AS gpu_4_queue_run_hours, + round(sumIf(total_run_s, queue_key = 'cpu_queue_premerge_us_east_1') / 3600.0, 2) AS cpu_queue_run_hours + FROM run_by_build + GROUP BY build_id +) + +/* 7) Final table (UTC) — includes both PR and main builds */ +SELECT + /* PR URL (NULL for non-PR builds) */ + if((b.pr_number IS NULL) OR (b.repo_slug IS NULL), + NULL, + concat('https://github.com/', b.repo_slug, '/pull/', toString(b.pr_number)) + ) AS pr_url, + + b.build_number AS build_number, + b.build_id AS build_id, + b.build_url AS build_url, + b.steps_table_url AS steps_table_url, + b.commit_sha AS commit_sha, + + b.robust_start AS build_started_at, + b.robust_finish AS build_finished_at, + + /* duration (hours) = finish − start (UTC) */ + multiIf( + b.robust_start IS NULL OR b.robust_finish IS NULL, + NULL, + round(dateDiff('second', b.robust_start, b.robust_finish) / 3600.0, 2) + ) AS duration_hours, + + b.steps_count AS steps_count, + b.latest_build_state AS latest_build_state, + + /* Keep run hours for cost */ + ifNull(rt.gpu_1_queue_run_hours, 0) AS gpu_1_queue_run_hours, + ifNull(rt.gpu_4_queue_run_hours, 0) AS gpu_4_queue_run_hours, + ifNull(rt.cpu_queue_run_hours, 0) AS cpu_queue_run_hours, + + /* NEW: P90 wait hours (by queue + combined) */ + round(ifNull(wp.gpu1_p90_s, 0) / 3600.0, 2) AS gpu_1_queue_wait_p90_hours, + round(ifNull(wp.gpu4_p90_s, 0) / 3600.0, 2) AS gpu_4_queue_wait_p90_hours, + round(ifNull(wp.cpu_p90_s, 0) / 3600.0, 2) AS cpu_queue_wait_p90_hours, + round(ifNull(wp.p90_combined_s, 0) / 3600.0, 2) AS wait_p90_hours, + + /* Fixed-rate cost */ + round( + 1.3232 * ifNull(rt.gpu_1_queue_run_hours, 0) + + 4.602 * ifNull(rt.gpu_4_queue_run_hours, 0), + 2 + ) AS cost, + + /* Mark if the build branch is literally 'main' */ + toUInt8(b.branch_name = 'main') AS is_main_branch + +FROM builds_window AS b +LEFT JOIN run_totals_by_build AS rt ON rt.build_id = b.build_id +LEFT JOIN waits_p90_pivot AS wp ON wp.build_id = b.build_id +ORDER BY b.build_created_at_utc ASC; diff --git a/torchci/components/metrics/vllm/QueueWaitPerBuildPanel.tsx b/torchci/components/metrics/vllm/QueueWaitPerBuildPanel.tsx new file mode 100644 index 0000000000..78ae52cc8c --- /dev/null +++ b/torchci/components/metrics/vllm/QueueWaitPerBuildPanel.tsx @@ -0,0 +1,283 @@ +import { Box, Stack, Switch, Tooltip, Typography } from "@mui/material"; +import { useDarkMode } from "lib/DarkModeContext"; +import { useCallback, useMemo, useState } from "react"; +import { ChartPaper } from "./chartUtils"; + +// Helper: extract pipeline slug from Buildkite URL (e.g., /vllm/ci/builds/...) +function pipelineFromUrl(url: string | null): string { + try { + if (!url) return "unknown"; + const u = new URL(url); + const parts = u.pathname.split("/").filter(Boolean); + // ['', 'vllm', 'ci', 'builds', '35431', ...] => ['vllm','ci','builds','35431'] -> 'ci' + return (parts[1] || "unknown").toLowerCase(); + } catch { + const m = url?.match(/buildkite\.com\/[^/]+\/([^/]+)/i); + return (m?.[1] ?? "unknown").toLowerCase(); + } +} + +type Row = { + pr_url: string | null; + build_number: number; + build_id: string; + build_url: string | null; // may be NULL in ClickHouse + steps_table_url: string | null; // SQL always builds this + commit_sha: string; + build_started_at: string | null; // UTC + build_finished_at: string | null; // UTC + duration_hours: number | null; + steps_count: number; + latest_build_state: string; + + // P90 wait columns + wait_p90_hours: number; + gpu_1_queue_wait_p90_hours: number; + gpu_4_queue_wait_p90_hours: number; + cpu_queue_wait_p90_hours: number; + + is_main_branch: number; // 0/1 +}; + +export default function QueueWaitPerBuildPanel({ + data, +}: { + data: Row[] | undefined; +}) { + const { darkMode } = useDarkMode(); + const [mainOnly, setMainOnly] = useState(true); + + // Filter & sort; drop rows without a start time (time axis needs x) + const rows = useMemo(() => { + const r = (data ?? []) + .filter((x) => (mainOnly ? x.is_main_branch === 1 : true)) + .filter((x) => !!x.build_started_at); + + return r.sort((a, b) => { + const ta = a.build_started_at + ? new Date(a.build_started_at).getTime() + : 0; + const tb = b.build_started_at + ? new Date(b.build_started_at).getTime() + : 0; + return ta - tb || a.build_number - b.build_number; + }); + }, [data, mainOnly]); + + // Group rows by pipeline, but derive pipeline from the *link url* (build_url || steps_table_url) + const grouped = useMemo(() => { + const g = new Map(); + for (const r of rows) { + const linkUrl = r.build_url || r.steps_table_url || null; + const p = pipelineFromUrl(linkUrl); + if (!g.has(p)) g.set(p, []); + g.get(p)!.push(r); + } + return g; + }, [rows]); + + // Click → open Buildkite (always read from data.link we attach below) + const onPointClick = useCallback((p: any) => { + const url: string | null = + p?.data?.link ?? p?.data?.build_url ?? p?.data?.row?.build_url ?? null; + if (url) window.open(url, "_blank", "noopener,noreferrer"); + }, []); + + const option = useMemo(() => { + const series: any[] = []; + + // Build scatter series per pipeline, split by main vs PR + // We pass each point as an OBJECT (not array) so tooltip/click + // can read stable fields regardless of how ECharts wraps values. + for (const [pipeline, arr] of grouped.entries()) { + // Skip release pipeline (all zeros) + if (pipeline === "release") continue; + + // Split into main and PR builds + const mainBuilds = arr.filter((r) => Number(r.is_main_branch ?? 0) === 1); + const prBuilds = arr.filter((r) => Number(r.is_main_branch ?? 0) !== 1); + + // Main branch builds - circles + if (mainBuilds.length > 0) { + series.push({ + name: `${pipeline} (main)`, + type: "scatter", + symbol: "circle", + symbolSize: 6, + cursor: "pointer", + data: mainBuilds.map((r) => { + const link = r.build_url || r.steps_table_url || null; + return { + // ECharts uses value[0] for x and value[1] for y on a scatter + // Convert hours to minutes for display + value: [r.build_started_at, Number(r.wait_p90_hours ?? 0) * 60], + link, // <— used for click and pipeline fallback + bn: r.build_number ?? null, + pr: r.pr_url ?? null, + main: true, + w1: Number(r.gpu_1_queue_wait_p90_hours ?? 0) * 60, + w4: Number(r.gpu_4_queue_wait_p90_hours ?? 0) * 60, + wc: Number(r.cpu_queue_wait_p90_hours ?? 0) * 60, + }; + }), + }); + } + + // PR builds - triangles (only show if mainOnly is off) + if (!mainOnly && prBuilds.length > 0) { + series.push({ + name: `${pipeline} (PR)`, + type: "scatter", + symbol: "triangle", + symbolSize: 7, + cursor: "pointer", + data: prBuilds.map((r) => { + const link = r.build_url || r.steps_table_url || null; + return { + // ECharts uses value[0] for x and value[1] for y on a scatter + // Convert hours to minutes for display + value: [r.build_started_at, Number(r.wait_p90_hours ?? 0) * 60], + link, // <— used for click and pipeline fallback + bn: r.build_number ?? null, + pr: r.pr_url ?? null, + main: false, + w1: Number(r.gpu_1_queue_wait_p90_hours ?? 0) * 60, + w4: Number(r.gpu_4_queue_wait_p90_hours ?? 0) * 60, + wc: Number(r.cpu_queue_wait_p90_hours ?? 0) * 60, + }; + }), + }); + } + } + + // Calculate daily average P90 wait time (CI pipeline only) + const dailyAvg = new Map(); + for (const r of rows) { + const linkUrl = r.build_url || r.steps_table_url || null; + const pipeline = pipelineFromUrl(linkUrl); + if (pipeline !== "ci") continue; // Only include CI pipeline + const day = (r.build_started_at ?? "").slice(0, 10); // 'YYYY-MM-DD' + if (!day) continue; + const val = Number(r.wait_p90_hours ?? 0) * 60; // Convert to minutes + const cur = dailyAvg.get(day) ?? { sum: 0, count: 0 }; + cur.sum += val; + cur.count += 1; + dailyAvg.set(day, cur); + } + + const dailyAvgData = Array.from(dailyAvg.entries()) + .map(([day, { sum, count }]) => ({ + value: [ + new Date(`${day}T12:00:00Z`).toISOString(), // Midday for centering + sum / Math.max(1, count), + ], + })) + .sort( + (a, b) => + Date.parse(String(a.value[0])) - Date.parse(String(b.value[0])) + ); + + if (dailyAvgData.length > 0) { + series.push({ + name: "Daily avg P90", + type: "line", + symbol: "circle", + symbolSize: 6, + itemStyle: { + color: "#ff7f0e", + borderColor: "#ff7f0e", + borderWidth: 2, + }, + lineStyle: { + width: 2, + color: "#ff7f0e", + }, + emphasis: { focus: "series" }, + data: dailyAvgData, + z: 3, + }); + } + + return { + tooltip: { + trigger: "item", + confine: true, + formatter: (p: any) => { + const d = p?.data ?? {}; + const ts = d?.value?.[0] ?? ""; + const y = Number(d?.value?.[1] ?? 0); + const url: string | null = d?.link ?? null; + const num = d?.bn ?? "—"; + const pr: string | null = d?.pr ?? null; + const isM: boolean = !!d?.main; + const w1 = Number(d?.w1 ?? 0); + const w4 = Number(d?.w4 ?? 0); + const wc = Number(d?.wc ?? 0); + const pipe = (p?.seriesName as string) || pipelineFromUrl(url); + + const buildLink = url + ? `#${num}` + : `#${num}`; + const prLine = pr + ? `
PR: ${pr + .replace("https://github.com/", "") + .replace("/pull/", "#")}
` + : ""; + + return ` +
+
${ts}
+
Pipeline: ${pipe}
+
Build: ${buildLink}
+ ${prLine} +
P90 wait GPU1: ${w1.toFixed(1)} min
+
P90 wait GPU4: ${w4.toFixed(1)} min
+
P90 wait CPU: ${wc.toFixed(1)} min
+
P90 wait (combined): ${y.toFixed(1)} min
+
Branch: ${isM ? "main" : "PR/other"}
+
+ `; + }, + }, + legend: { top: 0 }, + grid: { left: 40, right: 50, bottom: 40, top: 40 }, + xAxis: { type: "time", name: "Build start (UTC)" }, + yAxis: [{ type: "value", name: "P90 Wait (min)" }], + series, + }; + }, [grouped, mainOnly, rows]); + + return ( + + + + Queue Wait (per build) + + + + Main only + setMainOnly((s) => !s)} + /> + + + + + + + + ); +} diff --git a/torchci/components/metrics/vllm/RunCostPerBuildPanel.tsx b/torchci/components/metrics/vllm/RunCostPerBuildPanel.tsx new file mode 100644 index 0000000000..10e822dc7f --- /dev/null +++ b/torchci/components/metrics/vllm/RunCostPerBuildPanel.tsx @@ -0,0 +1,276 @@ +import { Box, Stack, Switch, Tooltip, Typography } from "@mui/material"; +import { useDarkMode } from "lib/DarkModeContext"; +import { useCallback, useMemo, useState } from "react"; +import { ChartPaper } from "./chartUtils"; + +// Helper: extract pipeline slug from Buildkite URL (e.g., /vllm/ci/builds/...) +function pipelineFromUrl(url: string | null): string { + try { + if (!url) return "unknown"; + const u = new URL(url); + const parts = u.pathname.split("/").filter(Boolean); + return (parts[1] || "unknown").toLowerCase(); + } catch { + const m = url?.match(/buildkite\.com\/[^/]+\/([^/]+)/i); + return (m?.[1] ?? "unknown").toLowerCase(); + } +} + +type Row = { + pr_url: string | null; + build_number: number; + build_id: string; + build_url: string; + steps_table_url: string; + commit_sha: string; + build_started_at: string | null; // UTC + build_finished_at: string | null; // UTC + duration_hours: number | null; + steps_count: number; + latest_build_state: string; + gpu_1_queue_wait_hours: number; + gpu_1_queue_run_hours: number; + gpu_4_queue_wait_hours: number; + gpu_4_queue_run_hours: number; + cost: number; // dollars + is_main_branch: number; // 0/1 +}; + +export default function RunCostPerBuildPanel({ + data, +}: { + data: Row[] | undefined; +}) { + const { darkMode } = useDarkMode(); + const [mainOnly, setMainOnly] = useState(true); + + const rows = useMemo(() => { + const r = (data ?? []) + .filter((x) => (mainOnly ? x.is_main_branch === 1 : true)) + .filter((x) => !!x.build_started_at); + return r.sort((a, b) => { + const ta = a.build_started_at + ? new Date(a.build_started_at).getTime() + : 0; + const tb = b.build_started_at + ? new Date(b.build_started_at).getTime() + : 0; + return ta - tb || a.build_number - b.build_number; + }); + }, [data, mainOnly]); + + // Group rows by pipeline to build one series per pipeline + const grouped = useMemo(() => { + const g = new Map(); + for (const r of rows) { + const p = pipelineFromUrl(r.build_url); + if (!g.has(p)) g.set(p, []); + g.get(p)!.push(r); + } + return g; + }, [rows]); + + const onPointClick = useCallback((e: any) => { + const url = + (e?.data?.value?.[2] as string | undefined) ?? + (e?.value?.[2] as string | undefined) ?? + e?.data?.build_url; + if (url) window.open(url, "_blank", "noopener,noreferrer"); + }, []); + + const option = useMemo(() => { + const series: any[] = []; + for (const [pipeline, arr] of grouped.entries()) { + // Skip release pipeline (all zeros) + if (pipeline === "release") continue; + + // Split into main and PR builds + const mainBuilds = arr.filter((r) => Number(r.is_main_branch ?? 0) === 1); + const prBuilds = arr.filter((r) => Number(r.is_main_branch ?? 0) !== 1); + + // Main branch builds - circles + if (mainBuilds.length > 0) { + series.push({ + name: `${pipeline} (main)`, + type: "scatter", + symbol: "circle", + symbolSize: 6, + cursor: "pointer", + data: mainBuilds.map((r) => ({ + // [0]=ts, [1]=cost, [2]=build_url, [3]=build_number, [4]=pr_url, [5]=is_main, [6]=pipeline, [7]=gpu1_run_h, [8]=gpu4_run_h + value: [ + r.build_started_at, + Number(r.cost ?? 0), + r.build_url ?? null, + r.build_number ?? null, + r.pr_url ?? null, + 1, + pipeline, + r.gpu_1_queue_run_hours ?? 0, + r.gpu_4_queue_run_hours ?? 0, + ], + })), + }); + } + + // PR builds - triangles (only show if mainOnly is off) + if (!mainOnly && prBuilds.length > 0) { + series.push({ + name: `${pipeline} (PR)`, + type: "scatter", + symbol: "triangle", + symbolSize: 7, + cursor: "pointer", + data: prBuilds.map((r) => ({ + // [0]=ts, [1]=cost, [2]=build_url, [3]=build_number, [4]=pr_url, [5]=is_main, [6]=pipeline, [7]=gpu1_run_h, [8]=gpu4_run_h + value: [ + r.build_started_at, + Number(r.cost ?? 0), + r.build_url ?? null, + r.build_number ?? null, + r.pr_url ?? null, + 0, + pipeline, + r.gpu_1_queue_run_hours ?? 0, + r.gpu_4_queue_run_hours ?? 0, + ], + })), + }); + } + } + + // CI-only daily average cost (UTC day bucket) + const acc = new Map(); + for (const r of rows) { + if (pipelineFromUrl(r.build_url) !== "ci") continue; + const day = (r.build_started_at ?? "").slice(0, 10); // 'YYYY-MM-DD' + if (!day) continue; + const val = Number(r.cost ?? 0); + const cur = acc.get(day) ?? { sum: 0, count: 0 }; + cur.sum += val; + cur.count += 1; + acc.set(day, cur); + } + const ciDailyAvg = Array.from(acc.entries()) + .map(([day, { sum, count }]) => ({ + value: [ + new Date(`${day}T12:00:00Z`).toISOString(), // Midday for centering + sum / Math.max(1, count), + ], + })) + .sort( + (a, b) => + Date.parse(String(a.value[0])) - Date.parse(String(b.value[0])) + ); + if (ciDailyAvg.length > 0) { + series.push({ + name: "Daily avg cost", + type: "line", + symbol: "circle", + symbolSize: 6, + itemStyle: { + color: "#ff7f0e", + borderColor: "#ff7f0e", + borderWidth: 2, + }, + lineStyle: { + width: 2, + color: "#ff7f0e", + }, + emphasis: { focus: "series" }, + data: ciDailyAvg, + z: 3, + }); + } + + return { + tooltip: { + trigger: "item", + confine: true, + formatter: (p: any) => { + // Daily-average line hover + if (p?.seriesType === "line") { + const ts = p?.data?.value?.[0] ?? p?.value?.[0]; + const v = Number(p?.data?.value?.[1] ?? p?.value?.[1] ?? 0); + return `
${ + ts ?? "" + }
CI daily avg cost: $${v.toFixed(2)}
`; + } + const v = p?.data?.value ?? p?.value ?? []; + const ts = v[0] ?? ""; + const cost = Number(v[1] ?? 0); + const url = v[2] as string | null; + const num = v[3] ?? "—"; + const pr = v[4] as string | null; + const isM = Number(v[5] ?? 0) === 1; + const pipe = + (p?.seriesName as string) || pipelineFromUrl(url ?? null); + const h1 = Number(v[7] ?? 0); + const h4 = Number(v[8] ?? 0); + const c1 = 1.3232 * h1; + const c4 = 4.602 * h4; + const total = Number(cost ?? c1 + c4); + const buildLink = url + ? `#${num}` + : `#${num}`; + const prLine = pr + ? `
PR: ${pr + .replace("https://github.com/", "") + .replace("/pull/", "#")}
` + : ""; + return ` +
+
${ts}
+
Pipeline: ${pipe}
+
Build: ${buildLink}
+ ${prLine} +
Cost GPU1: $${c1.toFixed(2)}
+
Cost GPU4: $${c4.toFixed(2)}
+
Cost (total): $${total.toFixed(2)}
+
Branch: ${isM ? "main" : "PR/other"}
+
+ `; + }, + }, + legend: { top: 0 }, + grid: { left: 40, right: 50, bottom: 40, top: 40 }, + xAxis: { type: "time", name: "Build start (UTC)" }, + yAxis: [{ type: "value", name: "Cost ($)" }], + series, + }; + }, [grouped, mainOnly, rows]); + + return ( + + + + Run Cost (per build) + + + + Main only + setMainOnly((s) => !s)} + /> + + + + + + + + ); +} diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index a5e140ae75..5ab03bd87c 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -18,9 +18,11 @@ import JobGroupFilter, { import JobReliabilityPanel from "components/metrics/vllm/JobReliabilityPanel"; import MergesPanel from "components/metrics/vllm/MergesPanel"; import MostRetriedJobsTable from "components/metrics/vllm/MostRetriedJobsTable"; +import QueueWaitPerBuildPanel from "components/metrics/vllm/QueueWaitPerBuildPanel"; import ReliabilityPanel from "components/metrics/vllm/ReliabilityPanel"; import ReliabilityTrendPanel from "components/metrics/vllm/ReliabilityTrendPanel"; import RetryTrendPanel from "components/metrics/vllm/RetryTrendPanel"; +import RunCostPerBuildPanel from "components/metrics/vllm/RunCostPerBuildPanel"; import TimeToSignalTrendPanel from "components/metrics/vllm/TimeToSignalTrendPanel"; import TrunkHealthPanel from "components/metrics/vllm/TrunkHealthPanel"; import TrunkHealthTrendPanel from "components/metrics/vllm/TrunkHealthTrendPanel"; @@ -307,6 +309,15 @@ export default function Page() { ? undefined : qFrom(prevSuccessDurations, 0.9); + const { data: queuePerBuild } = useClickHouseAPIImmutable( + "vllm/queue_per_build_windowed", + { + ...timeParams, + } + ); + + const isQueueLoading = queuePerBuild === undefined; + const { data: prCycleData } = useClickHouseAPIImmutable( "vllm/pr_cycle_time_breakdown", { @@ -961,6 +972,33 @@ export default function Page() { + {/* Section 3b: Queue Utilization & Cost */} + + + Queue Utilization & Cost + + + + {isQueueLoading ? ( + <> + + + + + + + + ) : ( + <> + + + + + + + + )} + {/* Section 4: PR Cycle Metrics */}