From 00000ade451ead8d5ab48a5648b4073b28a33e4f Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 3 Nov 2025 17:27:44 -0800 Subject: [PATCH 1/9] Add per job runtime metrics and docker build metrics --- .../vllm/docker_build_runtime/params.json | 15 + .../vllm/docker_build_runtime/query.sql | 30 ++ .../vllm/job_runtime_trends/params.json | 17 + .../vllm/job_runtime_trends/query.sql | 58 +++ .../metrics/vllm/CiDurationsPanel.tsx | 19 +- .../metrics/vllm/DockerBuildRuntimePanel.tsx | 165 ++++++++ .../vllm/DurationDistributionPanel.tsx | 19 +- .../metrics/vllm/JobRuntimePanel.tsx | 392 ++++++++++++++++++ torchci/pages/metrics/vllm.tsx | 29 +- 9 files changed, 707 insertions(+), 37 deletions(-) create mode 100644 torchci/clickhouse_queries/vllm/docker_build_runtime/params.json create mode 100644 torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql create mode 100644 torchci/clickhouse_queries/vllm/job_runtime_trends/params.json create mode 100644 torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql create mode 100644 torchci/components/metrics/vllm/DockerBuildRuntimePanel.tsx create mode 100644 torchci/components/metrics/vllm/JobRuntimePanel.tsx diff --git a/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json b/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json new file mode 100644 index 0000000000..e16edc9fce --- /dev/null +++ b/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json @@ -0,0 +1,15 @@ +{ + "params": { + "repo": "String", + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)" + }, + "tests": [ + { + "repo": "https://github.com/vllm-project/vllm.git", + "startTime": "2025-10-01T00:00:00.000", + "stopTime": "2025-11-01T00:00:00.000" + } + ] +} + diff --git a/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql b/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql new file mode 100644 index 0000000000..8d16cd611a --- /dev/null +++ b/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql @@ -0,0 +1,30 @@ +-- vLLM Docker Build Image Runtime Trends (main branch only) +-- Tracks runtime for the ":docker: build image" job specifically +-- This is a critical job for build speed monitoring + +WITH jobs AS ( + SELECT + tupleElement(job, 'name') AS job_name, + tupleElement(job, 'started_at') AS job_started_at, + tupleElement(job, 'finished_at') AS job_finished_at, + tupleElement(job, 'state') AS job_state, + tupleElement(build, 'number') AS build_number + FROM vllm.vllm_buildkite_jobs + WHERE + tupleElement(pipeline, 'repository') = {repo: String } + AND tupleElement(build, 'branch') = 'main' + AND tupleElement(job, 'name') = ':docker: build image' + AND tupleElement(job, 'started_at') IS NOT NULL + AND tupleElement(job, 'finished_at') IS NOT NULL + AND tupleElement(job, 'started_at') >= {startTime: DateTime64(3) } + AND tupleElement(job, 'started_at') < {stopTime: DateTime64(3) } + AND lowerUTF8(tupleElement(job, 'state')) IN ('passed', 'finished', 'success', 'failed') +) + +SELECT + job_started_at AS timestamp, + build_number, + round(dateDiff('second', job_started_at, job_finished_at) / 60.0, 2) AS runtime_minutes +FROM jobs +ORDER BY job_started_at ASC + diff --git a/torchci/clickhouse_queries/vllm/job_runtime_trends/params.json b/torchci/clickhouse_queries/vllm/job_runtime_trends/params.json new file mode 100644 index 0000000000..d8b1145a47 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/job_runtime_trends/params.json @@ -0,0 +1,17 @@ +{ + "params": { + "repo": "String", + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)", + "jobGroups": "Array(String)" + }, + "tests": [ + { + "repo": "https://github.com/vllm-project/vllm.git", + "startTime": "2025-10-01T00:00:00.000", + "stopTime": "2025-10-08T00:00:00.000", + "jobGroups": ["main", "amd", "torch_nightly"] + } + ] +} + diff --git a/torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql b/torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql new file mode 100644 index 0000000000..722cdcf9d3 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql @@ -0,0 +1,58 @@ +-- vLLM Job Runtime Trends (main branch only) +-- Aggregates per-job runtime statistics by day +-- Shows count, mean, p90, and max runtime for each job per day +-- Supports filtering by job groups: AMD, Torch Nightly, or Main + +WITH jobs AS ( + SELECT + tupleElement(job, 'name') AS job_name, + tupleElement(job, 'started_at') AS job_started_at, + tupleElement(job, 'finished_at') AS job_finished_at, + tupleElement(job, 'state') AS job_state, + tupleElement(build, 'branch') AS branch + FROM vllm.vllm_buildkite_jobs + WHERE + tupleElement(pipeline, 'repository') = {repo: String } + AND tupleElement(build, 'branch') = 'main' + AND tupleElement(job, 'started_at') IS NOT NULL + AND tupleElement(job, 'finished_at') IS NOT NULL + AND tupleElement(job, 'started_at') >= {startTime: DateTime64(3) } + AND tupleElement(job, 'started_at') < {stopTime: DateTime64(3) } + AND lowerUTF8(tupleElement(job, 'state')) IN ('passed', 'finished', 'success', 'failed') + -- Job group filtering: AMD, Torch Nightly, or Main + AND ( + ( + has({jobGroups: Array(String)}, 'amd') + AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD') + > 0 + ) + OR ( + has({jobGroups: Array(String)}, 'torch_nightly') + AND positionCaseInsensitive( + tupleElement(job, 'name'), 'Torch Nightly' + ) + > 0 + ) + OR ( + has({jobGroups: Array(String)}, 'main') + AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD') + = 0 + AND positionCaseInsensitive( + tupleElement(job, 'name'), 'Torch Nightly' + ) + = 0 + ) + ) +) + +SELECT + job_name, + toDate(job_started_at) AS date, + count() AS count, + round(avg(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) AS mean_runtime_minutes, + round(quantile(0.9)(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) AS p90_runtime_minutes, + round(max(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) AS max_runtime_minutes +FROM jobs +GROUP BY job_name, date +ORDER BY job_name ASC, date ASC + diff --git a/torchci/components/metrics/vllm/CiDurationsPanel.tsx b/torchci/components/metrics/vllm/CiDurationsPanel.tsx index 63b8c49425..30e5db0462 100644 --- a/torchci/components/metrics/vllm/CiDurationsPanel.tsx +++ b/torchci/components/metrics/vllm/CiDurationsPanel.tsx @@ -230,28 +230,11 @@ export default function CiDurationsPanel({ ...getLineSeries(dailyMeanSuccess, dailyMeanNonCanceled), ...getScatterSeriesByState(source), ], - dataZoom: [ - { - type: "slider", - show: true, - xAxisIndex: 0, - bottom: 0, - start: 0, - end: 100, - height: 25, - }, - { - type: "inside", - xAxisIndex: 0, - start: 0, - end: 100, - }, - ], }; return ( Build #${buildNumber}
` + : `Daily Average
`; + result += `Time: ${formattedTime}
`; + result += `Runtime: ${runtime.toFixed(1)} min`; + + return result; +} + +// Helper function to handle click events +function handleBuildClick(params: any) { + if (params?.componentType === "series") { + const data = Array.isArray(params.data) ? params.data : [params.data]; + const buildNumber = data[2]; + if (buildNumber !== undefined && buildNumber !== null) { + const url = `https://buildkite.com/vllm/ci/builds/${buildNumber}/`; + if (typeof window !== "undefined") { + window.open(url, "_blank"); + } + } + } +} + +export default function DockerBuildRuntimePanel({ + data, +}: { + data: DockerBuildData[] | undefined; +}) { + const { darkMode } = useDarkMode(); + + // Process data for chart + const chartData = (data || []).map((d) => [ + dayjs(d.timestamp).toISOString(), + d.runtime_minutes, + d.build_number, + ]); + + // Calculate daily average for trend line + const groupedByDay = _.groupBy(data || [], (d) => + dayjs(d.timestamp).format("YYYY-MM-DD") + ); + + const dailyAvg = Object.entries(groupedByDay) + .map(([day, records]) => { + const avgRuntime = _.meanBy(records, "runtime_minutes"); + return { + day, + value: Number(avgRuntime.toFixed(1)), + }; + }) + .sort((a, b) => (a.day < b.day ? -1 : 1)); + + // Calculate statistics + const runtimes = (data || []).map((d) => d.runtime_minutes); + const avgRuntime = runtimes.length ? _.mean(runtimes).toFixed(1) : "N/A"; + const p90Runtime = runtimes.length + ? runtimes.sort((a, b) => a - b)[ + Math.floor(runtimes.length * 0.9) + ].toFixed(1) + : "N/A"; + + const options: EChartsOption = { + title: { + text: "Docker Build Image Runtime", + subtext: `Avg: ${avgRuntime}m | P90: ${p90Runtime}m | Total builds: ${runtimes.length}`, + textStyle: { + fontSize: 14, + }, + }, + legend: { + top: 24, + data: ["Individual Builds", "Daily Average"], + }, + grid: { top: 60, right: 20, bottom: 80, left: 60 }, + dataset: [{ source: chartData }, { source: dailyAvg }], + xAxis: { + type: "time", + axisLabel: { + hideOverlap: true, + formatter: (value: number) => dayjs(value).format("M/D"), + }, + }, + yAxis: { + type: "value", + name: "Runtime (minutes)", + nameLocation: "middle", + nameGap: 45, + nameRotate: 90, + axisLabel: { + formatter: (value: number) => `${value}m`, + }, + }, + series: [ + { + name: "Individual Builds", + type: "scatter", + datasetIndex: 0, + symbolSize: 6, + itemStyle: { color: COLOR_SUCCESS, opacity: 0.6 }, + }, + { + name: "Daily Average", + type: "line", + datasetIndex: 1, + smooth: true, + encode: { x: "day", y: "value" }, + lineStyle: { color: COLOR_WARNING, width: 2 }, + itemStyle: { color: COLOR_WARNING }, + showSymbol: true, + symbolSize: 4, + }, + ], + tooltip: { + trigger: "item", + formatter: formatTooltip, + }, + }; + + return ( + + ); +} + diff --git a/torchci/components/metrics/vllm/DurationDistributionPanel.tsx b/torchci/components/metrics/vllm/DurationDistributionPanel.tsx index b4f9e4ba65..bf261e365d 100644 --- a/torchci/components/metrics/vllm/DurationDistributionPanel.tsx +++ b/torchci/components/metrics/vllm/DurationDistributionPanel.tsx @@ -164,28 +164,11 @@ export default function DurationDistributionPanel({ axisPointer: { type: "shadow" }, formatter: formatDistributionTooltip, }, - dataZoom: [ - { - type: "slider", - show: true, - xAxisIndex: 0, - bottom: 0, - start: 0, - end: 100, - height: 25, - }, - { - type: "inside", - xAxisIndex: 0, - start: 0, - end: 100, - }, - ], }; return ( diff --git a/torchci/components/metrics/vllm/JobRuntimePanel.tsx b/torchci/components/metrics/vllm/JobRuntimePanel.tsx new file mode 100644 index 0000000000..3a1963164b --- /dev/null +++ b/torchci/components/metrics/vllm/JobRuntimePanel.tsx @@ -0,0 +1,392 @@ +import { + Box, + Paper, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + TableSortLabel, + TextField, +} from "@mui/material"; +import dayjs from "dayjs"; +import { EChartsOption } from "echarts"; +import ReactECharts from "echarts-for-react"; +import { useDarkMode } from "lib/DarkModeContext"; +import React, { useState } from "react"; +import { + getChartTitle, + getCrosshairTooltipConfig, + getReactEChartsProps, + GRID_DEFAULT, +} from "./chartUtils"; +import { COLOR_SUCCESS, COLOR_WARNING } from "./constants"; + +interface JobRuntimeData { + job_name: string; + date: string; + count: number; + mean_runtime_minutes: number; + p90_runtime_minutes: number; + max_runtime_minutes: number; +} + +interface JobAggregatedStats { + job_name: string; + count: number; + mean: number; + p90: number; + max: number; +} + +type SortField = "job_name" | "count" | "mean" | "p90" | "max"; +type SortOrder = "asc" | "desc"; + +// Helper function to aggregate job statistics across all dates +function aggregateJobStats(data: JobRuntimeData[]): JobAggregatedStats[] { + const jobMap = new Map(); + + // Group by job name + data.forEach((row) => { + if (!jobMap.has(row.job_name)) { + jobMap.set(row.job_name, []); + } + jobMap.get(row.job_name)!.push(row); + }); + + // Aggregate statistics + const result: JobAggregatedStats[] = []; + jobMap.forEach((rows, jobName) => { + const totalCount = rows.reduce((sum, r) => sum + r.count, 0); + const avgMean = + rows.reduce((sum, r) => sum + r.mean_runtime_minutes * r.count, 0) / + totalCount; + const avgP90 = + rows.reduce((sum, r) => sum + r.p90_runtime_minutes * r.count, 0) / + totalCount; + const overallMax = Math.max(...rows.map((r) => r.max_runtime_minutes)); + + result.push({ + job_name: jobName, + count: totalCount, + mean: avgMean, + p90: avgP90, + max: overallMax, + }); + }); + + return result; +} + +// Helper function to format runtime with unit +function formatRuntime(minutes: number | null | undefined): string { + if (minutes === null || minutes === undefined) return "-"; + return minutes.toFixed(1) + "m"; +} + +// Helper function to format tooltip +function formatChartTooltip(params: any): string { + if (!Array.isArray(params) || params.length === 0) return ""; + + const date = params[0].axisValue; + let result = `${date}
`; + + params.forEach((p: any) => { + if (p.value !== undefined && p.value !== null) { + result += `${p.marker} ${p.seriesName}: ${p.value.toFixed(1)}m
`; + } + }); + + return result; +} + +// Helper function to get line chart series +function getLineSeries( + dates: string[], + meanData: number[], + p90Data: number[] +): any[] { + return [ + { + name: "Mean Runtime", + type: "line", + data: meanData, + smooth: true, + symbol: "circle", + symbolSize: 6, + itemStyle: { color: COLOR_SUCCESS }, + lineStyle: { width: 2 }, + emphasis: { focus: "series" }, + }, + { + name: "P90 Runtime", + type: "line", + data: p90Data, + smooth: true, + symbol: "diamond", + symbolSize: 7, + itemStyle: { color: COLOR_WARNING }, + lineStyle: { width: 2, type: "dashed" }, + emphasis: { focus: "series" }, + }, + ]; +} + +export default function JobRuntimePanel({ + data, +}: { + data: JobRuntimeData[] | undefined; +}) { + const { darkMode } = useDarkMode(); + const [sortField, setSortField] = useState("mean"); + const [sortOrder, setSortOrder] = useState("desc"); + const [searchQuery, setSearchQuery] = useState(""); + const [selectedJob, setSelectedJob] = useState(null); + + // Aggregate statistics for the table + const aggregatedStats = aggregateJobStats(data || []); + + // Filter by search query + const filteredStats = aggregatedStats.filter((job) => + job.job_name.toLowerCase().includes(searchQuery.toLowerCase()) + ); + + // Sort the filtered data + const sortedStats = [...filteredStats].sort((a, b) => { + let aValue: number | string = a[sortField]; + let bValue: number | string = b[sortField]; + + if (sortField === "job_name") { + aValue = (aValue as string).toLowerCase(); + bValue = (bValue as string).toLowerCase(); + return sortOrder === "asc" + ? aValue < bValue + ? -1 + : 1 + : aValue > bValue + ? -1 + : 1; + } + + return sortOrder === "asc" + ? (aValue as number) - (bValue as number) + : (bValue as number) - (aValue as number); + }); + + // Auto-select first job if nothing is selected or if selected job is no longer in the list + React.useEffect(() => { + if (sortedStats.length > 0) { + if (!selectedJob || !sortedStats.some((s) => s.job_name === selectedJob)) { + setSelectedJob(sortedStats[0].job_name); + } + } + }, [sortedStats, selectedJob]); + + // Handle sort request + function handleSort(field: SortField) { + if (sortField === field) { + setSortOrder(sortOrder === "asc" ? "desc" : "asc"); + } else { + setSortField(field); + setSortOrder("desc"); + } + } + + // Handle row click + function handleRowClick(jobName: string) { + setSelectedJob(jobName); + } + + // Prepare chart data for selected job + const selectedJobData = + selectedJob && data + ? data + .filter((d) => d.job_name === selectedJob) + .sort((a, b) => a.date.localeCompare(b.date)) + : []; + + const chartDates = selectedJobData.map((d) => + dayjs(d.date).format("MMM D") + ); + const chartMeanData = selectedJobData.map((d) => d.mean_runtime_minutes); + const chartP90Data = selectedJobData.map((d) => d.p90_runtime_minutes); + + const chartOptions: EChartsOption = { + title: { + text: selectedJob ? "Runtime Trend" : "Select a job to view", + subtext: selectedJob || "Click a row in the table", + textStyle: { + fontSize: 14, + }, + subtextStyle: { + fontSize: 16, + fontWeight: "bold", + color: darkMode ? "#fff" : "#333", + }, + }, + legend: { + top: 40, + data: ["Mean Runtime", "P90 Runtime"], + }, + grid: { top: 80, right: 20, bottom: 60, left: 60 }, + xAxis: { + type: "category", + data: chartDates, + name: "Date", + nameLocation: "middle", + nameGap: 35, + axisLabel: { + rotate: 45, + fontSize: 10, + }, + }, + yAxis: { + type: "value", + name: "Runtime (minutes)", + nameLocation: "middle", + nameGap: 45, + axisLabel: { + formatter: (value: number) => `${value}m`, + }, + }, + series: + selectedJobData.length > 0 + ? getLineSeries(chartDates, chartMeanData, chartP90Data) + : [], + tooltip: getCrosshairTooltipConfig(darkMode, formatChartTooltip), + }; + + return ( + + + {/* Table on the left */} + + setSearchQuery(e.target.value)} + sx={{ mb: 1 }} + fullWidth + /> + + + + + + handleSort("job_name")} + > + Job Name + + + + handleSort("count")} + > + Count + + + + handleSort("mean")} + > + Mean + + + + handleSort("p90")} + > + P90 + + + + handleSort("max")} + > + Max + + + + + + {sortedStats.map((job) => ( + handleRowClick(job.job_name)} + selected={selectedJob === job.job_name} + sx={{ + cursor: "pointer", + "&.Mui-selected": { + backgroundColor: darkMode + ? "rgba(144, 202, 249, 0.16)" + : "rgba(25, 118, 210, 0.12)", + }, + "&.Mui-selected:hover": { + backgroundColor: darkMode + ? "rgba(144, 202, 249, 0.24)" + : "rgba(25, 118, 210, 0.18)", + }, + }} + > + + {job.job_name} + + {job.count} + + {formatRuntime(job.mean)} + + + {formatRuntime(job.p90)} + + + {formatRuntime(job.max)} + + + ))} + +
+
+
+ + {/* Chart on the right */} + + + +
+
+ ); +} diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index 5ab03bd87c..2c769855b1 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -11,11 +11,13 @@ import { } from "@mui/material"; import CiDurationsPanel from "components/metrics/vllm/CiDurationsPanel"; import CommitsOnRedTrendPanel from "components/metrics/vllm/CommitsOnRedTrendPanel"; +import DockerBuildRuntimePanel from "components/metrics/vllm/DockerBuildRuntimePanel"; import DurationDistributionPanel from "components/metrics/vllm/DurationDistributionPanel"; import JobGroupFilter, { JobGroup, } from "components/metrics/vllm/JobGroupFilter"; import JobReliabilityPanel from "components/metrics/vllm/JobReliabilityPanel"; +import JobRuntimePanel from "components/metrics/vllm/JobRuntimePanel"; import MergesPanel from "components/metrics/vllm/MergesPanel"; import MostRetriedJobsTable from "components/metrics/vllm/MostRetriedJobsTable"; import QueueWaitPerBuildPanel from "components/metrics/vllm/QueueWaitPerBuildPanel"; @@ -375,6 +377,23 @@ export default function Page() { } ); + const { data: jobRuntimeTrendsData } = useClickHouseAPIImmutable( + "vllm/job_runtime_trends", + { + ...timeParams, + repo: "https://github.com/vllm-project/vllm.git", + jobGroups: selectedJobGroups, + } + ); + + const { data: dockerBuildRuntimeData } = useClickHouseAPIImmutable( + "vllm/docker_build_runtime", + { + ...timeParams, + repo: "https://github.com/vllm-project/vllm.git", + } + ); + const { data: trunkHealthData } = useClickHouseAPIImmutable( "vllm/trunk_health", { @@ -968,9 +987,17 @@ export default function Page() { - + + + + + + + + + {/* Section 3b: Queue Utilization & Cost */} From 98ca56afe6ac31da190bb4d8175fbe1cd34219e9 Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 3 Nov 2025 19:24:15 -0800 Subject: [PATCH 2/9] Replace sections with tabs --- torchci/pages/metrics/vllm.tsx | 516 ++++++++++++++++++--------------- 1 file changed, 282 insertions(+), 234 deletions(-) diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index 2c769855b1..c46b67ac8d 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -7,6 +7,8 @@ import { Link, Skeleton, Stack, + Tab, + Tabs, Typography, } from "@mui/material"; import CiDurationsPanel from "components/metrics/vllm/CiDurationsPanel"; @@ -220,6 +222,11 @@ export default function Page() { "torch_nightly", "main", ]); + const [selectedTab, setSelectedTab] = useState(0); + + const handleTabChange = (_: React.SyntheticEvent, newValue: number) => { + setSelectedTab(newValue); + }; const timeParams = { startTime: startTime.utc().format("YYYY-MM-DDTHH:mm:ss.SSS"), @@ -692,7 +699,7 @@ export default function Page() { : _.meanBy(recoveryTimes, "recovery_hours"); return ( -
+ - {/* Section 1: Key Metrics Summary Cards */} + {/* Overview - Always Visible */} Key Metrics Overview @@ -852,249 +859,290 @@ export default function Page() { /> - {/* Section 2: CI Reliability */} - - - CI Reliability - - - - (v ?? 1) < 0.85, - tooltip: - "Percentage of main branch builds with zero hard test failures. Builds with only soft failures (flaky tests) count as passed. Canceled builds excluded from calculation.", - delta: overallSuccessRateDelta, - }, - ]} - /> - (v ?? 0) > 10, - tooltip: - "Count of main branch CI runs with hard test failures (soft failures excluded) in selected time period.", - delta: totalFailedDelta, + {/* Tabs for detailed sections */} + + - (v ?? 0) > 0.01, - tooltip: - "Percentage of jobs that were manually or automatically retried. Low values (<1%) indicate stable infrastructure. High values may indicate flaky tests or infrastructure issues.", - delta: null, // TODO: Add delta when we have previous retry data + "& .Mui-selected": { + fontWeight: 700, }, - ]} - /> - - - - - - - - - - - - - - - - - - - - - - - - - - - - Trunk Health - - - - (v ?? 0) > 12, - tooltip: - "Average time trunk stays broken before being fixed. Measured from when trunk first breaks (success→failure) to when it's fixed (failure→success). Includes nights, weekends, and investigation time. Lower is better.", - delta: null, // TODO: Calculate when we have previous recovery data + }} + TabIndicatorProps={{ + sx: { + height: 3, + borderRadius: "3px 3px 0 0", }, - ]} - /> - - - - - - - - - - - - - - - - - + }} + > + + + + + + - {/* Section 3: CI Duration Analysis */} - - - CI Duration Analysis - - - - - - - - - - - - - - - - - - - - - - - - {/* Section 3b: Queue Utilization & Cost */} - - - Queue Utilization & Cost - - - - {isQueueLoading ? ( - <> + {/* Tab 0: Reliability */} + {selectedTab === 0 && ( + <> + + (v ?? 1) < 0.85, + tooltip: + "Percentage of main branch builds with zero hard test failures. Builds with only soft failures (flaky tests) count as passed. Canceled builds excluded from calculation.", + delta: overallSuccessRateDelta, + }, + ]} + /> + (v ?? 0) > 10, + tooltip: + "Count of main branch CI runs with hard test failures (soft failures excluded) in selected time period.", + delta: totalFailedDelta, + }, + ]} + /> + (v ?? 0) > 0.01, + tooltip: + "Percentage of jobs that were manually or automatically retried. Low values (<1%) indicate stable infrastructure. High values may indicate flaky tests or infrastructure issues.", + delta: null, // TODO: Add delta when we have previous retry data + }, + ]} + /> + (v ?? 0) > 12, + tooltip: + "Average time trunk stays broken before being fixed. Measured from when trunk first breaks (success→failure) to when it's fixed (failure→success). Includes nights, weekends, and investigation time. Lower is better.", + delta: null, // TODO: Calculate when we have previous recovery data + }, + ]} + /> + + - + - + - - ) : ( - <> + + - + - + - - )} - - - {/* Section 4: PR Cycle Metrics */} - - - PR Cycle Metrics - - - - (v ?? 0) > 0.5, - tooltip: - "Percentage of merged PRs where a human clicked 'Merge' button instead of using GitHub auto-merge. Includes both clean manual merges AND force merges. High values may indicate slow merge queues or low CI trust.", - delta: manualMergedPctDelta, - }, - ]} - /> - (v ?? 0) > 24, - badThreshold2: (v) => (v ?? 0) > 72, - tooltip: - "Time from PR ready (labeled 'ready' or created) to first human review comment. P50 = median, P90 = 90th percentile. Excludes bot reviews.", - delta: timeToReviewP50Delta, - delta2: timeToReviewP90Delta, - }, - ]} - /> - (v ?? 0) > 48, - badThreshold2: (v) => (v ?? 0) > 120, - tooltip: - "Time from first human review to first approval from a maintainer (MEMBER/OWNER/COLLABORATOR). P50 = median, P90 = 90th percentile.", - delta: timeToApprovalP50Delta, - delta2: timeToApprovalP90Delta, - }, - ]} - /> - - - - - - -
+ + + + + + + + + + + + + + + + + + + + + + + + + + + )} + + {/* Tab 1: Duration Analysis */} + {selectedTab === 1 && ( + <> + + + + + + + + + + + + + + + + + + + + + + + )} + + {/* Tab 2: Source Control */} + {selectedTab === 2 && ( + <> + + (v ?? 0) > 0.5, + tooltip: + "Percentage of merged PRs where a human clicked 'Merge' button instead of using GitHub auto-merge. Includes both clean manual merges AND force merges. High values may indicate slow merge queues or low CI trust.", + delta: manualMergedPctDelta, + }, + ]} + /> + (v ?? 0) > 24, + badThreshold2: (v) => (v ?? 0) > 72, + tooltip: + "Time from PR ready (labeled 'ready' or created) to first human review comment. P50 = median, P90 = 90th percentile. Excludes bot reviews.", + delta: timeToReviewP50Delta, + delta2: timeToReviewP90Delta, + }, + ]} + /> + (v ?? 0) > 48, + badThreshold2: (v) => (v ?? 0) > 120, + tooltip: + "Time from first human review to first approval from a maintainer (MEMBER/OWNER/COLLABORATOR). P50 = median, P90 = 90th percentile.", + delta: timeToApprovalP50Delta, + delta2: timeToApprovalP90Delta, + }, + ]} + /> + + + + + + + + )} + + {/* Tab 3: Utilization & Cost */} + {selectedTab === 3 && ( + <> + + {isQueueLoading ? ( + <> + + + + + + + + ) : ( + <> + + + + + + + + )} + + + )} + ); } From 6641870aa7ed41942875b1fa6d965e598a7f46d6 Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 3 Nov 2025 19:37:09 -0800 Subject: [PATCH 3/9] Lint --- .../vllm/docker_build_runtime/params.json | 1 - .../vllm/docker_build_runtime/query.sql | 8 +- .../vllm/job_runtime_trends/params.json | 1 - .../vllm/job_runtime_trends/query.sql | 18 ++- .../metrics/vllm/DockerBuildRuntimePanel.tsx | 11 +- .../metrics/vllm/JobRuntimePanel.tsx | 33 +++--- torchci/components/metrics/vllm/constants.ts | 57 ++++++++- torchci/pages/metrics/vllm.tsx | 109 +++++++----------- 8 files changed, 140 insertions(+), 98 deletions(-) diff --git a/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json b/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json index e16edc9fce..7f2c8af8ca 100644 --- a/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json +++ b/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json @@ -12,4 +12,3 @@ } ] } - diff --git a/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql b/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql index 8d16cd611a..8041cdb538 100644 --- a/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql +++ b/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql @@ -18,13 +18,15 @@ WITH jobs AS ( AND tupleElement(job, 'finished_at') IS NOT NULL AND tupleElement(job, 'started_at') >= {startTime: DateTime64(3) } AND tupleElement(job, 'started_at') < {stopTime: DateTime64(3) } - AND lowerUTF8(tupleElement(job, 'state')) IN ('passed', 'finished', 'success', 'failed') + AND lowerUTF8(tupleElement(job, 'state')) IN ( + 'passed', 'finished', 'success', 'failed' + ) ) SELECT job_started_at AS timestamp, build_number, - round(dateDiff('second', job_started_at, job_finished_at) / 60.0, 2) AS runtime_minutes + round(dateDiff('second', job_started_at, job_finished_at) / 60.0, 2) + AS runtime_minutes FROM jobs ORDER BY job_started_at ASC - diff --git a/torchci/clickhouse_queries/vllm/job_runtime_trends/params.json b/torchci/clickhouse_queries/vllm/job_runtime_trends/params.json index d8b1145a47..5f8714d676 100644 --- a/torchci/clickhouse_queries/vllm/job_runtime_trends/params.json +++ b/torchci/clickhouse_queries/vllm/job_runtime_trends/params.json @@ -14,4 +14,3 @@ } ] } - diff --git a/torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql b/torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql index 722cdcf9d3..48c347009a 100644 --- a/torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql +++ b/torchci/clickhouse_queries/vllm/job_runtime_trends/query.sql @@ -18,7 +18,9 @@ WITH jobs AS ( AND tupleElement(job, 'finished_at') IS NOT NULL AND tupleElement(job, 'started_at') >= {startTime: DateTime64(3) } AND tupleElement(job, 'started_at') < {stopTime: DateTime64(3) } - AND lowerUTF8(tupleElement(job, 'state')) IN ('passed', 'finished', 'success', 'failed') + AND lowerUTF8(tupleElement(job, 'state')) IN ( + 'passed', 'finished', 'success', 'failed' + ) -- Job group filtering: AMD, Torch Nightly, or Main AND ( ( @@ -49,10 +51,16 @@ SELECT job_name, toDate(job_started_at) AS date, count() AS count, - round(avg(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) AS mean_runtime_minutes, - round(quantile(0.9)(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) AS p90_runtime_minutes, - round(max(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) AS max_runtime_minutes + round(avg(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) + AS mean_runtime_minutes, + round( + quantile(0.9) ( + dateDiff('second', job_started_at, job_finished_at) / 60.0 + ), + 2 + ) AS p90_runtime_minutes, + round(max(dateDiff('second', job_started_at, job_finished_at) / 60.0), 2) + AS max_runtime_minutes FROM jobs GROUP BY job_name, date ORDER BY job_name ASC, date ASC - diff --git a/torchci/components/metrics/vllm/DockerBuildRuntimePanel.tsx b/torchci/components/metrics/vllm/DockerBuildRuntimePanel.tsx index d404a545f7..0124f6c617 100644 --- a/torchci/components/metrics/vllm/DockerBuildRuntimePanel.tsx +++ b/torchci/components/metrics/vllm/DockerBuildRuntimePanel.tsx @@ -16,10 +16,10 @@ function formatTooltip(params: any): string { if (!params || !params.data) return ""; const data = params.data; - + // Handle both scatter (array) and line (object) series let timestamp, runtime, buildNumber; - + if (Array.isArray(data)) { timestamp = data[0]; runtime = data[1]; @@ -91,9 +91,9 @@ export default function DockerBuildRuntimePanel({ const runtimes = (data || []).map((d) => d.runtime_minutes); const avgRuntime = runtimes.length ? _.mean(runtimes).toFixed(1) : "N/A"; const p90Runtime = runtimes.length - ? runtimes.sort((a, b) => a - b)[ - Math.floor(runtimes.length * 0.9) - ].toFixed(1) + ? runtimes + .sort((a, b) => a - b) + [Math.floor(runtimes.length * 0.9)].toFixed(1) : "N/A"; const options: EChartsOption = { @@ -162,4 +162,3 @@ export default function DockerBuildRuntimePanel({ /> ); } - diff --git a/torchci/components/metrics/vllm/JobRuntimePanel.tsx b/torchci/components/metrics/vllm/JobRuntimePanel.tsx index 3a1963164b..80ed0c5dc1 100644 --- a/torchci/components/metrics/vllm/JobRuntimePanel.tsx +++ b/torchci/components/metrics/vllm/JobRuntimePanel.tsx @@ -15,12 +15,7 @@ import { EChartsOption } from "echarts"; import ReactECharts from "echarts-for-react"; import { useDarkMode } from "lib/DarkModeContext"; import React, { useState } from "react"; -import { - getChartTitle, - getCrosshairTooltipConfig, - getReactEChartsProps, - GRID_DEFAULT, -} from "./chartUtils"; +import { getCrosshairTooltipConfig, getReactEChartsProps } from "./chartUtils"; import { COLOR_SUCCESS, COLOR_WARNING } from "./constants"; interface JobRuntimeData { @@ -94,7 +89,9 @@ function formatChartTooltip(params: any): string { params.forEach((p: any) => { if (p.value !== undefined && p.value !== null) { - result += `${p.marker} ${p.seriesName}: ${p.value.toFixed(1)}m
`; + result += `${p.marker} ${p.seriesName}: ${p.value.toFixed( + 1 + )}m
`; } }); @@ -165,8 +162,8 @@ export default function JobRuntimePanel({ ? -1 : 1 : aValue > bValue - ? -1 - : 1; + ? -1 + : 1; } return sortOrder === "asc" @@ -177,7 +174,10 @@ export default function JobRuntimePanel({ // Auto-select first job if nothing is selected or if selected job is no longer in the list React.useEffect(() => { if (sortedStats.length > 0) { - if (!selectedJob || !sortedStats.some((s) => s.job_name === selectedJob)) { + if ( + !selectedJob || + !sortedStats.some((s) => s.job_name === selectedJob) + ) { setSelectedJob(sortedStats[0].job_name); } } @@ -206,9 +206,7 @@ export default function JobRuntimePanel({ .sort((a, b) => a.date.localeCompare(b.date)) : []; - const chartDates = selectedJobData.map((d) => - dayjs(d.date).format("MMM D") - ); + const chartDates = selectedJobData.map((d) => dayjs(d.date).format("MMM D")); const chartMeanData = selectedJobData.map((d) => d.mean_runtime_minutes); const chartP90Data = selectedJobData.map((d) => d.p90_runtime_minutes); @@ -379,7 +377,14 @@ export default function JobRuntimePanel({ {/* Chart on the right */} - + ({ + borderBottom: 2, + borderColor: "divider", + mb: 3, + mt: 2, + bgcolor: darkMode ? "rgba(255, 255, 255, 0.05)" : "rgba(0, 0, 0, 0.02)", + borderRadius: "8px 8px 0 0", + px: 2, + }), + tabsSx: { + "& .MuiTab-root": { + fontSize: "1rem", + fontWeight: 600, + minHeight: 56, + textTransform: "none", + px: 3, + }, + "& .Mui-selected": { + fontWeight: 700, + }, + }, + indicatorSx: { + height: 3, + borderRadius: "3px 3px 0 0", + }, +}; + +// ============================================================================ +// Data Visualization Colors +// ============================================================================ export const COLOR_SUCCESS = "#3ba272"; // Green - for successful/passing states export const COLOR_ERROR = "#ee6666"; // Red - for failures/errors export const COLOR_WARNING = "#fc9403"; // Orange - for warnings/manual actions diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index c46b67ac8d..027231c400 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -36,6 +36,17 @@ import { VllmDualScalarPanel, VllmScalarPanel, } from "components/metrics/vllm/VllmScalarPanel"; +import { + DEFAULT_MIN_RUNS_JOB_RELIABILITY, + DEFAULT_MIN_RUNS_RETRY_STATS, + JOB_RUNTIME_PANEL_HEIGHT, + METRIC_CARD_HEIGHT, + PIPELINE_NAME, + ROW_HEIGHT, + TAB_CONFIG, + VLLM_REPO_SHORT, + VLLM_REPO_URL, +} from "components/metrics/vllm/constants"; import dayjs from "dayjs"; import { useDarkMode } from "lib/DarkModeContext"; import { useClickHouseAPIImmutable } from "lib/GeneralUtils"; @@ -43,11 +54,6 @@ import _ from "lodash"; import React, { useState } from "react"; import { TimeRangePicker } from "../metrics"; -const ROW_HEIGHT = 375; -const METRIC_CARD_HEIGHT = 200; // Height for key metric cards (reduced by ~20% from default) - -// moved MergesPanel and CiDurationsPanel to components - // Helper function to safely extract PR cycle data values function getPrCycleValue( data: any[] | undefined, @@ -248,7 +254,7 @@ export default function Page() { { ...timeParams, granularity: "day", - repo: "vllm-project/vllm", + repo: VLLM_REPO_SHORT, } ); @@ -256,9 +262,8 @@ export default function Page() { "vllm/ci_run_duration", { ...timeParams, - // Buildkite uses full repo URL with .git in vLLM dataset - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, } ); @@ -266,8 +271,8 @@ export default function Page() { "vllm/ci_run_duration", { ...prevTimeParams, - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, } ); @@ -331,7 +336,7 @@ export default function Page() { "vllm/pr_cycle_time_breakdown", { ...timeParams, - repo: "vllm-project/vllm", + repo: VLLM_REPO_SHORT, } ); @@ -339,7 +344,7 @@ export default function Page() { "vllm/pr_cycle_time_breakdown", { ...prevTimeParams, - repo: "vllm-project/vllm", + repo: VLLM_REPO_SHORT, } ); @@ -348,8 +353,8 @@ export default function Page() { { ...timeParams, granularity: "day", - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, jobGroups: selectedJobGroups, } ); @@ -357,8 +362,8 @@ export default function Page() { const { data: retryData } = useClickHouseAPIImmutable("vllm/rebuild_rate", { ...timeParams, granularity: "day", - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, jobGroups: selectedJobGroups, }); @@ -366,9 +371,9 @@ export default function Page() { "vllm/job_retry_stats", { ...timeParams, - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", - minRuns: 5, + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + minRuns: DEFAULT_MIN_RUNS_RETRY_STATS, jobGroups: selectedJobGroups, } ); @@ -377,9 +382,9 @@ export default function Page() { "vllm/job_reliability", { ...timeParams, - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", - minRuns: 3, + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + minRuns: DEFAULT_MIN_RUNS_JOB_RELIABILITY, jobGroups: selectedJobGroups, } ); @@ -388,7 +393,7 @@ export default function Page() { "vllm/job_runtime_trends", { ...timeParams, - repo: "https://github.com/vllm-project/vllm.git", + repo: VLLM_REPO_URL, jobGroups: selectedJobGroups, } ); @@ -397,7 +402,7 @@ export default function Page() { "vllm/docker_build_runtime", { ...timeParams, - repo: "https://github.com/vllm-project/vllm.git", + repo: VLLM_REPO_URL, } ); @@ -406,8 +411,8 @@ export default function Page() { { ...timeParams, granularity: "day", - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, jobGroups: selectedJobGroups, } ); @@ -416,8 +421,8 @@ export default function Page() { "vllm/trunk_recovery_time", { ...timeParams, - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, jobGroups: selectedJobGroups, } ); @@ -428,8 +433,8 @@ export default function Page() { { ...prevTimeParams, granularity: "day", - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, jobGroups: selectedJobGroups, } ); @@ -439,8 +444,8 @@ export default function Page() { { ...prevTimeParams, granularity: "day", - repo: "https://github.com/vllm-project/vllm.git", - pipelineName: "CI", + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, jobGroups: selectedJobGroups, } ); @@ -450,7 +455,7 @@ export default function Page() { { ...prevTimeParams, granularity: "day", - repo: "vllm-project/vllm", + repo: VLLM_REPO_SHORT, } ); @@ -860,41 +865,13 @@ export default function Page() { {/* Tabs for detailed sections */} - + @@ -1036,7 +1013,7 @@ export default function Page() { - + From cc8256a01212f68be61d88bac04c05f82c5fbb69 Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 24 Nov 2025 08:56:09 -0800 Subject: [PATCH 4/9] Add recent build tracker for jobs --- .../vllm/job_list/params.json | 19 + .../vllm/job_list/query.sql | 38 ++ .../vllm/recent_job_builds/params.json | 19 + .../vllm/recent_job_builds/query.sql | 57 +++ .../metrics/vllm/JobBuildsPanel.tsx | 428 ++++++++++++++++++ torchci/components/metrics/vllm/constants.ts | 1 + torchci/pages/metrics/vllm.tsx | 18 + 7 files changed, 580 insertions(+) create mode 100644 torchci/clickhouse_queries/vllm/job_list/params.json create mode 100644 torchci/clickhouse_queries/vllm/job_list/query.sql create mode 100644 torchci/clickhouse_queries/vllm/recent_job_builds/params.json create mode 100644 torchci/clickhouse_queries/vllm/recent_job_builds/query.sql create mode 100644 torchci/components/metrics/vllm/JobBuildsPanel.tsx diff --git a/torchci/clickhouse_queries/vllm/job_list/params.json b/torchci/clickhouse_queries/vllm/job_list/params.json new file mode 100644 index 0000000000..a5515c2859 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/job_list/params.json @@ -0,0 +1,19 @@ +{ + "params": { + "repo": "String", + "pipelineName": "String", + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)", + "jobGroups": "Array(String)" + }, + "tests": [ + { + "repo": "https://github.com/vllm-project/vllm.git", + "pipelineName": "CI", + "startTime": "2025-11-17T00:00:00.000", + "stopTime": "2025-11-24T00:00:00.000", + "jobGroups": ["amd", "torch_nightly", "main"] + } + ] +} + diff --git a/torchci/clickhouse_queries/vllm/job_list/query.sql b/torchci/clickhouse_queries/vllm/job_list/query.sql new file mode 100644 index 0000000000..2e83f31f50 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/job_list/query.sql @@ -0,0 +1,38 @@ +-- vLLM job list for build exploration +-- Returns a list of all jobs in the time period with basic stats +-- Used for the job selector in the JobBuildsPanel component +-- Only tracks main branch to focus on production CI + +SELECT + tupleElement(job, 'name') AS job_name, + COUNT(*) AS total_runs, + countIf(lowerUTF8(tupleElement(job, 'state')) IN ('passed', 'finished', 'success')) AS passed_count, + countIf(lowerUTF8(tupleElement(job, 'state')) = 'failed') AS failed_count, + max(tupleElement(job, 'finished_at')) AS last_run_at +FROM vllm.vllm_buildkite_jobs +WHERE + tupleElement(pipeline, 'repository') = {repo: String} + AND tupleElement(pipeline, 'name') = {pipelineName: String} + AND tupleElement(build, 'branch') = 'main' + AND tupleElement(job, 'finished_at') IS NOT NULL + AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3)} + AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3)} + -- Job group filtering: AMD, Torch Nightly, or Main + AND ( + ( + has({jobGroups: Array(String)}, 'amd') + AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD') > 0 + ) + OR ( + has({jobGroups: Array(String)}, 'torch_nightly') + AND positionCaseInsensitive(tupleElement(job, 'name'), 'Torch Nightly') > 0 + ) + OR ( + has({jobGroups: Array(String)}, 'main') + AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD') = 0 + AND positionCaseInsensitive(tupleElement(job, 'name'), 'Torch Nightly') = 0 + ) + ) +GROUP BY job_name +ORDER BY last_run_at DESC, total_runs DESC + diff --git a/torchci/clickhouse_queries/vllm/recent_job_builds/params.json b/torchci/clickhouse_queries/vllm/recent_job_builds/params.json new file mode 100644 index 0000000000..2f50f3cf2e --- /dev/null +++ b/torchci/clickhouse_queries/vllm/recent_job_builds/params.json @@ -0,0 +1,19 @@ +{ + "params": { + "repo": "String", + "pipelineName": "String", + "jobName": "String", + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)" + }, + "tests": [ + { + "repo": "https://github.com/vllm-project/vllm.git", + "pipelineName": "CI", + "jobName": "Test Example Job", + "startTime": "2025-11-17T00:00:00.000", + "stopTime": "2025-11-24T00:00:00.000" + } + ] +} + diff --git a/torchci/clickhouse_queries/vllm/recent_job_builds/query.sql b/torchci/clickhouse_queries/vllm/recent_job_builds/query.sql new file mode 100644 index 0000000000..a1914ad769 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/recent_job_builds/query.sql @@ -0,0 +1,57 @@ +-- vLLM recent builds for a specific job +-- Returns all builds within the time range for a given job name +-- Shows build details: number, state, duration, timestamps, etc. +-- Only tracks main branch + +WITH job_builds AS ( + SELECT + toUInt32(tupleElement(build, 'number')) AS build_number, + tupleElement(build, 'id') AS build_id, + tupleElement(build, 'state') AS build_state, + tupleElement(build, 'web_url') AS build_url, + tupleElement(build, 'started_at') AS build_started_at, + tupleElement(build, 'finished_at') AS build_finished_at, + tupleElement(build, 'commit') AS commit, + tupleElement(build, 'message') AS commit_message, + tupleElement(job, 'name') AS job_name, + tupleElement(job, 'state') AS job_state, + tupleElement(job, 'soft_failed') AS soft_failed, + tupleElement(job, 'started_at') AS job_started_at, + tupleElement(job, 'finished_at') AS job_finished_at, + tupleElement(job, 'web_url') AS job_url, + -- Calculate duration in hours + dateDiff( + 'second', + tupleElement(job, 'started_at'), + tupleElement(job, 'finished_at') + ) / 3600.0 AS duration_hours + FROM vllm.vllm_buildkite_jobs + WHERE + tupleElement(pipeline, 'repository') = {repo: String} + AND tupleElement(pipeline, 'name') = {pipelineName: String} + AND tupleElement(build, 'branch') = 'main' + AND tupleElement(job, 'name') = {jobName: String} + AND tupleElement(job, 'finished_at') IS NOT NULL + AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3)} + AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3)} +) + +SELECT + build_number, + build_id, + build_state, + build_url, + build_started_at, + build_finished_at, + commit, + commit_message, + job_name, + job_state, + soft_failed, + job_started_at, + job_finished_at, + job_url, + duration_hours +FROM job_builds +ORDER BY job_finished_at DESC + diff --git a/torchci/components/metrics/vllm/JobBuildsPanel.tsx b/torchci/components/metrics/vllm/JobBuildsPanel.tsx new file mode 100644 index 0000000000..d124761164 --- /dev/null +++ b/torchci/components/metrics/vllm/JobBuildsPanel.tsx @@ -0,0 +1,428 @@ +import OpenInNewIcon from "@mui/icons-material/OpenInNew"; +import { + Box, + Chip, + Link, + Paper, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + TableSortLabel, + TextField, + Tooltip, +} from "@mui/material"; +import dayjs from "dayjs"; +import { useDarkMode } from "lib/DarkModeContext"; +import { useClickHouseAPIImmutable } from "lib/GeneralUtils"; +import React, { useState } from "react"; +import { + COLOR_ERROR, + COLOR_GRAY, + COLOR_SUCCESS, + COLOR_WARNING, + PIPELINE_NAME, + VLLM_REPO_URL, +} from "./constants"; + +interface JobListData { + job_name: string; + total_runs: number; + passed_count: number; + failed_count: number; + last_run_at: string; +} + +interface RecentBuildData { + build_number: number; + build_id: string; + build_state: string; + build_url: string; + build_started_at: string | null; + build_finished_at: string | null; + commit: string; + commit_message: string; + job_name: string; + job_state: string; + soft_failed: boolean; + job_started_at: string | null; + job_finished_at: string | null; + job_url: string; + duration_hours: number | null; +} + +type JobSortField = "job_name" | "total_runs" | "passed_count" | "failed_count"; +type SortOrder = "asc" | "desc"; + +// Helper function to format duration +function formatDuration(hours: number | null | undefined): string { + if (hours === null || hours === undefined) return "-"; + if (hours < 1) { + return `${(hours * 60).toFixed(0)}m`; + } + return `${hours.toFixed(2)}h`; +} + +// Helper function to get state color +function getStateColor( + state: string, + softFailed: boolean +): { bg: string; text: string } { + const stateLower = state.toLowerCase(); + if (stateLower === "passed" || stateLower === "finished" || stateLower === "success") { + return { bg: COLOR_SUCCESS, text: "#fff" }; + } + if (stateLower === "failed") { + if (softFailed) { + return { bg: COLOR_WARNING, text: "#fff" }; + } + return { bg: COLOR_ERROR, text: "#fff" }; + } + if (stateLower === "canceled" || stateLower === "cancelled") { + return { bg: COLOR_GRAY, text: "#fff" }; + } + return { bg: "#999", text: "#fff" }; +} + +// Helper function to get state label +function getStateLabel(state: string, softFailed: boolean): string { + const stateLower = state.toLowerCase(); + if (stateLower === "failed" && softFailed) { + return "Soft Failed"; + } + return state.charAt(0).toUpperCase() + state.slice(1).toLowerCase(); +} + +export default function JobBuildsPanel({ + data, + timeParams, + jobGroups, +}: { + data: JobListData[] | undefined; + timeParams: { startTime: string; stopTime: string }; + jobGroups: string[]; +}) { + const { darkMode } = useDarkMode(); + const [sortField, setSortField] = useState("last_run_at"); + const [sortOrder, setSortOrder] = useState("desc"); + const [searchQuery, setSearchQuery] = useState(""); + const [selectedJob, setSelectedJob] = useState(null); + + // Fetch recent builds for selected job + const { data: recentBuildsData } = useClickHouseAPIImmutable( + "vllm/recent_job_builds", + selectedJob + ? { + ...timeParams, + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + jobName: selectedJob, + } + : null, + selectedJob !== null + ); + + // Filter by search query + const filteredJobs = (data || []).filter((job) => + job.job_name.toLowerCase().includes(searchQuery.toLowerCase()) + ); + + // Sort the filtered data + const sortedJobs = [...filteredJobs].sort((a, b) => { + let aValue: number | string = a[sortField]; + let bValue: number | string = b[sortField]; + + if (sortField === "job_name") { + aValue = (aValue as string).toLowerCase(); + bValue = (bValue as string).toLowerCase(); + return sortOrder === "asc" + ? aValue < bValue + ? -1 + : 1 + : aValue > bValue + ? -1 + : 1; + } + + return sortOrder === "asc" + ? (aValue as number) - (bValue as number) + : (bValue as number) - (aValue as number); + }); + + // Auto-select first job if nothing is selected or if selected job is no longer in the list + React.useEffect(() => { + if (sortedJobs.length > 0) { + if ( + !selectedJob || + !sortedJobs.some((j) => j.job_name === selectedJob) + ) { + setSelectedJob(sortedJobs[0].job_name); + } + } + }, [sortedJobs, selectedJob]); + + // Handle sort request + function handleSort(field: JobSortField) { + if (sortField === field) { + setSortOrder(sortOrder === "asc" ? "desc" : "asc"); + } else { + setSortField(field); + setSortOrder("desc"); + } + } + + // Handle row click + function handleRowClick(jobName: string) { + setSelectedJob(jobName); + } + + const recentBuilds = (recentBuildsData || []) as RecentBuildData[]; + + return ( + + + {/* Jobs table on the left */} + + setSearchQuery(e.target.value)} + sx={{ mb: 1 }} + fullWidth + /> + + + + + + handleSort("job_name")} + > + Job Name + + + + handleSort("total_runs")} + > + Runs + + + + handleSort("passed_count")} + > + ✓ + + + + handleSort("failed_count")} + > + ✗ + + + + + + {sortedJobs.map((job) => ( + handleRowClick(job.job_name)} + selected={selectedJob === job.job_name} + sx={{ + cursor: "pointer", + "&.Mui-selected": { + backgroundColor: darkMode + ? "rgba(144, 202, 249, 0.16)" + : "rgba(25, 118, 210, 0.12)", + }, + "&.Mui-selected:hover": { + backgroundColor: darkMode + ? "rgba(144, 202, 249, 0.24)" + : "rgba(25, 118, 210, 0.18)", + }, + }} + > + + {job.job_name} + + {job.total_runs} + + {job.passed_count} + + + {job.failed_count} + + + ))} + +
+
+
+ + {/* Recent builds table on the right */} + + + {selectedJob && ( + + Recent Builds: {selectedJob} + + )} + + + + + + Build # + Status + Duration + Finished At + Commit + Links + + + + {recentBuilds.length === 0 && ( + + + {selectedJob + ? "No recent builds found" + : "Select a job to view builds"} + + + )} + {recentBuilds.map((build) => { + const stateColors = getStateColor( + build.job_state, + build.soft_failed + ); + return ( + + + {build.build_number} + + + + + + {formatDuration(build.duration_hours)} + + + {build.job_finished_at + ? dayjs(build.job_finished_at).format("M/D/YY h:mm A") + : "-"} + + + + + {build.commit.slice(0, 7)} + + + + + + + } + size="small" + clickable + sx={{ fontSize: "0.7rem", height: 22 }} + /> + + + } + size="small" + clickable + sx={{ fontSize: "0.7rem", height: 22 }} + /> + + + + + ); + })} + +
+
+
+
+
+ ); +} + diff --git a/torchci/components/metrics/vllm/constants.ts b/torchci/components/metrics/vllm/constants.ts index 390d8c8622..c5a5409904 100644 --- a/torchci/components/metrics/vllm/constants.ts +++ b/torchci/components/metrics/vllm/constants.ts @@ -6,6 +6,7 @@ export const ROW_HEIGHT = 375; export const METRIC_CARD_HEIGHT = 200; export const JOB_RUNTIME_PANEL_HEIGHT = ROW_HEIGHT + 150; +export const JOB_BUILDS_PANEL_HEIGHT = 600; // ============================================================================ // Repository Constants diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index 027231c400..2f1c69e167 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -18,6 +18,7 @@ import DurationDistributionPanel from "components/metrics/vllm/DurationDistribut import JobGroupFilter, { JobGroup, } from "components/metrics/vllm/JobGroupFilter"; +import JobBuildsPanel from "components/metrics/vllm/JobBuildsPanel"; import JobReliabilityPanel from "components/metrics/vllm/JobReliabilityPanel"; import JobRuntimePanel from "components/metrics/vllm/JobRuntimePanel"; import MergesPanel from "components/metrics/vllm/MergesPanel"; @@ -39,6 +40,7 @@ import { import { DEFAULT_MIN_RUNS_JOB_RELIABILITY, DEFAULT_MIN_RUNS_RETRY_STATS, + JOB_BUILDS_PANEL_HEIGHT, JOB_RUNTIME_PANEL_HEIGHT, METRIC_CARD_HEIGHT, PIPELINE_NAME, @@ -398,6 +400,13 @@ export default function Page() { } ); + const { data: jobListData } = useClickHouseAPIImmutable("vllm/job_list", { + ...timeParams, + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + jobGroups: selectedJobGroups, + }); + const { data: dockerBuildRuntimeData } = useClickHouseAPIImmutable( "vllm/docker_build_runtime", { @@ -990,6 +999,15 @@ export default function Page() { + + + + + )} From 36db924610f981015cc06b17b11067ba65953510 Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 24 Nov 2025 09:09:54 -0800 Subject: [PATCH 5/9] Add nightly builds tracker --- .../vllm/build_failed_jobs/params.json | 15 + .../vllm/build_failed_jobs/query.sql | 30 ++ .../vllm/continuous_builds/params.json | 17 + .../vllm/continuous_builds/query.sql | 56 +++ .../metrics/vllm/ContinuousBuildTracker.tsx | 384 ++++++++++++++++++ torchci/pages/metrics/vllm.tsx | 43 +- 6 files changed, 536 insertions(+), 9 deletions(-) create mode 100644 torchci/clickhouse_queries/vllm/build_failed_jobs/params.json create mode 100644 torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql create mode 100644 torchci/clickhouse_queries/vllm/continuous_builds/params.json create mode 100644 torchci/clickhouse_queries/vllm/continuous_builds/query.sql create mode 100644 torchci/components/metrics/vllm/ContinuousBuildTracker.tsx diff --git a/torchci/clickhouse_queries/vllm/build_failed_jobs/params.json b/torchci/clickhouse_queries/vllm/build_failed_jobs/params.json new file mode 100644 index 0000000000..20bea144da --- /dev/null +++ b/torchci/clickhouse_queries/vllm/build_failed_jobs/params.json @@ -0,0 +1,15 @@ +{ + "params": { + "repo": "String", + "pipelineName": "String", + "buildNumber": "UInt32" + }, + "tests": [ + { + "repo": "https://github.com/vllm-project/vllm.git", + "pipelineName": "CI", + "buildNumber": 12345 + } + ] +} + diff --git a/torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql b/torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql new file mode 100644 index 0000000000..c2281b634f --- /dev/null +++ b/torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql @@ -0,0 +1,30 @@ +-- vLLM failed jobs for a specific build +-- Returns all jobs that hard-failed (soft failures excluded) for a given build number +-- Shows job details: name, state, duration, timestamps, etc. + +SELECT + tupleElement(job, 'name') AS job_name, + tupleElement(job, 'state') AS job_state, + tupleElement(job, 'soft_failed') AS soft_failed, + tupleElement(job, 'started_at') AS job_started_at, + tupleElement(job, 'finished_at') AS job_finished_at, + tupleElement(job, 'web_url') AS job_url, + tupleElement(job, 'exit_status') AS exit_status, + -- Calculate duration in hours + dateDiff( + 'second', + tupleElement(job, 'started_at'), + tupleElement(job, 'finished_at') + ) / 3600.0 AS duration_hours, + toUInt32(tupleElement(build, 'number')) AS build_number, + tupleElement(build, 'web_url') AS build_url +FROM vllm.vllm_buildkite_jobs +WHERE + tupleElement(pipeline, 'repository') = {repo: String} + AND tupleElement(pipeline, 'name') = {pipelineName: String} + AND tupleElement(build, 'branch') = 'main' + AND toUInt32(tupleElement(build, 'number')) = {buildNumber: UInt32} + AND lowerUTF8(tupleElement(job, 'state')) = 'failed' + AND tupleElement(job, 'soft_failed') = FALSE +ORDER BY job_name ASC + diff --git a/torchci/clickhouse_queries/vllm/continuous_builds/params.json b/torchci/clickhouse_queries/vllm/continuous_builds/params.json new file mode 100644 index 0000000000..442b31f243 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/continuous_builds/params.json @@ -0,0 +1,17 @@ +{ + "params": { + "repo": "String", + "pipelineName": "String", + "startTime": "DateTime64(3)", + "stopTime": "DateTime64(3)" + }, + "tests": [ + { + "repo": "https://github.com/vllm-project/vllm.git", + "pipelineName": "CI", + "startTime": "2025-11-17T00:00:00.000", + "stopTime": "2025-11-24T00:00:00.000" + } + ] +} + diff --git a/torchci/clickhouse_queries/vllm/continuous_builds/query.sql b/torchci/clickhouse_queries/vllm/continuous_builds/query.sql new file mode 100644 index 0000000000..4a0b9416c4 --- /dev/null +++ b/torchci/clickhouse_queries/vllm/continuous_builds/query.sql @@ -0,0 +1,56 @@ +-- vLLM continuous builds list (daily and nightly scheduled runs) +-- Returns recent builds that are part of scheduled CI runs +-- Filters by specific BUILDKITE_MESSAGE patterns +-- Only tracks main branch + +SELECT DISTINCT + toUInt32(tupleElement(build, 'number')) AS build_number, + tupleElement(build, 'id') AS build_id, + tupleElement(build, 'state') AS build_state, + tupleElement(build, 'web_url') AS build_url, + tupleElement(build, 'started_at') AS build_started_at, + tupleElement(build, 'finished_at') AS build_finished_at, + tupleElement(build, 'message') AS build_message, + tupleElement(build, 'commit') AS commit, + -- Determine build type + if( + positionCaseInsensitive(tupleElement(build, 'message'), 'Full CI run - daily') > 0, + 'Daily', + if( + positionCaseInsensitive(tupleElement(build, 'message'), 'Nightly run - All tests') > 0, + 'Nightly', + 'Other' + ) + ) AS build_type, + -- Count jobs for this build + ( + SELECT count(*) + FROM vllm.vllm_buildkite_jobs AS j + WHERE + tupleElement(j.build, 'number') = tupleElement(build, 'number') + AND tupleElement(j.pipeline, 'repository') = {repo: String} + ) AS total_jobs, + -- Count failed jobs for this build + ( + SELECT count(*) + FROM vllm.vllm_buildkite_jobs AS j + WHERE + tupleElement(j.build, 'number') = tupleElement(build, 'number') + AND tupleElement(j.pipeline, 'repository') = {repo: String} + AND lowerUTF8(tupleElement(j.job, 'state')) = 'failed' + AND tupleElement(j.job, 'soft_failed') = FALSE + ) AS failed_jobs_count +FROM vllm.vllm_buildkite_builds +WHERE + tupleElement(pipeline, 'repository') = {repo: String} + AND tupleElement(pipeline, 'name') = {pipelineName: String} + AND tupleElement(build, 'branch') = 'main' + AND tupleElement(build, 'finished_at') IS NOT NULL + AND tupleElement(build, 'finished_at') >= {startTime: DateTime64(3)} + AND tupleElement(build, 'finished_at') < {stopTime: DateTime64(3)} + AND ( + positionCaseInsensitive(tupleElement(build, 'message'), 'Full CI run - daily') > 0 + OR positionCaseInsensitive(tupleElement(build, 'message'), 'Nightly run - All tests') > 0 + ) +ORDER BY build_finished_at DESC + diff --git a/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx b/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx new file mode 100644 index 0000000000..13d3414904 --- /dev/null +++ b/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx @@ -0,0 +1,384 @@ +import OpenInNewIcon from "@mui/icons-material/OpenInNew"; +import { + Box, + Chip, + Link, + Paper, + Table, + TableBody, + TableCell, + TableContainer, + TableHead, + TableRow, + Tooltip, + Typography, +} from "@mui/material"; +import dayjs from "dayjs"; +import { useDarkMode } from "lib/DarkModeContext"; +import { useClickHouseAPIImmutable } from "lib/GeneralUtils"; +import React, { useState } from "react"; +import { + COLOR_ERROR, + COLOR_GRAY, + COLOR_SUCCESS, + COLOR_WARNING, + PIPELINE_NAME, + VLLM_REPO_URL, +} from "./constants"; + +interface ContinuousBuildData { + build_number: number; + build_id: string; + build_state: string; + build_url: string; + build_started_at: string | null; + build_finished_at: string | null; + build_message: string; + commit: string; + build_type: string; + total_jobs: number; + failed_jobs_count: number; +} + +interface FailedJobData { + job_name: string; + job_state: string; + soft_failed: boolean; + job_started_at: string | null; + job_finished_at: string | null; + job_url: string; + exit_status: number | null; + duration_hours: number | null; + build_number: number; + build_url: string; +} + +// Helper function to format duration +function formatDuration(hours: number | null | undefined): string { + if (hours === null || hours === undefined) return "-"; + if (hours < 1) { + return `${(hours * 60).toFixed(0)}m`; + } + return `${hours.toFixed(2)}h`; +} + +// Helper function to get state color +function getStateColor(state: string): { bg: string; text: string } { + const stateLower = state.toLowerCase(); + if ( + stateLower === "passed" || + stateLower === "finished" || + stateLower === "success" + ) { + return { bg: COLOR_SUCCESS, text: "#fff" }; + } + if (stateLower === "failed" || stateLower === "failing") { + return { bg: COLOR_ERROR, text: "#fff" }; + } + if (stateLower === "canceled" || stateLower === "cancelled") { + return { bg: COLOR_GRAY, text: "#fff" }; + } + if (stateLower === "running") { + return { bg: COLOR_WARNING, text: "#fff" }; + } + return { bg: "#999", text: "#fff" }; +} + +// Helper function to get build type chip color +function getBuildTypeColor(buildType: string): string { + if (buildType === "Daily") { + return COLOR_SUCCESS; + } + if (buildType === "Nightly") { + return COLOR_WARNING; + } + return COLOR_GRAY; +} + +export default function ContinuousBuildTracker({ + data, + timeParams, +}: { + data: ContinuousBuildData[] | undefined; + timeParams: { startTime: string; stopTime: string }; +}) { + const { darkMode } = useDarkMode(); + const [selectedBuildNumber, setSelectedBuildNumber] = useState( + null + ); + + // Fetch failed jobs for selected build + const { data: failedJobsData } = useClickHouseAPIImmutable( + "vllm/build_failed_jobs", + selectedBuildNumber !== null + ? { + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + buildNumber: selectedBuildNumber, + } + : null, + selectedBuildNumber !== null + ); + + const builds = (data || []) as ContinuousBuildData[]; + + // Auto-select first build if nothing is selected or if selected build is no longer in the list + React.useEffect(() => { + if (builds.length > 0) { + if ( + selectedBuildNumber === null || + !builds.some((b) => b.build_number === selectedBuildNumber) + ) { + setSelectedBuildNumber(builds[0].build_number); + } + } + }, [builds, selectedBuildNumber]); + + // Handle row click + function handleRowClick(buildNumber: number) { + setSelectedBuildNumber(buildNumber); + } + + const failedJobs = (failedJobsData || []) as FailedJobData[]; + const selectedBuild = builds.find( + (b) => b.build_number === selectedBuildNumber + ); + + return ( + + + {/* Builds table on the left */} + + + Continuous Builds + + + + + + Build # + Type + Status + Failed Jobs + Finished At + Link + + + + {builds.length === 0 && ( + + + No continuous builds found in selected time range + + + )} + {builds.map((build) => { + const stateColors = getStateColor(build.build_state); + return ( + handleRowClick(build.build_number)} + selected={selectedBuildNumber === build.build_number} + sx={{ + cursor: "pointer", + "&.Mui-selected": { + backgroundColor: darkMode + ? "rgba(144, 202, 249, 0.16)" + : "rgba(25, 118, 210, 0.12)", + }, + "&.Mui-selected:hover": { + backgroundColor: darkMode + ? "rgba(144, 202, 249, 0.24)" + : "rgba(25, 118, 210, 0.18)", + }, + }} + > + + {build.build_number} + + + + + + + + 0 + ? COLOR_ERROR + : COLOR_SUCCESS, + fontWeight: "bold", + }} + > + {build.failed_jobs_count} / {build.total_jobs} + + + {build.build_finished_at + ? dayjs(build.build_finished_at).format("M/D/YY h:mm A") + : "-"} + + + e.stopPropagation()} + > + } + size="small" + clickable + sx={{ fontSize: "0.7rem", height: 22 }} + /> + + + + ); + })} + +
+
+
+ + {/* Failed jobs table on the right */} + + + {selectedBuild && ( + + + Failed Jobs - Build #{selectedBuild.build_number} + + + {selectedBuild.build_type} build from{" "} + {dayjs(selectedBuild.build_finished_at).format("M/D/YY h:mm A")} + + + )} + + + + + + Job Name + Duration + Exit Code + Finished At + Link + + + + {failedJobs.length === 0 && ( + + + {selectedBuild + ? selectedBuild.failed_jobs_count === 0 + ? "No failed jobs - build passed! ✓" + : "Loading failed jobs..." + : "Select a build to view failed jobs"} + + + )} + {failedJobs.map((job, idx) => ( + + + {job.job_name} + + + {formatDuration(job.duration_hours)} + + + {job.exit_status ?? "-"} + + + {job.job_finished_at + ? dayjs(job.job_finished_at).format("M/D/YY h:mm A") + : "-"} + + + + } + size="small" + clickable + sx={{ fontSize: "0.7rem", height: 22 }} + /> + + + + ))} + +
+
+
+
+
+ ); +} + diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index 2f1c69e167..4accbbcde3 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -13,6 +13,7 @@ import { } from "@mui/material"; import CiDurationsPanel from "components/metrics/vllm/CiDurationsPanel"; import CommitsOnRedTrendPanel from "components/metrics/vllm/CommitsOnRedTrendPanel"; +import ContinuousBuildTracker from "components/metrics/vllm/ContinuousBuildTracker"; import DockerBuildRuntimePanel from "components/metrics/vllm/DockerBuildRuntimePanel"; import DurationDistributionPanel from "components/metrics/vllm/DurationDistributionPanel"; import JobGroupFilter, { @@ -407,6 +408,15 @@ export default function Page() { jobGroups: selectedJobGroups, }); + const { data: continuousBuildsData } = useClickHouseAPIImmutable( + "vllm/continuous_builds", + { + ...timeParams, + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + } + ); + const { data: dockerBuildRuntimeData } = useClickHouseAPIImmutable( "vllm/docker_build_runtime", { @@ -886,6 +896,7 @@ export default function Page() { +
@@ -999,15 +1010,6 @@ export default function Page() { - - - - - )} @@ -1138,6 +1140,29 @@ export default function Page() { )} + + {/* Tab 4: CI Builds */} + {selectedTab === 4 && ( + <> + + + + + + + + + + + + )} ); } From 457061779319c593e8980605a74195f7323cbe82 Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 24 Nov 2025 09:13:36 -0800 Subject: [PATCH 6/9] lint --- .../vllm/build_failed_jobs/params.json | 1 - .../vllm/build_failed_jobs/query.sql | 1 - .../vllm/continuous_builds/params.json | 1 - .../vllm/continuous_builds/query.sql | 21 +++++++--- .../vllm/job_list/params.json | 1 - .../vllm/job_list/query.sql | 17 ++++++-- .../vllm/recent_job_builds/params.json | 1 - .../vllm/recent_job_builds/query.sql | 1 - .../metrics/vllm/ContinuousBuildTracker.tsx | 14 ++++--- .../metrics/vllm/JobBuildsPanel.tsx | 42 +++++++++++++------ torchci/pages/metrics/vllm.tsx | 2 +- 11 files changed, 69 insertions(+), 33 deletions(-) diff --git a/torchci/clickhouse_queries/vllm/build_failed_jobs/params.json b/torchci/clickhouse_queries/vllm/build_failed_jobs/params.json index 20bea144da..f71304d0f1 100644 --- a/torchci/clickhouse_queries/vllm/build_failed_jobs/params.json +++ b/torchci/clickhouse_queries/vllm/build_failed_jobs/params.json @@ -12,4 +12,3 @@ } ] } - diff --git a/torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql b/torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql index c2281b634f..e0a0df49d0 100644 --- a/torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql +++ b/torchci/clickhouse_queries/vllm/build_failed_jobs/query.sql @@ -27,4 +27,3 @@ WHERE AND lowerUTF8(tupleElement(job, 'state')) = 'failed' AND tupleElement(job, 'soft_failed') = FALSE ORDER BY job_name ASC - diff --git a/torchci/clickhouse_queries/vllm/continuous_builds/params.json b/torchci/clickhouse_queries/vllm/continuous_builds/params.json index 442b31f243..f186818776 100644 --- a/torchci/clickhouse_queries/vllm/continuous_builds/params.json +++ b/torchci/clickhouse_queries/vllm/continuous_builds/params.json @@ -14,4 +14,3 @@ } ] } - diff --git a/torchci/clickhouse_queries/vllm/continuous_builds/query.sql b/torchci/clickhouse_queries/vllm/continuous_builds/query.sql index 4a0b9416c4..f6fe5739e8 100644 --- a/torchci/clickhouse_queries/vllm/continuous_builds/query.sql +++ b/torchci/clickhouse_queries/vllm/continuous_builds/query.sql @@ -14,10 +14,16 @@ SELECT DISTINCT tupleElement(build, 'commit') AS commit, -- Determine build type if( - positionCaseInsensitive(tupleElement(build, 'message'), 'Full CI run - daily') > 0, + positionCaseInsensitive( + tupleElement(build, 'message'), 'Full CI run - daily' + ) + > 0, 'Daily', if( - positionCaseInsensitive(tupleElement(build, 'message'), 'Nightly run - All tests') > 0, + positionCaseInsensitive( + tupleElement(build, 'message'), 'Nightly run - All tests' + ) + > 0, 'Nightly', 'Other' ) @@ -49,8 +55,13 @@ WHERE AND tupleElement(build, 'finished_at') >= {startTime: DateTime64(3)} AND tupleElement(build, 'finished_at') < {stopTime: DateTime64(3)} AND ( - positionCaseInsensitive(tupleElement(build, 'message'), 'Full CI run - daily') > 0 - OR positionCaseInsensitive(tupleElement(build, 'message'), 'Nightly run - All tests') > 0 + positionCaseInsensitive( + tupleElement(build, 'message'), 'Full CI run - daily' + ) + > 0 + OR positionCaseInsensitive( + tupleElement(build, 'message'), 'Nightly run - All tests' + ) + > 0 ) ORDER BY build_finished_at DESC - diff --git a/torchci/clickhouse_queries/vllm/job_list/params.json b/torchci/clickhouse_queries/vllm/job_list/params.json index a5515c2859..d81f11f2a2 100644 --- a/torchci/clickhouse_queries/vllm/job_list/params.json +++ b/torchci/clickhouse_queries/vllm/job_list/params.json @@ -16,4 +16,3 @@ } ] } - diff --git a/torchci/clickhouse_queries/vllm/job_list/query.sql b/torchci/clickhouse_queries/vllm/job_list/query.sql index 2e83f31f50..ed0fe1d2b4 100644 --- a/torchci/clickhouse_queries/vllm/job_list/query.sql +++ b/torchci/clickhouse_queries/vllm/job_list/query.sql @@ -6,7 +6,11 @@ SELECT tupleElement(job, 'name') AS job_name, COUNT(*) AS total_runs, - countIf(lowerUTF8(tupleElement(job, 'state')) IN ('passed', 'finished', 'success')) AS passed_count, + countIf( + lowerUTF8(tupleElement(job, 'state')) IN ( + 'passed', 'finished', 'success' + ) + ) AS passed_count, countIf(lowerUTF8(tupleElement(job, 'state')) = 'failed') AS failed_count, max(tupleElement(job, 'finished_at')) AS last_run_at FROM vllm.vllm_buildkite_jobs @@ -25,14 +29,19 @@ WHERE ) OR ( has({jobGroups: Array(String)}, 'torch_nightly') - AND positionCaseInsensitive(tupleElement(job, 'name'), 'Torch Nightly') > 0 + AND positionCaseInsensitive( + tupleElement(job, 'name'), 'Torch Nightly' + ) + > 0 ) OR ( has({jobGroups: Array(String)}, 'main') AND positionCaseInsensitive(tupleElement(job, 'name'), 'AMD') = 0 - AND positionCaseInsensitive(tupleElement(job, 'name'), 'Torch Nightly') = 0 + AND positionCaseInsensitive( + tupleElement(job, 'name'), 'Torch Nightly' + ) + = 0 ) ) GROUP BY job_name ORDER BY last_run_at DESC, total_runs DESC - diff --git a/torchci/clickhouse_queries/vllm/recent_job_builds/params.json b/torchci/clickhouse_queries/vllm/recent_job_builds/params.json index 2f50f3cf2e..26bfd65387 100644 --- a/torchci/clickhouse_queries/vllm/recent_job_builds/params.json +++ b/torchci/clickhouse_queries/vllm/recent_job_builds/params.json @@ -16,4 +16,3 @@ } ] } - diff --git a/torchci/clickhouse_queries/vllm/recent_job_builds/query.sql b/torchci/clickhouse_queries/vllm/recent_job_builds/query.sql index a1914ad769..fd53902a8b 100644 --- a/torchci/clickhouse_queries/vllm/recent_job_builds/query.sql +++ b/torchci/clickhouse_queries/vllm/recent_job_builds/query.sql @@ -54,4 +54,3 @@ SELECT duration_hours FROM job_builds ORDER BY job_finished_at DESC - diff --git a/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx b/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx index 13d3414904..5a2f56a536 100644 --- a/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx +++ b/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx @@ -10,7 +10,6 @@ import { TableContainer, TableHead, TableRow, - Tooltip, Typography, } from "@mui/material"; import dayjs from "dayjs"; @@ -217,7 +216,9 @@ export default function ContinuousBuildTracker({ label={build.build_type} size="small" sx={{ - backgroundColor: getBuildTypeColor(build.build_type), + backgroundColor: getBuildTypeColor( + build.build_type + ), color: "#fff", fontSize: "0.7rem", height: 20, @@ -259,7 +260,9 @@ export default function ContinuousBuildTracker({ {build.build_finished_at - ? dayjs(build.build_finished_at).format("M/D/YY h:mm A") + ? dayjs(build.build_finished_at).format( + "M/D/YY h:mm A" + ) : "-"} @@ -305,7 +308,9 @@ export default function ContinuousBuildTracker({ {selectedBuild.build_type} build from{" "} - {dayjs(selectedBuild.build_finished_at).format("M/D/YY h:mm A")} + {dayjs(selectedBuild.build_finished_at).format( + "M/D/YY h:mm A" + )}
)} @@ -381,4 +386,3 @@ export default function ContinuousBuildTracker({ ); } - diff --git a/torchci/components/metrics/vllm/JobBuildsPanel.tsx b/torchci/components/metrics/vllm/JobBuildsPanel.tsx index d124761164..90c4d2902a 100644 --- a/torchci/components/metrics/vllm/JobBuildsPanel.tsx +++ b/torchci/components/metrics/vllm/JobBuildsPanel.tsx @@ -71,7 +71,11 @@ function getStateColor( softFailed: boolean ): { bg: string; text: string } { const stateLower = state.toLowerCase(); - if (stateLower === "passed" || stateLower === "finished" || stateLower === "success") { + if ( + stateLower === "passed" || + stateLower === "finished" || + stateLower === "success" + ) { return { bg: COLOR_SUCCESS, text: "#fff" }; } if (stateLower === "failed") { @@ -154,10 +158,7 @@ export default function JobBuildsPanel({ // Auto-select first job if nothing is selected or if selected job is no longer in the list React.useEffect(() => { if (sortedJobs.length > 0) { - if ( - !selectedJob || - !sortedJobs.some((j) => j.job_name === selectedJob) - ) { + if (!selectedJob || !sortedJobs.some((j) => j.job_name === selectedJob)) { setSelectedJob(sortedJobs[0].job_name); } } @@ -224,7 +225,9 @@ export default function JobBuildsPanel({ handleSort("total_runs")} > Runs @@ -233,7 +236,9 @@ export default function JobBuildsPanel({ handleSort("passed_count")} > ✓ @@ -242,7 +247,9 @@ export default function JobBuildsPanel({ handleSort("failed_count")} > ✗ @@ -339,13 +346,19 @@ export default function JobBuildsPanel({ build.soft_failed ); return ( - + {build.build_number} - + ); } - diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index 4accbbcde3..24e0b24443 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -16,10 +16,10 @@ import CommitsOnRedTrendPanel from "components/metrics/vllm/CommitsOnRedTrendPan import ContinuousBuildTracker from "components/metrics/vllm/ContinuousBuildTracker"; import DockerBuildRuntimePanel from "components/metrics/vllm/DockerBuildRuntimePanel"; import DurationDistributionPanel from "components/metrics/vllm/DurationDistributionPanel"; +import JobBuildsPanel from "components/metrics/vllm/JobBuildsPanel"; import JobGroupFilter, { JobGroup, } from "components/metrics/vllm/JobGroupFilter"; -import JobBuildsPanel from "components/metrics/vllm/JobBuildsPanel"; import JobReliabilityPanel from "components/metrics/vllm/JobReliabilityPanel"; import JobRuntimePanel from "components/metrics/vllm/JobRuntimePanel"; import MergesPanel from "components/metrics/vllm/MergesPanel"; From 9f0205814ebfd278d395f3e9f35d4f6f9e67d010 Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 24 Nov 2025 09:27:20 -0800 Subject: [PATCH 7/9] Add CI Health stability metrics --- .../metrics/vllm/CiStabilityTrendPanel.tsx | 229 ++++++++++++++++++ torchci/pages/metrics/vllm.tsx | 162 ++++++++++++- 2 files changed, 383 insertions(+), 8 deletions(-) create mode 100644 torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx diff --git a/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx b/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx new file mode 100644 index 0000000000..25c0549b85 --- /dev/null +++ b/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx @@ -0,0 +1,229 @@ +import dayjs from "dayjs"; +import { EChartsOption } from "echarts"; +import { useDarkMode } from "lib/DarkModeContext"; +import _ from "lodash"; +import { + ChartPaper, + getCrosshairTooltipConfig, + GRID_DEFAULT, +} from "./chartUtils"; +import { COLOR_SUCCESS, COLOR_ERROR, COLOR_WARNING } from "./constants"; + +interface TrunkHealthData { + build_started_at: string; + is_green: number; +} + +// Helper function to calculate stability score for a window of days +function calculateStabilityScore(healthValues: number[]): number { + if (healthValues.length === 0) return 0; + + // Calculate volatility (standard deviation) + const mean = _.mean(healthValues); + const squaredDiffs = healthValues.map((x) => Math.pow(x - mean, 2)); + const variance = _.mean(squaredDiffs); + const volatility = Math.sqrt(variance); + + // Count state transitions + const transitions = healthValues.reduce((count, current, index) => { + if (index === 0) return 0; + const previous = healthValues[index - 1]; + return current !== previous ? count + 1 : count; + }, 0); + + // Calculate penalties + const volatilityPenalty = volatility * 50; + const transitionPenalty = + Math.min(transitions / healthValues.length, 1) * 50; + + // Return score as percentage (0-1) + return Math.max(0, 100 - volatilityPenalty - transitionPenalty) / 100; +} + +// Helper function to format tooltip +function formatTooltip(params: any, stabilityData: any[]): string { + if (!Array.isArray(params) || params.length === 0) return ""; + + const date = params[0].axisValue; + const dataIndex = params[0].dataIndex; + const data = stabilityData[dataIndex]; + + if (!data) return ""; + + let result = `${date}
`; + result += `${params[0].marker} Stability Score: ${(data.score * 100).toFixed(1)}%
`; + result += ``; + result += `Volatility: ${(data.volatility * 100).toFixed(1)}% | `; + result += `Transitions: ${data.transitions}`; + result += ``; + + return result; +} + +// Helper function to get line series +function getLineSeries(data: any[]): any[] { + return [ + { + name: "Stability Score", + type: "line", + data: data.map((d) => d.score), + smooth: true, + symbol: "circle", + symbolSize: 6, + lineStyle: { width: 2 }, + itemStyle: { + color: (params: any) => { + const score = params.data; + if (score >= 0.7) return COLOR_SUCCESS; + if (score >= 0.5) return COLOR_WARNING; + return COLOR_ERROR; + }, + }, + areaStyle: { + opacity: 0.2, + color: { + type: "linear", + x: 0, + y: 0, + x2: 0, + y2: 1, + colorStops: [ + { offset: 0, color: COLOR_SUCCESS }, + { offset: 0.5, color: COLOR_WARNING }, + { offset: 1, color: COLOR_ERROR }, + ], + }, + }, + markLine: { + silent: true, + symbol: "none", + lineStyle: { + type: "dashed", + color: COLOR_WARNING, + width: 1, + }, + label: { + formatter: "Target: 70%", + position: "end", + }, + data: [{ yAxis: 0.7 }], + }, + }, + ]; +} + +export default function CiStabilityTrendPanel({ + data, +}: { + data: TrunkHealthData[] | undefined; +}) { + const { darkMode } = useDarkMode(); + + // Group builds by day and determine daily health status + const buildsByDay = _.groupBy( + data || [], + (d) => d.build_started_at?.slice(0, 10) || "" + ); + + const dailyHealth = Object.entries(buildsByDay) + .map(([day, builds]) => { + if (!day) return null; + const sortedBuilds = _.sortBy(builds, "build_started_at"); + const mostRecent = sortedBuilds[sortedBuilds.length - 1]; + return { + date: day, + isGreen: mostRecent?.is_green === 1 ? 1 : 0, + }; + }) + .filter((d) => d !== null) + .sort((a, b) => a!.date.localeCompare(b!.date)) as { + date: string; + isGreen: number; + }[]; + + // Calculate rolling stability score (7-day window) + const windowSize = 7; + const stabilityData = dailyHealth + .map((day, index) => { + if (index < windowSize - 1) return null; // Not enough data for window + + // Get window of health values + const windowData = dailyHealth + .slice(Math.max(0, index - windowSize + 1), index + 1) + .map((d) => d.isGreen); + + // Calculate volatility + const mean = _.mean(windowData); + const squaredDiffs = windowData.map((x) => Math.pow(x - mean, 2)); + const variance = _.mean(squaredDiffs); + const volatility = Math.sqrt(variance); + + // Count transitions + const transitions = windowData.reduce((count, current, idx) => { + if (idx === 0) return 0; + const previous = windowData[idx - 1]; + return current !== previous ? count + 1 : count; + }, 0); + + const score = calculateStabilityScore(windowData); + + return { + date: day.date, + score, + volatility, + transitions, + }; + }) + .filter((d) => d !== null) as { + date: string; + score: number; + volatility: number; + transitions: number; + }[]; + + const dates = stabilityData.map((d) => dayjs(d.date).format("MMM D")); + + const options: EChartsOption = { + title: { + text: "CI Stability Score Over Time", + subtext: `7-day rolling window (target: ≥70%)`, + left: "center", + }, + grid: GRID_DEFAULT, + xAxis: { + type: "category", + data: dates, + name: "Date", + nameLocation: "middle", + nameGap: 35, + axisLabel: { + rotate: 45, + fontSize: 10, + }, + }, + yAxis: { + type: "value", + name: "Stability Score", + nameLocation: "middle", + nameGap: 45, + min: 0, + max: 1, + axisLabel: { + formatter: (value: number) => `${(value * 100).toFixed(0)}%`, + }, + }, + series: stabilityData.length > 0 ? getLineSeries(stabilityData) : [], + tooltip: getCrosshairTooltipConfig(darkMode, (params: any) => + formatTooltip(params, stabilityData) + ), + }; + + return ( + + ); +} + diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index 24e0b24443..cb19e62933 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -12,14 +12,15 @@ import { Typography, } from "@mui/material"; import CiDurationsPanel from "components/metrics/vllm/CiDurationsPanel"; +import CiStabilityTrendPanel from "components/metrics/vllm/CiStabilityTrendPanel"; import CommitsOnRedTrendPanel from "components/metrics/vllm/CommitsOnRedTrendPanel"; import ContinuousBuildTracker from "components/metrics/vllm/ContinuousBuildTracker"; import DockerBuildRuntimePanel from "components/metrics/vllm/DockerBuildRuntimePanel"; import DurationDistributionPanel from "components/metrics/vllm/DurationDistributionPanel"; -import JobBuildsPanel from "components/metrics/vllm/JobBuildsPanel"; import JobGroupFilter, { JobGroup, } from "components/metrics/vllm/JobGroupFilter"; +import JobBuildsPanel from "components/metrics/vllm/JobBuildsPanel"; import JobReliabilityPanel from "components/metrics/vllm/JobReliabilityPanel"; import JobRuntimePanel from "components/metrics/vllm/JobRuntimePanel"; import MergesPanel from "components/metrics/vllm/MergesPanel"; @@ -571,6 +572,58 @@ export default function Page() { ? null : 1 - trunkHealthPct; + // Calculate CI health volatility metrics + // Volatility = standard deviation of daily trunk health percentages + const dailyHealthPercentages = + trunkHealthData === undefined + ? undefined + : Object.entries(buildsByDay).map(([day, builds]) => { + const sortedBuilds = _.sortBy(builds, "build_started_at"); + const mostRecent = sortedBuilds[sortedBuilds.length - 1]; + return mostRecent?.is_green === 1 ? 1.0 : 0.0; + }); + + const ciHealthVolatility = + dailyHealthPercentages === undefined + ? undefined + : dailyHealthPercentages.length === 0 + ? null + : (() => { + const mean = _.mean(dailyHealthPercentages); + const squaredDiffs = dailyHealthPercentages.map((x) => + Math.pow(x - mean, 2) + ); + const variance = _.mean(squaredDiffs); + return Math.sqrt(variance); + })(); + + // Count state transitions (green->red or red->green) + const stateTransitions = + dailyHealthPercentages === undefined + ? undefined + : dailyHealthPercentages.length <= 1 + ? 0 + : dailyHealthPercentages.reduce((count, current, index) => { + if (index === 0) return 0; + const previous = dailyHealthPercentages[index - 1]; + return current !== previous ? count + 1 : count; + }, 0); + + // Calculate stability score (lower volatility + fewer transitions = higher score) + // Score from 0-100, where 100 is perfect stability + const ciStabilityScore = + ciHealthVolatility === undefined || stateTransitions === undefined + ? undefined + : ciHealthVolatility === null || stateTransitions === null + ? null + : (() => { + const volatilityPenalty = ciHealthVolatility * 50; // 0-50 penalty + const transitionPenalty = + Math.min(stateTransitions / (dailyHealthPercentages?.length || 1), 1) * + 50; // 0-50 penalty + return Math.max(0, 100 - volatilityPenalty - transitionPenalty) / 100; + })(); + // Calculate previous period metrics for deltas const prevReliabilityPoints = (prevReliabilityData || []) as any[]; const prevTotalPassed = _.sumBy(prevReliabilityPoints, "passed_count"); @@ -610,6 +663,56 @@ export default function Page() { ? null : 1 - prevTrunkHealthPct; + // Calculate previous period volatility metrics + const prevDailyHealthPercentages = + prevTrunkHealthData === undefined + ? undefined + : Object.entries(prevBuildsByDay).map(([day, builds]) => { + const sortedBuilds = _.sortBy(builds, "build_started_at"); + const mostRecent = sortedBuilds[sortedBuilds.length - 1]; + return mostRecent?.is_green === 1 ? 1.0 : 0.0; + }); + + const prevCiHealthVolatility = + prevDailyHealthPercentages === undefined + ? undefined + : prevDailyHealthPercentages.length === 0 + ? null + : (() => { + const mean = _.mean(prevDailyHealthPercentages); + const squaredDiffs = prevDailyHealthPercentages.map((x) => + Math.pow(x - mean, 2) + ); + const variance = _.mean(squaredDiffs); + return Math.sqrt(variance); + })(); + + const prevStateTransitions = + prevDailyHealthPercentages === undefined + ? undefined + : prevDailyHealthPercentages.length <= 1 + ? 0 + : prevDailyHealthPercentages.reduce((count, current, index) => { + if (index === 0) return 0; + const previous = prevDailyHealthPercentages[index - 1]; + return current !== previous ? count + 1 : count; + }, 0); + + const prevCiStabilityScore = + prevCiHealthVolatility === undefined || prevStateTransitions === undefined + ? undefined + : prevCiHealthVolatility === null || prevStateTransitions === null + ? null + : (() => { + const volatilityPenalty = prevCiHealthVolatility * 50; + const transitionPenalty = + Math.min( + prevStateTransitions / (prevDailyHealthPercentages?.length || 1), + 1 + ) * 50; + return Math.max(0, 100 - volatilityPenalty - transitionPenalty) / 100; + })(); + const prevManualMergedFailures = prevMergesData === undefined || prevMergesData.length === 0 ? 0 @@ -677,6 +780,12 @@ export default function Page() { prevManualMergedPct ); + // Calculate deltas for volatility metrics + const ciStabilityScoreDelta = calculateDelta( + ciStabilityScore, + prevCiStabilityScore + ); + // Calculate deltas for time to first review const prevTimeToReviewP50 = getPrCycleValue( prevPrCycleData, @@ -810,6 +919,21 @@ export default function Page() { }, ]} /> + (v ?? 1) < 0.7, + tooltip: + "Measures consistency of trunk health over time (0-100%). Penalizes both volatility (daily health swings) and frequent state changes (green↔red flips). Higher is better. Low scores indicate unpredictable CI that frequently oscillates between passing and failing.", + delta: ciStabilityScoreDelta, + }, + ]} + /> + + (v ?? 0) > (dailyHealthPercentages?.length || 1) * 0.3, + tooltip: + "Number of times trunk flipped between green (healthy) and red (broken) states. Lower is better. High values indicate frequent CI instability. Calculated over the selected time period.", + delta: null, + }, + ]} + /> - + - + - + - + - + - + - + + + + + + From 02a3d89b3eb2b2d647865ef9432ec5e8c337cfde Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 24 Nov 2025 09:30:41 -0800 Subject: [PATCH 8/9] Add jobName as query parameter --- .../vllm/docker_build_runtime/params.json | 2 ++ .../vllm/docker_build_runtime/query.sql | 8 ++++---- torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx | 1 - torchci/pages/metrics/vllm.tsx | 1 + 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json b/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json index 7f2c8af8ca..187cdc41eb 100644 --- a/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json +++ b/torchci/clickhouse_queries/vllm/docker_build_runtime/params.json @@ -1,12 +1,14 @@ { "params": { "repo": "String", + "jobName": "String", "startTime": "DateTime64(3)", "stopTime": "DateTime64(3)" }, "tests": [ { "repo": "https://github.com/vllm-project/vllm.git", + "jobName": ":docker: build image", "startTime": "2025-10-01T00:00:00.000", "stopTime": "2025-11-01T00:00:00.000" } diff --git a/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql b/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql index 8041cdb538..e63b7853d4 100644 --- a/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql +++ b/torchci/clickhouse_queries/vllm/docker_build_runtime/query.sql @@ -1,6 +1,6 @@ --- vLLM Docker Build Image Runtime Trends (main branch only) --- Tracks runtime for the ":docker: build image" job specifically --- This is a critical job for build speed monitoring +-- vLLM Single Job Runtime Trends (main branch only) +-- Tracks runtime for a specific job (parameterized for reusability) +-- Default use case: ":docker: build image" job for build speed monitoring WITH jobs AS ( SELECT @@ -13,7 +13,7 @@ WITH jobs AS ( WHERE tupleElement(pipeline, 'repository') = {repo: String } AND tupleElement(build, 'branch') = 'main' - AND tupleElement(job, 'name') = ':docker: build image' + AND tupleElement(job, 'name') = {jobName: String} AND tupleElement(job, 'started_at') IS NOT NULL AND tupleElement(job, 'finished_at') IS NOT NULL AND tupleElement(job, 'started_at') >= {startTime: DateTime64(3) } diff --git a/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx b/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx index 25c0549b85..c3e5c584b6 100644 --- a/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx +++ b/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx @@ -226,4 +226,3 @@ export default function CiStabilityTrendPanel({ /> ); } - diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index cb19e62933..0d1ee2d658 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -423,6 +423,7 @@ export default function Page() { { ...timeParams, repo: VLLM_REPO_URL, + jobName: ":docker: build image", } ); From a5098b9f15b7741fe159381249e21320719ee0df Mon Sep 17 00:00:00 2001 From: Reza Barazesh Date: Mon, 24 Nov 2025 09:34:10 -0800 Subject: [PATCH 9/9] format --- .../metrics/vllm/CiStabilityTrendPanel.tsx | 9 ++++---- .../metrics/vllm/ContinuousBuildTracker.tsx | 12 +++++------ .../metrics/vllm/JobBuildsPanel.tsx | 21 +++++++++++-------- torchci/pages/metrics/vllm.tsx | 16 ++++++++------ 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx b/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx index c3e5c584b6..4617b56317 100644 --- a/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx +++ b/torchci/components/metrics/vllm/CiStabilityTrendPanel.tsx @@ -7,7 +7,7 @@ import { getCrosshairTooltipConfig, GRID_DEFAULT, } from "./chartUtils"; -import { COLOR_SUCCESS, COLOR_ERROR, COLOR_WARNING } from "./constants"; +import { COLOR_ERROR, COLOR_SUCCESS, COLOR_WARNING } from "./constants"; interface TrunkHealthData { build_started_at: string; @@ -33,8 +33,7 @@ function calculateStabilityScore(healthValues: number[]): number { // Calculate penalties const volatilityPenalty = volatility * 50; - const transitionPenalty = - Math.min(transitions / healthValues.length, 1) * 50; + const transitionPenalty = Math.min(transitions / healthValues.length, 1) * 50; // Return score as percentage (0-1) return Math.max(0, 100 - volatilityPenalty - transitionPenalty) / 100; @@ -51,7 +50,9 @@ function formatTooltip(params: any, stabilityData: any[]): string { if (!data) return ""; let result = `${date}
`; - result += `${params[0].marker} Stability Score: ${(data.score * 100).toFixed(1)}%
`; + result += `${params[0].marker} Stability Score: ${( + data.score * 100 + ).toFixed(1)}%
`; result += ``; result += `Volatility: ${(data.volatility * 100).toFixed(1)}% | `; result += `Transitions: ${data.transitions}`; diff --git a/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx b/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx index 5a2f56a536..d869ba7e06 100644 --- a/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx +++ b/torchci/components/metrics/vllm/ContinuousBuildTracker.tsx @@ -109,13 +109,11 @@ export default function ContinuousBuildTracker({ // Fetch failed jobs for selected build const { data: failedJobsData } = useClickHouseAPIImmutable( "vllm/build_failed_jobs", - selectedBuildNumber !== null - ? { - repo: VLLM_REPO_URL, - pipelineName: PIPELINE_NAME, - buildNumber: selectedBuildNumber, - } - : null, + { + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + buildNumber: selectedBuildNumber || 0, + }, selectedBuildNumber !== null ); diff --git a/torchci/components/metrics/vllm/JobBuildsPanel.tsx b/torchci/components/metrics/vllm/JobBuildsPanel.tsx index 90c4d2902a..a19f2b799d 100644 --- a/torchci/components/metrics/vllm/JobBuildsPanel.tsx +++ b/torchci/components/metrics/vllm/JobBuildsPanel.tsx @@ -53,7 +53,12 @@ interface RecentBuildData { duration_hours: number | null; } -type JobSortField = "job_name" | "total_runs" | "passed_count" | "failed_count"; +type JobSortField = + | "job_name" + | "total_runs" + | "passed_count" + | "failed_count" + | "last_run_at"; type SortOrder = "asc" | "desc"; // Helper function to format duration @@ -117,14 +122,12 @@ export default function JobBuildsPanel({ // Fetch recent builds for selected job const { data: recentBuildsData } = useClickHouseAPIImmutable( "vllm/recent_job_builds", - selectedJob - ? { - ...timeParams, - repo: VLLM_REPO_URL, - pipelineName: PIPELINE_NAME, - jobName: selectedJob, - } - : null, + { + ...timeParams, + repo: VLLM_REPO_URL, + pipelineName: PIPELINE_NAME, + jobName: selectedJob || "", + }, selectedJob !== null ); diff --git a/torchci/pages/metrics/vllm.tsx b/torchci/pages/metrics/vllm.tsx index 0d1ee2d658..d93972f792 100644 --- a/torchci/pages/metrics/vllm.tsx +++ b/torchci/pages/metrics/vllm.tsx @@ -17,10 +17,10 @@ import CommitsOnRedTrendPanel from "components/metrics/vllm/CommitsOnRedTrendPan import ContinuousBuildTracker from "components/metrics/vllm/ContinuousBuildTracker"; import DockerBuildRuntimePanel from "components/metrics/vllm/DockerBuildRuntimePanel"; import DurationDistributionPanel from "components/metrics/vllm/DurationDistributionPanel"; +import JobBuildsPanel from "components/metrics/vllm/JobBuildsPanel"; import JobGroupFilter, { JobGroup, } from "components/metrics/vllm/JobGroupFilter"; -import JobBuildsPanel from "components/metrics/vllm/JobBuildsPanel"; import JobReliabilityPanel from "components/metrics/vllm/JobReliabilityPanel"; import JobRuntimePanel from "components/metrics/vllm/JobRuntimePanel"; import MergesPanel from "components/metrics/vllm/MergesPanel"; @@ -604,7 +604,7 @@ export default function Page() { ? undefined : dailyHealthPercentages.length <= 1 ? 0 - : dailyHealthPercentages.reduce((count, current, index) => { + : dailyHealthPercentages.reduce((count: number, current, index) => { if (index === 0) return 0; const previous = dailyHealthPercentages[index - 1]; return current !== previous ? count + 1 : count; @@ -620,8 +620,10 @@ export default function Page() { : (() => { const volatilityPenalty = ciHealthVolatility * 50; // 0-50 penalty const transitionPenalty = - Math.min(stateTransitions / (dailyHealthPercentages?.length || 1), 1) * - 50; // 0-50 penalty + Math.min( + stateTransitions / (dailyHealthPercentages?.length || 1), + 1 + ) * 50; // 0-50 penalty return Math.max(0, 100 - volatilityPenalty - transitionPenalty) / 100; })(); @@ -693,7 +695,7 @@ export default function Page() { ? undefined : prevDailyHealthPercentages.length <= 1 ? 0 - : prevDailyHealthPercentages.reduce((count, current, index) => { + : prevDailyHealthPercentages.reduce((count: number, current, index) => { if (index === 0) return 0; const previous = prevDailyHealthPercentages[index - 1]; return current !== previous ? count + 1 : count; @@ -1067,7 +1069,9 @@ export default function Page() { { title: "State Transitions", value: - stateTransitions === undefined ? undefined : stateTransitions, + stateTransitions === undefined + ? undefined + : stateTransitions, valueRenderer: formatCount, badThreshold: (v) => (v ?? 0) > (dailyHealthPercentages?.length || 1) * 0.3,