Skip to content

Commit bbf6dcb

Browse files
dmitshurtoothrotodeke-em
committed
cmd/coordinator: migrate to OpenCensus for metrics
Replace low-level Stackdriver monitoring API usage for OpenCensus with a Stackdriver exporter. To benefit local development, expose metrics at an /metrics endpoint (to be picked up with Prometheus). This makes it much easier to add new metrics, to test them locally, and brings our metrics solution in sync with what's currently in use in x/playground (see CL 302769). It's expected to be preferable to migrate to OpenTelemetry in the future when a good migration path becomes available, and both x/build and x/playground can be updated at that time. This CL is based on work in CL 229679 and CL 138522. For golang/go#26779. For golang/go#44406. For golang/go#17104. Co-authored-by: Alexander Rakoczy <[email protected]> Co-authored-by: Emmanuel T Odeke <[email protected]> Change-Id: Iad45730feace471db1668e828b7c9775377be8a9 Reviewed-on: https://go-review.googlesource.com/c/build/+/303669 Run-TryBot: Dmitri Shuralyov <[email protected]> TryBot-Result: Go Bot <[email protected]> Trust: Dmitri Shuralyov <[email protected]> Reviewed-by: Alexander Rakoczy <[email protected]> Reviewed-by: Emmanuel Odeke <[email protected]>
1 parent 61bfccf commit bbf6dcb

File tree

12 files changed

+577
-462
lines changed

12 files changed

+577
-462
lines changed

cmd/coordinator/Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ RUN go install \
2222
cloud.google.com/go/compute/metadata \
2323
cloud.google.com/go/datastore \
2424
cloud.google.com/go/errorreporting \
25-
cloud.google.com/go/monitoring/apiv3 \
2625
cloud.google.com/go/storage \
2726
github.com/gliderlabs/ssh \
2827
github.com/golang/protobuf/ptypes \

cmd/coordinator/coordinator.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ import (
5757
"golang.org/x/build/autocertcache"
5858
"golang.org/x/build/buildenv"
5959
"golang.org/x/build/buildlet"
60+
"golang.org/x/build/cmd/coordinator/internal/metrics"
6061
"golang.org/x/build/dashboard"
6162
"golang.org/x/build/gerrit"
6263
"golang.org/x/build/internal/buildgo"
@@ -317,6 +318,17 @@ func main() {
317318

318319
addHealthCheckers(context.Background())
319320

321+
gr, err := metrics.GCEResource("go-build-coordinator")
322+
if err != nil && metadata.OnGCE() {
323+
log.Println("metrics.GCEResource:", err)
324+
}
325+
if ms, err := metrics.NewService(gr, views); err != nil {
326+
log.Println("failed to initialize metrics:", err)
327+
} else {
328+
http.Handle("/metrics", ms)
329+
defer ms.Stop()
330+
}
331+
320332
cc, err := grpc4.NewClient(http.DefaultClient, "https://maintner.golang.org")
321333
if err != nil {
322334
log.Fatal(err)
@@ -374,7 +386,7 @@ func main() {
374386
go listenAndServeInternalModuleProxy()
375387
go findWorkLoop()
376388
go findTryWorkLoop()
377-
go reportMetrics(context.Background())
389+
go reportReverseCountMetrics()
378390
// TODO(cmang): gccgo will need its own findWorkLoop
379391
}
380392

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<!-- Auto-generated by x/build/update-readmes.go -->
2+
3+
[![Go Reference](https://pkg.go.dev/badge/golang.org/x/build/cmd/coordinator/internal/metrics.svg)](https://pkg.go.dev/golang.org/x/build/cmd/coordinator/internal/metrics)
4+
5+
# golang.org/x/build/cmd/coordinator/internal/metrics
6+
7+
Package metrics provides a service for reporting metrics to Stackdriver, or locally during development.
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Package metrics provides a service for reporting metrics to
6+
// Stackdriver, or locally during development.
7+
package metrics
8+
9+
import (
10+
"errors"
11+
"fmt"
12+
"net/http"
13+
"path"
14+
"time"
15+
16+
"cloud.google.com/go/compute/metadata"
17+
"contrib.go.opencensus.io/exporter/prometheus"
18+
"contrib.go.opencensus.io/exporter/stackdriver"
19+
"go.opencensus.io/stats/view"
20+
mrpb "google.golang.org/genproto/googleapis/api/monitoredres"
21+
)
22+
23+
// NewService initializes a *Service.
24+
//
25+
// The Service returned is configured to send metric data to
26+
// StackDriver. When not running on GCE, it will host metrics through
27+
// a prometheus HTTP handler.
28+
//
29+
// views will be passed to view.Register for export to the metric
30+
// service.
31+
func NewService(resource *MonitoredResource, views []*view.View) (*Service, error) {
32+
err := view.Register(views...)
33+
if err != nil {
34+
return nil, err
35+
}
36+
37+
if !metadata.OnGCE() {
38+
view.SetReportingPeriod(5 * time.Second)
39+
pe, err := prometheus.NewExporter(prometheus.Options{})
40+
if err != nil {
41+
return nil, fmt.Errorf("prometheus.NewExporter: %w", err)
42+
}
43+
view.RegisterExporter(pe)
44+
return &Service{pExporter: pe}, nil
45+
}
46+
47+
projID, err := metadata.ProjectID()
48+
if err != nil {
49+
return nil, err
50+
}
51+
if resource == nil {
52+
return nil, errors.New("resource is required, got nil")
53+
}
54+
sde, err := stackdriver.NewExporter(stackdriver.Options{
55+
ProjectID: projID,
56+
MonitoredResource: resource,
57+
ReportingInterval: time.Minute, // Minimum interval for Stackdriver is 1 minute.
58+
})
59+
if err != nil {
60+
return nil, err
61+
}
62+
63+
// Minimum interval for Stackdriver is 1 minute.
64+
view.SetReportingPeriod(time.Minute)
65+
// Start the metrics exporter.
66+
if err := sde.StartMetricsExporter(); err != nil {
67+
return nil, err
68+
}
69+
70+
return &Service{sdExporter: sde}, nil
71+
}
72+
73+
// Service controls metric exporters.
74+
type Service struct {
75+
sdExporter *stackdriver.Exporter
76+
pExporter *prometheus.Exporter
77+
}
78+
79+
func (m *Service) ServeHTTP(w http.ResponseWriter, r *http.Request) {
80+
if m.pExporter != nil {
81+
m.pExporter.ServeHTTP(w, r)
82+
return
83+
}
84+
http.Error(w, http.StatusText(http.StatusNotFound), http.StatusNotFound)
85+
}
86+
87+
// Stop flushes metrics and stops exporting. Stop should be called
88+
// before exiting.
89+
func (m *Service) Stop() {
90+
if sde := m.sdExporter; sde != nil {
91+
// Flush any unsent data before exiting.
92+
sde.Flush()
93+
94+
sde.StopMetricsExporter()
95+
}
96+
}
97+
98+
// MonitoredResource wraps a *mrpb.MonitoredResource to implement the
99+
// monitoredresource.MonitoredResource interface.
100+
type MonitoredResource mrpb.MonitoredResource
101+
102+
func (r *MonitoredResource) MonitoredResource() (resType string, labels map[string]string) {
103+
return r.Type, r.Labels
104+
}
105+
106+
// GCEResource populates a MonitoredResource with GCE Metadata.
107+
//
108+
// The returned MonitoredResource will have the type set to "generic_task".
109+
func GCEResource(jobName string) (*MonitoredResource, error) {
110+
projID, err := metadata.ProjectID()
111+
if err != nil {
112+
return nil, err
113+
}
114+
zone, err := metadata.Zone()
115+
if err != nil {
116+
return nil, err
117+
}
118+
inst, err := metadata.InstanceName()
119+
if err != nil {
120+
return nil, err
121+
}
122+
group, err := instanceGroupName()
123+
if err != nil {
124+
return nil, err
125+
} else if group == "" {
126+
group = projID
127+
}
128+
129+
return (*MonitoredResource)(&mrpb.MonitoredResource{
130+
Type: "generic_task", // See: https://cloud.google.com/monitoring/api/resources#tag_generic_task
131+
Labels: map[string]string{
132+
"project_id": projID,
133+
"location": zone,
134+
"namespace": group,
135+
"job": jobName,
136+
"task_id": inst,
137+
},
138+
}), nil
139+
}
140+
141+
// instanceGroupName fetches the instanceGroupName from the instance
142+
// metadata.
143+
//
144+
// The instance group manager applies a custom "created-by" attribute
145+
// to the instance, which is not part of the metadata package API, and
146+
// must be queried separately.
147+
//
148+
// An empty string will be returned if a metadata.NotDefinedError is
149+
// returned when fetching metadata. An error will be returned if other
150+
// errors occur when fetching metadata.
151+
func instanceGroupName() (string, error) {
152+
ig, err := metadata.InstanceAttributeValue("created-by")
153+
if errors.As(err, new(metadata.NotDefinedError)) {
154+
return "", nil
155+
} else if err != nil {
156+
return "", err
157+
}
158+
// "created-by" format: "projects/{{InstanceID}}/zones/{{Zone}}/instanceGroupManagers/{{Instance Group Name}}
159+
return path.Base(ig), nil
160+
}

cmd/coordinator/metrics.go

Lines changed: 29 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -10,66 +10,44 @@ package main
1010

1111
import (
1212
"context"
13-
"log"
1413
"time"
1514

16-
"golang.org/x/build/cmd/coordinator/metrics"
15+
"go.opencensus.io/stats"
16+
"go.opencensus.io/stats/view"
17+
"go.opencensus.io/tag"
1718
"golang.org/x/build/internal/coordinator/pool"
19+
)
1820

19-
"github.com/golang/protobuf/ptypes"
20-
metpb "google.golang.org/genproto/googleapis/api/metric"
21-
monpb "google.golang.org/genproto/googleapis/monitoring/v3"
21+
var (
22+
kHostType = tag.MustNewKey("go-build/coordinator/host_type")
23+
mReverseBuildlets = stats.Int64("go-build/coordinator/reverse_buildlets_count", "number of reverse buildlets", stats.UnitDimensionless)
2224
)
2325

24-
// reportMetrics gathers and reports buildlet metrics to Stackdriver.
25-
// It currently only reports count of running reverse buildlets per type.
26-
func reportMetrics(ctx context.Context) {
26+
// views should contain all measurements. All *view.View added to this
27+
// slice will be registered and exported to the metric service.
28+
var views = []*view.View{
29+
{
30+
Name: "go-build/coordinator/reverse_buildlets_count",
31+
Description: "Number of reverse buildlets that are up",
32+
Measure: mReverseBuildlets,
33+
TagKeys: []tag.Key{kHostType},
34+
Aggregation: view.LastValue(),
35+
},
36+
}
37+
38+
// reportReverseCountMetrics gathers and reports
39+
// a count of running reverse buildlets per type.
40+
func reportReverseCountMetrics() {
2741
for {
28-
err := reportReverseCountMetrics(ctx)
29-
if err != nil {
30-
log.Printf("error reporting %q metrics: %v\n",
31-
metrics.ReverseCount.Name, err)
42+
// 1. Gather # buildlets up per reverse builder type.
43+
totals := pool.ReversePool().HostTypeCount()
44+
// 2. Write counts out to the metrics recorder, grouped by hostType.
45+
for hostType, n := range totals {
46+
stats.RecordWithTags(context.Background(),
47+
[]tag.Mutator{tag.Upsert(kHostType, hostType)},
48+
mReverseBuildlets.M(int64(n)))
3249
}
3350

3451
time.Sleep(5 * time.Minute)
3552
}
36-
37-
}
38-
39-
func reportReverseCountMetrics(ctx context.Context) error {
40-
m := metrics.ReverseCount
41-
// 1. Gather # buildlets up per reverse builder type
42-
totals := pool.ReversePool().HostTypeCount()
43-
// 2. Write counts to Stackdriver
44-
ts := []*monpb.TimeSeries{}
45-
now := ptypes.TimestampNow()
46-
for hostType, n := range totals {
47-
labels, err := m.Labels(hostType)
48-
if err != nil {
49-
return err
50-
}
51-
tv, err := m.TypedValue(n)
52-
if err != nil {
53-
return err
54-
}
55-
ts = append(ts, &monpb.TimeSeries{
56-
Metric: &metpb.Metric{
57-
Type: m.Descriptor.Type,
58-
Labels: labels,
59-
},
60-
Points: []*monpb.Point{
61-
{
62-
Interval: &monpb.TimeInterval{
63-
EndTime: now,
64-
},
65-
Value: tv,
66-
},
67-
},
68-
})
69-
}
70-
71-
return pool.NewGCEConfiguration().MetricsClient().CreateTimeSeries(ctx, &monpb.CreateTimeSeriesRequest{
72-
Name: m.DescriptorPath(pool.NewGCEConfiguration().BuildEnv().ProjectName),
73-
TimeSeries: ts,
74-
})
7553
}

cmd/coordinator/metrics/README.md

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)