cmd/coordinator, cmd/buildstats: start of using logs to schedule tests

bradfitz · bradfitz · commit da19ce050fb1 · 2017-04-24T15:37:51.000Z
We've been logging event spans to datastore for years, but I'd lost this CL and just found it back. This does two things: syncs the datastore logs to BigQuery, and starts to use the from-BigQuery timing info in the coordinator for scheduling sharded tests. The plan was to have a job occasionally do a BigQuery query and write out the results to a CSV file on GCS. The code to read that CSV file is in this CL, but that code path is disabled, so this CL should be a no-op. A future change will periodically do the query and write the CSV file, and then we can start using the new code path and remove the static map of expected test durations. Updates golang/go#12669 Change-Id: Ibe5b41d6a3009c2ade8ab728fa1cad646788e621 Reviewed-on: https://go-review.googlesource.com/30716 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
diff --git a/cmd/buildstats/buildstats.go b/cmd/buildstats/buildstats.go
@@ -17,27 +17,36 @@ import (
 
 	"cloud.google.com/go/bigquery"
 	"cloud.google.com/go/datastore"
+	"golang.org/x/build/buildenv"
 	"golang.org/x/build/types"
+	"google.golang.org/api/googleapi"
 	"google.golang.org/api/iterator"
 )
 
 var (
 	doSync = flag.Bool("sync", false, "sync build stats data from Datastore to BigQuery")
 )
 
+var env *buildenv.Environment
+
 func main() {
+	buildenv.RegisterFlags()
 	flag.Parse()
+
+	env = buildenv.FromFlags()
+
 	ctx := context.Background()
 	if *doSync {
-		sync(ctx)
+		syncBuilds(ctx)
+		syncSpans(ctx)
 	} else {
 		log.Fatalf("the buildstats command doesn't yet do anything except the --sync mode")
 	}
 
 }
 
-func sync(ctx context.Context) {
-	bq, err := bigquery.NewClient(ctx, "symbolic-datum-552")
+func syncBuilds(ctx context.Context) {
+	bq, err := bigquery.NewClient(ctx, env.ProjectName)
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -73,7 +82,7 @@ func sync(ctx context.Context) {
 	}
 	log.Printf("Max is %v (%v)", t, t.Location())
 
-	ds, err := datastore.NewClient(ctx, "symbolic-datum-552")
+	ds, err := datastore.NewClient(ctx, env.ProjectName)
 	if err != nil {
 		log.Fatalf("datastore.NewClient: %v", err)
 	}
@@ -99,7 +108,6 @@ func sync(ctx context.Context) {
 			if s.EndTime.IsZero() {
 				log.Fatalf("got zero endtime")
 			}
-			//log.Printf("need to add %s: %+v", key.Encode(), s)
 
 			var row []bigquery.Value
 			var putSchema bigquery.Schema
@@ -132,5 +140,130 @@ func sync(ctx context.Context) {
 			os.Exit(1)
 		}
 	}
+}
+
+func syncSpans(ctx context.Context) {
+	bq, err := bigquery.NewClient(ctx, env.ProjectName)
+	if err != nil {
+		log.Fatal(err)
+	}
+	table := bq.Dataset("builds").Table("Spans")
+	meta, err := table.Metadata(ctx)
+	if ae, ok := err.(*googleapi.Error); ok && ae.Code == 404 {
+		log.Printf("Creating table Spans...")
+		err = table.Create(ctx)
+		if err == nil {
+			meta, err = table.Metadata(ctx)
+		}
+	}
+	if err != nil {
+		log.Fatalf("Metadata: %#v", err)
+	}
+	log.Printf("Metadata: %#v", meta)
+	schema := meta.Schema
+	if len(schema) == 0 {
+		log.Printf("EMPTY SCHEMA")
+		schema, err = bigquery.InferSchema(types.SpanRecord{})
+		if err != nil {
+			log.Fatalf("InferSchema: %v", err)
+		}
+		meta, err := table.Update(ctx, bigquery.TableMetadataToUpdate{Schema: schema})
+		if err != nil {
+			log.Fatalf("table.Update schema: %v", err)
+		}
+		schema = meta.Schema
+	}
+	for i, fs := range schema {
+		log.Printf("  schema[%v]: %+v", i, fs)
+		for j, fs := range fs.Schema {
+			log.Printf("     .. schema[%v]: %+v", j, fs)
+		}
+	}
+
+	q := bq.Query("SELECT MAX(EndTime) FROM [symbolic-datum-552:builds.Spans]")
+	it, err := q.Read(ctx)
+	if err != nil {
+		log.Fatalf("Read: %v", err)
+	}
+
+	var since time.Time
+	var values []bigquery.Value
+	if err := it.Next(&values); err != nil {
+		if err == iterator.Done {
+			log.Fatalf("Expected at least one row fro MAX(EndTime) query; got none.")
+		}
+		log.Fatalf("Next: %v", err)
+	}
+	switch t := values[0].(type) {
+	case nil:
+		// NULL. No rows.
+		log.Printf("starting from the beginning...")
+	case time.Time:
+		since = values[0].(time.Time)
+	default:
+		log.Fatalf("MAX(EndType) = %T: want nil or time.Time", t)
+	}
+	if since.IsZero() {
+		since = time.Unix(1, 0) // arbitrary
+	}
+
+	ds, err := datastore.NewClient(ctx, env.ProjectName)
+	if err != nil {
+		log.Fatalf("datastore.NewClient: %v", err)
+	}
+
+	up := table.Uploader()
+
+	log.Printf("Max: %v", since)
+	dsit := ds.Run(ctx, datastore.NewQuery("Span").Filter("EndTime >", since).Order("EndTime"))
+	var maxPut time.Time
+	for {
+		n := 0
+		var rows []*bigquery.ValuesSaver
+		for {
+			var s types.SpanRecord
+			key, err := dsit.Next(&s)
+			if err == iterator.Done {
+				break
+			}
+			n++
+			if err != nil {
+				log.Fatal(err)
+			}
+			if s.EndTime.IsZero() {
+				log.Fatalf("got zero endtime")
+			}
+			//log.Printf("need to add %s: %+v", key.Encode(), s)
 
+			var row []bigquery.Value
+			var putSchema bigquery.Schema
+			rv := reflect.ValueOf(s)
+			for _, fs := range meta.Schema {
+				if fs.Name[0] == '_' {
+					continue
+				}
+				putSchema = append(putSchema, fs)
+				row = append(row, rv.FieldByName(fs.Name).Interface())
+				maxPut = s.EndTime
+			}
+
+			rows = append(rows, &bigquery.ValuesSaver{
+				Schema:   putSchema,
+				InsertID: key.Encode(),
+				Row:      row,
+			})
+			if len(rows) == 1000 {
+				break
+			}
+		}
+		if n == 0 {
+			log.Printf("Done.")
+			return
+		}
+		err = up.Put(ctx, rows)
+		log.Printf("Put %d rows, up to %v. error = %v", len(rows), maxPut, err)
+		if err != nil {
+			os.Exit(1)
+		}
+	}
 }
diff --git a/cmd/coordinator/coordinator.go b/cmd/coordinator/coordinator.go
@@ -19,6 +19,7 @@ import (
 	"crypto/rand"
 	"crypto/sha1"
 	"crypto/tls"
+	"encoding/csv"
 	"errors"
 	"flag"
 	"fmt"
@@ -36,6 +37,7 @@ import (
 	"path"
 	"runtime"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -2028,7 +2030,7 @@ func (st *buildStatus) newTestSet(names []string, benchmarks []*benchmarkItem) *
 		set.items = append(set.items, &testItem{
 			set:      set,
 			name:     name,
-			duration: testDuration(name),
+			duration: testDuration(st.builderRev.name, name),
 			take:     make(chan token, 1),
 			done:     make(chan token),
 		})
@@ -2039,15 +2041,15 @@ func (st *buildStatus) newTestSet(names []string, benchmarks []*benchmarkItem) *
 			set:      set,
 			name:     name,
 			bench:    bench,
-			duration: testDuration(name),
+			duration: testDuration(st.builderRev.name, name),
 			take:     make(chan token, 1),
 			done:     make(chan token),
 		})
 	}
 	return set
 }
 
-func partitionGoTests(tests []string) (sets [][]string) {
+func partitionGoTests(builderName string, tests []string) (sets [][]string) {
 	var srcTests []string
 	var cmdTests []string
 	for _, name := range tests {
@@ -2073,19 +2075,99 @@ func partitionGoTests(tests []string) (sets [][]string) {
 			curDur = 0
 		}
 	}
-	for _, name := range goTests {
-		d := testDuration(name) - minGoTestSpeed // subtract 'go' tool overhead
+	for _, testName := range goTests {
+		d := testDuration(builderName, testName)
 		if curDur+d > sizeThres {
 			flush() // no-op if empty
 		}
-		curSet = append(curSet, name)
+		curSet = append(curSet, testName)
 		curDur += d
 	}
 
 	flush()
 	return
 }
 
+func secondsToDuration(sec float64) time.Duration {
+	return time.Duration(float64(sec) * float64(time.Second))
+}
+
+type testDurationMap map[string]map[string]time.Duration // builder name => test name => avg
+
+var (
+	testDurations   atomic.Value // of testDurationMap
+	testDurationsMu sync.Mutex   // held while updating testDurations
+)
+
+func getTestDurations() testDurationMap {
+	if m, ok := testDurations.Load().(testDurationMap); ok {
+		return m
+	}
+	testDurationsMu.Lock()
+	defer testDurationsMu.Unlock()
+	if m, ok := testDurations.Load().(testDurationMap); ok {
+		return m
+	}
+	updateTestDurationsLocked()
+	return testDurations.Load().(testDurationMap)
+}
+
+func updateTestDurations() {
+	testDurationsMu.Lock()
+	defer testDurationsMu.Unlock()
+	updateTestDurationsLocked()
+}
+
+func updateTestDurationsLocked() {
+	defer time.AfterFunc(1*time.Hour, updateTestDurations)
+	m := loadTestDurations()
+	testDurations.Store(m)
+}
+
+// The csv file on cloud storage looks like:
+//    Builder,Event,MedianSeconds,count
+//    linux-arm-arm5,run_test:runtime:cpu124,334.49922194,10
+//    linux-arm,run_test:runtime:cpu124,284.609130993,26
+//    linux-arm-arm5,run_test:go_test:cmd/compile/internal/gc,260.0241916,12
+//    linux-arm,run_test:go_test:cmd/compile/internal/gc,224.425924681,26
+//    solaris-amd64-smartosbuildlet,run_test:test:2_5,199.653975717,9
+//    solaris-amd64-smartosbuildlet,run_test:test:1_5,169.89733442,9
+//    solaris-amd64-smartosbuildlet,run_test:test:3_5,163.770453839,9
+//    solaris-amd64-smartosbuildlet,run_test:test:0_5,158.250119402,9
+//    openbsd-386-gce58,run_test:runtime:cpu124,146.494229388,12
+func loadTestDurations() (m testDurationMap) {
+	m = make(testDurationMap)
+	r, err := storageClient.Bucket(buildEnv.BuildletBucket).Object("test-durations.csv").NewReader(context.Background())
+	if err != nil {
+		log.Printf("loading test durations object from GCS: %v", err)
+		return
+	}
+	defer r.Close()
+	recs, err := csv.NewReader(r).ReadAll()
+	if err != nil {
+		log.Printf("reading test durations CSV: %v", err)
+		return
+	}
+	for _, rec := range recs {
+		if len(rec) < 3 || rec[0] == "Builder" {
+			continue
+		}
+		builder, testName, secondsStr := rec[0], rec[1], rec[2]
+		secs, err := strconv.ParseFloat(secondsStr, 64)
+		if err != nil {
+			log.Printf("unexpected seconds value in test durations CSV: %v", err)
+			continue
+		}
+		mm := m[builder]
+		if mm == nil {
+			mm = make(map[string]time.Duration)
+			m[builder] = mm
+		}
+		mm[testName] = secondsToDuration(secs)
+	}
+	return
+}
+
 var minGoTestSpeed = (func() time.Duration {
 	var min Seconds
 	for name, secs := range fixedTestDuration {
@@ -2325,11 +2407,18 @@ var fixedTestDuration = map[string]Seconds{
 
 // testDuration predicts how long the dist test 'name' will take 'name' will take.
 // It's only a scheduling guess.
-func testDuration(name string) time.Duration {
-	if secs, ok := fixedTestDuration[name]; ok {
+func testDuration(builderName, testName string) time.Duration {
+	if false { // disabled for now. never tested. TODO: test, enable.
+		durs := getTestDurations()
+		bdur := durs[builderName]
+		if d, ok := bdur[testName]; ok {
+			return d
+		}
+	}
+	if secs, ok := fixedTestDuration[testName]; ok {
 		return secs.Duration()
 	}
-	if strings.HasPrefix(name, "bench:") {
+	if strings.HasPrefix(testName, "bench:") {
 		// Assume benchmarks are roughly 20 seconds per run.
 		return 2 * benchRuns * 20 * time.Second
 	}
@@ -2845,7 +2934,7 @@ func (s *testSet) initInOrder() {
 
 	// First do the go_test:* ones. partitionGoTests
 	// only returns those, which are the ones we merge together.
-	stdSets := partitionGoTests(names)
+	stdSets := partitionGoTests(s.st.builderRev.name, names)
 	for _, set := range stdSets {
 		tis := make([]*testItem, len(set))
 		for i, name := range set {