Skip to content

Commit 79f0ef2

Browse files
committed
internal/coordinator/pool: add a hard limit for a1.metal instances
This change allows the ledger to track how many a1.metal instances have either been created or are in the process of being created. A hard limit of 1 has been set for a1.metal limits while testing is being performed. For golang/go#42604 Change-Id: I5bcb3a65407af07d225caf2884877ce040ee011b Reviewed-on: https://go-review.googlesource.com/c/build/+/322855 Trust: Carlos Amedee <[email protected]> Run-TryBot: Carlos Amedee <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Alexander Rakoczy <[email protected]>
1 parent f2096ad commit 79f0ef2

File tree

2 files changed

+131
-13
lines changed

2 files changed

+131
-13
lines changed

internal/coordinator/pool/ledger.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type entry struct {
2525
createdAt time.Time
2626
instanceID string
2727
instanceName string
28+
instanceType string
2829
vCPUCount int64
2930
}
3031

@@ -40,6 +41,10 @@ type ledger struct {
4041
// entries contains a mapping of instance name to entries for each instance
4142
// that has resources allocated to it.
4243
entries map[string]*entry
44+
// instanceA1Limit is the limit of a1.metal instances which can be created on EC2.
45+
instanceA1Limit int64
46+
// instanceA1Used is the current count of a1.metal instances.
47+
instanceA1Used int64
4348
// types contains a mapping of instance type names to instance types for each
4449
// ARM64 EC2 instance.
4550
types map[string]*cloud.InstanceType
@@ -48,8 +53,9 @@ type ledger struct {
4853
// newLedger creates a new ledger.
4954
func newLedger() *ledger {
5055
return &ledger{
51-
entries: make(map[string]*entry),
52-
types: make(map[string]*cloud.InstanceType),
56+
entries: make(map[string]*entry),
57+
instanceA1Limit: 1, // TODO(golang.org/issue/42604) query for limit once issue is resolved.
58+
types: make(map[string]*cloud.InstanceType),
5359
}
5460
}
5561

@@ -65,7 +71,7 @@ func (l *ledger) ReserveResources(ctx context.Context, instName, vmType string)
6571
t := time.NewTicker(2 * time.Second)
6672
defer t.Stop()
6773
for {
68-
if l.allocateCPU(instType.CPU, instName) {
74+
if l.allocateResources(instType.CPU, instName, instType.Type) {
6975
return nil
7076
}
7177
select {
@@ -94,44 +100,58 @@ func (l *ledger) PrepareReservationRequest(instName, vmType string) (*cloud.Inst
94100
return instType, nil
95101
}
96102

97-
// releaseResources deletes the entry associated with an instance. The resources associated to the
103+
const a1MetalInstance = "a1.metal" // added for golang.org/issue/42604
104+
105+
// releaseResources deletes the entry associated with an instance. The resources associated with the
98106
// instance will also be released. An error is returned if the instance entry is not found.
99107
// Lock l.mu must be held by the caller.
100108
func (l *ledger) releaseResources(instName string) error {
101109
e, ok := l.entries[instName]
102110
if !ok {
103111
return fmt.Errorf("instance not found for releasing quota: %s", instName)
104112
}
113+
if e.instanceType == a1MetalInstance && l.instanceA1Used > 0 {
114+
l.instanceA1Used--
115+
}
105116
l.deallocateCPU(e.vCPUCount)
106117
return nil
107118
}
108119

109-
// allocateCPU ensures that there is enough CPU to allocate below the CPU Quota
120+
// allocateResources ensures that there is enough CPU to allocate below the CPU Quota
110121
// for the caller to create a resouce with the numCPU passed in. If there is enough
111122
// then the ammount of used CPU will increase by the requested ammount. If there is
112123
// not enough CPU available, then a false is returned. In the event that CPU is allocated
113124
// an entry will be added in the entries map for the instance.
114-
func (l *ledger) allocateCPU(numCPU int64, instName string) bool {
125+
// It also enforces instance type limits.
126+
func (l *ledger) allocateResources(numCPU int64, instName, instType string) bool {
115127
// should never happen
116128
if numCPU <= 0 {
117129
log.Printf("invalid allocation requested: %d", numCPU)
118130
return false
119131
}
132+
isA1Metal := instType == a1MetalInstance
120133

121134
l.mu.Lock()
122135
defer l.mu.Unlock()
123136

137+
if isA1Metal && l.instanceA1Used >= l.instanceA1Limit {
138+
return false
139+
}
124140
if numCPU+l.cpuUsed > l.cpuLimit {
125141
return false
126142
}
127143
l.cpuUsed += numCPU
144+
if isA1Metal {
145+
l.instanceA1Used++
146+
}
128147
e, ok := l.entries[instName]
129148
if ok {
130149
e.vCPUCount = numCPU
131150
} else {
132151
l.entries[instName] = &entry{
133152
instanceName: instName,
134153
vCPUCount: numCPU,
154+
instanceType: instType,
135155
}
136156
}
137157
return true

internal/coordinator/pool/ledger_test.go

Lines changed: 105 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,9 @@ func TestLedgerReleaseResources(t *testing.T) {
115115
instName string
116116
entry *entry
117117
cpuUsed int64
118+
a1Used int64
118119
wantCPUUsed int64
120+
wantA1Used int64
119121
wantErr bool
120122
}{
121123
{
@@ -126,7 +128,9 @@ func TestLedgerReleaseResources(t *testing.T) {
126128
vCPUCount: 10,
127129
},
128130
cpuUsed: 20,
131+
a1Used: 0,
129132
wantCPUUsed: 10,
133+
wantA1Used: 0,
130134
wantErr: false,
131135
},
132136
{
@@ -137,14 +141,45 @@ func TestLedgerReleaseResources(t *testing.T) {
137141
vCPUCount: 10,
138142
},
139143
cpuUsed: 20,
144+
a1Used: 0,
140145
wantCPUUsed: 20,
146+
wantA1Used: 0,
147+
wantErr: true,
148+
},
149+
{
150+
desc: "success-with-a1-instance",
151+
instName: "inst-x",
152+
entry: &entry{
153+
instanceName: "inst-x",
154+
vCPUCount: 10,
155+
instanceType: a1MetalInstance,
156+
},
157+
cpuUsed: 20,
158+
a1Used: 1,
159+
wantCPUUsed: 10,
160+
wantA1Used: 0,
161+
wantErr: false,
162+
},
163+
{
164+
desc: "entry-not-found-with-a1-instance",
165+
instName: "inst-x",
166+
entry: &entry{
167+
instanceName: "inst-w",
168+
vCPUCount: 10,
169+
instanceType: a1MetalInstance,
170+
},
171+
cpuUsed: 20,
172+
a1Used: 1,
173+
wantCPUUsed: 20,
174+
wantA1Used: 1,
141175
wantErr: true,
142176
},
143177
}
144178
for _, tc := range testCases {
145179
t.Run(tc.desc, func(t *testing.T) {
146180
l := &ledger{
147-
cpuUsed: tc.cpuUsed,
181+
cpuUsed: tc.cpuUsed,
182+
instanceA1Used: tc.a1Used,
148183
entries: map[string]*entry{
149184
tc.entry.instanceName: tc.entry,
150185
},
@@ -156,62 +191,125 @@ func TestLedgerReleaseResources(t *testing.T) {
156191
if l.cpuUsed != tc.wantCPUUsed {
157192
t.Errorf("ledger.cpuUsed = %d; wanted %d", l.cpuUsed, tc.wantCPUUsed)
158193
}
194+
if l.instanceA1Used != tc.wantA1Used {
195+
t.Errorf("ledger.instanceA1Used = %d; wanted %d", l.instanceA1Used, tc.wantA1Used)
196+
}
159197
})
160198
}
161199
}
162200

163-
func TestLedgerAllocateCPU(t *testing.T) {
201+
func TestLedgerAllocateResources(t *testing.T) {
164202
testCases := []struct {
165203
desc string
166204
numCPU int64
167205
cpuLimit int64
168206
cpuUsed int64
207+
a1Used int64
208+
a1Limit int64
169209
instName string
210+
instType string
170211
wantReserve bool
171212
wantCPUUsed int64
213+
wantA1Used int64
172214
}{
173215
{
174216
desc: "reservation-success",
175217
numCPU: 10,
176218
cpuLimit: 10,
177219
cpuUsed: 0,
220+
a1Used: 0,
221+
a1Limit: 1,
178222
instName: "chacha",
223+
instType: "x.type",
179224
wantReserve: true,
180225
wantCPUUsed: 10,
226+
wantA1Used: 0,
181227
},
182228
{
183229
desc: "failed-to-reserve",
230+
a1Used: 0,
231+
a1Limit: 1,
184232
numCPU: 10,
185233
cpuLimit: 5,
186234
cpuUsed: 0,
187235
instName: "pasa",
236+
instType: "x.type",
188237
wantReserve: false,
189238
wantCPUUsed: 0,
239+
wantA1Used: 0,
190240
},
191241
{
192242
desc: "invalid-cpu-count",
243+
a1Used: 0,
244+
a1Limit: 1,
193245
numCPU: 0,
194246
cpuLimit: 50,
195247
cpuUsed: 20,
196248
instName: "double",
249+
instType: "x.type",
197250
wantReserve: false,
198251
wantCPUUsed: 20,
252+
wantA1Used: 0,
253+
},
254+
{
255+
desc: "reservation-success with a1.metal instance",
256+
numCPU: 10,
257+
cpuLimit: 10,
258+
cpuUsed: 0,
259+
a1Used: 0,
260+
a1Limit: 1,
261+
instName: "chacha",
262+
instType: a1MetalInstance,
263+
wantReserve: true,
264+
wantCPUUsed: 10,
265+
wantA1Used: 1,
266+
},
267+
{
268+
desc: "failed-to-reserve with a1.metal instance",
269+
a1Used: 0,
270+
a1Limit: 1,
271+
numCPU: 10,
272+
cpuLimit: 5,
273+
cpuUsed: 0,
274+
instName: "pasa",
275+
instType: a1MetalInstance,
276+
wantReserve: false,
277+
wantCPUUsed: 0,
278+
wantA1Used: 0,
279+
},
280+
{
281+
desc: "invalid-cpu-count with a1.metal instance",
282+
a1Used: 0,
283+
a1Limit: 10,
284+
numCPU: 0,
285+
cpuLimit: 50,
286+
cpuUsed: 20,
287+
instName: "double",
288+
instType: a1MetalInstance,
289+
wantReserve: false,
290+
wantCPUUsed: 20,
291+
wantA1Used: 0,
199292
},
200293
}
201294
for _, tc := range testCases {
202295
t.Run(tc.desc, func(t *testing.T) {
203296
l := &ledger{
204-
entries: make(map[string]*entry),
205-
cpuLimit: tc.cpuLimit,
206-
cpuUsed: tc.cpuUsed,
297+
entries: make(map[string]*entry),
298+
cpuLimit: tc.cpuLimit,
299+
cpuUsed: tc.cpuUsed,
300+
instanceA1Limit: tc.a1Limit,
301+
instanceA1Used: tc.a1Used,
207302
}
208-
gotReserve := l.allocateCPU(tc.numCPU, tc.instName)
303+
gotReserve := l.allocateResources(tc.numCPU, tc.instName, tc.instType)
209304
if gotReserve != tc.wantReserve {
210-
t.Errorf("ledger.allocateCPU(%d) = %v, want %v", tc.numCPU, gotReserve, tc.wantReserve)
305+
t.Errorf("ledger.allocateResources(%d) = %v, want %v", tc.numCPU, gotReserve, tc.wantReserve)
211306
}
212307
if l.cpuUsed != tc.wantCPUUsed {
213308
t.Errorf("ledger.cpuUsed = %d; want %d", l.cpuUsed, tc.wantCPUUsed)
214309
}
310+
if l.instanceA1Used != tc.wantA1Used {
311+
t.Errorf("ledger.instanceA1Used = %d; want %d", l.instanceA1Used, tc.wantA1Used)
312+
}
215313
if _, ok := l.entries[tc.instName]; tc.wantReserve && !ok {
216314
t.Fatalf("ledger.entries[%s] = nil; want it to exist", tc.instName)
217315
}

0 commit comments

Comments
 (0)