-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
scheduler: preserve allocations enriched during placement as 'informational' #24960
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -388,6 +388,9 @@ func TestReconciler_Place_Existing(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -429,6 +432,9 @@ func TestReconciler_ScaleDown_Partial(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -471,6 +477,9 @@ func TestReconciler_ScaleDown_Zero(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -514,6 +523,9 @@ func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) { | |
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2)) | ||
allocs = append(allocs, alloc) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
expectedStopped = append(expectedStopped, i%2) | ||
} | ||
|
||
|
@@ -552,6 +564,9 @@ func TestReconciler_Inplace(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -593,6 +608,9 @@ func TestReconciler_Inplace_ScaleUp(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -636,6 +654,9 @@ func TestReconciler_Inplace_ScaleDown(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -686,6 +707,9 @@ func TestReconciler_Inplace_Rollback(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
// allocs[0] is an allocation from version 0 | ||
|
@@ -746,6 +770,9 @@ func TestReconciler_Destructive(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -782,6 +809,9 @@ func TestReconciler_DestructiveMaxParallel(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -821,6 +851,9 @@ func TestReconciler_Destructive_ScaleUp(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -863,6 +896,9 @@ func TestReconciler_Destructive_ScaleDown(t *testing.T) { | |
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
allocs = append(allocs, alloc) | ||
} | ||
|
||
|
@@ -1018,6 +1054,10 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { | |
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
alloc.DesiredStatus = structs.AllocDesiredStatusRun | ||
|
||
// set host volume IDs on running allocations to make sure their presence doesn't | ||
// interfere with reconciler behavior | ||
alloc.HostVolumeIDs = []string{"host-volume1", "host-volume2"} | ||
|
||
// Set one of the allocations to failed | ||
if i == 4 { | ||
alloc.ClientStatus = structs.AllocClientStatusFailed | ||
|
@@ -1063,6 +1103,126 @@ func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { | |
} | ||
} | ||
|
||
func TestReconciler_InformationalAllocs(t *testing.T) { | ||
disabledReschedulePolicy := &structs.ReschedulePolicy{ | ||
Attempts: 0, | ||
Unlimited: false, | ||
} | ||
|
||
ci.Parallel(t) | ||
now := time.Now() | ||
|
||
testCases := []struct { | ||
name string | ||
count int | ||
stoppedCount int | ||
failedCount int | ||
reschedulePolicy *structs.ReschedulePolicy | ||
expectPlace int | ||
expectStop int | ||
expectIgnore int | ||
}{ | ||
{ | ||
name: "Count 3, 2 allocs failed, 1 stopped, no reschedule", | ||
count: 3, | ||
stoppedCount: 1, | ||
failedCount: 2, | ||
reschedulePolicy: disabledReschedulePolicy, | ||
expectPlace: 2, | ||
expectStop: 1, | ||
expectIgnore: 1, | ||
}, | ||
Comment on lines
+1125
to
+1134
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This case appears to cover your comment @tgross, does it not? The desired behavior, if I'm correct, should be 2 placed allocs, 1 stopped and 1 ignored in this case, which is exactly what we're getting. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Oh, I see... this test is actually quite complicated, as the first failed alloc is on the down node and the second failed alloc is on the disconnected node. So the 2nd failed alloc is resulting in a replacement for the disconnect? We should probably leave a comment on the |
||
{ | ||
name: "Count 1, 1 alloc failed, 1 stopped, reschedule", | ||
count: 1, | ||
stoppedCount: 1, | ||
failedCount: 1, | ||
reschedulePolicy: &structs.ReschedulePolicy{ | ||
Attempts: 1, | ||
}, | ||
expectPlace: 1, | ||
expectStop: 2, | ||
expectIgnore: 0, | ||
}, | ||
{ | ||
name: "Count 2, no allocs failed, 2 stopped, no reschedule", | ||
count: 2, | ||
stoppedCount: 2, | ||
failedCount: 0, | ||
reschedulePolicy: disabledReschedulePolicy, | ||
expectPlace: 2, | ||
expectStop: 1, | ||
expectIgnore: 0, | ||
Comment on lines
+1153
to
+1155
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not a 100% confident about these. Intuitively, I would expect 2 allocs to place, 0 to ignore and 0 to stop. This might have to do with what nodes are available in this case, I will look into this and do some additional manual testing to be sure that we're setting alloc desired status and client status correctly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, agreed, this one looks funny. Like the one above, I'd annotate the expectations here because it's not intuitive. You've got no failed allocs, so one stopped alloc is sitting on a down node, and the other stopped alloc is disconnected. So I'd expect 1 placement for the down node, and 1 temporary replacement for the disconnected alloc. Where's the stop come from? Are we calling stop for an allocation that's already been stopped? |
||
}, | ||
} | ||
|
||
for _, tc := range testCases { | ||
t.Run(tc.name, func(t *testing.T) { | ||
job := mock.Job() | ||
job.TaskGroups[0].Count = tc.count | ||
job.TaskGroups[0].ReschedulePolicy = tc.reschedulePolicy | ||
|
||
var allocs []*structs.Allocation | ||
for i := 0; i < tc.failedCount; i++ { | ||
alloc := mock.Alloc() | ||
alloc.Job = job | ||
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
alloc.HostVolumeIDs = []string{"foo"} | ||
alloc.DesiredStatus = structs.AllocDesiredStatusRun | ||
alloc.ClientStatus = structs.AllocClientStatusFailed | ||
|
||
allocs = append(allocs, alloc) | ||
} | ||
|
||
for i := 0; i < tc.stoppedCount; i++ { | ||
alloc := mock.Alloc() | ||
alloc.Job = job | ||
alloc.JobID = job.ID | ||
alloc.NodeID = uuid.Generate() | ||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) | ||
alloc.HostVolumeIDs = []string{"foo"} | ||
alloc.DesiredStatus = structs.AllocDesiredStatusStop | ||
alloc.ClientStatus = structs.AllocClientStatusComplete | ||
|
||
allocs = append(allocs, alloc) | ||
} | ||
|
||
// Build a map of tainted nodes, one down one disconnected | ||
tainted := make(map[string]*structs.Node, 2) | ||
downNode := mock.Node() | ||
downNode.ID = allocs[0].NodeID | ||
downNode.Status = structs.NodeStatusDown | ||
tainted[downNode.ID] = downNode | ||
|
||
disconnected := mock.Node() | ||
disconnected.ID = allocs[1].NodeID | ||
disconnected.Status = structs.NodeStatusDisconnected | ||
tainted[disconnected.ID] = disconnected | ||
|
||
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, | ||
nil, allocs, tainted, "", 50, true, AllocRenconcilerWithNow(now)) | ||
r := reconciler.Compute() | ||
|
||
// Assert the correct results | ||
assertResults(t, r, &resultExpectation{ | ||
createDeployment: nil, | ||
deploymentUpdates: nil, | ||
place: tc.expectPlace, | ||
stop: tc.expectStop, | ||
desiredTGUpdates: map[string]*structs.DesiredUpdates{ | ||
job.TaskGroups[0].Name: { | ||
Place: uint64(tc.expectPlace), | ||
Stop: uint64(tc.expectStop), | ||
Ignore: uint64(tc.expectIgnore), | ||
}, | ||
}, | ||
}) | ||
}) | ||
} | ||
} | ||
|
||
// Tests the reconciler properly handles lost nodes with allocations | ||
func TestReconciler_LostNode(t *testing.T) { | ||
ci.Parallel(t) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are a great quick-and-dirty addition for avoiding regressions 👍