diff --git a/.changelog/24974.txt b/.changelog/24974.txt new file mode 100644 index 00000000000..b45d5ef63c4 --- /dev/null +++ b/.changelog/24974.txt @@ -0,0 +1,3 @@ +```release-note:bug +state store: fix for setting correct job status in various scenarios +``` diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index fe700090f35..9f16d0cdb0e 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -117,28 +117,28 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) { defer cleanupS1() testutil.WaitForLeader(t, s1.RPC) + job := mock.Job() + // Insert "dead" eval store := s1.fsm.State() eval := mock.Eval() + eval.JobModifyIndex = job.ModifyIndex eval.CreateTime = time.Now().UTC().Add(-6 * time.Hour).UnixNano() // make sure objects we insert are older than GC thresholds eval.ModifyTime = time.Now().UTC().Add(-5 * time.Hour).UnixNano() eval.Status = structs.EvalStatusFailed - must.NoError(t, store.UpsertJobSummary(999, mock.JobSummary(eval.JobID))) - must.NoError(t, store.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval})) + must.NoError(t, store.UpsertEvals(structs.MsgTypeTestSetup, 1001, []*structs.Evaluation{eval})) + + // Insert mock job with default reschedule policy of 2 in 10 minutes + job.ID = eval.JobID + must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, 1002, nil, job)) // Insert "pending" eval for same job eval2 := mock.Eval() eval2.JobID = eval.JobID + eval2.JobModifyIndex = job.ModifyIndex // must have same modify index as job in order to set job status correctly eval2.CreateTime = time.Now().UTC().Add(-6 * time.Hour).UnixNano() // make sure objects we insert are older than GC thresholds eval2.ModifyTime = time.Now().UTC().Add(-5 * time.Hour).UnixNano() - must.NoError(t, store.UpsertJobSummary(999, mock.JobSummary(eval2.JobID))) - must.NoError(t, store.UpsertEvals(structs.MsgTypeTestSetup, 1003, []*structs.Evaluation{eval2})) - - // Insert mock job with default reschedule policy of 2 in 10 minutes - job := mock.Job() - job.ID = eval.JobID - - must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, 1001, nil, job)) + must.NoError(t, store.UpsertEvals(structs.MsgTypeTestSetup, 1002, []*structs.Evaluation{eval2})) // Insert failed alloc with an old reschedule attempt, can be GCed alloc := mock.Alloc() @@ -1049,12 +1049,14 @@ func TestCoreScheduler_JobGC_OutstandingEvals(t *testing.T) { // Insert two evals, one terminal and one not eval := mock.Eval() eval.JobID = job.ID + eval.JobModifyIndex = job.ModifyIndex eval.Status = structs.EvalStatusComplete eval.CreateTime = time.Now().Add(-6 * time.Hour).UnixNano() // make sure objects we insert are older than GC thresholds eval.ModifyTime = time.Now().Add(-5 * time.Hour).UnixNano() eval2 := mock.Eval() eval2.JobID = job.ID + eval2.JobModifyIndex = job.ModifyIndex eval2.Status = structs.EvalStatusPending eval2.CreateTime = time.Now().Add(-6 * time.Hour).UnixNano() // make sure objects we insert are older than GC thresholds eval2.ModifyTime = time.Now().Add(-5 * time.Hour).UnixNano() @@ -1145,6 +1147,7 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) { // Insert two allocs, one terminal and one not alloc := mock.Alloc() alloc.JobID = job.ID + alloc.Job = job alloc.EvalID = eval.ID alloc.DesiredStatus = structs.AllocDesiredStatusRun alloc.ClientStatus = structs.AllocClientStatusComplete @@ -1152,6 +1155,7 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) { alloc2 := mock.Alloc() alloc2.JobID = job.ID + alloc.Job = job alloc2.EvalID = eval.ID alloc2.DesiredStatus = structs.AllocDesiredStatusRun alloc2.ClientStatus = structs.AllocClientStatusRunning @@ -1484,6 +1488,7 @@ func TestCoreScheduler_JobGC_Force(t *testing.T) { // Insert a terminal eval eval := mock.Eval() eval.JobID = job.ID + eval.JobModifyIndex = job.ModifyIndex eval.Status = structs.EvalStatusComplete eval.CreateTime = time.Now().Add(-6 * time.Hour).UnixNano() // make sure objects we insert are older than GC thresholds eval.ModifyTime = time.Now().Add(-5 * time.Hour).UnixNano() diff --git a/nomad/job_endpoint.go b/nomad/job_endpoint.go index 7267f0a15b6..60f2d67c101 100644 --- a/nomad/job_endpoint.go +++ b/nomad/job_endpoint.go @@ -422,7 +422,7 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis return nil } - if eval != nil && !submittedEval { + if !submittedEval { eval.JobModifyIndex = reply.JobModifyIndex update := &structs.EvalUpdateRequest{ Evals: []*structs.Evaluation{eval}, @@ -659,6 +659,10 @@ func (j *Job) Revert(args *structs.JobRevertRequest, reply *structs.JobRegisterR // Clear out the VersionTag to prevent tag duplication revJob.VersionTag = nil + // Set the stable flag to false as this is functionally a new registration + // and should handle deployment updates + revJob.Stable = false + reg := &structs.JobRegisterRequest{ Job: revJob, WriteRequest: args.WriteRequest, diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index f5b690291f3..0ea7e2510a5 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -3716,6 +3716,9 @@ func (s *StateStore) DeleteEval(index uint64, evals, allocs []string, userInitia } } + // TODO: should we really be doing this here? The only time this will affect the + // status of a job is if it's the last eval and alloc for a client, at which point + // the status of the job will already be "dead" from handling the alloc update. // Set the job's status if err := s.setJobStatuses(index, txn, jobs, true); err != nil { return fmt.Errorf("setting job status failed: %v", err) @@ -3851,11 +3854,6 @@ func (s *StateStore) EvalsByJob(ws memdb.WatchSet, namespace, jobID string) ([]* e := raw.(*structs.Evaluation) - // Filter non-exact matches - if e.JobID != jobID { - continue - } - out = append(out, e) } return out, nil @@ -4824,14 +4822,13 @@ func (s *StateStore) UpdateDeploymentStatus(msgType structs.MessageType, index u return err } - // Upsert the job if necessary + // On failed deployments with auto_revert set to true, a new eval and job will be included on the request. + // We should upsert them both if req.Job != nil { if err := s.upsertJobImpl(index, nil, req.Job, false, txn); err != nil { return err } } - - // Upsert the optional eval if req.Eval != nil { if err := s.nestedUpsertEval(txn, index, req.Eval); err != nil { return err @@ -5532,6 +5529,12 @@ func (s *StateStore) setJobStatus(index uint64, txn *txn, if err := s.setJobSummary(txn, updated, index, oldStatus, newStatus); err != nil { return fmt.Errorf("job summary update failed %w", err) } + + // Update the job version details + if err := s.upsertJobVersion(index, updated, txn); err != nil { + return err + } + return nil } @@ -5604,7 +5607,7 @@ func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) ( if job.Type == structs.JobTypeSystem || job.IsParameterized() || job.IsPeriodic() { - if job.Stop { + if job.Stopped() { return structs.JobStatusDead, nil } return structs.JobStatusRunning, nil @@ -5615,13 +5618,27 @@ func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) ( return "", err } - // If there is a non-terminal allocation, the job is running. - hasAlloc := false + terminalAllocs := false for alloc := allocs.Next(); alloc != nil; alloc = allocs.Next() { - hasAlloc = true - if !alloc.(*structs.Allocation).TerminalStatus() { + a := alloc.(*structs.Allocation) + + // If there is a non-terminal allocation, the job is running. + if !a.TerminalStatus() { return structs.JobStatusRunning, nil } + + // Check if the allocs are reschedulable before before + // marking the job dead. If any of the allocs are terminal + // and not reschedulable, mark the job dead. + if !isReschedulable(a) { + terminalAllocs = true + } + } + + // The job is dead if it is stopped and there are no allocs + // or all allocs are terminal + if job.Stopped() { + return structs.JobStatusDead, nil } evals, err := txn.Get("evals", "job_prefix", job.Namespace, job.ID) @@ -5629,27 +5646,36 @@ func (s *StateStore) getJobStatus(txn *txn, job *structs.Job, evalDelete bool) ( return "", err } - hasEval := false + terminalEvals := false for raw := evals.Next(); raw != nil; raw = evals.Next() { e := raw.(*structs.Evaluation) - // Filter non-exact matches - if e.JobID != job.ID { + // This handles restarting stopped jobs, or else they are marked dead. + // We need to be careful with this, because an eval can technically + // still apply to a job with a greater modify index, i.e. during reschedule + // but we handle reschedule above. + if e.JobModifyIndex < job.ModifyIndex { continue } - hasEval = true if !e.TerminalStatus() { return structs.JobStatusPending, nil } + + terminalEvals = true } - // The job is dead if all the allocations and evals are terminal or if there - // are no evals because of garbage collection. - if evalDelete || hasEval || hasAlloc { + // The job is dead if all allocations for this version are terminal, + // all evals are terminal. + // Also, in the event a jobs allocs and evals are all GC'd, we don't + // want the job to be marked pending. + if terminalAllocs || terminalEvals || evalDelete { return structs.JobStatusDead, nil } + // There are no allocs/evals yet, which can happen for new job submissions, + // running new versions of a job, or reverting. This will happen if + // the evaluation is persisted after the job is persisted. return structs.JobStatusPending, nil } @@ -7348,6 +7374,30 @@ func (s *StateStore) ScalingPoliciesByIDPrefix(ws memdb.WatchSet, namespace stri return iter, nil } +func isReschedulable(a *structs.Allocation) bool { + if a.ReschedulePolicy() == nil || a.Job.Type != structs.JobTypeService { + return false + } + + reschedulePolicy := a.ReschedulePolicy() + availableAttempts := reschedulePolicy.Attempts + interval := reschedulePolicy.Interval + attempted := 0 + currTime := time.Now() + + // Loop over reschedule tracker to find attempts within the restart policy's interval + if a.RescheduleTracker != nil && availableAttempts > 0 && interval > 0 { + for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- { + lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime + timeDiff := currTime.UTC().UnixNano() - lastAttempt + if timeDiff < interval.Nanoseconds() { + attempted += 1 + } + } + } + return attempted < availableAttempts +} + // scalingPolicyNamespaceFilter returns a filter function that filters all // scaling policies not targeting the given namespace. func scalingPolicyNamespaceFilter(namespace string) func(interface{}) bool { diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go index c0efa31112f..f00f27b9bf3 100644 --- a/nomad/state/state_store_test.go +++ b/nomad/state/state_store_test.go @@ -5,6 +5,7 @@ package state import ( "context" + "errors" "fmt" "reflect" "sort" @@ -3653,19 +3654,22 @@ func TestStateStore_JobsByGC(t *testing.T) { } for i := 0; i < 20; i += 2 { + idx := 2000 + uint64(i+1) job := mock.Job() job.Type = structs.JobTypeBatch + job.ModifyIndex = idx gc[job.ID] = struct{}{} - if err := state.UpsertJob(structs.MsgTypeTestSetup, 2000+uint64(i), nil, job); err != nil { + if err := state.UpsertJob(structs.MsgTypeTestSetup, idx, nil, job); err != nil { t.Fatalf("err: %v", err) } // Create an eval for it eval := mock.Eval() eval.JobID = job.ID + eval.JobModifyIndex = job.ModifyIndex eval.Status = structs.EvalStatusComplete - if err := state.UpsertEvals(structs.MsgTypeTestSetup, 2000+uint64(i+1), []*structs.Evaluation{eval}); err != nil { + if err := state.UpsertEvals(structs.MsgTypeTestSetup, idx, []*structs.Evaluation{eval}); err != nil { t.Fatalf("err: %v", err) } @@ -4867,6 +4871,7 @@ func TestStateStore_UpsertEvals_Eval_ChildJob(t *testing.T) { eval := mock.Eval() eval.Status = structs.EvalStatusComplete eval.JobID = child.ID + eval.JobModifyIndex = child.ModifyIndex // Create watchsets so we can test that upsert fires the watch ws := memdb.NewWatchSet() @@ -5084,6 +5089,7 @@ func TestStateStore_DeleteEval_Eval(t *testing.T) { require.Equal(t, uint64(1002), evalsIndex) } +// This tests the evalDelete boolean by deleting a Pending eval and Pending Alloc. func TestStateStore_DeleteEval_ChildJob(t *testing.T) { ci.Parallel(t) @@ -5792,6 +5798,7 @@ func TestStateStore_UpdateAllocsFromClient(t *testing.T) { must.NoError(t, state.UpsertJob(structs.MsgTypeTestSetup, 998, nil, parent)) child := mock.Job() + child.Type = structs.JobTypeBatch child.Status = "" child.ParentID = parent.ID must.NoError(t, state.UpsertJob(structs.MsgTypeTestSetup, 999, nil, child)) @@ -7678,232 +7685,228 @@ func TestStateStore_SetJobStatus(t *testing.T) { } } -func TestStateStore_GetJobStatus_NoEvalsOrAllocs(t *testing.T) { +func TestStateStore_GetJobStatus(t *testing.T) { ci.Parallel(t) - job := mock.Job() - state := testStateStore(t) - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, false) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusPending { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusPending) - } -} - -func TestStateStore_GetJobStatus_NoEvalsOrAllocs_Periodic(t *testing.T) { - ci.Parallel(t) - - job := mock.PeriodicJob() - state := testStateStore(t) - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, false) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusRunning { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusRunning) - } -} - -func TestStateStore_GetJobStatus_NoEvalsOrAllocs_EvalDelete(t *testing.T) { - ci.Parallel(t) - - job := mock.Job() - state := testStateStore(t) - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, true) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusDead { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusDead) - } -} - -func TestStateStore_GetJobStatus_DeadEvalsAndAllocs(t *testing.T) { - ci.Parallel(t) - - state := testStateStore(t) - job := mock.Job() - - // Create a mock alloc that is dead. - alloc := mock.Alloc() - alloc.JobID = job.ID - alloc.DesiredStatus = structs.AllocDesiredStatusStop - state.UpsertJobSummary(999, mock.JobSummary(alloc.JobID)) - if err := state.UpsertAllocs(structs.MsgTypeTestSetup, 1000, []*structs.Allocation{alloc}); err != nil { - t.Fatalf("err: %v", err) - } - - // Create a mock eval that is complete - eval := mock.Eval() - eval.JobID = job.ID - eval.Status = structs.EvalStatusComplete - if err := state.UpsertEvals(structs.MsgTypeTestSetup, 1001, []*structs.Evaluation{eval}); err != nil { - t.Fatalf("err: %v", err) - } - - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, false) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusDead { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusDead) - } -} - -func TestStateStore_GetJobStatus_RunningAlloc(t *testing.T) { - ci.Parallel(t) - - state := testStateStore(t) - job := mock.Job() - - // Create a mock alloc that is running. - alloc := mock.Alloc() - alloc.JobID = job.ID - alloc.DesiredStatus = structs.AllocDesiredStatusRun - state.UpsertJobSummary(999, mock.JobSummary(alloc.JobID)) - if err := state.UpsertAllocs(structs.MsgTypeTestSetup, 1000, []*structs.Allocation{alloc}); err != nil { - t.Fatalf("err: %v", err) - } - - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, true) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusRunning { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusRunning) - } -} - -func TestStateStore_GetJobStatus_PeriodicJob(t *testing.T) { - ci.Parallel(t) - - state := testStateStore(t) - job := mock.PeriodicJob() - - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, false) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusRunning { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusRunning) - } - - // Mark it as stopped - job.Stop = true - status, err = state.getJobStatus(txn, job, false) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusDead { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusDead) - } -} - -func TestStateStore_GetJobStatus_ParameterizedJob(t *testing.T) { - ci.Parallel(t) - - state := testStateStore(t) - job := mock.Job() - job.ParameterizedJob = &structs.ParameterizedJobConfig{} - - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, false) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusRunning { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusRunning) - } - - // Mark it as stopped - job.Stop = true - status, err = state.getJobStatus(txn, job, false) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } - - if status != structs.JobStatusDead { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusDead) - } -} - -func TestStateStore_SetJobStatus_PendingEval(t *testing.T) { - ci.Parallel(t) - - state := testStateStore(t) - job := mock.Job() - - // Create a mock eval that is pending. - eval := mock.Eval() - eval.JobID = job.ID - eval.Status = structs.EvalStatusPending - if err := state.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval}); err != nil { - t.Fatalf("err: %v", err) - } + testCases := []struct { + name string + setup func(*txn) (*structs.Job, error) + exp string + }{ + { + name: "stopped job with running allocations is still running", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + + a := mock.Alloc() + a.JobID = j.ID + a.Job = j + a.ClientStatus = structs.AllocClientStatusRunning + if err := txn.Insert("allocs", a); err != nil { + return nil, err + } + + stoppedJob := j.Copy() + stoppedJob.Stop = true + stoppedJob.Version += 1 + return stoppedJob, nil + }, + exp: structs.JobStatusRunning, + }, + { + name: "stopped job with terminal allocs is dead", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + j.Stop = true + + a := mock.Alloc() + a.JobID = j.ID + a.Job = j + a.ClientStatus = structs.AllocClientStatusComplete + if err := txn.Insert("allocs", a); err != nil { + return nil, err + } + return j, nil + }, + exp: structs.JobStatusDead, + }, + { + name: "parameterized job", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + j.ParameterizedJob = &structs.ParameterizedJobConfig{} + j.Dispatched = false + return j, nil + }, + exp: structs.JobStatusRunning, + }, + { + name: "periodic job", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + j.Periodic = &structs.PeriodicConfig{} + return j, nil + }, + exp: structs.JobStatusRunning, + }, + { + name: "no allocs", + setup: func(txn *txn) (*structs.Job, error) { + return mock.Job(), nil + }, + exp: structs.JobStatusPending, + }, + { + name: "current job has running alloc", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + a := mock.Alloc() + + a.JobID = j.ID + if err := txn.Insert("allocs", a); err != nil { + return nil, err + } + return j, nil + }, + exp: structs.JobStatusRunning, + }, + { + name: "previous job version had allocs", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + a := mock.Alloc() + e := mock.Eval() + + e.JobID = j.ID + e.JobModifyIndex = j.ModifyIndex + e.Status = structs.EvalStatusPending + + a.JobID = j.ID + a.Job = j + a.ClientStatus = structs.AllocClientStatusFailed + + j.Version += 1 + if err := txn.Insert("allocs", a); err != nil { + return nil, err + } + if err := txn.Insert("evals", e); err != nil { + return nil, err + } + return j, nil + }, + exp: structs.JobStatusPending, + }, + { + name: "batch job has all terminal allocs, with no evals", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + j.Type = structs.JobTypeBatch + + a := mock.Alloc() + a.ClientStatus = structs.AllocClientStatusFailed + a.JobID = j.ID + a.Job = j + + if err := txn.Insert("allocs", a); err != nil { + return nil, err + } + return j, nil + }, + exp: structs.JobStatusDead, + }, + { + name: "job has all terminal allocs, but pending eval", + setup: func(txn *txn) (*structs.Job, error) { + j := mock.Job() + a := mock.Alloc() - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, true) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } + a.ClientStatus = structs.AllocClientStatusFailed + a.JobID = j.ID - if status != structs.JobStatusPending { - t.Fatalf("getJobStatus() returned %v; expected %v", status, structs.JobStatusPending) - } -} + e := mock.Eval() + e.JobID = j.ID + e.JobModifyIndex = j.ModifyIndex + e.Status = structs.EvalStatusPending -// TestStateStore_SetJobStatus_SystemJob asserts that system jobs are still -// considered running until explicitly stopped. -func TestStateStore_SetJobStatus_SystemJob(t *testing.T) { - ci.Parallel(t) + if err := txn.Insert("allocs", a); err != nil { + return nil, err + } - state := testStateStore(t) - job := mock.SystemJob() + if err := txn.Insert("evals", e); err != nil { + return nil, err + } + return j, nil - // Create a mock eval that is pending. - eval := mock.Eval() - eval.JobID = job.ID - eval.Type = job.Type - eval.Status = structs.EvalStatusComplete - if err := state.UpsertEvals(structs.MsgTypeTestSetup, 1000, []*structs.Evaluation{eval}); err != nil { - t.Fatalf("err: %v", err) + }, + exp: structs.JobStatusPending, + }, + { + name: "reschedulable alloc is pending waiting for replacement", + setup: func(t *txn) (*structs.Job, error) { + j := mock.Job() + if j.TaskGroups[0].ReschedulePolicy == nil { + return nil, errors.New("mock job doesn't have replacement policy") + } + a := mock.Alloc() + a.Job = j + a.JobID = j.ID + a.ClientStatus = structs.AllocClientStatusFailed + if err := t.Insert("allocs", a); err != nil { + return nil, err + } + return j, nil + }, + exp: structs.JobStatusPending, + }, + { + name: "reschedulable alloc is dead after replacement fails", + setup: func(t *txn) (*structs.Job, error) { + j := mock.Job() + // give job one reschedule attempt + j.TaskGroups[0].ReschedulePolicy.Attempts = 1 + j.TaskGroups[0].ReschedulePolicy.Interval = time.Hour + + // Replacement alloc + a := mock.Alloc() + a.Job = j + a.JobID = j.ID + a.ClientStatus = structs.AllocClientStatusFailed + a.RescheduleTracker = &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{ + structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), "", "", time.Minute), + }, + } + if err := t.Insert("allocs", a); err != nil { + return nil, err + } + + // Original alloc + a2 := mock.Alloc() + a2.Job = j + a2.JobID = j.ID + a2.ClientStatus = structs.AllocClientStatusFailed + a2.NextAllocation = a.ID + if err := t.Insert("allocs", a2); err != nil { + return nil, err + } + return j, nil + }, + exp: structs.JobStatusDead, + }, } - txn := state.db.ReadTxn() - status, err := state.getJobStatus(txn, job, true) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + state := testStateStore(t) - if expected := structs.JobStatusRunning; status != expected { - t.Fatalf("getJobStatus() returned %v; expected %v", status, expected) - } + txn := state.db.WriteTxn(0) - // Stop the job - job.Stop = true - status, err = state.getJobStatus(txn, job, true) - if err != nil { - t.Fatalf("getJobStatus() failed: %v", err) - } + job, _ := tc.setup(txn) - if expected := structs.JobStatusDead; status != expected { - t.Fatalf("getJobStatus() returned %v; expected %v", status, expected) + status, err := state.getJobStatus(txn, job, false) + require.NoError(t, err) + require.Equal(t, tc.exp, status) + }) } } diff --git a/nomad/system_endpoint_test.go b/nomad/system_endpoint_test.go index 209e875492b..f43d2791b45 100644 --- a/nomad/system_endpoint_test.go +++ b/nomad/system_endpoint_test.go @@ -41,6 +41,7 @@ func TestSystemEndpoint_GarbageCollect(t *testing.T) { eval := mock.Eval() eval.Status = structs.EvalStatusComplete eval.JobID = job.ID + eval.JobModifyIndex = job.ModifyIndex // set modify time older than now but still newer than default GC threshold eval.ModifyTime = time.Now().Add(-10 * time.Millisecond).UnixNano() must.NoError(t, state.UpsertEvals(structs.MsgTypeTestSetup, 1001, []*structs.Evaluation{eval}))