Skip to content

Commit

Permalink
Fix nil pointer exception when retrying local activity (#913)
Browse files Browse the repository at this point in the history
  • Loading branch information
yycptt authored and yux0 committed Dec 18, 2019
1 parent 9bbb0d3 commit 8b5a4e5
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 23 deletions.
90 changes: 67 additions & 23 deletions internal/internal_task_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"reflect"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/opentracing/opentracing-go"
Expand Down Expand Up @@ -91,7 +92,10 @@ type (
workflowInfo *WorkflowInfo
wth *workflowTaskHandlerImpl

eventHandler *workflowExecutionEventHandlerImpl
// eventHandler is changed to a atomic.Value as a temporally bug fix for local activity
// retry issue (github issue #915). Therefore, when accessing/modifying this field, the
// mutex should still be held.
eventHandler atomic.Value

isWorkflowCompleted bool
result []byte
Expand Down Expand Up @@ -123,6 +127,7 @@ type (
}

activityProvider func(name string) activity

// activityTaskHandlerImpl is the implementation of ActivityTaskHandler
activityTaskHandlerImpl struct {
taskListName string
Expand Down Expand Up @@ -401,6 +406,20 @@ func removeWorkflowContext(runID string) {
getWorkflowCache().Delete(runID)
}

func newWorkflowExecutionContext(
startTime time.Time,
workflowInfo *WorkflowInfo,
taskHandler *workflowTaskHandlerImpl,
) *workflowExecutionContextImpl {
workflowContext := &workflowExecutionContextImpl{
workflowStartTime: startTime,
workflowInfo: workflowInfo,
wth: taskHandler,
}
workflowContext.createEventHandler()
return workflowContext
}

func (w *workflowExecutionContextImpl) Lock() {
w.mutex.Lock()
}
Expand All @@ -421,6 +440,18 @@ func (w *workflowExecutionContextImpl) Unlock(err error) {
w.mutex.Unlock()
}

func (w *workflowExecutionContextImpl) getEventHandler() *workflowExecutionEventHandlerImpl {
eventHandler := w.eventHandler.Load()
if eventHandler == nil {
return nil
}
eventHandlerImpl, ok := eventHandler.(*workflowExecutionEventHandlerImpl)
if !ok {
panic("unknown type for workflow execution event handler")
}
return eventHandlerImpl
}

func (w *workflowExecutionContextImpl) completeWorkflow(result []byte, err error) {
w.isWorkflowCompleted = true
w.result = result
Expand Down Expand Up @@ -453,7 +484,7 @@ func (w *workflowExecutionContextImpl) onEviction() {
}

func (w *workflowExecutionContextImpl) IsDestroyed() bool {
return w.eventHandler == nil
return w.getEventHandler() == nil
}

func (w *workflowExecutionContextImpl) queueResetStickinessTask() {
Expand All @@ -479,17 +510,19 @@ func (w *workflowExecutionContextImpl) clearState() {
w.err = nil
w.previousStartedEventID = 0
w.newDecisions = nil
if w.eventHandler != nil {

eventHandler := w.getEventHandler()
if eventHandler != nil {
// Set isReplay to true to prevent user code in defer guarded by !isReplaying() from running
w.eventHandler.isReplay = true
w.eventHandler.Close()
w.eventHandler = nil
eventHandler.isReplay = true
eventHandler.Close()
w.eventHandler.Store((*workflowExecutionEventHandlerImpl)(nil))
}
}

func (w *workflowExecutionContextImpl) createEventHandler() {
w.clearState()
w.eventHandler = newWorkflowExecutionEventHandler(
eventHandler := newWorkflowExecutionEventHandler(
w.workflowInfo,
w.completeWorkflow,
w.wth.logger,
Expand All @@ -499,7 +532,8 @@ func (w *workflowExecutionContextImpl) createEventHandler() {
w.wth.dataConverter,
w.wth.contextPropagators,
w.wth.tracer,
).(*workflowExecutionEventHandlerImpl)
)
w.eventHandler.Store(eventHandler)
}

func resetHistory(task *s.PollForDecisionTaskResponse, historyIterator HistoryIterator) (*s.History, error) {
Expand Down Expand Up @@ -555,10 +589,7 @@ func (wth *workflowTaskHandlerImpl) createWorkflowContext(task *s.PollForDecisio
}

wfStartTime := time.Unix(0, h.Events[0].GetTimestamp())
workflowContext := &workflowExecutionContextImpl{workflowStartTime: wfStartTime, workflowInfo: workflowInfo, wth: wth}
workflowContext.createEventHandler()

return workflowContext, nil
return newWorkflowExecutionContext(wfStartTime, workflowInfo, wth), nil
}

func (wth *workflowTaskHandlerImpl) getOrCreateWorkflowContext(
Expand Down Expand Up @@ -733,7 +764,7 @@ func (w *workflowExecutionContextImpl) ProcessWorkflowTask(workflowTask *workflo
}
w.SetCurrentTask(task)

eventHandler := w.eventHandler
eventHandler := w.getEventHandler()
reorderedHistory := newHistory(workflowTask, eventHandler)
var replayDecisions []*s.Decision
var respondEvents []*s.HistoryEvent
Expand Down Expand Up @@ -870,7 +901,7 @@ func (w *workflowExecutionContextImpl) ProcessLocalActivityResult(workflowTask *
return nil, nil // nothing to do here as we are retrying...
}

err := w.eventHandler.ProcessLocalActivityResult(lar)
err := w.getEventHandler().ProcessLocalActivityResult(lar)
if err != nil {
return nil, err
}
Expand All @@ -887,7 +918,17 @@ func (w *workflowExecutionContextImpl) retryLocalActivity(lar *localActivityResu
if backoff > 0 && backoff <= w.GetDecisionTimeout() {
// we need a local retry
time.AfterFunc(backoff, func() {
if _, ok := w.eventHandler.pendingLaTasks[lar.task.activityID]; !ok {
// TODO: this should not be a separate goroutine as it introduces race condition when accessing eventHandler.
// currently this is solved by changing eventHandler to an atomic.Value. Ideally, this retry timer should be
// part of the event loop for processing the workflow task.
eventHandler := w.getEventHandler()

// if decision heartbeat failed, the workflow execution context will be cleared and eventHandler will be nil
if eventHandler == nil {
return
}

if _, ok := eventHandler.pendingLaTasks[lar.task.activityID]; !ok {
return
}

Expand Down Expand Up @@ -964,42 +1005,45 @@ func (w *workflowExecutionContextImpl) CompleteDecisionTask(workflowTask *workfl
if w.currentDecisionTask == nil {
return nil
}
eventHandler := w.getEventHandler()

// w.laTunnel could be nil for worker.ReplayHistory() because there is no worker started, in that case we don't
// care about the pending local activities, and just return because the result is ignored anyway by the caller.
if w.hasPendingLocalActivityWork() && w.laTunnel != nil {
if len(w.eventHandler.unstartedLaTasks) > 0 {
if len(eventHandler.unstartedLaTasks) > 0 {
// start new local activity tasks
for activityID := range w.eventHandler.unstartedLaTasks {
task := w.eventHandler.pendingLaTasks[activityID]
for activityID := range eventHandler.unstartedLaTasks {
task := eventHandler.pendingLaTasks[activityID]
task.wc = w
task.workflowTask = workflowTask
w.laTunnel.sendTask(task)
}
w.eventHandler.unstartedLaTasks = make(map[string]struct{})
eventHandler.unstartedLaTasks = make(map[string]struct{})
}
// cannot complete decision task as there are pending local activities
if waitLocalActivities {
return nil
}
}

eventDecisions := w.eventHandler.decisionsHelper.getDecisions(true)
eventDecisions := eventHandler.decisionsHelper.getDecisions(true)
if len(eventDecisions) > 0 {
w.newDecisions = append(w.newDecisions, eventDecisions...)
}

completeRequest := w.wth.completeWorkflow(w.eventHandler, w.currentDecisionTask, w, w.newDecisions, !waitLocalActivities)
completeRequest := w.wth.completeWorkflow(eventHandler, w.currentDecisionTask, w, w.newDecisions, !waitLocalActivities)
w.clearCurrentTask()

return completeRequest
}

func (w *workflowExecutionContextImpl) hasPendingLocalActivityWork() bool {
eventHandler := w.getEventHandler()
return !w.isWorkflowCompleted &&
w.currentDecisionTask != nil &&
w.currentDecisionTask.Query == nil && // don't run local activity for query task
w.eventHandler != nil &&
len(w.eventHandler.pendingLaTasks) > 0
eventHandler != nil &&
len(eventHandler.pendingLaTasks) > 0
}

func (w *workflowExecutionContextImpl) clearCurrentTask() {
Expand Down
91 changes: 91 additions & 0 deletions internal/internal_task_handlers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,97 @@ func (t *TaskHandlersTestSuite) TestWorkflowTask_PageToken() {
t.NotNil(response)
}

func (t *TaskHandlersTestSuite) TestLocalActivityRetry_DecisionHeartbeatFail() {
backoffIntervalInSeconds := int32(1)
backoffDuration := time.Second * time.Duration(backoffIntervalInSeconds)
workflowComplete := false

retryLocalActivityWorkflowFunc := func(ctx Context, intput []byte) error {
ao := LocalActivityOptions{
ScheduleToCloseTimeout: time.Minute,
RetryPolicy: &RetryPolicy{
InitialInterval: backoffDuration,
BackoffCoefficient: 1.1,
MaximumInterval: time.Minute,
ExpirationInterval: time.Minute,
},
}
ctx = WithLocalActivityOptions(ctx, ao)

err := ExecuteLocalActivity(ctx, func() error {
return errors.New("some random error")
}).Get(ctx, nil)
workflowComplete = true
return err
}
RegisterWorkflowWithOptions(
retryLocalActivityWorkflowFunc,
RegisterWorkflowOptions{Name: "RetryLocalActivityWorkflow"},
)

decisionTaskStartedEvent := createTestEventDecisionTaskStarted(3)
decisionTaskStartedEvent.Timestamp = common.Int64Ptr(time.Now().UnixNano())
testEvents := []*s.HistoryEvent{
createTestEventWorkflowExecutionStarted(1, &s.WorkflowExecutionStartedEventAttributes{
// make sure the timeout is same as the backoff interval
TaskStartToCloseTimeoutSeconds: common.Int32Ptr(backoffIntervalInSeconds),
TaskList: &s.TaskList{Name: &testWorkflowTaskTasklist}},
),
createTestEventDecisionTaskScheduled(2, &s.DecisionTaskScheduledEventAttributes{}),
decisionTaskStartedEvent,
}

task := createWorkflowTask(testEvents, 0, "RetryLocalActivityWorkflow")
params := workerExecutionParameters{
TaskList: testWorkflowTaskTasklist,
Identity: "test-id-1",
Logger: t.logger,
Tracer: opentracing.NoopTracer{},
}

taskHandler := newWorkflowTaskHandler(testDomain, params, nil, getHostEnvironment())
laTunnel := &localActivityTunnel{
taskCh: make(chan *localActivityTask, 1000),
resultCh: make(chan interface{}),
}
taskHandlerImpl, ok := taskHandler.(*workflowTaskHandlerImpl)
t.True(ok)
taskHandlerImpl.laTunnel = laTunnel

laTaskPoller := newLocalActivityPoller(params, laTunnel)
doneCh := make(chan struct{})
go func() {
// laTaskPoller needs to poll the local activity and process it
task, err := laTaskPoller.PollTask()
t.NoError(err)
err = laTaskPoller.ProcessTask(task)
t.NoError(err)

// before clearing workflow state, a reset sticky task will be sent to this chan,
// drain the chan so that workflow state can be cleared
<-laTunnel.resultCh

close(doneCh)
}()

laResultCh := make(chan *localActivityResult)
response, err := taskHandler.ProcessWorkflowTask(
&workflowTask{
task: task,
laResultCh: laResultCh,
},
func(response interface{}, startTime time.Time) (*workflowTask, error) {
return nil, &s.EntityNotExistsError{Message: "Decision task not found."}
})
t.Nil(response)
t.Error(err)

// wait for the retry timer to fire
time.Sleep(backoffDuration)
t.False(workflowComplete)
<-doneCh
}

func (t *TaskHandlersTestSuite) TestHeartBeat_NoError() {
mockCtrl := gomock.NewController(t.T())
mockService := workflowservicetest.NewMockClient(mockCtrl)
Expand Down

0 comments on commit 8b5a4e5

Please sign in to comment.