Skip to content

Commit 865db66

Browse files
committed
[core] OCTRL-912 Cancel calls pending await upon environment destruction
This addressed the problem of having stuck calls waiting for something to receive from c.await which outlive the parent environments. This may happen if awaited trigger never occurs after it has been started, e.g. if trigger is before_START_ACTIVITY and await is after_START_ACTIVITY, but the environments fails to transition to RUNNING.
1 parent e101416 commit 865db66

4 files changed

Lines changed: 70 additions & 2 deletions

File tree

core/environment/environment.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,7 @@ func (env *Environment) handleHooks(workflow workflow.Role, trigger string, weig
706706
// respected.
707707

708708
callErrors = pendingCalls.AwaitAll()
709+
delete(env.callsPendingAwait[trigger], weight)
709710
}
710711
}
711712

core/environment/hooks_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"github.com/AliceO2Group/Control/common/utils/uid"
77
"github.com/AliceO2Group/Control/core/task"
88
"github.com/AliceO2Group/Control/core/workflow"
9+
"github.com/AliceO2Group/Control/core/workflow/callable"
910
. "github.com/onsi/ginkgo/v2"
1011
. "github.com/onsi/gomega"
1112
)
@@ -459,4 +460,35 @@ var _ = Describe("calling hooks on FSM events", func() {
459460
Expect(ok).To(BeTrue())
460461
Expect(v).To(Equal("root.call1,root.call2,root.call3"))
461462
})
463+
464+
It("should allow to cancel hooks in case that await trigger never happens", func() {
465+
env.workflow = workflow.NewAggregatorRole("root", []workflow.Role{
466+
workflow.NewCallRole(
467+
"call1", // this call should return immediately and should not be accessible later
468+
task.Traits{Trigger: "before_CONFIGURE", Timeout: "5s", Critical: true, Await: "before_CONFIGURE"},
469+
"testplugin.Test()",
470+
""),
471+
workflow.NewCallRole(
472+
"call2", // this call should not return, but should be cancelled later
473+
task.Traits{Trigger: "before_CONFIGURE", Timeout: "5s", Critical: true, Await: "after_CONFIGURE"},
474+
"testplugin.Test()",
475+
"")})
476+
workflow.LinkChildrenToParents(env.workflow)
477+
env.Sm.SetState("DEPLOYED")
478+
479+
err := env.Sm.Event(context.Background(), "CONFIGURE", NewDummyTransition("CONFIGURE", true))
480+
Expect(err).To(HaveOccurred())
481+
482+
callMapForAwait := env.callsPendingAwait
483+
Expect(callMapForAwait).To(HaveKey("after_CONFIGURE"))
484+
callsForWeight := callMapForAwait["after_CONFIGURE"]
485+
Expect(callsForWeight).To(HaveKey(callable.HookWeight(0)))
486+
calls := callsForWeight[0]
487+
Expect(calls).To(HaveLen(1))
488+
Expect(calls[0]).NotTo(BeNil())
489+
// the first cancel attempt should return "true" to say it was successful
490+
Expect(calls[0].Cancel()).To(BeTrue())
491+
// the subsequent cancel attempts should return "false", because the call was already cancelled
492+
Expect(calls[0].Cancel()).To(BeFalse())
493+
})
462494
})

core/environment/manager.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,8 @@ func (envs *Manager) TeardownEnvironment(environmentId uid.ID, force bool) error
811811
}
812812
}
813813

814+
envs.cancelCallsPendingAwait(env)
815+
814816
// we remake the pending teardown channel too, because each completed TasksReleasedEvent
815817
// automatically closes it
816818
pendingCh = make(chan *event.TasksReleasedEvent)
@@ -880,6 +882,22 @@ func (envs *Manager) TeardownEnvironment(environmentId uid.ID, force bool) error
880882
return err
881883
}
882884

885+
func (envs *Manager) cancelCallsPendingAwait(env *Environment) {
886+
// unblock all calls which are stuck waiting for an await trigger which never happened
887+
if env == nil {
888+
return
889+
}
890+
for _, callMapForAwait := range env.callsPendingAwait {
891+
for _, callsForWeight := range callMapForAwait {
892+
for _, call := range callsForWeight {
893+
if call != nil {
894+
call.Cancel()
895+
}
896+
}
897+
}
898+
}
899+
}
900+
883901
/*func (envs *Manager) Configuration(environmentId uuid.UUID) EnvironmentCfg {
884902
envs.mu.RLock()
885903
defer envs.mu.RUnlock()

core/workflow/callable/call.go

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
package callable
2626

2727
import (
28+
"context"
2829
"errors"
2930
"fmt"
3031
"strconv"
@@ -54,7 +55,8 @@ type Call struct {
5455
Traits task.Traits
5556
parentRole ParentRole
5657

57-
await chan error
58+
await chan error
59+
awaitCancel context.CancelFunc
5860
}
5961

6062
type Calls []*Call
@@ -220,11 +222,17 @@ func (c *Call) Call() error {
220222

221223
func (c *Call) Start() {
222224
c.await = make(chan error)
225+
ctx, cancel := context.WithCancel(context.Background())
226+
c.awaitCancel = cancel
223227
go func() {
224228
callId := fmt.Sprintf("hook:%s:%s", c.GetTraits().Trigger, c.GetName())
225229
log.Debugf("%s started", callId)
226230
defer utils.TimeTrack(time.Now(), callId, log.WithPrefix("callable"))
227-
c.await <- c.Call()
231+
select {
232+
case c.await <- c.Call():
233+
case <-ctx.Done():
234+
log.Debugf("%s cancelled", callId)
235+
}
228236
close(c.await)
229237
}()
230238
}
@@ -234,6 +242,15 @@ func (c *Call) Await() error {
234242
return <-c.await
235243
}
236244

245+
func (c *Call) Cancel() bool {
246+
if c.awaitCancel != nil {
247+
c.awaitCancel()
248+
c.awaitCancel = nil
249+
return true
250+
}
251+
return false
252+
}
253+
237254
func (c *Call) GetParentRole() interface{} {
238255
return c.parentRole
239256
}

0 commit comments

Comments
 (0)