Avoid returning early on agent join failures

mrjana · mrjana · commit 23a782bb92d5 · 2016-09-27T08:36:10.000-07:00
When a gossip join failure happens do not return early in the call chain
because a join failure is most likely transient and the retry logic
built in the networkdb is going to retry and succeed. Returning early
makes the initialization of ingress network/sandbox to not happen which
causes a problem even after the gossip join on retry is successful.

Signed-off-by: Jana Radhakrishnan &lt;mrjana@docker.com&gt;
diff --git a/agent.go b/agent.go
@@ -191,8 +191,7 @@ func (c *controller) agentSetup() error {
 
 	if remoteAddr != "" {
 		if err := c.agentJoin(remoteAddr); err != nil {
-			logrus.Errorf("Error in agentJoin : %v", err)
-			return nil
+			logrus.Errorf("Error in joining gossip cluster : %v(join will be retried in background)", err)
 		}
 	}
 
diff --git a/networkdb/cluster.go b/networkdb/cluster.go
@@ -161,6 +161,10 @@ func (nDB *NetworkDB) retryJoin(members []string, stop <-chan struct{}) {
 				logrus.Errorf("Failed to join memberlist %s on retry: %v", members, err)
 				continue
 			}
+			if err := nDB.sendNodeEvent(NodeEventTypeJoin); err != nil {
+				logrus.Errorf("failed to send node join on retry: %v", err)
+				continue
+			}
 			return
 		case <-stop:
 			return

Original file line number	Diff line number	Diff line change
`@@ -191,8 +191,7 @@ func (c *controller) agentSetup() error {`
`191`	`191`
`192`	`192`	`if remoteAddr != "" {`
`193`	`193`	`if err := c.agentJoin(remoteAddr); err != nil {`
`194`		`- logrus.Errorf("Error in agentJoin : %v", err)`
`195`		`- return nil`
	`194`	`+ logrus.Errorf("Error in joining gossip cluster : %v(join will be retried in background)", err)`
`196`	`195`	`}`
`197`	`196`	`}`
`198`	`197`