diff --git a/CHANGELOG.md b/CHANGELOG.md index 912343ec00..f6f021fbd2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changes + +- Improve P2P transient network failure [#3212](https://github.com/evstack/ev-node/pull/3212) + ## v1.1.0-rc.1 ### Added diff --git a/apps/evm/go.mod b/apps/evm/go.mod index edd9c14311..cdce2d0ee3 100644 --- a/apps/evm/go.mod +++ b/apps/evm/go.mod @@ -2,10 +2,10 @@ module github.com/evstack/ev-node/apps/evm go 1.25.7 -// replace ( -// github.com/evstack/ev-node => ../../ -// github.com/evstack/ev-node/execution/evm => ../../execution/evm -// ) +replace ( + github.com/evstack/ev-node => ../../ + github.com/evstack/ev-node/execution/evm => ../../execution/evm +) require ( github.com/ethereum/go-ethereum v1.17.2 diff --git a/apps/evm/go.sum b/apps/evm/go.sum index 239a59c985..e0249473d4 100644 --- a/apps/evm/go.sum +++ b/apps/evm/go.sum @@ -472,12 +472,8 @@ github.com/ethereum/go-bigmodexpfix v0.0.0-20250911101455-f9e208c548ab h1:rvv6MJ github.com/ethereum/go-bigmodexpfix v0.0.0-20250911101455-f9e208c548ab/go.mod h1:IuLm4IsPipXKF7CW5Lzf68PIbZ5yl7FFd74l/E0o9A8= github.com/ethereum/go-ethereum v1.17.2 h1:ag6geu0kn8Hv5FLKTpH+Hm2DHD+iuFtuqKxEuwUsDOI= github.com/ethereum/go-ethereum v1.17.2/go.mod h1:KHcRXfGOUfUmKg51IhQ0IowiqZ6PqZf08CMtk0g5K1o= -github.com/evstack/ev-node v1.1.0-rc.1 h1:NtPuuDLqN2h4/edu5zxRlZAxmLkTG3ncXBO2PlCDvVs= -github.com/evstack/ev-node v1.1.0-rc.1/go.mod h1:6rhWWzuyiqNn/erDmWCk1aLxUuQphyOGIRq56/smSyk= github.com/evstack/ev-node/core v1.0.0 h1:s0Tx0uWHme7SJn/ZNEtee4qNM8UO6PIxXnHhPbbKTz8= github.com/evstack/ev-node/core v1.0.0/go.mod h1:n2w/LhYQTPsi48m6lMj16YiIqsaQw6gxwjyJvR+B3sY= -github.com/evstack/ev-node/execution/evm v1.0.0 h1:UTAdCrnPsLoGzSgsBx4Kv76jkXpMmHBIpNv3MxyzWPo= -github.com/evstack/ev-node/execution/evm v1.0.0/go.mod h1:UrqkiepfTMiot6M8jnswgu3VU8SSucZpaMIHIl22/1A= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= diff --git a/apps/testapp/go.mod b/apps/testapp/go.mod index c235c3f07b..aeb4b4bbb2 100644 --- a/apps/testapp/go.mod +++ b/apps/testapp/go.mod @@ -2,7 +2,7 @@ module github.com/evstack/ev-node/apps/testapp go 1.25.7 -// replace github.com/evstack/ev-node => ../../. +replace github.com/evstack/ev-node => ../../. require ( github.com/evstack/ev-node v1.1.0-rc.1 diff --git a/apps/testapp/go.sum b/apps/testapp/go.sum index be901c574f..8670e575e8 100644 --- a/apps/testapp/go.sum +++ b/apps/testapp/go.sum @@ -432,8 +432,6 @@ github.com/envoyproxy/protoc-gen-validate v1.0.1/go.mod h1:0vj8bNkYbSTNS2PIyH87K github.com/envoyproxy/protoc-gen-validate v1.0.2/go.mod h1:GpiZQP3dDbg4JouG/NNS7QWXpgx6x8QiMKdmN72jogE= github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4= github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA= -github.com/evstack/ev-node v1.1.0-rc.1 h1:NtPuuDLqN2h4/edu5zxRlZAxmLkTG3ncXBO2PlCDvVs= -github.com/evstack/ev-node v1.1.0-rc.1/go.mod h1:6rhWWzuyiqNn/erDmWCk1aLxUuQphyOGIRq56/smSyk= github.com/evstack/ev-node/core v1.0.0 h1:s0Tx0uWHme7SJn/ZNEtee4qNM8UO6PIxXnHhPbbKTz8= github.com/evstack/ev-node/core v1.0.0/go.mod h1:n2w/LhYQTPsi48m6lMj16YiIqsaQw6gxwjyJvR+B3sY= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= diff --git a/pkg/p2p/client.go b/pkg/p2p/client.go index e8fc6eabe1..0093a8ad60 100644 --- a/pkg/p2p/client.go +++ b/pkg/p2p/client.go @@ -35,6 +35,12 @@ const ( // peerLimit defines limit of number of peers returned during active peer discovery. peerLimit = 60 + + // seedReconnectBackoff is the initial backoff when reconnecting to a disconnected seed peer. + seedReconnectBackoff = 1 * time.Second + + // seedReconnectMaxBackoff is the maximum backoff for seed peer reconnection attempts. + seedReconnectMaxBackoff = 30 * time.Second ) // Client is a P2P client, implemented with libp2p. @@ -56,6 +62,11 @@ type Client struct { ps *pubsub.PubSub started bool + ctx context.Context + cancel context.CancelFunc + + seedPeers []peer.AddrInfo + metrics *Metrics } @@ -140,6 +151,7 @@ func (c *Client) Start(ctx context.Context) error { func (c *Client) startWithHost(ctx context.Context, h host.Host) error { c.host = h + c.ctx, c.cancel = context.WithCancel(ctx) for _, a := range c.host.Addrs() { c.logger.Info().Str("address", fmt.Sprintf("%s/p2p/%s", a, c.host.ID())).Msg("listening on address") } @@ -170,11 +182,17 @@ func (c *Client) startWithHost(ctx context.Context, h host.Host) error { } c.started = true + + c.host.Network().Notify(c.newDisconnectNotifee()) + return nil } // Close gently stops Client. func (c *Client) Close() error { + if c.cancel != nil { + c.cancel() + } var err error if c.dht != nil { err = errors.Join(err, c.dht.Close()) @@ -245,6 +263,77 @@ func (c *Client) Peers() []PeerConnection { return res } +type disconnectNotifee struct { + c *Client +} + +func (n disconnectNotifee) Connected(_ network.Network, conn network.Conn) { + p := conn.RemotePeer() + for _, sp := range n.c.seedPeers { + if sp.ID == p { + n.c.logger.Info().Str("peer", p.String()).Msg("connected to seed peer") + return + } + } +} +func (n disconnectNotifee) OpenedStream(_ network.Network, _ network.Stream) {} +func (n disconnectNotifee) ClosedStream(_ network.Network, _ network.Stream) {} +func (n disconnectNotifee) Listen(_ network.Network, _ multiaddr.Multiaddr) {} +func (n disconnectNotifee) ListenClose(_ network.Network, _ multiaddr.Multiaddr) {} + +func (n disconnectNotifee) Disconnected(_ network.Network, conn network.Conn) { + p := conn.RemotePeer() + for _, sp := range n.c.seedPeers { + if sp.ID == p { + n.c.logger.Warn().Str("peer", p.String()).Msg("disconnected from seed peer, scheduling reconnect") + go n.c.reconnectSeedPeer(sp) + return + } + } +} + +func (c *Client) reconnectSeedPeer(sp peer.AddrInfo) { + backoff := seedReconnectBackoff + for { + if c.ctx.Err() != nil { + return + } + if c.isConnected(sp.ID) { + return + } + + err := c.host.Connect(c.ctx, sp) + if err == nil { + c.logger.Info().Str("peer", sp.ID.String()).Msg("reconnected to seed peer") + return + } + if c.ctx.Err() != nil { + return + } + + c.logger.Debug().Str("peer", sp.ID.String()).Dur("backoff", backoff).Err(err).Msg("failed to reconnect to seed peer, retrying") + select { + case <-c.ctx.Done(): + return + case <-time.After(backoff): + } + + backoff *= 2 + if backoff > seedReconnectMaxBackoff { + backoff = seedReconnectMaxBackoff + } + } +} + +func (c *Client) newDisconnectNotifee() disconnectNotifee { + return disconnectNotifee{c: c} +} + +// isConnected returns true if there is an active connection to the given peer. +func (c *Client) isConnected(id peer.ID) bool { + return c.host.Network().Connectedness(id) == network.Connected +} + func (c *Client) listen() (host.Host, error) { maddr, err := multiaddr.NewMultiaddr(c.conf.ListenAddress) if err != nil { @@ -256,6 +345,7 @@ func (c *Client) listen() (host.Host, error) { func (c *Client) setupDHT(ctx context.Context) error { peers := c.parseAddrInfoList(c.conf.Peers) + c.seedPeers = peers if len(peers) == 0 { c.logger.Info().Msg("no peers - only listening for connections") } diff --git a/pkg/p2p/client_test.go b/pkg/p2p/client_test.go index e3ac6f1fab..568b152ea4 100644 --- a/pkg/p2p/client_test.go +++ b/pkg/p2p/client_test.go @@ -278,6 +278,104 @@ func waitForCondition(timeout time.Duration, conditionFunc func() bool) error { } } +func TestSeedPeerReconnect(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + logger := zerolog.Nop() + + mn := mocknet.New() + defer mn.Close() + + seedKey, err := key.GenerateNodeKey() + require.NoError(err) + seedAddr, err := getAddr(seedKey.PrivKey) + require.NoError(err) + seedHost, err := mn.AddPeer(seedKey.PrivKey, seedAddr) + require.NoError(err) + + clientKey, err := key.GenerateNodeKey() + require.NoError(err) + clientAddr, err := getAddr(clientKey.PrivKey) + require.NoError(err) + clientHost, err := mn.AddPeer(clientKey.PrivKey, clientAddr) + require.NoError(err) + + seedAddrStr := seedHost.Addrs()[0].String() + "/p2p/" + seedHost.ID().String() + conf := config.P2PConfig{Peers: seedAddrStr} + + client, err := NewClient(conf, clientKey.PrivKey, dssync.MutexWrap(datastore.NewMapDatastore()), "test-chain", logger, NopMetrics()) + require.NoError(err) + require.NotNil(client) + + err = mn.LinkAll() + require.NoError(err) + err = mn.ConnectAllButSelf() + require.NoError(err) + + ctx := t.Context() + err = client.startWithHost(ctx, clientHost) + require.NoError(err) + defer client.Close() + + err = waitForCondition(2*time.Second, func() bool { + return client.isConnected(seedHost.ID()) + }) + require.NoError(err, "client should connect to seed peer on start") + + conns := client.host.Network().ConnsToPeer(seedHost.ID()) + for _, conn := range conns { + conn.Close() + } + client.host.Network().ClosePeer(seedHost.ID()) + + assert.False(client.isConnected(seedHost.ID()), "seed peer should be disconnected") + + err = waitForCondition(5*time.Second, func() bool { + return client.isConnected(seedHost.ID()) + }) + require.NoError(err, "client should reconnect to seed peer after disconnect") +} + +func TestSeedPeerReconnectStopsOnClose(t *testing.T) { + require := require.New(t) + + mn := mocknet.New() + defer mn.Close() + + seedKey, err := key.GenerateNodeKey() + require.NoError(err) + seedAddr, err := getAddr(seedKey.PrivKey) + require.NoError(err) + seedHost, err := mn.AddPeer(seedKey.PrivKey, seedAddr) + require.NoError(err) + + clientKey, err := key.GenerateNodeKey() + require.NoError(err) + clientAddr, err := getAddr(clientKey.PrivKey) + require.NoError(err) + clientHost, err := mn.AddPeer(clientKey.PrivKey, clientAddr) + require.NoError(err) + + seedAddrStr := seedHost.Addrs()[0].String() + "/p2p/" + seedHost.ID().String() + conf := config.P2PConfig{Peers: seedAddrStr} + + client, err := NewClient(conf, clientKey.PrivKey, dssync.MutexWrap(datastore.NewMapDatastore()), "test-chain", zerolog.Nop(), NopMetrics()) + require.NoError(err) + + err = mn.LinkAll() + require.NoError(err) + err = mn.ConnectAllButSelf() + require.NoError(err) + + ctx := t.Context() + err = client.startWithHost(ctx, clientHost) + require.NoError(err) + + require.NoError(client.Close()) + + require.Error(client.ctx.Err(), "client context should be cancelled after Close") +} + func TestClientInfoMethods(t *testing.T) { require := require.New(t) assert := assert.New(t) diff --git a/pkg/signer/aws/signer.go b/pkg/signer/aws/signer.go index 4c0d9e1d63..f5a2d5c270 100644 --- a/pkg/signer/aws/signer.go +++ b/pkg/signer/aws/signer.go @@ -159,7 +159,7 @@ func (s *KmsSigner) Sign(ctx context.Context, message []byte) ([]byte, error) { timeout := s.opts.timeout() maxAttempts := maxRetries + 1 - for attempt := 0; attempt < maxAttempts; attempt++ { + for attempt := range maxAttempts { if err := ctx.Err(); err != nil { return nil, err } diff --git a/pkg/signer/gcp/signer.go b/pkg/signer/gcp/signer.go index 70f6667c24..2dc8b27a31 100644 --- a/pkg/signer/gcp/signer.go +++ b/pkg/signer/gcp/signer.go @@ -189,7 +189,7 @@ func (s *KmsSigner) Sign(ctx context.Context, message []byte) ([]byte, error) { timeout := s.opts.timeout() maxAttempts := maxRetries + 1 - for attempt := 0; attempt < maxAttempts; attempt++ { + for attempt := range maxAttempts { if err := ctx.Err(); err != nil { return nil, err }