diff --git a/internal/app/active_nodes.go b/internal/app/active_nodes.go index d70b57c..3076cd7 100644 --- a/internal/app/active_nodes.go +++ b/internal/app/active_nodes.go @@ -87,6 +87,24 @@ func (app *App) updateActiveNodes(state, stateDcs map[string]*HostState, oldActi return err } + if removingNodes { + expectedNumReplicas := app.getNumReplicasToWrite(activeNodes) + masterNode := app.shard.Get(master) + actualNumReplicas, err := masterNode.GetNumQuorumReplicas(app.ctx) + if err != nil { + app.logger.Error().Err(err).Msg("Update active nodes: failed to get num quorum replicas on master") + } else if expectedNumReplicas < actualNumReplicas { + app.logger.Info().Msgf("Update active nodes: changing num quorum replicas from %d to %d on master", actualNumReplicas, expectedNumReplicas) + err, rewriteErr := masterNode.SetNumQuorumReplicas(app.ctx, expectedNumReplicas) + if err != nil { + app.logger.Error().Err(err).Msg("Update active nodes: failed to set num quorum replicas on master") + } + if rewriteErr != nil { + app.logger.Error().Err(rewriteErr).Msg("Update active nodes: failed to rewrite config on master") + } + } + } + if !removingNodes { err := app.dcs.Set(pathActiveNodes, activeNodes) if err != nil { diff --git a/internal/app/repair.go b/internal/app/repair.go index 880b391..efba9a3 100644 --- a/internal/app/repair.go +++ b/internal/app/repair.go @@ -54,15 +54,6 @@ func (app *App) repairShard(shardState map[string]*HostState, activeNodes []stri } func (app *App) repairMaster(node *valkey.Node, activeNodes []string, state *HostState) { - if state.IsReadOnly || state.MinReplicasToWrite != 0 { - err, rewriteErr := node.SetReadWrite(app.ctx) - if err != nil { - app.logger.Error().Str("fqdn", node.FQDN()).Err(err).Msg("Unable to set master read-write") - } - if rewriteErr != nil { - app.logger.Error().Str("fqdn", node.FQDN()).Err(rewriteErr).Msg("Unable to rewrite config on master") - } - } expectedNumReplicas := app.getNumReplicasToWrite(activeNodes) actualNumReplicas, err := node.GetNumQuorumReplicas(app.ctx) if err != nil { @@ -74,6 +65,16 @@ func (app *App) repairMaster(node *valkey.Node, activeNodes []string, state *Hos err, rewriteErr := node.SetNumQuorumReplicas(app.ctx, expectedNumReplicas) if err != nil { app.logger.Error().Str("fqdn", node.FQDN()).Err(err).Msg("Unable to set num quorum replicas on master") + return + } + if rewriteErr != nil { + app.logger.Error().Str("fqdn", node.FQDN()).Err(rewriteErr).Msg("Unable to rewrite config on master") + } + } + if state.IsReadOnly || state.MinReplicasToWrite != 0 { + err, rewriteErr := node.SetReadWrite(app.ctx) + if err != nil { + app.logger.Error().Str("fqdn", node.FQDN()).Err(err).Msg("Unable to set master read-write") } if rewriteErr != nil { app.logger.Error().Str("fqdn", node.FQDN()).Err(rewriteErr).Msg("Unable to rewrite config on master") @@ -322,6 +323,30 @@ func (app *App) repairLocalNode(master string) bool { app.timings.reportTiming("node_offline", dur) delete(app.nodeFailTime, local.FQDN()) } + if master == local.FQDN() { + activeNodes, err := app.GetActiveNodes() + if err != nil { + app.logger.Error().Err(err).Msg("Unable to get active nodes before setting local master online") + return false + } + expectedNumReplicas := app.getNumReplicasToWrite(activeNodes) + actualNumReplicas, err := local.GetNumQuorumReplicas(app.ctx) + if err != nil { + app.logger.Error().Err(err).Msg("Unable to get num quorum replicas before setting local master online") + return false + } + if expectedNumReplicas > actualNumReplicas { + app.logger.Info().Msgf("Setting num quorum replicas to %d before setting local master online", expectedNumReplicas) + err, rewriteErr := local.SetNumQuorumReplicas(app.ctx, expectedNumReplicas) + if err != nil { + app.logger.Error().Err(err).Msg("Unable to set num quorum replicas before setting local master online") + return false + } + if rewriteErr != nil { + app.logger.Error().Err(rewriteErr).Msg("Unable to rewrite config after setting num quorum replicas on local master") + } + } + } err = local.SetOnline(app.ctx) if err != nil { app.logger.Error().Err(err).Msg("Unable to set local node online") diff --git a/internal/app/switchover.go b/internal/app/switchover.go index 62f067f..038445e 100644 --- a/internal/app/switchover.go +++ b/internal/app/switchover.go @@ -370,22 +370,31 @@ func (app *App) performSwitchover(shardState map[string]*HostState, activeNodes app.waitPoisonPill(app.config.Valkey.WaitPoisonPillTimeout) } + newMasterNode := app.shard.Get(newMaster) if len(aliveActiveNodes) == 1 || app.config.Valkey.AllowDataLoss { - node := app.shard.Get(newMaster) - err, errConf := node.SetReadWrite(app.ctx) + err, errConf := newMasterNode.SetReadWrite(app.ctx) if err != nil { return fmt.Errorf("unable to set %s available for write before promote: %s", newMaster, err.Error()) } if errConf != nil { return fmt.Errorf("unable to rewrite config on %s before promote: %s", newMaster, errConf.Error()) } - err, errConf = node.SetNumQuorumReplicas(app.ctx, 0) + err, errConf = newMasterNode.SetNumQuorumReplicas(app.ctx, 0) if err != nil { return fmt.Errorf("unable to set num quorum replicas to 0 on %s: %s", newMaster, err.Error()) } if errConf != nil { return fmt.Errorf("unable to rewrite config on %s before promote: %s", newMaster, errConf.Error()) } + } else { + expectedNumReplicas := app.getNumReplicasToWrite(aliveActiveNodes) + err, errConf := newMasterNode.SetNumQuorumReplicas(app.ctx, expectedNumReplicas) + if err != nil { + return fmt.Errorf("unable to set num quorum replicas to %d on %s before promote: %s", expectedNumReplicas, newMaster, err.Error()) + } + if errConf != nil { + app.logger.Warn().Err(errConf).Msgf("Unable to rewrite config on %s after setting num quorum replicas before promote", newMaster) + } } if app.config.Valkey.TurnBeforeSwitchover {