Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions internal/app/active_nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,24 @@ func (app *App) updateActiveNodes(state, stateDcs map[string]*HostState, oldActi
return err
}

if removingNodes {
expectedNumReplicas := app.getNumReplicasToWrite(activeNodes)
masterNode := app.shard.Get(master)
actualNumReplicas, err := masterNode.GetNumQuorumReplicas(app.ctx)
if err != nil {
app.logger.Error().Err(err).Msg("Update active nodes: failed to get num quorum replicas on master")
} else if expectedNumReplicas < actualNumReplicas {
app.logger.Info().Msgf("Update active nodes: changing num quorum replicas from %d to %d on master", actualNumReplicas, expectedNumReplicas)
err, rewriteErr := masterNode.SetNumQuorumReplicas(app.ctx, expectedNumReplicas)
if err != nil {
app.logger.Error().Err(err).Msg("Update active nodes: failed to set num quorum replicas on master")
}
if rewriteErr != nil {
app.logger.Error().Err(rewriteErr).Msg("Update active nodes: failed to rewrite config on master")
}
}
}

if !removingNodes {
err := app.dcs.Set(pathActiveNodes, activeNodes)
if err != nil {
Expand Down
43 changes: 34 additions & 9 deletions internal/app/repair.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,6 @@ func (app *App) repairShard(shardState map[string]*HostState, activeNodes []stri
}

func (app *App) repairMaster(node *valkey.Node, activeNodes []string, state *HostState) {
if state.IsReadOnly || state.MinReplicasToWrite != 0 {
err, rewriteErr := node.SetReadWrite(app.ctx)
if err != nil {
app.logger.Error().Str("fqdn", node.FQDN()).Err(err).Msg("Unable to set master read-write")
}
if rewriteErr != nil {
app.logger.Error().Str("fqdn", node.FQDN()).Err(rewriteErr).Msg("Unable to rewrite config on master")
}
}
expectedNumReplicas := app.getNumReplicasToWrite(activeNodes)
actualNumReplicas, err := node.GetNumQuorumReplicas(app.ctx)
if err != nil {
Expand All @@ -74,6 +65,16 @@ func (app *App) repairMaster(node *valkey.Node, activeNodes []string, state *Hos
err, rewriteErr := node.SetNumQuorumReplicas(app.ctx, expectedNumReplicas)
if err != nil {
app.logger.Error().Str("fqdn", node.FQDN()).Err(err).Msg("Unable to set num quorum replicas on master")
return
}
if rewriteErr != nil {
app.logger.Error().Str("fqdn", node.FQDN()).Err(rewriteErr).Msg("Unable to rewrite config on master")
}
}
if state.IsReadOnly || state.MinReplicasToWrite != 0 {
err, rewriteErr := node.SetReadWrite(app.ctx)
if err != nil {
app.logger.Error().Str("fqdn", node.FQDN()).Err(err).Msg("Unable to set master read-write")
}
if rewriteErr != nil {
app.logger.Error().Str("fqdn", node.FQDN()).Err(rewriteErr).Msg("Unable to rewrite config on master")
Expand Down Expand Up @@ -322,6 +323,30 @@ func (app *App) repairLocalNode(master string) bool {
app.timings.reportTiming("node_offline", dur)
delete(app.nodeFailTime, local.FQDN())
}
if master == local.FQDN() {
activeNodes, err := app.GetActiveNodes()
if err != nil {
app.logger.Error().Err(err).Msg("Unable to get active nodes before setting local master online")
return false
}
expectedNumReplicas := app.getNumReplicasToWrite(activeNodes)
actualNumReplicas, err := local.GetNumQuorumReplicas(app.ctx)
if err != nil {
app.logger.Error().Err(err).Msg("Unable to get num quorum replicas before setting local master online")
return false
}
if expectedNumReplicas > actualNumReplicas {
app.logger.Info().Msgf("Setting num quorum replicas to %d before setting local master online", expectedNumReplicas)
err, rewriteErr := local.SetNumQuorumReplicas(app.ctx, expectedNumReplicas)
if err != nil {
app.logger.Error().Err(err).Msg("Unable to set num quorum replicas before setting local master online")
return false
}
if rewriteErr != nil {
app.logger.Error().Err(rewriteErr).Msg("Unable to rewrite config after setting num quorum replicas on local master")
}
}
}
err = local.SetOnline(app.ctx)
if err != nil {
app.logger.Error().Err(err).Msg("Unable to set local node online")
Expand Down
15 changes: 12 additions & 3 deletions internal/app/switchover.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,22 +370,31 @@ func (app *App) performSwitchover(shardState map[string]*HostState, activeNodes
app.waitPoisonPill(app.config.Valkey.WaitPoisonPillTimeout)
}

newMasterNode := app.shard.Get(newMaster)
if len(aliveActiveNodes) == 1 || app.config.Valkey.AllowDataLoss {
node := app.shard.Get(newMaster)
err, errConf := node.SetReadWrite(app.ctx)
err, errConf := newMasterNode.SetReadWrite(app.ctx)
if err != nil {
return fmt.Errorf("unable to set %s available for write before promote: %s", newMaster, err.Error())
}
if errConf != nil {
return fmt.Errorf("unable to rewrite config on %s before promote: %s", newMaster, errConf.Error())
}
err, errConf = node.SetNumQuorumReplicas(app.ctx, 0)
err, errConf = newMasterNode.SetNumQuorumReplicas(app.ctx, 0)
if err != nil {
return fmt.Errorf("unable to set num quorum replicas to 0 on %s: %s", newMaster, err.Error())
}
if errConf != nil {
return fmt.Errorf("unable to rewrite config on %s before promote: %s", newMaster, errConf.Error())
}
} else {
expectedNumReplicas := app.getNumReplicasToWrite(aliveActiveNodes)
err, errConf := newMasterNode.SetNumQuorumReplicas(app.ctx, expectedNumReplicas)
if err != nil {
return fmt.Errorf("unable to set num quorum replicas to %d on %s before promote: %s", expectedNumReplicas, newMaster, err.Error())
}
if errConf != nil {
app.logger.Warn().Err(errConf).Msgf("Unable to rewrite config on %s after setting num quorum replicas before promote", newMaster)
}
}

if app.config.Valkey.TurnBeforeSwitchover {
Expand Down
Loading