Skip to content

Commit e8df2a8

Browse files
bdchathamclaude
andauthored
feat: dedicated storage class and nodepool for archive nodes (#85)
* feat: dedicated storage class and nodepool for archive nodes Archive nodes require io2 storage (25TB+) and memory-optimized instances (256GB+ RAM) that differ fundamentally from other node types. This adds archive-specific infrastructure routing: - New platform config fields: StorageClassArchive, NodepoolArchive - Archive nodes route to io2-archive StorageClass (25TB, Retain) - Archive pods schedule on sei-archive Karpenter NodePool via mode-aware tolerations and node affinity - Bootstrap Jobs get the same nodepool affinity as StatefulSet pods - Removes unused TolerationVal (toleration value now derived from nodepool name via NodepoolForMode) - Ships gp3-10k-750 and io2-archive StorageClasses in config/storage/ (independent of config/default namePrefix) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: add coverage for archive nodepool, storage class, and resource routing Tests verify: - Archive PVC uses io2-archive StorageClass at 25TB - Archive pods schedule on sei-archive nodepool with correct tolerations - Full node pods schedule on sei-node nodepool (regression) - defaultStorageForMode routes archive vs full correctly - defaultResourcesForMode returns archive-specific resources Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent fe7688e commit e8df2a8

10 files changed

Lines changed: 157 additions & 13 deletions

File tree

cmd/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,12 @@ func main() {
114114

115115
platformCfg := platform.Config{
116116
NodepoolName: os.Getenv("SEI_NODEPOOL_NAME"),
117+
NodepoolArchive: os.Getenv("SEI_NODEPOOL_ARCHIVE"),
117118
TolerationKey: os.Getenv("SEI_TOLERATION_KEY"),
118-
TolerationVal: os.Getenv("SEI_TOLERATION_VALUE"),
119119
ServiceAccount: os.Getenv("SEI_SERVICE_ACCOUNT"),
120120
StorageClassPerf: os.Getenv("SEI_STORAGE_CLASS_PERF"),
121121
StorageClassDefault: os.Getenv("SEI_STORAGE_CLASS_DEFAULT"),
122+
StorageClassArchive: os.Getenv("SEI_STORAGE_CLASS_ARCHIVE"),
122123
StorageSizeDefault: os.Getenv("SEI_STORAGE_SIZE_DEFAULT"),
123124
StorageSizeArchive: os.Getenv("SEI_STORAGE_SIZE_ARCHIVE"),
124125
ResourceCPUArchive: os.Getenv("SEI_RESOURCE_CPU_ARCHIVE"),

config/manager/manager.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,22 @@ spec:
3737
env:
3838
- name: SEI_NODEPOOL_NAME
3939
value: sei-node
40+
- name: SEI_NODEPOOL_ARCHIVE
41+
value: sei-archive
4042
- name: SEI_TOLERATION_KEY
4143
value: sei.io/workload
42-
- name: SEI_TOLERATION_VALUE
43-
value: sei-node
4444
- name: SEI_SERVICE_ACCOUNT
4545
value: seid-node
4646
- name: SEI_STORAGE_CLASS_PERF
4747
value: gp3-10k-750
4848
- name: SEI_STORAGE_CLASS_DEFAULT
4949
value: gp3
50+
- name: SEI_STORAGE_CLASS_ARCHIVE
51+
value: io2-archive
5052
- name: SEI_STORAGE_SIZE_DEFAULT
5153
value: 2000Gi
5254
- name: SEI_STORAGE_SIZE_ARCHIVE
53-
value: 4000Gi
55+
value: 25000Gi
5456
- name: SEI_RESOURCE_CPU_ARCHIVE
5557
value: "16"
5658
- name: SEI_RESOURCE_MEM_ARCHIVE

config/storage/kustomization.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
resources:
5+
- storage-classes.yaml
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
apiVersion: storage.k8s.io/v1
2+
kind: StorageClass
3+
metadata:
4+
name: gp3-10k-750
5+
provisioner: ebs.csi.aws.com
6+
allowVolumeExpansion: true
7+
reclaimPolicy: Delete
8+
volumeBindingMode: WaitForFirstConsumer
9+
parameters:
10+
type: gp3
11+
iops: "10000"
12+
throughput: "750"
13+
---
14+
apiVersion: storage.k8s.io/v1
15+
kind: StorageClass
16+
metadata:
17+
name: io2-archive
18+
provisioner: ebs.csi.aws.com
19+
allowVolumeExpansion: true
20+
reclaimPolicy: Retain
21+
volumeBindingMode: WaitForFirstConsumer
22+
parameters:
23+
type: io2
24+
iopsPerGB: "3"

internal/controller/node/resources.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,12 @@ func buildNodePodSpec(node *seiv1alpha1.SeiNode, platform PlatformConfig) corev1
5555
},
5656
}
5757

58+
pool := platform.NodepoolForMode(nodeMode(node))
59+
5860
spec := corev1.PodSpec{
5961
ServiceAccountName: platform.ServiceAccount,
6062
Tolerations: []corev1.Toleration{
61-
{Key: platform.TolerationKey, Value: platform.TolerationVal, Effect: corev1.TaintEffectNoSchedule},
63+
{Key: platform.TolerationKey, Value: pool, Effect: corev1.TaintEffectNoSchedule},
6264
},
6365
Affinity: &corev1.Affinity{
6466
NodeAffinity: &corev1.NodeAffinity{
@@ -67,7 +69,7 @@ func buildNodePodSpec(node *seiv1alpha1.SeiNode, platform PlatformConfig) corev1
6769
MatchExpressions: []corev1.NodeSelectorRequirement{{
6870
Key: "karpenter.sh/nodepool",
6971
Operator: corev1.NodeSelectorOpIn,
70-
Values: []string{platform.NodepoolName},
72+
Values: []string{pool},
7173
}},
7274
}},
7375
},

internal/controller/node/resources_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,3 +624,83 @@ func TestGenerateNodeDataPVC(t *testing.T) {
624624
storage := pvc.Spec.Resources.Requests[corev1.ResourceStorage]
625625
g.Expect(storage.String()).To(Equal("2000Gi"))
626626
}
627+
628+
func newArchiveNode(name, namespace string) *seiv1alpha1.SeiNode {
629+
return &seiv1alpha1.SeiNode{
630+
ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace},
631+
Spec: seiv1alpha1.SeiNodeSpec{
632+
ChainID: "pacific-1",
633+
Image: "ghcr.io/sei-protocol/seid:v6.4.1",
634+
Archive: &seiv1alpha1.ArchiveSpec{},
635+
Sidecar: &seiv1alpha1.SidecarConfig{Port: 7777},
636+
},
637+
}
638+
}
639+
640+
func TestGenerateNodeDataPVC_Archive(t *testing.T) {
641+
g := NewWithT(t)
642+
node := newArchiveNode("archive-0", "pacific-1")
643+
644+
pvc := generateNodeDataPVC(node, platformtest.Config())
645+
646+
g.Expect(*pvc.Spec.StorageClassName).To(Equal("io2-archive"))
647+
648+
storage := pvc.Spec.Resources.Requests[corev1.ResourceStorage]
649+
g.Expect(storage.String()).To(Equal("25000Gi"))
650+
}
651+
652+
func TestBuildNodePodSpec_Archive_SchedulesOnArchiveNodepool(t *testing.T) {
653+
g := NewWithT(t)
654+
node := newArchiveNode("archive-0", "pacific-1")
655+
656+
spec := buildNodePodSpec(node, platformtest.Config())
657+
658+
g.Expect(spec.Tolerations).To(HaveLen(1))
659+
g.Expect(spec.Tolerations[0].Key).To(Equal("sei.io/workload"))
660+
g.Expect(spec.Tolerations[0].Value).To(Equal("sei-archive"))
661+
662+
terms := spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
663+
g.Expect(terms).To(HaveLen(1))
664+
g.Expect(terms[0].MatchExpressions).To(HaveLen(1))
665+
g.Expect(terms[0].MatchExpressions[0].Key).To(Equal("karpenter.sh/nodepool"))
666+
g.Expect(terms[0].MatchExpressions[0].Values).To(ConsistOf("sei-archive"))
667+
}
668+
669+
func TestBuildNodePodSpec_FullNode_SchedulesOnDefaultNodepool(t *testing.T) {
670+
g := NewWithT(t)
671+
node := newSnapshotNode("syncer-0", "pacific-1")
672+
673+
spec := buildNodePodSpec(node, platformtest.Config())
674+
675+
g.Expect(spec.Tolerations[0].Value).To(Equal("sei-node"))
676+
677+
terms := spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
678+
g.Expect(terms[0].MatchExpressions[0].Values).To(ConsistOf("sei-node"))
679+
}
680+
681+
func TestDefaultStorageForMode_Archive(t *testing.T) {
682+
g := NewWithT(t)
683+
cfg := platformtest.Config()
684+
685+
sc, size := defaultStorageForMode(string(seiconfig.ModeArchive), cfg)
686+
g.Expect(sc).To(Equal("io2-archive"))
687+
g.Expect(size).To(Equal("25000Gi"))
688+
}
689+
690+
func TestDefaultStorageForMode_FullNode(t *testing.T) {
691+
g := NewWithT(t)
692+
cfg := platformtest.Config()
693+
694+
sc, size := defaultStorageForMode(string(seiconfig.ModeFull), cfg)
695+
g.Expect(sc).To(Equal("gp3-10k-750"))
696+
g.Expect(size).To(Equal("2000Gi"))
697+
}
698+
699+
func TestDefaultResourcesForMode_Archive(t *testing.T) {
700+
g := NewWithT(t)
701+
cfg := platformtest.Config()
702+
703+
res := defaultResourcesForMode(string(seiconfig.ModeArchive), cfg)
704+
g.Expect(res.Requests[corev1.ResourceCPU]).To(Equal(resource.MustParse("16")))
705+
g.Expect(res.Requests[corev1.ResourceMemory]).To(Equal(resource.MustParse("256Gi")))
706+
}

internal/controller/node/sizing.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,13 @@ func defaultResourcesForMode(mode string, platform PlatformConfig) corev1.Resour
3939
func defaultStorageForMode(mode string, platform PlatformConfig) (storageClass string, size string) {
4040
switch mode {
4141
case string(seiconfig.ModeArchive):
42-
return platform.StorageClassPerf, platform.StorageSizeArchive
42+
return platform.StorageClassArchive, platform.StorageSizeArchive
4343
case string(seiconfig.ModeFull), string(seiconfig.ModeValidator):
4444
return platform.StorageClassPerf, platform.StorageSizeDefault
4545
default:
4646
return platform.StorageClassDefault, platform.StorageSizeDefault
4747
}
4848
}
49-
5049
func makeResources(cpu, memory string) corev1.ResourceRequirements {
5150
return corev1.ResourceRequirements{
5251
Requests: corev1.ResourceList{

internal/platform/platform.go

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,22 @@ const (
99

1010
// DataDir is the mount path for the sei data volume inside node pods.
1111
DataDir = "/sei"
12+
13+
// modeArchive matches seiconfig.ModeArchive without importing sei-config.
14+
modeArchive = "archive"
1215
)
1316

1417
// Config holds infrastructure-level settings that vary per deployment
1518
// environment. All fields are required and read from environment variables
1619
// in main.go. See platformtest.Config() for test fixtures.
1720
type Config struct {
1821
NodepoolName string
22+
NodepoolArchive string
1923
TolerationKey string
20-
TolerationVal string
2124
ServiceAccount string
2225
StorageClassPerf string
2326
StorageClassDefault string
27+
StorageClassArchive string
2428
StorageSizeDefault string
2529
StorageSizeArchive string
2630
ResourceCPUArchive string
@@ -43,17 +47,28 @@ type Config struct {
4347
GatewayPublicDomain string
4448
}
4549

50+
// NodepoolForMode returns the Karpenter NodePool name for the given
51+
// sei-config mode string. Archive nodes use a dedicated pool; all
52+
// other modes share the default pool.
53+
func (c Config) NodepoolForMode(mode string) string {
54+
if mode == modeArchive {
55+
return c.NodepoolArchive
56+
}
57+
return c.NodepoolName
58+
}
59+
4660
// Validate returns an error if required fields are missing.
4761
func (c Config) Validate() error {
4862
required := map[string]string{
4963
"SEI_NODEPOOL_NAME": c.NodepoolName,
5064
"SEI_TOLERATION_KEY": c.TolerationKey,
51-
"SEI_TOLERATION_VALUE": c.TolerationVal,
5265
"SEI_SERVICE_ACCOUNT": c.ServiceAccount,
5366
"SEI_STORAGE_CLASS_PERF": c.StorageClassPerf,
5467
"SEI_STORAGE_CLASS_DEFAULT": c.StorageClassDefault,
68+
"SEI_STORAGE_CLASS_ARCHIVE": c.StorageClassArchive,
5569
"SEI_STORAGE_SIZE_DEFAULT": c.StorageSizeDefault,
5670
"SEI_STORAGE_SIZE_ARCHIVE": c.StorageSizeArchive,
71+
"SEI_NODEPOOL_ARCHIVE": c.NodepoolArchive,
5772
"SEI_RESOURCE_CPU_ARCHIVE": c.ResourceCPUArchive,
5873
"SEI_RESOURCE_MEM_ARCHIVE": c.ResourceMemArchive,
5974
"SEI_RESOURCE_CPU_DEFAULT": c.ResourceCPUDefault,

internal/platform/platformtest/config.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@ import "github.com/sei-protocol/sei-k8s-controller/internal/platform"
88
func Config() platform.Config {
99
return platform.Config{
1010
NodepoolName: "sei-node",
11+
NodepoolArchive: "sei-archive",
1112
TolerationKey: "sei.io/workload",
12-
TolerationVal: "sei-node",
1313
ServiceAccount: "seid-node",
1414
StorageClassPerf: "gp3-10k-750",
1515
StorageClassDefault: "gp3",
16+
StorageClassArchive: "io2-archive",
1617
StorageSizeDefault: "2000Gi",
17-
StorageSizeArchive: "4000Gi",
18+
StorageSizeArchive: "25000Gi",
1819
ResourceCPUArchive: "16",
1920
ResourceMemArchive: "256Gi",
2021
ResourceCPUDefault: "4",

internal/task/bootstrap_resources.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ func buildBootstrapPodSpec(node *seiv1alpha1.SeiNode, snap *seiv1alpha1.Snapshot
162162
seidInit := bootstrapSeidInitContainer(node)
163163
seidInit.Image = bootstrapImage
164164

165+
pool := platformCfg.NodepoolForMode(bootstrapNodeMode(node))
166+
165167
return corev1.PodSpec{
166168
Hostname: fmt.Sprintf("%s-0", node.Name),
167169
Subdomain: serviceName,
@@ -170,7 +172,20 @@ func buildBootstrapPodSpec(node *seiv1alpha1.SeiNode, snap *seiv1alpha1.Snapshot
170172
RestartPolicy: corev1.RestartPolicyNever,
171173
TerminationGracePeriodSeconds: ptr.To(bootstrapTerminationGracePeriod),
172174
Tolerations: []corev1.Toleration{
173-
{Key: platformCfg.TolerationKey, Value: platformCfg.TolerationVal, Effect: corev1.TaintEffectNoSchedule},
175+
{Key: platformCfg.TolerationKey, Value: pool, Effect: corev1.TaintEffectNoSchedule},
176+
},
177+
Affinity: &corev1.Affinity{
178+
NodeAffinity: &corev1.NodeAffinity{
179+
RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
180+
NodeSelectorTerms: []corev1.NodeSelectorTerm{{
181+
MatchExpressions: []corev1.NodeSelectorRequirement{{
182+
Key: "karpenter.sh/nodepool",
183+
Operator: corev1.NodeSelectorOpIn,
184+
Values: []string{pool},
185+
}},
186+
}},
187+
},
188+
},
174189
},
175190
Volumes: []corev1.Volume{dataVolume},
176191
InitContainers: []corev1.Container{seidInit, sidecar},

0 commit comments

Comments
 (0)