From a675bd2dc0f70886612c0c6d09fcf332456cefa5 Mon Sep 17 00:00:00 2001 From: edersonbrilhante Date: Thu, 4 Dec 2025 14:45:34 +0100 Subject: [PATCH 01/46] feat: add support to use custom scale errors --- .../control-plane/src/aws/runners.d.ts | 2 +- .../control-plane/src/aws/runners.test.ts | 15 ++++++++-- .../control-plane/src/aws/runners.ts | 18 ++++++++++- .../functions/control-plane/src/pool/pool.ts | 2 +- .../src/scale-runners/scale-up.test.ts | 13 ++++---- .../src/scale-runners/scale-up.ts | 4 +-- main.tf | 2 +- modules/multi-runner/runners.tf | 2 +- modules/multi-runner/variables.tf | 30 +++++++------------ modules/runners/pool.tf | 2 +- modules/runners/pool/variables.tf | 2 +- modules/runners/scale-up.tf | 2 +- modules/runners/variables.tf | 16 ++-------- 13 files changed, 58 insertions(+), 52 deletions(-) diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts index f57652d491..e5b4e67d80 100644 --- a/lambdas/functions/control-plane/src/aws/runners.d.ts +++ b/lambdas/functions/control-plane/src/aws/runners.d.ts @@ -69,5 +69,5 @@ export interface RunnerInputParameters { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; - scaleErrors: string[]; + customScaleErrors?: string[]; } diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts index 2f1b05792e..7dd69765a6 100644 --- a/lambdas/functions/control-plane/src/aws/runners.test.ts +++ b/lambdas/functions/control-plane/src/aws/runners.test.ts @@ -717,6 +717,17 @@ describe('create runner with errors', () => { expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand); }); + it('test ScaleError with custom scale error.', async () => { + createFleetMockWithErrors(['CustomAWSError']); + + await expect(createRunner(createRunnerConfig({ ...defaultRunnerConfig, customScaleErrors: ['CustomAWSError'] }))).rejects.toBeInstanceOf(ScaleError); + expect(mockEC2Client).toHaveReceivedCommandWith( + CreateFleetCommand, + expectedCreateFleetRequest(defaultExpectedFleetRequestValues), + ); + expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand); + }); + it('test ScaleError with multiple error.', async () => { createFleetMockWithErrors(['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'NotMappedError']); @@ -962,7 +973,7 @@ interface RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; - scaleErrors: string[]; + customScaleErrors?: string[]; source: LambdaRunnerSource; } @@ -983,7 +994,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters { amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName, tracingEnabled: runnerConfig.tracingEnabled, onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError, - scaleErrors: runnerConfig.scaleErrors, + customScaleErrors: runnerConfig.customScaleErrors, source: runnerConfig.source, }; } diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 61edf99af6..3f9b921359 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -205,8 +205,24 @@ async function processFleetResult( return instances; } - const scaleErrors = runnerParameters.scaleErrors; + // Educated guess of errors that would make sense to retry based on the list + // https://docs.aws.amazon.com/AWSEC2/latest/APIReference/errors-overview.html + const defaultScaleErrors = [ + 'UnfulfillableCapacity', + 'MaxSpotInstanceCountExceeded', + 'TargetCapacityLimitExceededException', + 'RequestLimitExceeded', + 'ResourceLimitExceeded', + 'MaxSpotInstanceCountExceeded', + 'MaxSpotFleetRequestCountExceeded', + 'InsufficientInstanceCapacity', + ]; + const scaleErrors = + runnerParameters.customScaleErrors && runnerParameters.customScaleErrors.length > 0 + ? runnerParameters.customScaleErrors + : defaultScaleErrors; + const failedCount = countScaleErrors(errors, scaleErrors); if (failedCount > 0) { logger.warn('Create fleet failed, ScaleError will be thrown to trigger retry for ephemeral runners.'); diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts index cece8d9951..b370e2e4b7 100644 --- a/lambdas/functions/control-plane/src/pool/pool.ts +++ b/lambdas/functions/control-plane/src/pool/pool.ts @@ -102,7 +102,7 @@ export async function adjust(event: PoolEvent): Promise { amiIdSsmParameterName, tracingEnabled, onDemandFailoverOnError, - scaleErrors, + customScaleErrors }, topUp, githubInstallationClient, diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts index 2245f29b90..fa2bca6cc4 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.test.ts @@ -112,8 +112,8 @@ const EXPECTED_RUNNER_PARAMS: RunnerInputParameters = { subnets: ['subnet-123'], tracingEnabled: false, onDemandFailoverOnError: [], - scaleErrors: ['UnfulfillableCapacity', 'MaxSpotInstanceCountExceeded', 'TargetCapacityLimitExceededException'], - source: 'scale-up-lambda', + customScaleErrors: [], + source: 'scale-up-lambda', }; let expectedRunnerParams: RunnerInputParameters; @@ -131,8 +131,7 @@ function setDefaults() { process.env.INSTANCE_TYPES = 'm5.large'; process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot'; process.env.ENABLE_ON_DEMAND_FAILOVER = undefined; - process.env.SCALE_ERRORS = - '["UnfulfillableCapacity","MaxSpotInstanceCountExceeded","TargetCapacityLimitExceededException"]'; + process.env.CUSTOM_SCALE_ERRORS = undefined; } beforeEach(() => { @@ -1354,11 +1353,11 @@ describe('scaleUp with public GH', () => { it('creates a runner with correct config and labels and custom scale errors enabled.', async () => { process.env.RUNNER_LABELS = 'label1,label2'; - process.env.SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']); - await scaleUpModule.scaleUp(TEST_DATA); + process.env.CUSTOM_SCALE_ERRORS = JSON.stringify(['RequestLimitExceeded']); + await scaleUpModule.scaleUp('aws:sqs', TEST_DATA); expect(createRunner).toBeCalledWith({ ...expectedRunnerParams, - scaleErrors: ['RequestLimitExceeded'], + customScaleErrors: ['RequestLimitExceeded'], }); }); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts index b51731f180..6756d9792d 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-up.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-up.ts @@ -98,7 +98,7 @@ interface CreateEC2RunnerConfig { amiIdSsmParameterName?: string; tracingEnabled?: boolean; onDemandFailoverOnError?: string[]; - scaleErrors: string[]; + customScaleErrors?: string[]; } function generateRunnerServiceConfig(githubRunnerConfig: CreateGitHubRunnerConfig, token: string) { @@ -582,7 +582,7 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise:' labels to dynamically configure EC2 instances (e.g., 'ghr-ec2-instance-type:t3.large') and 'ghr-run-