-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathQUICK_FIX_SPOT_LIMIT.sh
More file actions
executable file
·63 lines (53 loc) · 2.28 KB
/
QUICK_FIX_SPOT_LIMIT.sh
File metadata and controls
executable file
·63 lines (53 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/bin/bash
# Quick fix for MaxSpotInstanceCountExceeded error
set -e
echo "🔧 Fixing Spot Instance Limit Issue"
echo "===================================="
echo ""
# Step 1: Clean up any existing resources
echo "Step 1: Cleaning up existing resources..."
./scripts/cleanup_aws_resources.sh --force 2>&1 | grep -E "✅|❌|Found|Terminating|Cancelling" || true
echo ""
# Step 2: Wait a moment for AWS to process
echo "Step 2: Waiting for AWS to process cleanup (10 seconds)..."
sleep 10
echo ""
# Step 3: Try different regions
REGIONS=("us-east-1" "us-west-2" "us-west-1" "eu-west-1")
for REGION in "${REGIONS[@]}"; do
echo "Step 3: Trying region: $REGION"
export AWS_REGION=$REGION
# Quick test - just check if we can make a request
TEST_OUTPUT=$(aws ec2 request-spot-instances \
--instance-count 1 \
--launch-specification '{"ImageId":"ami-03deb8c961063af8c","InstanceType":"g4dn.xlarge","KeyName":"tarek","SecurityGroupIds":["sg-de86b4ac"]}' \
--spot-price "0.50" \
--type "one-time" \
--region "$REGION" \
--output json 2>&1 || echo "ERROR")
if echo "$TEST_OUTPUT" | grep -q "SpotInstanceRequestId"; then
SPOT_REQ=$(echo "$TEST_OUTPUT" | jq -r '.SpotInstanceRequests[0].SpotInstanceRequestId' 2>/dev/null)
echo "✅ Success! Region $REGION works. Spot Request: $SPOT_REQ"
echo ""
echo "Cancel test request:"
echo " aws ec2 cancel-spot-instance-requests --spot-instance-request-ids $SPOT_REQ --region $REGION"
echo ""
echo "Launch with this region:"
echo " AWS_REGION=$REGION ./scripts/scale_gpu_training.sh up g4dn.xlarge 24"
exit 0
elif echo "$TEST_OUTPUT" | grep -q "MaxSpotInstanceCountExceeded"; then
echo "❌ Region $REGION also has limit issue"
else
echo "⚠️ Region $REGION: $(echo "$TEST_OUTPUT" | head -3)"
fi
echo ""
done
echo "❌ All regions hit limits. Options:"
echo ""
echo "1. Request limit increase from AWS Support:"
echo " https://console.aws.amazon.com/support/ -> Create case -> Service limit increase"
echo ""
echo "2. Wait 15-30 minutes for limits to reset"
echo ""
echo "3. Use On-Demand instances instead (more expensive):"
echo " Modify scripts/scale_gpu_training.sh to use regular EC2 instances"