Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions aws_quickstart/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 4.10.0 (May 13, 2026)

- Add `InstrumentationResourceTypes` parameter to `main_v2.yaml`. When set to a comma-separated list of UDM resource types (e.g. `aws:ec2:instance,aws:ecs:cluster,aws:eks:cluster`), the integration role's permission-attach Lambda calls `GET /api/unstable/instrumenter/aws/iam_permissions?resource_type=...&chunked=true` and attaches the returned IAM permissions as additional managed policies on the integration role, so customers can install the Datadog Agent on those resources without extra IAM setup. Failure to fetch or attach these extra permissions is non-blocking — the integration install proceeds with a warning. Affects `main_v2.yaml`, `datadog_integration_role.yaml`, `attach_integration_permissions.py`

# 4.9.1 (April 22, 2026)

- Fix `Template error: Unable to get mapping for DdAccountIdBySite::<site>::AccountIdGovCloud` on commercial-site deploys. CloudFormation's `Fn::FindInMap` is resolved at template-parse time regardless of the surrounding `Fn::If`, so every site row now carries an `AccountIdGovCloud` key (commercial sites use `"NOT_APPLICABLE"`, which is discarded by the `IsGov` guard). Affects `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml`
Expand Down
182 changes: 121 additions & 61 deletions aws_quickstart/attach_integration_permissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
from urllib.request import Request
import urllib.error
import urllib.parse
import urllib.request
import cfnresponse
import boto3
Expand All @@ -11,32 +12,52 @@
API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart"
POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy"
BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions"
BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions"
STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard"
RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true"
INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions"


class DatadogAPIError(Exception):
pass


def fetch_permissions_from_datadog(api_url):
"""Fetch permissions from Datadog API"""
headers = {
"Dd-Aws-Api-Call-Source": API_CALL_SOURCE_HEADER_VALUE,
}
request = Request(api_url, headers=headers)
request.get_method = lambda: "GET"

try:
response = urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
error_body = json.loads(e.read())
error_message = error_body.get('errors', ['Unknown error'])[0]
raise DatadogAPIError(f"Datadog API error: {error_message}") from e

json_response = json.loads(response.read())
return json_response["data"]["attributes"]["permissions"]
return json.loads(response.read())["data"]["attributes"]["permissions"]


def parse_resource_types(raw):
# CFN forwards CommaDelimitedList parameters as JSON arrays to custom resources,
# while String parameters arrive as comma-delimited strings; accept both.
if raw is None:
return []
items = raw.split(",") if isinstance(raw, str) else list(raw)
return [t.strip() for t in items if t and t.strip()]


def build_instrumentation_permissions_url(datadog_site, resource_types):
query = urllib.parse.urlencode(
[("resource_type", t) for t in resource_types] + [("chunked", "true")]
)
return f"https://api.{datadog_site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}"


def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name):
"""Detach a managed policy from a role and delete it. Ignores missing entities."""
# Detach + delete are both no-ops if the entity is already gone, so callers can blindly
# iterate the policy-name space without first checking what actually exists.
try:
iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn)
except iam_client.exceptions.NoSuchEntityException:
Expand All @@ -53,105 +74,144 @@ def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name):
except Exception as e:
LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}")

def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10):
# Remove role-scoped resource collection policies

def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, prefix, max_policies=10):
for i in range(max_policies):
policy_name = f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}"
policy_name = f"{prefix}-{role_name}-{i+1}"
policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}"
_detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name)

# Remove standard permissions


def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10):
_cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies)

try:
iam_client.delete_role_policy(
RoleName=role_name,
PolicyName=POLICY_NAME_STANDARD
)
iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD)
except iam_client.exceptions.NoSuchEntityException:
pass
except Exception as e:
LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}")



def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10):
_cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies)


def attach_standard_permissions(iam_client, role_name):
permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL)
policy_document = {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": permissions,
"Resource": "*"
}
]
"Statement": [{"Effect": "Allow", "Action": permissions, "Resource": "*"}],
}

iam_client.put_role_policy(
RoleName=role_name,
PolicyName=POLICY_NAME_STANDARD,
PolicyDocument=json.dumps(policy_document, separators=(',', ':'))
PolicyDocument=json.dumps(policy_document, separators=(',', ':')),
)



def _create_and_attach_policy(iam_client, role_name, policy_name, actions):
policy_json = json.dumps(
{
"Version": "2012-10-17",
"Statement": [{"Effect": "Allow", "Action": actions, "Resource": "*"}],
},
separators=(',', ':'),
)
LOGGER.info(f"Creating policy {policy_name} with {len(actions)} permissions ({len(policy_json)} characters)")
policy = iam_client.create_policy(PolicyName=policy_name, PolicyDocument=policy_json)
iam_client.attach_role_policy(RoleName=role_name, PolicyArn=policy['Policy']['Arn'])


def attach_resource_collection_permissions(iam_client, role_name):
permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL)

# Create and attach new policies
for i, chunk in enumerate(permission_chunks):
policy_name = f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}"
policy_document = {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": chunk,
"Resource": "*"
}
]
}
policy_json = json.dumps(policy_document, separators=(',', ':'))
policy_size = len(policy_json)
LOGGER.info(f"Creating policy {policy_name} with {len(chunk)} permissions ({policy_size} characters)")
policy = iam_client.create_policy(
PolicyName=policy_name,
PolicyDocument=policy_json
_create_and_attach_policy(
iam_client,
role_name,
f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}",
chunk,
)

# Attach policy to role
iam_client.attach_role_policy(
RoleName=role_name,
PolicyArn=policy['Policy']['Arn']


def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types):
# Best-effort: instrumentation permissions are additive convenience on top of the
# integration, so any failure here is logged and swallowed rather than blocking install.
# Fetch before cleanup so that a transient API failure on an Update leaves the
# previously-attached policies in place instead of silently revoking them.
if not resource_types:
# Only clean up if the previous Update had instrumentation enabled — avoids running
# delete calls on stacks that never opted in to instrumentation in the first place.
if previous_resource_types:
cleanup_instrumentation_policies(iam_client, role_name, account_id, partition)
return

try:
url = build_instrumentation_permissions_url(datadog_site, resource_types)
LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}")
permission_chunks = fetch_permissions_from_datadog(url)
except Exception as e:
LOGGER.warning(
f"Failed to fetch instrumentation permissions for {resource_types}: {e}. "
"Leaving any previously-attached instrumentation policies in place."
)
return
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve existing instrumentation policies on fetch failure

When an Update runs after instrumentation policies were previously attached, handle_create_update deletes those policies via cleanup_existing_policies before calling this best-effort path. If the Datadog API or network is temporarily unavailable, this return makes the custom resource report success without recreating the already-requested instrumentation policies, so an unrelated stack update can silently revoke the Agent instrumentation permissions. Consider fetching before cleanup or failing/restoring on update failures.

Useful? React with 👍 / 👎.


cleanup_instrumentation_policies(iam_client, role_name, account_id, partition)
for i, chunk in enumerate(permission_chunks):
policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}"
try:
_create_and_attach_policy(iam_client, role_name, policy_name, chunk)
except Exception as e:
LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.")

def handle_delete(event, context, role_name, account_id, partition):
"""Handle stack deletion."""

def handle_delete(event, context):
props = event['ResourceProperties']
role_name = props['DatadogIntegrationRole']
account_id = props['AccountId']
partition = props.get('Partition', 'aws')
iam_client = boto3.client('iam')
try:
cleanup_existing_policies(iam_client, role_name, account_id, partition)
cleanup_instrumentation_policies(iam_client, role_name, account_id, partition)
cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={})
except Exception as e:
LOGGER.error(f"Error deleting policy: {str(e)}")
cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)})

def handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy):
"""Handle stack creation or update."""

def handle_create_update(event, context):
props = event['ResourceProperties']
role_name = props['DatadogIntegrationRole']
account_id = props['AccountId']
partition = props.get('Partition', 'aws')
should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true'
datadog_site = props.get('DatadogSite') or 'datadoghq.com'
instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes'))
previous_instrumentation_resource_types = parse_resource_types(
event.get('OldResourceProperties', {}).get('InstrumentationResourceTypes')
)

try:
iam_client = boto3.client('iam')
cleanup_existing_policies(iam_client, role_name, account_id, partition)
attach_standard_permissions(iam_client, role_name)
if should_install_security_audit_policy:
attach_resource_collection_permissions(iam_client, role_name)
attach_instrumentation_permissions(
iam_client, role_name, account_id, partition,
datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types,
)
cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={})
except Exception as e:
LOGGER.error(f"Error creating/attaching policy: {str(e)}")
cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)})


def handler(event, context):
LOGGER.info("Event received: %s", json.dumps(event))

role_name = event['ResourceProperties']['DatadogIntegrationRole']
account_id = event['ResourceProperties']['AccountId']
partition = event['ResourceProperties'].get('Partition', 'aws')
should_install_security_audit_policy = str(event['ResourceProperties']['ResourceCollectionPermissions']).lower() == 'true'

if event['RequestType'] == 'Delete':
handle_delete(event, context, role_name, account_id, partition)
handle_delete(event, context)
else:
handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy)
handle_create_update(event, context)
Loading
Loading