From 73fb70935dd72b12ff6ac7046b6889bcf5727fd4 Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Wed, 13 May 2026 15:33:34 -0400 Subject: [PATCH 1/4] [TON-388] feat(aws_quickstart): grant instrumenter IAM permissions for selected resource types (v4.10.0) Add an `InstrumentationResourceTypes` CFN parameter to the Quick Start template that accepts a comma-separated list of UDM resource types (e.g. `aws:ec2:instance, aws:ecs:cluster, aws:eks:cluster`). When non-empty, the integration-role permission-attach Lambda calls `GET /api/unstable/instrumenter/aws/iam_permissions?resource_type=...&chunked=true` and attaches each returned permission chunk as an additional managed policy on the integration role, so customers who install the Datadog Agent on those resource types via the Quick Start don't have to wire up the extra IAM actions themselves (ssm:SendCommand for EC2, EKS access-entry + lambda actions, ECS service/task-definition actions, etc.). Failure to fetch or attach the instrumenter permissions is non-blocking: a warning is logged and the integration install completes successfully. The new policies are also cleaned up alongside the existing resource-collection policies on stack delete. Co-Authored-By: Claude Opus 4.7 (1M context) --- aws_quickstart/CHANGELOG.md | 4 + .../attach_integration_permissions.py | 100 ++++++++-- .../attach_integration_permissions_test.py | 185 ++++++++++++++++++ aws_quickstart/datadog_integration_role.yaml | 144 +++++++++++--- aws_quickstart/main_v2.yaml | 10 + aws_quickstart/version.txt | 2 +- 6 files changed, 404 insertions(+), 41 deletions(-) create mode 100644 aws_quickstart/attach_integration_permissions_test.py diff --git a/aws_quickstart/CHANGELOG.md b/aws_quickstart/CHANGELOG.md index 71f4fcd6..2edc93e9 100644 --- a/aws_quickstart/CHANGELOG.md +++ b/aws_quickstart/CHANGELOG.md @@ -1,3 +1,7 @@ +# 4.10.0 (May 13, 2026) + +- Add `InstrumentationResourceTypes` parameter to `main_v2.yaml`. When set to a comma-separated list of UDM resource types (e.g. `aws:ec2:instance,aws:ecs:cluster,aws:eks:cluster`), the integration role's permission-attach Lambda calls `GET /api/unstable/instrumenter/aws/iam_permissions?resource_type=...&chunked=true` and attaches the returned IAM permissions as additional managed policies on the integration role, so customers can install the Datadog Agent on those resources without extra IAM setup. Failure to fetch or attach these extra permissions is non-blocking — the integration install proceeds with a warning. Affects `main_v2.yaml`, `datadog_integration_role.yaml`, `attach_integration_permissions.py` + # 4.9.1 (April 22, 2026) - Fix `Template error: Unable to get mapping for DdAccountIdBySite::::AccountIdGovCloud` on commercial-site deploys. CloudFormation's `Fn::FindInMap` is resolved at template-parse time regardless of the surrounding `Fn::If`, so every site row now carries an `AccountIdGovCloud` key (commercial sites use `"NOT_APPLICABLE"`, which is discarded by the `IsGov` guard). Affects `main_v2.yaml`, `main_workflow.yaml`, `main_extended.yaml`, and `main_extended_workflow.yaml` diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index 977cb4a2..dc188905 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -2,6 +2,7 @@ import logging from urllib.request import Request import urllib.error +import urllib.parse import urllib.request import cfnresponse import boto3 @@ -11,8 +12,10 @@ API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" +BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" +INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" class DatadogAPIError(Exception): pass @@ -24,7 +27,7 @@ def fetch_permissions_from_datadog(api_url): } request = Request(api_url, headers=headers) request.get_method = lambda: "GET" - + try: response = urllib.request.urlopen(request) except urllib.error.HTTPError as e: @@ -35,6 +38,22 @@ def fetch_permissions_from_datadog(api_url): json_response = json.loads(response.read()) return json_response["data"]["attributes"]["permissions"] +def parse_resource_types(raw): + """Parse the InstrumentationResourceTypes ResourceProperty into a clean list of UDM strings. + Accepts a CFN comma-delimited string or an already-split list (CFN serializes + CommaDelimitedList parameters as JSON arrays when forwarded to a custom resource).""" + if raw is None: + return [] + items = raw.split(",") if isinstance(raw, str) else list(raw) + return [t.strip() for t in items if t and t.strip()] + +def build_instrumentation_permissions_url(datadog_site, resource_types): + site = datadog_site or "datadoghq.com" + query = urllib.parse.urlencode( + [("resource_type", t) for t in resource_types] + [("chunked", "true")] + ) + return f"https://api.{site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): """Detach a managed policy from a role and delete it. Ignores missing entities.""" try: @@ -54,12 +73,13 @@ def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - # Remove role-scoped resource collection policies - for i in range(max_policies): - policy_name = f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}" - policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" - _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) - + # Remove role-scoped chunked policies for both prefixes + for prefix in (BASE_POLICY_PREFIX_RESOURCE_COLLECTION, BASE_POLICY_PREFIX_INSTRUMENTATION): + for i in range(max_policies): + policy_name = f"{prefix}-{role_name}-{i+1}" + policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" + _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + # Remove standard permissions try: iam_client.delete_role_policy( @@ -70,7 +90,7 @@ def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_ pass except Exception as e: LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") - + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { @@ -89,7 +109,7 @@ def attach_standard_permissions(iam_client, role_name): PolicyName=POLICY_NAME_STANDARD, PolicyDocument=json.dumps(policy_document, separators=(',', ':')) ) - + def attach_resource_collection_permissions(iam_client, role_name): permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) @@ -113,13 +133,64 @@ def attach_resource_collection_permissions(iam_client, role_name): PolicyName=policy_name, PolicyDocument=policy_json ) - + # Attach policy to role iam_client.attach_role_policy( RoleName=role_name, PolicyArn=policy['Policy']['Arn'] ) +def attach_instrumentation_permissions(iam_client, role_name, datadog_site, resource_types): + """Best-effort attach IAM permissions required to instrument the given UDM resource types. + + Never raises: instrumentation permissions are additive convenience on top of the integration, + so any failure here surfaces a warning but does not block the install. + """ + if not resource_types: + return + + try: + url = build_instrumentation_permissions_url(datadog_site, resource_types) + LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") + permission_chunks = fetch_permissions_from_datadog(url) + except Exception as e: + LOGGER.warning( + f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " + "Integration install will continue without these permissions." + ) + return + + attached, failed = 0, 0 + for i, chunk in enumerate(permission_chunks): + policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" + policy_document = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": chunk, + "Resource": "*" + } + ] + } + policy_json = json.dumps(policy_document, separators=(',', ':')) + LOGGER.info(f"Creating policy {policy_name} with {len(chunk)} permissions ({len(policy_json)} characters)") + try: + policy = iam_client.create_policy( + PolicyName=policy_name, + PolicyDocument=policy_json + ) + iam_client.attach_role_policy( + RoleName=role_name, + PolicyArn=policy['Policy']['Arn'] + ) + attached += 1 + except Exception as e: + failed += 1 + LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") + + LOGGER.info(f"Instrumentation permissions: {attached} attached, {failed} failed out of {len(permission_chunks)} chunks") + def handle_delete(event, context, role_name, account_id, partition): """Handle stack deletion.""" iam_client = boto3.client('iam') @@ -130,7 +201,7 @@ def handle_delete(event, context, role_name, account_id, partition): LOGGER.error(f"Error deleting policy: {str(e)}") cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) -def handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy): +def handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types): """Handle stack creation or update.""" try: iam_client = boto3.client('iam') @@ -138,6 +209,7 @@ def handle_create_update(event, context, role_name, account_id, partition, shoul attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: attach_resource_collection_permissions(iam_client, role_name) + attach_instrumentation_permissions(iam_client, role_name, datadog_site, instrumentation_resource_types) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error creating/attaching policy: {str(e)}") @@ -145,13 +217,15 @@ def handle_create_update(event, context, role_name, account_id, partition, shoul def handler(event, context): LOGGER.info("Event received: %s", json.dumps(event)) - + role_name = event['ResourceProperties']['DatadogIntegrationRole'] account_id = event['ResourceProperties']['AccountId'] partition = event['ResourceProperties'].get('Partition', 'aws') should_install_security_audit_policy = str(event['ResourceProperties']['ResourceCollectionPermissions']).lower() == 'true' + datadog_site = event['ResourceProperties'].get('DatadogSite', 'datadoghq.com') + instrumentation_resource_types = parse_resource_types(event['ResourceProperties'].get('InstrumentationResourceTypes')) if event['RequestType'] == 'Delete': handle_delete(event, context, role_name, account_id, partition) else: - handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy) + handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types) diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py new file mode 100644 index 00000000..64a2df0d --- /dev/null +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 + +import json +import sys +import unittest +from unittest.mock import patch, Mock, MagicMock, call +from urllib.error import HTTPError +from urllib.parse import urlparse, parse_qsl +from io import BytesIO + +if "boto3" not in sys.modules: + sys.modules["boto3"] = MagicMock() +if "cfnresponse" not in sys.modules: + sys.modules["cfnresponse"] = MagicMock() + +from attach_integration_permissions import ( + parse_resource_types, + build_instrumentation_permissions_url, + attach_instrumentation_permissions, + cleanup_existing_policies, + BASE_POLICY_PREFIX_INSTRUMENTATION, + BASE_POLICY_PREFIX_RESOURCE_COLLECTION, +) + + +class TestParseResourceTypes(unittest.TestCase): + def test_none(self): + self.assertEqual(parse_resource_types(None), []) + + def test_empty_string(self): + self.assertEqual(parse_resource_types(""), []) + + def test_single(self): + self.assertEqual(parse_resource_types("aws:ec2:instance"), ["aws:ec2:instance"]) + + def test_multiple_with_whitespace(self): + self.assertEqual( + parse_resource_types("aws:ec2:instance, aws:ecs:cluster ,aws:eks:cluster"), + ["aws:ec2:instance", "aws:ecs:cluster", "aws:eks:cluster"], + ) + + def test_list_input(self): + # CFN may forward a CommaDelimitedList as a JSON array + self.assertEqual( + parse_resource_types(["aws:ec2:instance", " aws:ecs:cluster "]), + ["aws:ec2:instance", "aws:ecs:cluster"], + ) + + def test_drops_empties(self): + self.assertEqual(parse_resource_types(",,aws:ec2:instance,,"), ["aws:ec2:instance"]) + + +class TestBuildInstrumentationURL(unittest.TestCase): + def _query_pairs(self, url): + return parse_qsl(urlparse(url).query) + + def test_path_and_host(self): + url = build_instrumentation_permissions_url("datadoghq.eu", ["aws:ec2:instance"]) + parsed = urlparse(url) + self.assertEqual(parsed.scheme, "https") + self.assertEqual(parsed.netloc, "api.datadoghq.eu") + self.assertEqual(parsed.path, "/api/unstable/instrumenter/aws/iam_permissions") + + def test_default_site(self): + url = build_instrumentation_permissions_url("", ["aws:ec2:instance"]) + self.assertIn("api.datadoghq.com", url) + + def test_repeated_resource_type_and_chunked(self): + url = build_instrumentation_permissions_url( + "datadoghq.com", + ["aws:ec2:instance", "aws:ecs:cluster", "aws:eks:cluster"], + ) + pairs = self._query_pairs(url) + resource_types = [v for k, v in pairs if k == "resource_type"] + self.assertEqual( + resource_types, + ["aws:ec2:instance", "aws:ecs:cluster", "aws:eks:cluster"], + ) + self.assertIn(("chunked", "true"), pairs) + + +class TestAttachInstrumentationPermissions(unittest.TestCase): + def setUp(self): + self.iam = MagicMock() + self.iam.create_policy.return_value = {"Policy": {"Arn": "arn:aws:iam::123:policy/X"}} + self.role_name = "DatadogIntegrationRole" + self.site = "datadoghq.com" + + def _mock_chunks_response(self, chunks): + body = json.dumps({"data": {"attributes": {"permissions": chunks}}}).encode() + resp = Mock() + resp.read.return_value = body + return resp + + def test_empty_resource_types_is_noop(self): + attach_instrumentation_permissions(self.iam, self.role_name, self.site, []) + self.iam.create_policy.assert_not_called() + self.iam.attach_role_policy.assert_not_called() + + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_happy_path_attaches_each_chunk(self, mock_urlopen): + mock_urlopen.return_value = self._mock_chunks_response( + [["ec2:Describe*"], ["ssm:SendCommand", "eks:DescribeCluster"]] + ) + + attach_instrumentation_permissions( + self.iam, self.role_name, self.site, ["aws:ec2:instance", "aws:eks:cluster"] + ) + + # Two chunks → two create_policy + two attach_role_policy + self.assertEqual(self.iam.create_policy.call_count, 2) + self.assertEqual(self.iam.attach_role_policy.call_count, 2) + + names = [c.kwargs["PolicyName"] for c in self.iam.create_policy.call_args_list] + self.assertEqual( + names, + [ + f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{self.role_name}-1", + f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{self.role_name}-2", + ], + ) + + # Sanity: Dd-Aws-Api-Call-Source header is sent + sent_request = mock_urlopen.call_args[0][0] + self.assertEqual(sent_request.headers.get("Dd-aws-api-call-source"), "cfn-quickstart") + + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_fetch_failure_is_swallowed(self, mock_urlopen): + mock_urlopen.side_effect = HTTPError( + "u", 500, "boom", {}, BytesIO(b'{"errors":["upstream down"]}') + ) + + # Must not raise + attach_instrumentation_permissions( + self.iam, self.role_name, self.site, ["aws:ec2:instance"] + ) + self.iam.create_policy.assert_not_called() + self.iam.attach_role_policy.assert_not_called() + + @patch("attach_integration_permissions.urllib.request.urlopen") + def test_per_chunk_failure_is_swallowed_and_others_continue(self, mock_urlopen): + mock_urlopen.return_value = self._mock_chunks_response( + [["chunk1:Action"], ["chunk2:Action"], ["chunk3:Action"]] + ) + # Fail the second create_policy call; the others succeed. + self.iam.create_policy.side_effect = [ + {"Policy": {"Arn": "arn:aws:iam::123:policy/A"}}, + Exception("EntityAlreadyExists"), + {"Policy": {"Arn": "arn:aws:iam::123:policy/C"}}, + ] + + attach_instrumentation_permissions( + self.iam, self.role_name, self.site, ["aws:ec2:instance"] + ) + + # All 3 chunks attempted, but only 2 attaches succeeded. + self.assertEqual(self.iam.create_policy.call_count, 3) + self.assertEqual(self.iam.attach_role_policy.call_count, 2) + + +class TestCleanupAlsoRemovesInstrumentationPolicies(unittest.TestCase): + def test_cleanup_iterates_both_prefixes(self): + iam = MagicMock() + iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) + iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + iam.detach_role_policy.side_effect = iam.exceptions.NoSuchEntityException + iam.delete_policy.side_effect = iam.exceptions.NoSuchEntityException + + cleanup_existing_policies(iam, "MyRole", "123456789012", "aws", max_policies=2) + + detached = [c.kwargs["PolicyArn"] for c in iam.detach_role_policy.call_args_list] + # Should have iterated 2 times over each of the 2 prefixes. + self.assertEqual(len(detached), 4) + self.assertIn( + f"arn:aws:iam::123456789012:policy/{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-MyRole-1", + detached, + ) + self.assertIn( + f"arn:aws:iam::123456789012:policy/{BASE_POLICY_PREFIX_INSTRUMENTATION}-MyRole-1", + detached, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/aws_quickstart/datadog_integration_role.yaml b/aws_quickstart/datadog_integration_role.yaml index 821aff84..51de5098 100644 --- a/aws_quickstart/datadog_integration_role.yaml +++ b/aws_quickstart/datadog_integration_role.yaml @@ -18,6 +18,19 @@ Parameters: - false Description: >- Set this value to "true" to add permissions for Datadog to collect resource configuration data. + InstrumentationResourceTypes: + Type: String + Default: "" + Description: >- + Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:ecs:cluster, + aws:eks:cluster) that the Datadog integration role should be granted the IAM permissions + required to instrument with the Datadog Agent. Leave blank to skip. + DatadogSite: + Type: String + Default: "datadoghq.com" + Description: >- + Datadog site the integration is being installed against. Used to call the Datadog API that + returns the IAM permissions required to instrument the selected resource types. DdAWSAccountId: Description: >- Datadog AWS account ID allowed to assume the integration IAM role. DO NOT CHANGE! @@ -89,9 +102,10 @@ Resources: - iam:AttachRolePolicy - iam:DetachRolePolicy - iam:PutRolePolicy - Resource: + Resource: - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMRoleName} - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-resource-collection-permissions-* + - !Sub arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/datadog-aws-integration-instrumentation-permissions-* - !Sub "arn:${AWS::Partition}:iam::aws:policy/SecurityAudit" DatadogAttachIntegrationPermissionsFunction: Type: AWS::Lambda::Function @@ -110,21 +124,24 @@ Resources: import logging from urllib.request import Request import urllib.error + import urllib.parse import urllib.request import cfnresponse import boto3 - + LOGGER = logging.getLogger() LOGGER.setLevel(logging.INFO) API_CALL_SOURCE_HEADER_VALUE = "cfn-quickstart" POLICY_NAME_STANDARD = "DatadogAWSIntegrationPolicy" BASE_POLICY_PREFIX_RESOURCE_COLLECTION = "datadog-aws-integration-resource-collection-permissions" + BASE_POLICY_PREFIX_INSTRUMENTATION = "datadog-aws-integration-instrumentation-permissions" STANDARD_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/standard" RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" - + INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" + class DatadogAPIError(Exception): pass - + def fetch_permissions_from_datadog(api_url): """Fetch permissions from Datadog API""" headers = { @@ -132,17 +149,33 @@ Resources: } request = Request(api_url, headers=headers) request.get_method = lambda: "GET" - + try: response = urllib.request.urlopen(request) except urllib.error.HTTPError as e: error_body = json.loads(e.read()) error_message = error_body.get('errors', ['Unknown error'])[0] raise DatadogAPIError(f"Datadog API error: {error_message}") from e - + json_response = json.loads(response.read()) return json_response["data"]["attributes"]["permissions"] - + + def parse_resource_types(raw): + """Parse the InstrumentationResourceTypes ResourceProperty into a clean list of UDM strings. + Accepts a CFN comma-delimited string or an already-split list (CFN serializes + CommaDelimitedList parameters as JSON arrays when forwarded to a custom resource).""" + if raw is None: + return [] + items = raw.split(",") if isinstance(raw, str) else list(raw) + return [t.strip() for t in items if t and t.strip()] + + def build_instrumentation_permissions_url(datadog_site, resource_types): + site = datadog_site or "datadoghq.com" + query = urllib.parse.urlencode( + [("resource_type", t) for t in resource_types] + [("chunked", "true")] + ) + return f"https://api.{site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): """Detach a managed policy from a role and delete it. Ignores missing entities.""" try: @@ -151,7 +184,7 @@ Resources: pass except Exception as e: LOGGER.error(f"Error detaching policy {policy_name}: {str(e)}") - + try: iam_client.delete_policy(PolicyArn=policy_arn) except iam_client.exceptions.NoSuchEntityException: @@ -160,14 +193,15 @@ Resources: LOGGER.warning(f"Policy {policy_name} still attached, skipping delete") except Exception as e: LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") - + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - # Remove role-scoped resource collection policies - for i in range(max_policies): - policy_name = f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}" - policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" - _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) - + # Remove role-scoped chunked policies for both prefixes + for prefix in (BASE_POLICY_PREFIX_RESOURCE_COLLECTION, BASE_POLICY_PREFIX_INSTRUMENTATION): + for i in range(max_policies): + policy_name = f"{prefix}-{role_name}-{i+1}" + policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" + _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + # Remove standard permissions try: iam_client.delete_role_policy( @@ -178,7 +212,7 @@ Resources: pass except Exception as e: LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") - + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { @@ -191,16 +225,16 @@ Resources: } ] } - + iam_client.put_role_policy( RoleName=role_name, PolicyName=POLICY_NAME_STANDARD, PolicyDocument=json.dumps(policy_document, separators=(',', ':')) ) - + def attach_resource_collection_permissions(iam_client, role_name): permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) - + # Create and attach new policies for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}" @@ -221,13 +255,64 @@ Resources: PolicyName=policy_name, PolicyDocument=policy_json ) - + # Attach policy to role iam_client.attach_role_policy( RoleName=role_name, PolicyArn=policy['Policy']['Arn'] ) - + + def attach_instrumentation_permissions(iam_client, role_name, datadog_site, resource_types): + """Best-effort attach IAM permissions required to instrument the given UDM resource types. + + Never raises: instrumentation permissions are additive convenience on top of the integration, + so any failure here surfaces a warning but does not block the install. + """ + if not resource_types: + return + + try: + url = build_instrumentation_permissions_url(datadog_site, resource_types) + LOGGER.info(f"Fetching instrumentation permissions for {resource_types} from {url}") + permission_chunks = fetch_permissions_from_datadog(url) + except Exception as e: + LOGGER.warning( + f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " + "Integration install will continue without these permissions." + ) + return + + attached, failed = 0, 0 + for i, chunk in enumerate(permission_chunks): + policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" + policy_document = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": chunk, + "Resource": "*" + } + ] + } + policy_json = json.dumps(policy_document, separators=(',', ':')) + LOGGER.info(f"Creating policy {policy_name} with {len(chunk)} permissions ({len(policy_json)} characters)") + try: + policy = iam_client.create_policy( + PolicyName=policy_name, + PolicyDocument=policy_json + ) + iam_client.attach_role_policy( + RoleName=role_name, + PolicyArn=policy['Policy']['Arn'] + ) + attached += 1 + except Exception as e: + failed += 1 + LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") + + LOGGER.info(f"Instrumentation permissions: {attached} attached, {failed} failed out of {len(permission_chunks)} chunks") + def handle_delete(event, context, role_name, account_id, partition): """Handle stack deletion.""" iam_client = boto3.client('iam') @@ -237,8 +322,8 @@ Resources: except Exception as e: LOGGER.error(f"Error deleting policy: {str(e)}") cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) - - def handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy): + + def handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types): """Handle stack creation or update.""" try: iam_client = boto3.client('iam') @@ -246,23 +331,26 @@ Resources: attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: attach_resource_collection_permissions(iam_client, role_name) + attach_instrumentation_permissions(iam_client, role_name, datadog_site, instrumentation_resource_types) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error creating/attaching policy: {str(e)}") cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) - + def handler(event, context): LOGGER.info("Event received: %s", json.dumps(event)) - + role_name = event['ResourceProperties']['DatadogIntegrationRole'] account_id = event['ResourceProperties']['AccountId'] partition = event['ResourceProperties'].get('Partition', 'aws') should_install_security_audit_policy = str(event['ResourceProperties']['ResourceCollectionPermissions']).lower() == 'true' - + datadog_site = event['ResourceProperties'].get('DatadogSite', 'datadoghq.com') + instrumentation_resource_types = parse_resource_types(event['ResourceProperties'].get('InstrumentationResourceTypes')) + if event['RequestType'] == 'Delete': handle_delete(event, context, role_name, account_id, partition) else: - handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy) + handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types) DatadogAttachIntegrationPermissionsFunctionTrigger: Type: Custom::DatadogAttachIntegrationPermissionsFunctionTrigger DependsOn: DatadogIntegrationRole @@ -272,6 +360,8 @@ Resources: AccountId: !Ref AWS::AccountId Partition: !Sub "${AWS::Partition}" ResourceCollectionPermissions: !Ref ResourceCollectionPermissions + InstrumentationResourceTypes: !Ref InstrumentationResourceTypes + DatadogSite: !Ref DatadogSite Metadata: AWS::CloudFormation::Interface: ParameterGroups: diff --git a/aws_quickstart/main_v2.yaml b/aws_quickstart/main_v2.yaml index 3c7ba77f..3e86464c 100644 --- a/aws_quickstart/main_v2.yaml +++ b/aws_quickstart/main_v2.yaml @@ -69,6 +69,13 @@ Parameters: Datadog CSPM is a product that automatically detects resource misconfigurations in your AWS account according to industry benchmarks. More info: https://www.datadoghq.com/product/security-platform/cloud-security-posture-management/ Default: false + InstrumentationResourceTypes: + Type: CommaDelimitedList + Description: >- + Comma-separated list of AWS resource types (UDM form, e.g. aws:ec2:instance, aws:ecs:cluster, aws:eks:cluster) + that the Datadog integration role should be granted the IAM permissions required to instrument with the Datadog + Agent. Leave blank to skip granting any extra instrumentation permissions. + Default: "" Rules: ResourceCollectionValidState: Assertions: @@ -158,6 +165,8 @@ Resources: ExternalId: !GetAtt DatadogAWSAccountIntegration.Outputs.ExternalId IAMRoleName: !Ref IAMRoleName ResourceCollectionPermissions: !If [ResourceCollectionPermissions, true, false] + InstrumentationResourceTypes: !Join [",", !Ref InstrumentationResourceTypes] + DatadogSite: !Ref DatadogSite DdAWSAccountId: !If - IsGov - !If @@ -212,6 +221,7 @@ Metadata: - IAMRoleName - DisableMetricCollection - DisableResourceCollection + - InstrumentationResourceTypes ParameterLabels: APIKey: default: "DatadogApiKey *" diff --git a/aws_quickstart/version.txt b/aws_quickstart/version.txt index 1dff3233..d3d9bf7a 100644 --- a/aws_quickstart/version.txt +++ b/aws_quickstart/version.txt @@ -1 +1 @@ -v4.9.1 +v4.10.0 From c61da828530d9a20243ed7728a5ee9f2fcc6807e Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Wed, 13 May 2026 15:38:01 -0400 Subject: [PATCH 2/4] refactor(aws_quickstart): simplify attach-integration-permissions Lambda MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract `_create_and_attach_policy` helper so chunked-policy attach is shared between the resource-collection and instrumentation code paths instead of being duplicated byte-for-byte. - Have `handle_delete`/`handle_create_update` unpack `ResourceProperties` themselves rather than threading 4–8 positional args from `handler`. - Drop the `or "datadoghq.com"` fallback inside `build_instrumentation_permissions_url` — the handler already defaults `DatadogSite`, so the builder can trust its input. - Prune docstrings/comments that only restated the function name or the next statement, per CLAUDE.md policy. Kept only the non-obvious "why" comments (CommaDelimitedList serialization, best-effort/non-raising contract, ignored-missing-entity semantics). - Strip trailing whitespace from the embedded ZipFile. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../attach_integration_permissions.py | 147 +++++++----------- .../attach_integration_permissions_test.py | 13 +- aws_quickstart/datadog_integration_role.yaml | 147 +++++++----------- 3 files changed, 114 insertions(+), 193 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index dc188905..fbc7b357 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -17,11 +17,12 @@ RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" + class DatadogAPIError(Exception): pass + def fetch_permissions_from_datadog(api_url): - """Fetch permissions from Datadog API""" headers = { "Dd-Aws-Api-Call-Source": API_CALL_SOURCE_HEADER_VALUE, } @@ -35,27 +36,28 @@ def fetch_permissions_from_datadog(api_url): error_message = error_body.get('errors', ['Unknown error'])[0] raise DatadogAPIError(f"Datadog API error: {error_message}") from e - json_response = json.loads(response.read()) - return json_response["data"]["attributes"]["permissions"] + return json.loads(response.read())["data"]["attributes"]["permissions"] + def parse_resource_types(raw): - """Parse the InstrumentationResourceTypes ResourceProperty into a clean list of UDM strings. - Accepts a CFN comma-delimited string or an already-split list (CFN serializes - CommaDelimitedList parameters as JSON arrays when forwarded to a custom resource).""" + # CFN forwards CommaDelimitedList parameters as JSON arrays to custom resources, + # while String parameters arrive as comma-delimited strings; accept both. if raw is None: return [] items = raw.split(",") if isinstance(raw, str) else list(raw) return [t.strip() for t in items if t and t.strip()] + def build_instrumentation_permissions_url(datadog_site, resource_types): - site = datadog_site or "datadoghq.com" query = urllib.parse.urlencode( [("resource_type", t) for t in resource_types] + [("chunked", "true")] ) - return f"https://api.{site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + return f"https://api.{datadog_site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): - """Detach a managed policy from a role and delete it. Ignores missing entities.""" + # Detach + delete are both no-ops if the entity is already gone, so callers can blindly + # iterate the policy-name space without first checking what actually exists. try: iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn) except iam_client.exceptions.NoSuchEntityException: @@ -72,80 +74,62 @@ def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): except Exception as e: LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - # Remove role-scoped chunked policies for both prefixes for prefix in (BASE_POLICY_PREFIX_RESOURCE_COLLECTION, BASE_POLICY_PREFIX_INSTRUMENTATION): for i in range(max_policies): policy_name = f"{prefix}-{role_name}-{i+1}" policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) - # Remove standard permissions try: - iam_client.delete_role_policy( - RoleName=role_name, - PolicyName=POLICY_NAME_STANDARD - ) + iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) except iam_client.exceptions.NoSuchEntityException: pass except Exception as e: LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": permissions, - "Resource": "*" - } - ] + "Statement": [{"Effect": "Allow", "Action": permissions, "Resource": "*"}], } - iam_client.put_role_policy( RoleName=role_name, PolicyName=POLICY_NAME_STANDARD, - PolicyDocument=json.dumps(policy_document, separators=(',', ':')) + PolicyDocument=json.dumps(policy_document, separators=(',', ':')), + ) + + +def _create_and_attach_policy(iam_client, role_name, policy_name, actions): + policy_json = json.dumps( + { + "Version": "2012-10-17", + "Statement": [{"Effect": "Allow", "Action": actions, "Resource": "*"}], + }, + separators=(',', ':'), ) + LOGGER.info(f"Creating policy {policy_name} with {len(actions)} permissions ({len(policy_json)} characters)") + policy = iam_client.create_policy(PolicyName=policy_name, PolicyDocument=policy_json) + iam_client.attach_role_policy(RoleName=role_name, PolicyArn=policy['Policy']['Arn']) + def attach_resource_collection_permissions(iam_client, role_name): permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) - - # Create and attach new policies for i, chunk in enumerate(permission_chunks): - policy_name = f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}" - policy_document = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": chunk, - "Resource": "*" - } - ] - } - policy_json = json.dumps(policy_document, separators=(',', ':')) - policy_size = len(policy_json) - LOGGER.info(f"Creating policy {policy_name} with {len(chunk)} permissions ({policy_size} characters)") - policy = iam_client.create_policy( - PolicyName=policy_name, - PolicyDocument=policy_json + _create_and_attach_policy( + iam_client, + role_name, + f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}", + chunk, ) - # Attach policy to role - iam_client.attach_role_policy( - RoleName=role_name, - PolicyArn=policy['Policy']['Arn'] - ) def attach_instrumentation_permissions(iam_client, role_name, datadog_site, resource_types): - """Best-effort attach IAM permissions required to instrument the given UDM resource types. - - Never raises: instrumentation permissions are additive convenience on top of the integration, - so any failure here surfaces a warning but does not block the install. - """ + # Best-effort: instrumentation permissions are additive convenience on top of the + # integration, so any failure here is logged and swallowed rather than blocking install. if not resource_types: return @@ -160,39 +144,19 @@ def attach_instrumentation_permissions(iam_client, role_name, datadog_site, reso ) return - attached, failed = 0, 0 for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" - policy_document = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": chunk, - "Resource": "*" - } - ] - } - policy_json = json.dumps(policy_document, separators=(',', ':')) - LOGGER.info(f"Creating policy {policy_name} with {len(chunk)} permissions ({len(policy_json)} characters)") try: - policy = iam_client.create_policy( - PolicyName=policy_name, - PolicyDocument=policy_json - ) - iam_client.attach_role_policy( - RoleName=role_name, - PolicyArn=policy['Policy']['Arn'] - ) - attached += 1 + _create_and_attach_policy(iam_client, role_name, policy_name, chunk) except Exception as e: - failed += 1 LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") - LOGGER.info(f"Instrumentation permissions: {attached} attached, {failed} failed out of {len(permission_chunks)} chunks") -def handle_delete(event, context, role_name, account_id, partition): - """Handle stack deletion.""" +def handle_delete(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') iam_client = boto3.client('iam') try: cleanup_existing_policies(iam_client, role_name, account_id, partition) @@ -201,8 +165,16 @@ def handle_delete(event, context, role_name, account_id, partition): LOGGER.error(f"Error deleting policy: {str(e)}") cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) -def handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types): - """Handle stack creation or update.""" + +def handle_create_update(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') + should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' + datadog_site = props.get('DatadogSite') or 'datadoghq.com' + instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) + try: iam_client = boto3.client('iam') cleanup_existing_policies(iam_client, role_name, account_id, partition) @@ -215,17 +187,10 @@ def handle_create_update(event, context, role_name, account_id, partition, shoul LOGGER.error(f"Error creating/attaching policy: {str(e)}") cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) + def handler(event, context): LOGGER.info("Event received: %s", json.dumps(event)) - - role_name = event['ResourceProperties']['DatadogIntegrationRole'] - account_id = event['ResourceProperties']['AccountId'] - partition = event['ResourceProperties'].get('Partition', 'aws') - should_install_security_audit_policy = str(event['ResourceProperties']['ResourceCollectionPermissions']).lower() == 'true' - datadog_site = event['ResourceProperties'].get('DatadogSite', 'datadoghq.com') - instrumentation_resource_types = parse_resource_types(event['ResourceProperties'].get('InstrumentationResourceTypes')) - if event['RequestType'] == 'Delete': - handle_delete(event, context, role_name, account_id, partition) + handle_delete(event, context) else: - handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types) + handle_create_update(event, context) diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index 64a2df0d..6c0a24db 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -61,10 +61,6 @@ def test_path_and_host(self): self.assertEqual(parsed.netloc, "api.datadoghq.eu") self.assertEqual(parsed.path, "/api/unstable/instrumenter/aws/iam_permissions") - def test_default_site(self): - url = build_instrumentation_permissions_url("", ["aws:ec2:instance"]) - self.assertIn("api.datadoghq.com", url) - def test_repeated_resource_type_and_chunked(self): url = build_instrumentation_permissions_url( "datadoghq.com", @@ -107,7 +103,6 @@ def test_happy_path_attaches_each_chunk(self, mock_urlopen): self.iam, self.role_name, self.site, ["aws:ec2:instance", "aws:eks:cluster"] ) - # Two chunks → two create_policy + two attach_role_policy self.assertEqual(self.iam.create_policy.call_count, 2) self.assertEqual(self.iam.attach_role_policy.call_count, 2) @@ -120,7 +115,6 @@ def test_happy_path_attaches_each_chunk(self, mock_urlopen): ], ) - # Sanity: Dd-Aws-Api-Call-Source header is sent sent_request = mock_urlopen.call_args[0][0] self.assertEqual(sent_request.headers.get("Dd-aws-api-call-source"), "cfn-quickstart") @@ -130,7 +124,7 @@ def test_fetch_failure_is_swallowed(self, mock_urlopen): "u", 500, "boom", {}, BytesIO(b'{"errors":["upstream down"]}') ) - # Must not raise + # Contract: attach_instrumentation_permissions must never raise. attach_instrumentation_permissions( self.iam, self.role_name, self.site, ["aws:ec2:instance"] ) @@ -142,7 +136,6 @@ def test_per_chunk_failure_is_swallowed_and_others_continue(self, mock_urlopen): mock_urlopen.return_value = self._mock_chunks_response( [["chunk1:Action"], ["chunk2:Action"], ["chunk3:Action"]] ) - # Fail the second create_policy call; the others succeed. self.iam.create_policy.side_effect = [ {"Policy": {"Arn": "arn:aws:iam::123:policy/A"}}, Exception("EntityAlreadyExists"), @@ -153,7 +146,6 @@ def test_per_chunk_failure_is_swallowed_and_others_continue(self, mock_urlopen): self.iam, self.role_name, self.site, ["aws:ec2:instance"] ) - # All 3 chunks attempted, but only 2 attaches succeeded. self.assertEqual(self.iam.create_policy.call_count, 3) self.assertEqual(self.iam.attach_role_policy.call_count, 2) @@ -169,8 +161,7 @@ def test_cleanup_iterates_both_prefixes(self): cleanup_existing_policies(iam, "MyRole", "123456789012", "aws", max_policies=2) detached = [c.kwargs["PolicyArn"] for c in iam.detach_role_policy.call_args_list] - # Should have iterated 2 times over each of the 2 prefixes. - self.assertEqual(len(detached), 4) + self.assertEqual(len(detached), 4) # 2 prefixes × max_policies=2 self.assertIn( f"arn:aws:iam::123456789012:policy/{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-MyRole-1", detached, diff --git a/aws_quickstart/datadog_integration_role.yaml b/aws_quickstart/datadog_integration_role.yaml index 51de5098..1c886169 100644 --- a/aws_quickstart/datadog_integration_role.yaml +++ b/aws_quickstart/datadog_integration_role.yaml @@ -139,11 +139,12 @@ Resources: RESOURCE_COLLECTION_PERMISSIONS_API_URL = "https://api.datadoghq.com/api/v2/integration/aws/iam_permissions/resource_collection?chunked=true" INSTRUMENTATION_PERMISSIONS_API_PATH = "/api/unstable/instrumenter/aws/iam_permissions" + class DatadogAPIError(Exception): pass + def fetch_permissions_from_datadog(api_url): - """Fetch permissions from Datadog API""" headers = { "Dd-Aws-Api-Call-Source": API_CALL_SOURCE_HEADER_VALUE, } @@ -157,27 +158,28 @@ Resources: error_message = error_body.get('errors', ['Unknown error'])[0] raise DatadogAPIError(f"Datadog API error: {error_message}") from e - json_response = json.loads(response.read()) - return json_response["data"]["attributes"]["permissions"] + return json.loads(response.read())["data"]["attributes"]["permissions"] + def parse_resource_types(raw): - """Parse the InstrumentationResourceTypes ResourceProperty into a clean list of UDM strings. - Accepts a CFN comma-delimited string or an already-split list (CFN serializes - CommaDelimitedList parameters as JSON arrays when forwarded to a custom resource).""" + # CFN forwards CommaDelimitedList parameters as JSON arrays to custom resources, + # while String parameters arrive as comma-delimited strings; accept both. if raw is None: return [] items = raw.split(",") if isinstance(raw, str) else list(raw) return [t.strip() for t in items if t and t.strip()] + def build_instrumentation_permissions_url(datadog_site, resource_types): - site = datadog_site or "datadoghq.com" query = urllib.parse.urlencode( [("resource_type", t) for t in resource_types] + [("chunked", "true")] ) - return f"https://api.{site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + return f"https://api.{datadog_site}{INSTRUMENTATION_PERMISSIONS_API_PATH}?{query}" + def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): - """Detach a managed policy from a role and delete it. Ignores missing entities.""" + # Detach + delete are both no-ops if the entity is already gone, so callers can blindly + # iterate the policy-name space without first checking what actually exists. try: iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy_arn) except iam_client.exceptions.NoSuchEntityException: @@ -194,80 +196,62 @@ Resources: except Exception as e: LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - # Remove role-scoped chunked policies for both prefixes for prefix in (BASE_POLICY_PREFIX_RESOURCE_COLLECTION, BASE_POLICY_PREFIX_INSTRUMENTATION): for i in range(max_policies): policy_name = f"{prefix}-{role_name}-{i+1}" policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) - # Remove standard permissions try: - iam_client.delete_role_policy( - RoleName=role_name, - PolicyName=POLICY_NAME_STANDARD - ) + iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) except iam_client.exceptions.NoSuchEntityException: pass except Exception as e: LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": permissions, - "Resource": "*" - } - ] + "Statement": [{"Effect": "Allow", "Action": permissions, "Resource": "*"}], } - iam_client.put_role_policy( RoleName=role_name, PolicyName=POLICY_NAME_STANDARD, - PolicyDocument=json.dumps(policy_document, separators=(',', ':')) + PolicyDocument=json.dumps(policy_document, separators=(',', ':')), + ) + + + def _create_and_attach_policy(iam_client, role_name, policy_name, actions): + policy_json = json.dumps( + { + "Version": "2012-10-17", + "Statement": [{"Effect": "Allow", "Action": actions, "Resource": "*"}], + }, + separators=(',', ':'), ) + LOGGER.info(f"Creating policy {policy_name} with {len(actions)} permissions ({len(policy_json)} characters)") + policy = iam_client.create_policy(PolicyName=policy_name, PolicyDocument=policy_json) + iam_client.attach_role_policy(RoleName=role_name, PolicyArn=policy['Policy']['Arn']) + def attach_resource_collection_permissions(iam_client, role_name): permission_chunks = fetch_permissions_from_datadog(RESOURCE_COLLECTION_PERMISSIONS_API_URL) - - # Create and attach new policies for i, chunk in enumerate(permission_chunks): - policy_name = f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}" - policy_document = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": chunk, - "Resource": "*" - } - ] - } - policy_json = json.dumps(policy_document, separators=(',', ':')) - policy_size = len(policy_json) - LOGGER.info(f"Creating policy {policy_name} with {len(chunk)} permissions ({policy_size} characters)") - policy = iam_client.create_policy( - PolicyName=policy_name, - PolicyDocument=policy_json + _create_and_attach_policy( + iam_client, + role_name, + f"{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-{role_name}-{i+1}", + chunk, ) - # Attach policy to role - iam_client.attach_role_policy( - RoleName=role_name, - PolicyArn=policy['Policy']['Arn'] - ) def attach_instrumentation_permissions(iam_client, role_name, datadog_site, resource_types): - """Best-effort attach IAM permissions required to instrument the given UDM resource types. - - Never raises: instrumentation permissions are additive convenience on top of the integration, - so any failure here surfaces a warning but does not block the install. - """ + # Best-effort: instrumentation permissions are additive convenience on top of the + # integration, so any failure here is logged and swallowed rather than blocking install. if not resource_types: return @@ -282,39 +266,19 @@ Resources: ) return - attached, failed = 0, 0 for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" - policy_document = { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": chunk, - "Resource": "*" - } - ] - } - policy_json = json.dumps(policy_document, separators=(',', ':')) - LOGGER.info(f"Creating policy {policy_name} with {len(chunk)} permissions ({len(policy_json)} characters)") try: - policy = iam_client.create_policy( - PolicyName=policy_name, - PolicyDocument=policy_json - ) - iam_client.attach_role_policy( - RoleName=role_name, - PolicyArn=policy['Policy']['Arn'] - ) - attached += 1 + _create_and_attach_policy(iam_client, role_name, policy_name, chunk) except Exception as e: - failed += 1 LOGGER.warning(f"Failed to create/attach instrumentation policy {policy_name}: {e}. Continuing.") - LOGGER.info(f"Instrumentation permissions: {attached} attached, {failed} failed out of {len(permission_chunks)} chunks") - def handle_delete(event, context, role_name, account_id, partition): - """Handle stack deletion.""" + def handle_delete(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') iam_client = boto3.client('iam') try: cleanup_existing_policies(iam_client, role_name, account_id, partition) @@ -323,8 +287,16 @@ Resources: LOGGER.error(f"Error deleting policy: {str(e)}") cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) - def handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types): - """Handle stack creation or update.""" + + def handle_create_update(event, context): + props = event['ResourceProperties'] + role_name = props['DatadogIntegrationRole'] + account_id = props['AccountId'] + partition = props.get('Partition', 'aws') + should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' + datadog_site = props.get('DatadogSite') or 'datadoghq.com' + instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) + try: iam_client = boto3.client('iam') cleanup_existing_policies(iam_client, role_name, account_id, partition) @@ -337,20 +309,13 @@ Resources: LOGGER.error(f"Error creating/attaching policy: {str(e)}") cfnresponse.send(event, context, cfnresponse.FAILED, responseData={"Message": str(e)}) + def handler(event, context): LOGGER.info("Event received: %s", json.dumps(event)) - - role_name = event['ResourceProperties']['DatadogIntegrationRole'] - account_id = event['ResourceProperties']['AccountId'] - partition = event['ResourceProperties'].get('Partition', 'aws') - should_install_security_audit_policy = str(event['ResourceProperties']['ResourceCollectionPermissions']).lower() == 'true' - datadog_site = event['ResourceProperties'].get('DatadogSite', 'datadoghq.com') - instrumentation_resource_types = parse_resource_types(event['ResourceProperties'].get('InstrumentationResourceTypes')) - if event['RequestType'] == 'Delete': - handle_delete(event, context, role_name, account_id, partition) + handle_delete(event, context) else: - handle_create_update(event, context, role_name, account_id, partition, should_install_security_audit_policy, datadog_site, instrumentation_resource_types) + handle_create_update(event, context) DatadogAttachIntegrationPermissionsFunctionTrigger: Type: Custom::DatadogAttachIntegrationPermissionsFunctionTrigger DependsOn: DatadogIntegrationRole From c45020e260798a3e08e61df814a83445c1023f9c Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Wed, 13 May 2026 15:40:57 -0400 Subject: [PATCH 3/4] fix(aws_quickstart): preserve instrumentation policies on fetch failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a stack Update, the previous shape of `handle_create_update` deleted existing instrumentation managed policies via `cleanup_existing_policies` before attempting to refetch the latest set from Datadog. If the fetch returned an error (transient API outage, network blip), the best-effort path early-returned with the policies gone — so an unrelated stack update could silently revoke the Agent's instrumentation permissions on the integration role. Fetch first, then clean up + reattach in a single step. If the fetch fails, leave any previously-attached policies in place. An empty `InstrumentationResourceTypes` list still cleans them up (user explicitly opting out), and `handle_delete` still wipes them on stack deletion. Surfaced by codex review on #306. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../attach_integration_permissions.py | 28 +++++-- .../attach_integration_permissions_test.py | 76 +++++++++++-------- aws_quickstart/datadog_integration_role.yaml | 28 +++++-- 3 files changed, 85 insertions(+), 47 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index fbc7b357..865aca8f 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -75,12 +75,15 @@ def _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name): LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") +def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, prefix, max_policies=10): + for i in range(max_policies): + policy_name = f"{prefix}-{role_name}-{i+1}" + policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" + _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - for prefix in (BASE_POLICY_PREFIX_RESOURCE_COLLECTION, BASE_POLICY_PREFIX_INSTRUMENTATION): - for i in range(max_policies): - policy_name = f"{prefix}-{role_name}-{i+1}" - policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" - _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) try: iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) @@ -90,6 +93,10 @@ def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_ LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") +def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) + + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { @@ -127,10 +134,13 @@ def attach_resource_collection_permissions(iam_client, role_name): ) -def attach_instrumentation_permissions(iam_client, role_name, datadog_site, resource_types): +def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types): # Best-effort: instrumentation permissions are additive convenience on top of the # integration, so any failure here is logged and swallowed rather than blocking install. + # Fetch before cleanup so that a transient API failure on an Update leaves the + # previously-attached policies in place instead of silently revoking them. if not resource_types: + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) return try: @@ -140,10 +150,11 @@ def attach_instrumentation_permissions(iam_client, role_name, datadog_site, reso except Exception as e: LOGGER.warning( f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " - "Integration install will continue without these permissions." + "Leaving any previously-attached instrumentation policies in place." ) return + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" try: @@ -160,6 +171,7 @@ def handle_delete(event, context): iam_client = boto3.client('iam') try: cleanup_existing_policies(iam_client, role_name, account_id, partition) + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error deleting policy: {str(e)}") @@ -181,7 +193,7 @@ def handle_create_update(event, context): attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: attach_resource_collection_permissions(iam_client, role_name) - attach_instrumentation_permissions(iam_client, role_name, datadog_site, instrumentation_resource_types) + attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error creating/attaching policy: {str(e)}") diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index 6c0a24db..d5d694db 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -18,6 +18,7 @@ build_instrumentation_permissions_url, attach_instrumentation_permissions, cleanup_existing_policies, + cleanup_instrumentation_policies, BASE_POLICY_PREFIX_INSTRUMENTATION, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, ) @@ -78,20 +79,33 @@ def test_repeated_resource_type_and_chunked(self): class TestAttachInstrumentationPermissions(unittest.TestCase): def setUp(self): self.iam = MagicMock() + self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) + self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) self.iam.create_policy.return_value = {"Policy": {"Arn": "arn:aws:iam::123:policy/X"}} + self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException + self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException self.role_name = "DatadogIntegrationRole" + self.account_id = "123456789012" + self.partition = "aws" self.site = "datadoghq.com" + def _attach(self, resource_types): + attach_instrumentation_permissions( + self.iam, self.role_name, self.account_id, self.partition, self.site, resource_types, + ) + def _mock_chunks_response(self, chunks): body = json.dumps({"data": {"attributes": {"permissions": chunks}}}).encode() resp = Mock() resp.read.return_value = body return resp - def test_empty_resource_types_is_noop(self): - attach_instrumentation_permissions(self.iam, self.role_name, self.site, []) + def test_empty_resource_types_cleans_up_previously_attached_policies(self): + # Toggling off instrumentation should remove the previously-attached policies. + self._attach([]) self.iam.create_policy.assert_not_called() self.iam.attach_role_policy.assert_not_called() + self.assertGreater(self.iam.detach_role_policy.call_count, 0) @patch("attach_integration_permissions.urllib.request.urlopen") def test_happy_path_attaches_each_chunk(self, mock_urlopen): @@ -99,9 +113,7 @@ def test_happy_path_attaches_each_chunk(self, mock_urlopen): [["ec2:Describe*"], ["ssm:SendCommand", "eks:DescribeCluster"]] ) - attach_instrumentation_permissions( - self.iam, self.role_name, self.site, ["aws:ec2:instance", "aws:eks:cluster"] - ) + self._attach(["aws:ec2:instance", "aws:eks:cluster"]) self.assertEqual(self.iam.create_policy.call_count, 2) self.assertEqual(self.iam.attach_role_policy.call_count, 2) @@ -119,17 +131,20 @@ def test_happy_path_attaches_each_chunk(self, mock_urlopen): self.assertEqual(sent_request.headers.get("Dd-aws-api-call-source"), "cfn-quickstart") @patch("attach_integration_permissions.urllib.request.urlopen") - def test_fetch_failure_is_swallowed(self, mock_urlopen): + def test_fetch_failure_preserves_existing_policies(self, mock_urlopen): + # Regression: a transient API failure on Update must not silently revoke the + # previously-attached instrumentation policies. The function must neither + # call detach_role_policy / delete_policy nor raise. mock_urlopen.side_effect = HTTPError( "u", 500, "boom", {}, BytesIO(b'{"errors":["upstream down"]}') ) - # Contract: attach_instrumentation_permissions must never raise. - attach_instrumentation_permissions( - self.iam, self.role_name, self.site, ["aws:ec2:instance"] - ) + self._attach(["aws:ec2:instance"]) + self.iam.create_policy.assert_not_called() self.iam.attach_role_policy.assert_not_called() + self.iam.detach_role_policy.assert_not_called() + self.iam.delete_policy.assert_not_called() @patch("attach_integration_permissions.urllib.request.urlopen") def test_per_chunk_failure_is_swallowed_and_others_continue(self, mock_urlopen): @@ -142,34 +157,33 @@ def test_per_chunk_failure_is_swallowed_and_others_continue(self, mock_urlopen): {"Policy": {"Arn": "arn:aws:iam::123:policy/C"}}, ] - attach_instrumentation_permissions( - self.iam, self.role_name, self.site, ["aws:ec2:instance"] - ) + self._attach(["aws:ec2:instance"]) self.assertEqual(self.iam.create_policy.call_count, 3) self.assertEqual(self.iam.attach_role_policy.call_count, 2) -class TestCleanupAlsoRemovesInstrumentationPolicies(unittest.TestCase): - def test_cleanup_iterates_both_prefixes(self): - iam = MagicMock() - iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) - iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) - iam.detach_role_policy.side_effect = iam.exceptions.NoSuchEntityException - iam.delete_policy.side_effect = iam.exceptions.NoSuchEntityException +class TestCleanup(unittest.TestCase): + def setUp(self): + self.iam = MagicMock() + self.iam.exceptions.NoSuchEntityException = type("NSE", (Exception,), {}) + self.iam.exceptions.DeleteConflictException = type("DCE", (Exception,), {}) + self.iam.detach_role_policy.side_effect = self.iam.exceptions.NoSuchEntityException + self.iam.delete_policy.side_effect = self.iam.exceptions.NoSuchEntityException + + def test_cleanup_existing_does_not_touch_instrumentation(self): + cleanup_existing_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=2) + + detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION not in arn for arn in detached)) + self.assertTrue(any(BASE_POLICY_PREFIX_RESOURCE_COLLECTION in arn for arn in detached)) - cleanup_existing_policies(iam, "MyRole", "123456789012", "aws", max_policies=2) + def test_cleanup_instrumentation_targets_only_instrumentation_prefix(self): + cleanup_instrumentation_policies(self.iam, "MyRole", "123456789012", "aws", max_policies=2) - detached = [c.kwargs["PolicyArn"] for c in iam.detach_role_policy.call_args_list] - self.assertEqual(len(detached), 4) # 2 prefixes × max_policies=2 - self.assertIn( - f"arn:aws:iam::123456789012:policy/{BASE_POLICY_PREFIX_RESOURCE_COLLECTION}-MyRole-1", - detached, - ) - self.assertIn( - f"arn:aws:iam::123456789012:policy/{BASE_POLICY_PREFIX_INSTRUMENTATION}-MyRole-1", - detached, - ) + detached = [c.kwargs["PolicyArn"] for c in self.iam.detach_role_policy.call_args_list] + self.assertEqual(len(detached), 2) + self.assertTrue(all(BASE_POLICY_PREFIX_INSTRUMENTATION in arn for arn in detached)) if __name__ == "__main__": diff --git a/aws_quickstart/datadog_integration_role.yaml b/aws_quickstart/datadog_integration_role.yaml index 1c886169..4e9d8682 100644 --- a/aws_quickstart/datadog_integration_role.yaml +++ b/aws_quickstart/datadog_integration_role.yaml @@ -197,12 +197,15 @@ Resources: LOGGER.error(f"Error deleting policy {policy_name}: {str(e)}") + def _cleanup_chunked_policies(iam_client, role_name, account_id, partition, prefix, max_policies=10): + for i in range(max_policies): + policy_name = f"{prefix}-{role_name}-{i+1}" + policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" + _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + + def cleanup_existing_policies(iam_client, role_name, account_id, partition, max_policies=10): - for prefix in (BASE_POLICY_PREFIX_RESOURCE_COLLECTION, BASE_POLICY_PREFIX_INSTRUMENTATION): - for i in range(max_policies): - policy_name = f"{prefix}-{role_name}-{i+1}" - policy_arn = f"arn:{partition}:iam::{account_id}:policy/{policy_name}" - _detach_and_delete_policy(iam_client, role_name, policy_arn, policy_name) + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_RESOURCE_COLLECTION, max_policies) try: iam_client.delete_role_policy(RoleName=role_name, PolicyName=POLICY_NAME_STANDARD) @@ -212,6 +215,10 @@ Resources: LOGGER.error(f"Error deleting inline policy {POLICY_NAME_STANDARD}: {str(e)}") + def cleanup_instrumentation_policies(iam_client, role_name, account_id, partition, max_policies=10): + _cleanup_chunked_policies(iam_client, role_name, account_id, partition, BASE_POLICY_PREFIX_INSTRUMENTATION, max_policies) + + def attach_standard_permissions(iam_client, role_name): permissions = fetch_permissions_from_datadog(STANDARD_PERMISSIONS_API_URL) policy_document = { @@ -249,10 +256,13 @@ Resources: ) - def attach_instrumentation_permissions(iam_client, role_name, datadog_site, resource_types): + def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types): # Best-effort: instrumentation permissions are additive convenience on top of the # integration, so any failure here is logged and swallowed rather than blocking install. + # Fetch before cleanup so that a transient API failure on an Update leaves the + # previously-attached policies in place instead of silently revoking them. if not resource_types: + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) return try: @@ -262,10 +272,11 @@ Resources: except Exception as e: LOGGER.warning( f"Failed to fetch instrumentation permissions for {resource_types}: {e}. " - "Integration install will continue without these permissions." + "Leaving any previously-attached instrumentation policies in place." ) return + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) for i, chunk in enumerate(permission_chunks): policy_name = f"{BASE_POLICY_PREFIX_INSTRUMENTATION}-{role_name}-{i+1}" try: @@ -282,6 +293,7 @@ Resources: iam_client = boto3.client('iam') try: cleanup_existing_policies(iam_client, role_name, account_id, partition) + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error deleting policy: {str(e)}") @@ -303,7 +315,7 @@ Resources: attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: attach_resource_collection_permissions(iam_client, role_name) - attach_instrumentation_permissions(iam_client, role_name, datadog_site, instrumentation_resource_types) + attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error creating/attaching policy: {str(e)}") From 8890d14e554cde31d9cdfcbed0448c6d794155b7 Mon Sep 17 00:00:00 2001 From: Grant Palmer Date: Thu, 14 May 2026 15:43:34 -0400 Subject: [PATCH 4/4] refactor(aws_quickstart): gate instrumentation cleanup on previous value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `attach_instrumentation_permissions` now also takes the previous value of `InstrumentationResourceTypes` (sourced from `event['OldResourceProperties']` inside `handle_create_update`) and only runs the IAM cleanup when the parameter actually transitioned from non-empty to empty between Updates. This skips ~20 no-op IAM delete attempts on first-time stack Creates that never opted in to instrumentation, while still cleaning up policies on toggle-off Updates and on stack Delete. Per @raymondeah's review comment on #306 — the previous shape ran the deletion path on every Create/Update regardless of whether the parameter had ever been non-empty. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../attach_integration_permissions.py | 15 +++++++++++--- .../attach_integration_permissions_test.py | 20 ++++++++++++++----- aws_quickstart/datadog_integration_role.yaml | 15 +++++++++++--- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/aws_quickstart/attach_integration_permissions.py b/aws_quickstart/attach_integration_permissions.py index 865aca8f..f9ea665c 100644 --- a/aws_quickstart/attach_integration_permissions.py +++ b/aws_quickstart/attach_integration_permissions.py @@ -134,13 +134,16 @@ def attach_resource_collection_permissions(iam_client, role_name): ) -def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types): +def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): # Best-effort: instrumentation permissions are additive convenience on top of the # integration, so any failure here is logged and swallowed rather than blocking install. # Fetch before cleanup so that a transient API failure on an Update leaves the # previously-attached policies in place instead of silently revoking them. if not resource_types: - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + # Only clean up if the previous Update had instrumentation enabled — avoids running + # delete calls on stacks that never opted in to instrumentation in the first place. + if previous_resource_types: + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) return try: @@ -186,6 +189,9 @@ def handle_create_update(event, context): should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' datadog_site = props.get('DatadogSite') or 'datadoghq.com' instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) + previous_instrumentation_resource_types = parse_resource_types( + event.get('OldResourceProperties', {}).get('InstrumentationResourceTypes') + ) try: iam_client = boto3.client('iam') @@ -193,7 +199,10 @@ def handle_create_update(event, context): attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: attach_resource_collection_permissions(iam_client, role_name) - attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types) + attach_instrumentation_permissions( + iam_client, role_name, account_id, partition, + datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, + ) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error creating/attaching policy: {str(e)}") diff --git a/aws_quickstart/attach_integration_permissions_test.py b/aws_quickstart/attach_integration_permissions_test.py index d5d694db..aca49602 100644 --- a/aws_quickstart/attach_integration_permissions_test.py +++ b/aws_quickstart/attach_integration_permissions_test.py @@ -89,9 +89,10 @@ def setUp(self): self.partition = "aws" self.site = "datadoghq.com" - def _attach(self, resource_types): + def _attach(self, resource_types, previous_resource_types=()): attach_instrumentation_permissions( - self.iam, self.role_name, self.account_id, self.partition, self.site, resource_types, + self.iam, self.role_name, self.account_id, self.partition, self.site, + resource_types, previous_resource_types, ) def _mock_chunks_response(self, chunks): @@ -100,9 +101,18 @@ def _mock_chunks_response(self, chunks): resp.read.return_value = body return resp - def test_empty_resource_types_cleans_up_previously_attached_policies(self): - # Toggling off instrumentation should remove the previously-attached policies. - self._attach([]) + def test_empty_resource_types_no_op_when_previously_empty(self): + # Stack Create (or Update with no change) and no instrumentation requested: + # don't touch IAM at all — there's nothing to clean up. + self._attach([], previous_resource_types=[]) + self.iam.create_policy.assert_not_called() + self.iam.attach_role_policy.assert_not_called() + self.iam.detach_role_policy.assert_not_called() + self.iam.delete_policy.assert_not_called() + + def test_empty_resource_types_cleans_up_when_previously_set(self): + # Toggling instrumentation off on an Update should remove the previously-attached policies. + self._attach([], previous_resource_types=["aws:ec2:instance"]) self.iam.create_policy.assert_not_called() self.iam.attach_role_policy.assert_not_called() self.assertGreater(self.iam.detach_role_policy.call_count, 0) diff --git a/aws_quickstart/datadog_integration_role.yaml b/aws_quickstart/datadog_integration_role.yaml index 4e9d8682..8ebd27b4 100644 --- a/aws_quickstart/datadog_integration_role.yaml +++ b/aws_quickstart/datadog_integration_role.yaml @@ -256,13 +256,16 @@ Resources: ) - def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types): + def attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, resource_types, previous_resource_types): # Best-effort: instrumentation permissions are additive convenience on top of the # integration, so any failure here is logged and swallowed rather than blocking install. # Fetch before cleanup so that a transient API failure on an Update leaves the # previously-attached policies in place instead of silently revoking them. if not resource_types: - cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) + # Only clean up if the previous Update had instrumentation enabled — avoids running + # delete calls on stacks that never opted in to instrumentation in the first place. + if previous_resource_types: + cleanup_instrumentation_policies(iam_client, role_name, account_id, partition) return try: @@ -308,6 +311,9 @@ Resources: should_install_security_audit_policy = str(props['ResourceCollectionPermissions']).lower() == 'true' datadog_site = props.get('DatadogSite') or 'datadoghq.com' instrumentation_resource_types = parse_resource_types(props.get('InstrumentationResourceTypes')) + previous_instrumentation_resource_types = parse_resource_types( + event.get('OldResourceProperties', {}).get('InstrumentationResourceTypes') + ) try: iam_client = boto3.client('iam') @@ -315,7 +321,10 @@ Resources: attach_standard_permissions(iam_client, role_name) if should_install_security_audit_policy: attach_resource_collection_permissions(iam_client, role_name) - attach_instrumentation_permissions(iam_client, role_name, account_id, partition, datadog_site, instrumentation_resource_types) + attach_instrumentation_permissions( + iam_client, role_name, account_id, partition, + datadog_site, instrumentation_resource_types, previous_instrumentation_resource_types, + ) cfnresponse.send(event, context, cfnresponse.SUCCESS, responseData={}) except Exception as e: LOGGER.error(f"Error creating/attaching policy: {str(e)}")