From c12d6feb4db30ecf22078cadbb279e6856be8ce9 Mon Sep 17 00:00:00 2001 From: Joshua Garde Date: Thu, 1 May 2025 13:23:31 -0700 Subject: [PATCH 01/11] Initial terraform module refactor --- terraform/main.tf | 19 ++++- .../combine/confluence-combine_data.tf | 71 +++++++++++-------- terraform/modules/combine/main.tf | 16 ----- terraform/modules/combine/variables.tf | 15 ++++ 4 files changed, 75 insertions(+), 46 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index 6ea4533..f482ad5 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -19,6 +19,18 @@ provider "aws" { data "aws_caller_identity" "current" {} +data "aws_efs_file_system" "input" { + creation_token = "${var.prefix}-input" +} + +data "aws_iam_role" "job" { + name = "${var.prefix}-batch-job-role" +} + +data "aws_iam_role" "exec" { + name = "${var.prefix}-ecs-exe-task-role" +} + locals { account_id = sensitive(data.aws_caller_identity.current.account_id) default_tags = length(var.default_tags) == 0 ? { @@ -35,4 +47,9 @@ module "confluence-combine-data" { aws_region = var.aws_region environment = var.environment prefix = var.prefix -} \ No newline at end of file + iam_execution_role_arn = data.aws_iam_role.exec.arn + iam_job_role_arn = data.aws_iam_role.job.arn + efs_file_system_ids = { + input = data.aws_efs_file_system.input.file_system_id + } +} diff --git a/terraform/modules/combine/confluence-combine_data.tf b/terraform/modules/combine/confluence-combine_data.tf index fc5712b..e975446 100644 --- a/terraform/modules/combine/confluence-combine_data.tf +++ b/terraform/modules/combine/confluence-combine_data.tf @@ -2,40 +2,53 @@ resource "aws_batch_job_definition" "generate_batch_jd_combine_data" { name = "${var.prefix}-combine-data" type = "container" - container_properties = < Date: Thu, 1 May 2025 13:28:22 -0700 Subject: [PATCH 02/11] Update variables --- terraform/variables.tf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/terraform/variables.tf b/terraform/variables.tf index 30c4a5d..c77d847 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -29,3 +29,13 @@ variable "prefix" { type = string description = "Prefix to add to all AWS resources as a unique identifier" } + +variable "iam_job_role_arn" { + type = string + description = "The IAM ARN of the job role" +} + +variable "iam_execution_role_arn" { + type = string + description = "The IAM ARN of the execution role" +} From 6cc8fc946bbe3e3ad2a0d8d24504d8a4bf37890b Mon Sep 17 00:00:00 2001 From: Joshua Garde Date: Mon, 5 May 2025 11:24:53 -0700 Subject: [PATCH 03/11] Remove unused variables --- terraform/variables.tf | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/terraform/variables.tf b/terraform/variables.tf index c77d847..30c4a5d 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -29,13 +29,3 @@ variable "prefix" { type = string description = "Prefix to add to all AWS resources as a unique identifier" } - -variable "iam_job_role_arn" { - type = string - description = "The IAM ARN of the job role" -} - -variable "iam_execution_role_arn" { - type = string - description = "The IAM ARN of the execution role" -} From 1ede630c001b74802f7082efe0f3049e537a0147 Mon Sep 17 00:00:00 2001 From: Joshua Garde Date: Mon, 19 May 2025 13:09:39 -0700 Subject: [PATCH 04/11] Cleanup --- terraform/main.tf | 18 +++---- .../combine/confluence-combine_data.tf | 47 ++++++++----------- terraform/modules/combine/main.tf | 7 +-- terraform/modules/combine/variables.tf | 29 +++++++----- 4 files changed, 47 insertions(+), 54 deletions(-) diff --git a/terraform/main.tf b/terraform/main.tf index f482ad5..6efd5f1 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -14,7 +14,7 @@ provider "aws" { default_tags { tags = local.default_tags } - region = var.aws_region + region = var.aws_region } data "aws_caller_identity" "current" {} @@ -41,15 +41,15 @@ locals { } module "confluence-combine-data" { - source = "./modules/combine" - app_name = var.app_name - app_version = var.app_version - aws_region = var.aws_region - environment = var.environment - prefix = var.prefix - iam_execution_role_arn = data.aws_iam_role.exec.arn - iam_job_role_arn = data.aws_iam_role.job.arn + source = "./modules/combine" + app_name = var.app_name + app_version = var.app_version + aws_region = var.aws_region efs_file_system_ids = { input = data.aws_efs_file_system.input.file_system_id } + environment = var.environment + iam_execution_role_arn = data.aws_iam_role.exec.arn + iam_job_role_arn = data.aws_iam_role.job.arn + prefix = var.prefix } diff --git a/terraform/modules/combine/confluence-combine_data.tf b/terraform/modules/combine/confluence-combine_data.tf index e975446..6c25ae7 100644 --- a/terraform/modules/combine/confluence-combine_data.tf +++ b/terraform/modules/combine/confluence-combine_data.tf @@ -7,8 +7,8 @@ resource "aws_batch_job_definition" "generate_batch_jd_combine_data" { tags = { "job_definition": "${var.prefix}-combine-data" } container_properties = jsonencode({ - image = "${local.account_id}.dkr.ecr.us-west-2.amazonaws.com/${var.prefix}-combine-data" - executionRoleArn = var.iam_job_role_arn + image = "${local.account_id}.dkr.ecr.us-west-2.amazonaws.com/${var.prefix}-combine-data:${var.image_tag}" + executionRoleArn = var.iam_execution_role_arn jobRoleArn = var.iam_job_role_arn fargatePlatformConfiguration = { platformVersion = "LATEST" @@ -19,32 +19,25 @@ resource "aws_batch_job_definition" "generate_batch_jd_combine_data" { awslogs-group = aws_cloudwatch_log_group.cw_log_group.name } } - resourceRequirements = [ - { - type = "MEMORY" - value = "2048" - }, - { - type = "VCPU", - value = "1" + resourceRequirements = [{ + type = "MEMORY" + value = "2048" + }, { + type = "VCPU", + value = "1" + }] + mountPoints = [{ + sourceVolume = "input", + containerPath = "/data" + readOnly = false + }] + volumes = [{ + name = "input" + efsVolumeConfiguration = { + fileSystemId = var.efs_file_system_ids["input"] + rootDirectory = "/" } - ] - mountPoints = [ - { - sourceVolume = "input", - containerPath = "/data" - readOnly = false - } - ] - volumes = [ - { - name = "input" - efsVolumeConfiguration = { - fileSystemId = var.efs_file_system_ids["input"] - rootDirectory = "/" - } - } - ] + }] }) } diff --git a/terraform/modules/combine/main.tf b/terraform/modules/combine/main.tf index 08b9f14..b81ede2 100644 --- a/terraform/modules/combine/main.tf +++ b/terraform/modules/combine/main.tf @@ -4,9 +4,4 @@ data "aws_caller_identity" "current" {} # Local variables locals { account_id = data.aws_caller_identity.current.account_id - default_tags = length(var.default_tags) == 0 ? { - application : var.app_name, - environment : var.environment, - version : var.app_version - } : var.default_tags -} \ No newline at end of file +} diff --git a/terraform/modules/combine/variables.tf b/terraform/modules/combine/variables.tf index 699b768..20ccceb 100644 --- a/terraform/modules/combine/variables.tf +++ b/terraform/modules/combine/variables.tf @@ -8,7 +8,6 @@ variable "app_version" { type = string description = "The application version number" } - variable "aws_region" { type = string description = "AWS region to deploy to" @@ -16,8 +15,13 @@ variable "aws_region" { } variable "default_tags" { - type = map(string) - default = {} + type = map(string) + default = {} +} + +variable "efs_file_system_ids" { + type = map(string) + description = "Map of EFS file system ids to pass to the container definition" } variable "environment" { @@ -25,14 +29,9 @@ variable "environment" { description = "The environment in which to deploy to" } -variable "prefix" { +variable "iam_execution_role_arn" { type = string - description = "Prefix to add to all AWS resources as a unique identifier" -} - -variable "efs_file_system_ids" { - type = map(string) - description = "Map of EFS file system ids to pass to the container definition" + description = "The IAM ARN of the execution role" } variable "iam_job_role_arn" { @@ -40,7 +39,13 @@ variable "iam_job_role_arn" { description = "The IAM ARN of the job role" } -variable "iam_execution_role_arn" { +variable "image_tag" { type = string - description = "The IAM ARN of the execution role" + description = "The container image tag to utilize" + default = "latest" +} + +variable "prefix" { + type = string + description = "Prefix to add to all AWS resources as a unique identifier" } From 5ed6e87aa7c4ad72845b560e8319f80866eaf0c0 Mon Sep 17 00:00:00 2001 From: Joshua Garde Date: Mon, 19 May 2025 13:22:21 -0700 Subject: [PATCH 05/11] Lint --- terraform/modules/combine/confluence-combine_data.tf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/terraform/modules/combine/confluence-combine_data.tf b/terraform/modules/combine/confluence-combine_data.tf index 6c25ae7..7bd5a7d 100644 --- a/terraform/modules/combine/confluence-combine_data.tf +++ b/terraform/modules/combine/confluence-combine_data.tf @@ -1,10 +1,10 @@ # Job Definition resource "aws_batch_job_definition" "generate_batch_jd_combine_data" { - name = "${var.prefix}-combine-data" - type = "container" + name = "${var.prefix}-combine-data" + type = "container" platform_capabilities = ["FARGATE"] - propagate_tags = true - tags = { "job_definition": "${var.prefix}-combine-data" } + propagate_tags = true + tags = { "job_definition": "${var.prefix}-combine-data" } container_properties = jsonencode({ image = "${local.account_id}.dkr.ecr.us-west-2.amazonaws.com/${var.prefix}-combine-data:${var.image_tag}" From 93e69d7db224c6a3d5ec72f2f7c22bf42847e438 Mon Sep 17 00:00:00 2001 From: Joshua Garde Date: Mon, 19 May 2025 16:57:16 -0700 Subject: [PATCH 06/11] default_tags padding --- terraform/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terraform/variables.tf b/terraform/variables.tf index 30c4a5d..c368ffb 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -16,8 +16,8 @@ variable "aws_region" { } variable "default_tags" { - type = map(string) - default = {} + type = map(string) + default = {} } variable "environment" { From a2e380ece652a537d3007dda14371874ff10aad5 Mon Sep 17 00:00:00 2001 From: Travis-Simmons Date: Thu, 8 May 2025 19:16:08 +0000 Subject: [PATCH 07/11] ssc combine, use the --ssc flag --- combine_data.py | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/combine_data.py b/combine_data.py index e980ce8..495e1be 100644 --- a/combine_data.py +++ b/combine_data.py @@ -82,6 +82,9 @@ def create_args(): "--expanded", help="Indicate we are looking for expanded set files.", action="store_true") + arg_parser.add_argument("--ssc", + help="Indicate we are looking for expanded set files.", + action="store_true") return arg_parser def get_logger(): @@ -106,7 +109,7 @@ def get_logger(): # Return logger return logger -def combine_continents(continents, data_dir, sword_version,expanded, logger): +def combine_continents(continents, data_dir, sword_version,expanded,ssc, logger): """Combine continent-level data in to global data. Parameters @@ -176,8 +179,44 @@ def combine_continents(continents, data_dir, sword_version,expanded, logger): json.dump(continent_json, jf, indent=2) logger.info(f"Written: {c_file}") + if ssc: + ssc_json_data = combine_ssc(data_dir=data_dir, logger = logger) + + with open(os.path.join(data_dir,"ssc_hls_list.json"), "w") as jf: + json.dump(ssc_json_data, jf, indent=2) + return reaches_json_list +def combine_ssc(data_dir:str, logger): + """Combine SSC input data into a single file.""" + ssc_input_data = glob.glob(os.path.join(data_dir, "ssc", "*.json")) + logger.info('found', len(ssc_input_data), 'ssc files...') + + + ssc_json_data = {} + count = 0 + for ssc_input in ssc_input_data: + logger.info('processing ssc') + with open(ssc_input) as jf: + data = json.load(jf) + logger.info(f'{ssc_input}') + for key in list(data.keys()): + short_key = key[:-10] + if short_key in list(ssc_json_data.keys()): + prev_len = len(ssc_json_data[short_key]) + ssc_json_data[short_key].extend(data[key]) + ssc_json_data[short_key] = list(set(ssc_json_data[short_key])) + after_len = len(ssc_json_data[short_key]) + if prev_len != after_len: + logger.info(f'{ssc_input} difference {short_key}') + + else: + ssc_json_data[short_key] = data[key] + + + # ssc_json_data.extend(data) + return ssc_json_data + def create_basin_data(data_dir, basin_id, base_reaches, sword_version): continent_codes = { '1': "af", '2': "eu", '3': "as", '4': "as", '5': "oc", '6': "sa", '7': "na", '8': "na", '9':"na" } @@ -260,7 +299,7 @@ def combine_data(): ] # Combine continent-level data - json_file_list = combine_continents(continents, args.datadir, args.sword_version, args.expanded, logger) + json_file_list = combine_continents(continents, args.datadir, args.sword_version, args.expanded, args.ssc, logger) # Upload JSON files to S3 if args.uploadbucket: From eb232bcb05cd19d8531722266ffaa3e1e4386139 Mon Sep 17 00:00:00 2001 From: Travis-Simmons Date: Thu, 8 May 2025 19:24:17 +0000 Subject: [PATCH 08/11] turning to list --- combine_data.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/combine_data.py b/combine_data.py index 495e1be..3e26f6c 100644 --- a/combine_data.py +++ b/combine_data.py @@ -177,8 +177,7 @@ def combine_continents(continents, data_dir, sword_version,expanded,ssc, logger) reaches_json_list.append(c_file) with open(c_file, 'w') as jf: json.dump(continent_json, jf, indent=2) - logger.info(f"Written: {c_file}") - + logger.info(f"Written: {c_file}") if ssc: ssc_json_data = combine_ssc(data_dir=data_dir, logger = logger) @@ -215,7 +214,9 @@ def combine_ssc(data_dir:str, logger): # ssc_json_data.extend(data) - return ssc_json_data + single_entry_list = [{k: v} for k, v in ssc_json_data.items()] + + return single_entry_list def create_basin_data(data_dir, basin_id, base_reaches, sword_version): continent_codes = { '1': "af", '2': "eu", '3': "as", '4': "as", '5': "oc", '6': "sa", '7': "na", '8': "na", '9':"na" } From 107ace456a77ac015ba599bdebeb312400328944 Mon Sep 17 00:00:00 2001 From: Travis-Simmons Date: Thu, 8 May 2025 19:34:06 +0000 Subject: [PATCH 09/11] removed logger --- combine_data.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/combine_data.py b/combine_data.py index 3e26f6c..6504cbb 100644 --- a/combine_data.py +++ b/combine_data.py @@ -189,16 +189,13 @@ def combine_continents(continents, data_dir, sword_version,expanded,ssc, logger) def combine_ssc(data_dir:str, logger): """Combine SSC input data into a single file.""" ssc_input_data = glob.glob(os.path.join(data_dir, "ssc", "*.json")) - logger.info('found', len(ssc_input_data), 'ssc files...') ssc_json_data = {} count = 0 for ssc_input in ssc_input_data: - logger.info('processing ssc') with open(ssc_input) as jf: data = json.load(jf) - logger.info(f'{ssc_input}') for key in list(data.keys()): short_key = key[:-10] if short_key in list(ssc_json_data.keys()): @@ -206,8 +203,6 @@ def combine_ssc(data_dir:str, logger): ssc_json_data[short_key].extend(data[key]) ssc_json_data[short_key] = list(set(ssc_json_data[short_key])) after_len = len(ssc_json_data[short_key]) - if prev_len != after_len: - logger.info(f'{ssc_input} difference {short_key}') else: ssc_json_data[short_key] = data[key] From ac8cb0ca0be3ad51722996c9d94e672631dda071 Mon Sep 17 00:00:00 2001 From: Nikki <17799906+nikki-t@users.noreply.github.com> Date: Thu, 8 May 2025 16:11:27 -0400 Subject: [PATCH 10/11] Make sure SSC HLS JSON is uploaded to S3 --- combine_data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/combine_data.py b/combine_data.py index 6504cbb..d4eb226 100644 --- a/combine_data.py +++ b/combine_data.py @@ -181,8 +181,11 @@ def combine_continents(continents, data_dir, sword_version,expanded,ssc, logger) if ssc: ssc_json_data = combine_ssc(data_dir=data_dir, logger = logger) - with open(os.path.join(data_dir,"ssc_hls_list.json"), "w") as jf: + ssc_json = os.path.join(data_dir,"ssc_hls_list.json") + with open(ssc_json, "w") as jf: json.dump(ssc_json_data, jf, indent=2) + reaches_json_list.append(ssc_json) + logger.info(f"Written: %s", ssc_json) return reaches_json_list From fde75f305e500b375d218b84fc4d91f9ecff2c89 Mon Sep 17 00:00:00 2001 From: Nikki Tebaldi <17799906+nikki-t@users.noreply.github.com> Date: Thu, 15 May 2025 16:30:56 -0400 Subject: [PATCH 11/11] Integrate SSC and lakeflow without any extra command line args (#11) --- combine_data.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/combine_data.py b/combine_data.py index d4eb226..ce15e2f 100644 --- a/combine_data.py +++ b/combine_data.py @@ -82,9 +82,6 @@ def create_args(): "--expanded", help="Indicate we are looking for expanded set files.", action="store_true") - arg_parser.add_argument("--ssc", - help="Indicate we are looking for expanded set files.", - action="store_true") return arg_parser def get_logger(): @@ -109,7 +106,7 @@ def get_logger(): # Return logger return logger -def combine_continents(continents, data_dir, sword_version,expanded,ssc, logger): +def combine_continents(continents, data_dir, sword_version, expanded, logger): """Combine continent-level data in to global data. Parameters @@ -171,21 +168,23 @@ def combine_continents(continents, data_dir, sword_version,expanded,ssc, logger) with open(outpath, 'w') as jf: json.dump(out_dict[a_key], jf, indent=2) logger.info(f"Written: {outpath}.") - + if not expanded: c_file = os.path.join(data_dir, 'continent.json') reaches_json_list.append(c_file) with open(c_file, 'w') as jf: json.dump(continent_json, jf, indent=2) logger.info(f"Written: {c_file}") - if ssc: - ssc_json_data = combine_ssc(data_dir=data_dir, logger = logger) + ssc_json_data = combine_ssc(data_dir=data_dir, logger = logger) + if len(ssc_json_data) > 0: ssc_json = os.path.join(data_dir,"ssc_hls_list.json") with open(ssc_json, "w") as jf: json.dump(ssc_json_data, jf, indent=2) reaches_json_list.append(ssc_json) logger.info(f"Written: %s", ssc_json) + else: + logger.info("No SSC JSON written.") return reaches_json_list @@ -193,7 +192,6 @@ def combine_ssc(data_dir:str, logger): """Combine SSC input data into a single file.""" ssc_input_data = glob.glob(os.path.join(data_dir, "ssc", "*.json")) - ssc_json_data = {} count = 0 for ssc_input in ssc_input_data: @@ -210,7 +208,6 @@ def combine_ssc(data_dir:str, logger): else: ssc_json_data[short_key] = data[key] - # ssc_json_data.extend(data) single_entry_list = [{k: v} for k, v in ssc_json_data.items()] @@ -298,7 +295,11 @@ def combine_data(): ] # Combine continent-level data - json_file_list = combine_continents(continents, args.datadir, args.sword_version, args.expanded, args.ssc, logger) + json_file_list = combine_continents(continents, args.datadir, args.sword_version, args.expanded, logger) + + # Check for lakeflow data + viable_lakes = pathlib.Path(args.datadir).joinpath("lakeflow", "viable", "viable_locations.csv") + if viable_lakes.exists(): json_file_list.append(str(viable_lakes)) # Upload JSON files to S3 if args.uploadbucket: