From 2dc517d705e956aad62c3ce222de1e9d5427e086 Mon Sep 17 00:00:00 2001 From: deadex-ng Date: Sat, 25 Mar 2023 17:14:45 +0200 Subject: [PATCH 1/5] Add raw data Signed-off-by: deadex-ng --- Data/Soil/.gitignore | 3 +++ Data/Soil/soilcarbon.ovr.dvc | 4 ++++ 2 files changed, 7 insertions(+) create mode 100644 Data/Soil/.gitignore create mode 100644 Data/Soil/soilcarbon.ovr.dvc diff --git a/Data/Soil/.gitignore b/Data/Soil/.gitignore new file mode 100644 index 000000000..2c40d69a4 --- /dev/null +++ b/Data/Soil/.gitignore @@ -0,0 +1,3 @@ +/prepared +/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif +/soilcarbon.ovr diff --git a/Data/Soil/soilcarbon.ovr.dvc b/Data/Soil/soilcarbon.ovr.dvc new file mode 100644 index 000000000..0ce8f3eec --- /dev/null +++ b/Data/Soil/soilcarbon.ovr.dvc @@ -0,0 +1,4 @@ +outs: +- md5: 33956e4dd24c1caa1dcea956e85e1f5f + size: 131473 + path: soilcarbon.ovr From f697ccd28a902e90432ea886efc3ab390dd61d7a Mon Sep 17 00:00:00 2001 From: deadex-ng Date: Sat, 25 Mar 2023 17:29:20 +0200 Subject: [PATCH 2/5] Configure remote storage Signed-off-by: deadex-ng --- .dvc/config | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .dvc/config diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 000000000..aefccf2ac --- /dev/null +++ b/.dvc/config @@ -0,0 +1,4 @@ +[core] + remote = storage +['remote "storage"'] + url = gdrive://1bozOw-FD0JkthpUQnkrJqxYtU2Uwj6E8 From d0ab6012082a8e826fd49c8934fc2d7ede5518c1 Mon Sep 17 00:00:00 2001 From: deadex-ng Date: Sun, 26 Mar 2023 17:21:53 +0200 Subject: [PATCH 3/5] Update dvc.yaml Signed-off-by: deadex-ng --- .dvc/.gitignore | 3 +++ Data/Soil/.gitignore | 2 +- dvc.lock | 39 +++++++++++++++++++++++++++++++++++++++ dvc.yaml | 20 ++++++++++++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 .dvc/.gitignore create mode 100644 dvc.lock create mode 100644 dvc.yaml diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 000000000..528f30c71 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/Data/Soil/.gitignore b/Data/Soil/.gitignore index 2c40d69a4..b5c873d4f 100644 --- a/Data/Soil/.gitignore +++ b/Data/Soil/.gitignore @@ -1,3 +1,3 @@ -/prepared +/processed /GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif /soilcarbon.ovr diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 000000000..3abed6cb6 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,39 @@ +schema: '2.0' +stages: + prepare: + cmd: + - "gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 \\\n -to SRC_METHOD=NO_GEOTRANSFORM\ + \ -tr 0.5 0.5 \\\n -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326\ + \ \\\n -of GTiff Data/Soil/soilcarbon.ovr \\\n Data/Soil/prepared/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif" + deps: + - path: Data/Soil/soilcarbon.ovr + md5: 33956e4dd24c1caa1dcea956e85e1f5f + size: 131473 + outs: + - path: Data/Soil/prepared/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + md5: 96f78155b79a835f56d019586d4c1f14 + size: 1038282 + transform: + cmd: + - "gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 \\\n -to SRC_METHOD=NO_GEOTRANSFORM\ + \ -tr 0.5 0.5 \\\n -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326\ + \ \\\n -of GTiff Data/Soil/soilcarbon.ovr \\\n Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif" + deps: + - path: Data/Soil/soilcarbon.ovr + md5: 33956e4dd24c1caa1dcea956e85e1f5f + size: 131473 + outs: + - path: Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + md5: 96f78155b79a835f56d019586d4c1f14 + size: 1038282 + Load: + cmd: + - cp Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif Data/Soil/processed/ + - rm -f Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + extract: + cmd: dvc pull + load: + cmd: + - mkdir Data/Soil/processed + - cp Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif Data/Soil/processed/ + - rm -f Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 000000000..a609c5a46 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,20 @@ +stages: + extract: + cmd: dvc pull + transform: + cmd: + - >- + gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 \ + -to SRC_METHOD=NO_GEOTRANSFORM -tr 0.5 0.5 \ + -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326 \ + -of GTiff Data/Soil/soilcarbon.ovr \ + Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + deps: + - Data/Soil/soilcarbon.ovr + outs: + - Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + load: + cmd: + - mkdir Data/Soil/processed + - cp Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif Data/Soil/processed/ + - rm -f Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif \ No newline at end of file From ab7c7e711cf5fe208393f1fdc3d3832f3d2a8b8f Mon Sep 17 00:00:00 2001 From: deadex-ng Date: Wed, 29 Mar 2023 16:12:05 +0200 Subject: [PATCH 4/5] Update ETL process Signed-off-by: deadex-ng --- scripts/extract.py | 29 +++++++++++++++++++++++++++++ scripts/rename_files.py | 12 ++++++++++++ scripts/transform.py | 22 ++++++++++++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 scripts/extract.py create mode 100644 scripts/rename_files.py create mode 100644 scripts/transform.py diff --git a/scripts/extract.py b/scripts/extract.py new file mode 100644 index 000000000..1d0cc4db5 --- /dev/null +++ b/scripts/extract.py @@ -0,0 +1,29 @@ +import argparse +import requests +import zipfile +import io +import os + +URL = "https://databasin2-filestore.s3.amazonaws.com:443/a4cb6d367eae4e52a08902874f8bfedf/download/a4cb6d367eae4e52a08902874f8bfedf_1_zip_en.zip?Signature=O6QSoOR%2BisIRVE2mpxzkphTkhmw%3D&Expires=1680100356&AWSAccessKeyId=AKIAI4RK5BEPK3FCQPUQ" + +def ensure_url_is_accessible(URL): + r = requests.get(URL) + if not r.ok: + print("Download link expired. Please update download link") + else: + download_and_unzip_files(r.content) + +def download_and_unzip_files(content): + current_directory = os.getcwd() + target_parent_dir = os.path.join(current_directory, r'tmp_unzip_path') + if not os.path.exists(target_parent_dir): + os.mkdir(target_parent_dir) + try: + z = zipfile.ZipFile(io.BytesIO(content)) + z.extractall(target_parent_dir) + except Exception as e: + print(e) + else: + print("unzipped successfully") + +ensure_url_is_accessible(URL) \ No newline at end of file diff --git a/scripts/rename_files.py b/scripts/rename_files.py new file mode 100644 index 000000000..a95438ba1 --- /dev/null +++ b/scripts/rename_files.py @@ -0,0 +1,12 @@ +import os + +current_directory = os.getcwd() +target_parent_dir = os.path.join(current_directory, r'tmp_unzip_path/data') +if os.path.exists(target_parent_dir): + for file_name in os.listdir(target_parent_dir): + if '\\' in file_name: + old_file_name = os.path.join(target_parent_dir, file_name) + filename = os.fsdecode(file_name) + changed_name = filename.replace("\\", "_") + new_file_name = os.path.join(target_parent_dir, changed_name) + os.rename(old_file_name,new_file_name) \ No newline at end of file diff --git a/scripts/transform.py b/scripts/transform.py new file mode 100644 index 000000000..f8e3ce2ad --- /dev/null +++ b/scripts/transform.py @@ -0,0 +1,22 @@ +import subprocess +import argparse +import os + +parser = argparse.ArgumentParser() +parser.add_argument('--input', help="Directory of file to transform") +parser.add_argument('--output', help="Directory for transformed files") +args = vars(parser.parse_args()) + + +def run_shell_cmd(cmd): + try: + p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) + last_stdout_bytes, last_stderr_bytes = p.communicate() + if last_stdout_bytes: + return last_stdout_bytes.decode('utf-8', 'replace') + else: + return last_stderr_bytes + except Exception as e: + print(e) + +run_shell_cmd("gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 -to SRC_METHOD=NO_GEOTRANSFORM -tr 0.5 0.5 -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326 -of GTiff " + args.get('input') + " " + args.get('output')) \ No newline at end of file From e528bf6ae3fb3567c3320bd8c88893c671136251 Mon Sep 17 00:00:00 2001 From: deadex-ng Date: Wed, 29 Mar 2023 16:15:50 +0200 Subject: [PATCH 5/5] Update ETL process Signed-off-by: deadex-ng --- dvc.lock | 19 ++++++++++--------- dvc.yaml | 21 ++++++++++----------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dvc.lock b/dvc.lock index 3abed6cb6..bc154f352 100644 --- a/dvc.lock +++ b/dvc.lock @@ -15,15 +15,14 @@ stages: size: 1038282 transform: cmd: - - "gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 \\\n -to SRC_METHOD=NO_GEOTRANSFORM\ - \ -tr 0.5 0.5 \\\n -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326\ - \ \\\n -of GTiff Data/Soil/soilcarbon.ovr \\\n Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif" + - "python scripts/transform.py --input tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr\ + \ \\\n --output Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif" deps: - - path: Data/Soil/soilcarbon.ovr + - path: tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr md5: 33956e4dd24c1caa1dcea956e85e1f5f size: 131473 outs: - - path: Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + - path: Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif md5: 96f78155b79a835f56d019586d4c1f14 size: 1038282 Load: @@ -31,9 +30,11 @@ stages: - cp Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif Data/Soil/processed/ - rm -f Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif extract: - cmd: dvc pull + cmd: + - python scripts/extract.py + - python scripts/rename_files.py load: cmd: - - mkdir Data/Soil/processed - - cp Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif Data/Soil/processed/ - - rm -f Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + - dvc push + clean: + cmd: rm -rf tmp_unzip_path diff --git a/dvc.yaml b/dvc.yaml index a609c5a46..4c6ad9bcb 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,20 +1,19 @@ stages: extract: - cmd: dvc pull + cmd: + - python scripts/extract.py + - python scripts/rename_files.py transform: cmd: - >- - gdalwarp -s_srs EPSG:4326 -t_srs EPSG:4326 \ - -to SRC_METHOD=NO_GEOTRANSFORM -tr 0.5 0.5 \ - -r near -te -180.0 -90.0 180.0 90.0 -te_srs EPSG:4326 \ - -of GTiff Data/Soil/soilcarbon.ovr \ - Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + python scripts/transform.py --input tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr \ + --output Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif deps: - - Data/Soil/soilcarbon.ovr + - tmp_unzip_path/data/commonData_Data0_soilcarbon.ovr outs: - - Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif + - Data/Soil/processed/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif load: cmd: - - mkdir Data/Soil/processed - - cp Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif Data/Soil/processed/ - - rm -f Data/Soil/GlobalSoilOrganicCarbonDensityinkgCm_1mDepth.tif \ No newline at end of file + - dvc push + clean: + cmd: rm -rf tmp_unzip_path \ No newline at end of file