Skip to content

Commit ce795ab

Browse files
committed
finished building metadata
1 parent 2d2a71a commit ce795ab

8 files changed

Lines changed: 172 additions & 65 deletions

src/dev/dev_helpers.R

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1291,6 +1291,11 @@ correct_all_geometries <- function(path, dir_pattern = 'ws_boundary'){
12911291
}
12921292
}
12931293

1294+
# correct_all_geometries(path = '~/git/macrosheds/data_acquisition/data')
1295+
# correct_all_geometries(path = '~/git/macrosheds/portal/data')
1296+
# correct_all_geometries(path = '~/git/macrosheds/data_acquisition/macrosheds_dataset_v1')
1297+
# correct_all_geometries(path = '~/git/macrosheds/data_acquisition/macrosheds_figshare_v1')
1298+
12941299
rebuild_portal_data_before_postprocessing <- function(network_domain, backup = TRUE){
12951300

12961301
dir_wrapper <- function(path, keyword){
@@ -1393,7 +1398,28 @@ rebuild_portal_data_before_postprocessing <- function(network_domain, backup = T
13931398
'sure it looks good before and after postprocessing. especially portal/data/general'))
13941399
}
13951400

1396-
# correct_all_geometries(path = '~/git/macrosheds/data_acquisition/data')
1397-
# correct_all_geometries(path = '~/git/macrosheds/portal/data')
1398-
# correct_all_geometries(path = '~/git/macrosheds/data_acquisition/macrosheds_dataset_v1')
1399-
# correct_all_geometries(path = '~/git/macrosheds/data_acquisition/macrosheds_figshare_v1')
1401+
insert_retrieval_datetimes <- function(){
1402+
1403+
#this inserts the last modification datetime of each raw documentation file as
1404+
#the presumed retrieval datetime for the corresponding raw data product.
1405+
1406+
#used in a pinch. hopefully a real recording method has been implemented by now.
1407+
1408+
fs <- list.files('data',
1409+
recursive = TRUE, full.names = TRUE)
1410+
fs <- fs[grepl('/raw/', fs)]
1411+
fs <- fs[grepl('/documentation/', fs)]
1412+
1413+
for(f in fs){
1414+
1415+
rt <- read_lines(f)
1416+
if(length(rt) != 1) stop('sup with this')
1417+
if(grepl('UTC\\)$', rt)) next
1418+
rt <- glue(
1419+
rt, ' (',
1420+
as.character(lubridate::with_tz(file.info(f)$mtime, 'UTC')),
1421+
' UTC)')
1422+
1423+
write_lines(rt, f)
1424+
}
1425+
}

src/global/global_helpers.R

Lines changed: 50 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -9568,10 +9568,6 @@ postprocess_entire_dataset <- function(site_data,
95689568
logger = logger_module)
95699569
}
95709570

9571-
log_with_indent('adding legal metadata to each domain directory',
9572-
logger = logger_module)
9573-
# legal_details_scrape(dataset_version = dataset_version)
9574-
95759571
log_with_indent(glue('Generating output dataset v',
95769572
dataset_version),
95779573
logger = logger_module)
@@ -9594,13 +9590,6 @@ postprocess_entire_dataset <- function(site_data,
95949590
# log_with_indent(glue('Removing unneeded files from portal dataset.',
95959591
# logger = logger_module)
95969592
# clean_portal_dataset()
9597-
9598-
if(reformat_camels) {
9599-
log_with_indent('Reformatting CAMELS attributes to MacroSheds format', logger = logger_module)
9600-
reformat_camels_for_ms()
9601-
} else{
9602-
log_with_indent('NOT reformatting CAMELS attributes to MacroSheds format', logger = logger_module)
9603-
}
96049593

96059594
log_with_indent('Generating spatial summary data',
96069595
logger = logger_module)
@@ -9634,6 +9623,11 @@ postprocess_entire_dataset <- function(site_data,
96349623
dataset_version = dataset_version)
96359624
prepare_for_figshare_packageformat(where = fs_dir,
96369625
dataset_version = dataset_version)
9626+
reformat_camels_for_ms()
9627+
9628+
# log_with_indent('adding legal metadata to each domain directory',
9629+
# logger = logger_module)
9630+
# legal_details_scrape(dataset_version = dataset_version)
96379631

96389632
log_with_indent(glue('Uploading dataset v{vv} to Figshare',
96399633
vv = dataset_version),
@@ -9654,6 +9648,7 @@ make_figshare_docs_skeleton <- function(where){
96549648
dir.create(file.path(where, 'macrosheds_documentation', '04_site_documentation'), showWarnings = FALSE)
96559649
dir.create(file.path(where, 'macrosheds_documentation', '05_timeseries_documentation'), showWarnings = FALSE)
96569650
dir.create(file.path(where, 'macrosheds_documentation', '06_ws_attr_documentation'), showWarnings = FALSE)
9651+
dir.create(file.path(where, 'macrosheds_documentation', '07_CAMELS-compliant_datasets_documentation'), showWarnings = FALSE)
96579652
dir.create(file.path(where, 'macrosheds_documentation_packageformat'), showWarnings = FALSE)
96589653
}
96599654

@@ -9825,11 +9820,18 @@ assemble_misc_docs_figshare <- function(where){
98259820
overwrite = TRUE)
98269821
select(domain_detection_limits, -precision, -sigfigs, -added_programmatically) %>%
98279822
write_csv(file.path(docs_dir, '05_timeseries_documentation', '05f_detection_limits_and_precision.csv'))
9828-
file.copy('src/templates/figshare_docfiles/05g_detection_limits_and_precision_column_descriptions.txt', docs_dir)
9823+
file.copy('src/templates/figshare_docfiles/05g_detection_limits_and_precision_column_descriptions.txt',
9824+
file.path(docs_dir, '05_timeseries_documentation'))
98299825
file.copy('/home/mike/git/macrosheds/papers/release_paper/tables/timeseries_refs.bib',
98309826
file.path(docs_dir, '05_timeseries_documentation', '05h_timeseries_refs.bib'))
98319827
file.copy('/home/mike/git/macrosheds/papers/release_paper/tables/ws_attr_refs.bib',
98329828
file.path(docs_dir, '06_ws_attr_documentation', '06h_ws_attr_refs.bib'))
9829+
file.copy('src/templates/figshare_docfiles/07a_CAMELS-compliant_datasets_metadata.txt',
9830+
file.path(docs_dir, '07_CAMELS-compliant_datasets_documentation'))
9831+
file.copy('src/templates/figshare_docfiles/07b_CAMELS-compliant_ws_attributes_column_descriptions.txt',
9832+
file.path(docs_dir, '07_CAMELS-compliant_datasets_documentation'))
9833+
file.copy('src/templates/figshare_docfiles/07c_CAMELS-compliant_Daymet_forcings_column_descriptions.txt',
9834+
file.path(docs_dir, '07_CAMELS-compliant_datasets_documentation'))
98339835
file.copy('src/templates/figshare_docfiles/02_glossary.txt', docs_dir)
98349836
file.copy('src/templates/figshare_docfiles/03_changelog.txt', docs_dir)
98359837
file.copy('/home/mike/git/macrosheds/data_acquisition/src/templates/figshare_docfiles/04b_site_metadata_column_descriptions.txt',
@@ -9981,6 +9983,14 @@ prepare_for_figshare <- function(where, dataset_version){
99819983
prepare_ts_data_for_figshare(where = where,
99829984
dataset_version = dataset_version)
99839985
prepare_ws_attr_data_for_figshare(where = where)
9986+
9987+
#decided to change some dirnames. easiest to just do that as a patch here
9988+
file.rename(file.path(where, 'macrosheds_documentation'),
9989+
file.path(where, '0_documentation_and_metadata'))
9990+
file.rename(file.path(where, 'macrosheds_watershed_attribute_data'),
9991+
file.path(where, '1_watershed_attribute_data'))
9992+
file.rename(file.path(where, 'macrosheds_timeseries_data'),
9993+
file.path(where, '2_timeseries_data'))
99849994
}
99859995

99869996
prepare_for_figshare_packageformat <- function(where, dataset_version){
@@ -11570,7 +11580,7 @@ get_source_urls <- function(result_obj, processing_func){
1157011580

1157111581
if(uses_gdrive_func){
1157211582

11573-
source_urls <- 'MacroSheds drive; not yet public'
11583+
source_urls <- 'MacroSheds drive (contact us for original source): https://drive.google.com/drive/folders/1gugTmDybtMTbmKRq2WQvw2K1WkJjcmJr?usp=sharing'
1157411584

1157511585
} else if('url' %in% names(result_obj)){
1157611586

@@ -14489,41 +14499,42 @@ legal_details_scrape <- function(dataset_version){
1448914499
}
1449014500

1449114501
reformat_camels_for_ms <- function(){
14492-
14493-
# This function will reformat the camels metrics computed for MacroSheds
14494-
# watersheds to the MacroSheds format.
14495-
14496-
ms_attributes_dir <- '../timeseries_experimentation/neon_camels_attr/data/ms_attributes'
14497-
14498-
all_files <- list.files(ms_attributes_dir, recursive = T, full.names = TRUE)
14499-
14500-
14501-
14502+
14503+
# ms_attributes_dir <- '../timeseries_experimentation/neon_camels_attr/data/ms_attributes'
14504+
ms_attributes_dir <- '../qa_experimentation/data/ms_in_camels_format'
14505+
14506+
all_files <- list.files(ms_attributes_dir, recursive = TRUE, full.names = TRUE)
14507+
1450214508
soil_files <- all_files[grep('soil.feather', all_files)]
1450314509
clim_files <- all_files[grep('clim.feather', all_files)]
1450414510
topo_files <- all_files[grep('topo.feather', all_files)]
14511+
vege_files <- all_files[grep('vege.feather', all_files)]
1450514512
geol_files <- all_files[grep('geol.feather', all_files)]
1450614513
daymet_files <- all_files[grep('daymet_full_climate.feather', all_files)]
14507-
14514+
daymet_files <- daymet_files[! grepl('/NC/', daymet_files)]
14515+
warning('removing NC daymet data from camels set')
14516+
1450814517
soil <- map_dfr(soil_files, read_feather)
1450914518
clim <- map_dfr(clim_files, read_feather)
1451014519
topo <- map_dfr(topo_files, read_feather)
1451114520
geol <- map_dfr(geol_files, read_feather)
14512-
14513-
dir.create('data/camels_compliant')
14514-
dir.create('data/camels_compliant/daymet')
14515-
14516-
14517-
write_csv(soil, 'data/camels_compliant/soil.csv')
14518-
write_csv(clim, 'data/camels_compliant/clim.csv')
14519-
write_csv(topo, 'data/camels_compliant/topo.csv')
14520-
write_csv(geol, 'data/camels_compliant/geol.csv')
14521-
14521+
vege <- map_dfr(vege_files, read_feather)
14522+
14523+
dir.create('macrosheds_figshare_v1/3_CAMELS-compliant_watershed_attributes')
14524+
dir.create('macrosheds_figshare_v1/4_CAMELS-compliant_Daymet_forcings')
14525+
14526+
write_csv(soil, 'macrosheds_figshare_v1/3_CAMELS-compliant_watershed_attributes/soil.csv')
14527+
write_csv(clim, 'macrosheds_figshare_v1/3_CAMELS-compliant_watershed_attributes/clim.csv')
14528+
write_csv(topo, 'macrosheds_figshare_v1/3_CAMELS-compliant_watershed_attributes/topo.csv')
14529+
write_csv(geol, 'macrosheds_figshare_v1/3_CAMELS-compliant_watershed_attributes/geol.csv')
14530+
write_csv(vege, 'macrosheds_figshare_v1/3_CAMELS-compliant_watershed_attributes/vege.csv')
14531+
1452214532
for(i in 1:length(daymet_files)){
14533+
1452314534
this_daymet <- read_feather(daymet_files[i])
14524-
14535+
1452514536
sites <- unique(this_daymet$site_code)
14526-
14537+
1452714538
for(s in 1:length(sites)){
1452814539
this_site <- this_daymet %>%
1452914540
filter(site_code == !!sites[s]) %>%
@@ -14534,13 +14545,10 @@ reformat_camels_for_ms <- function(){
1453414545
`tmax(C)` = tmax,
1453514546
`tmin(C)` = tmin,
1453614547
`vp(Pa)` = vp,
14537-
`pet(mm)` = pet)
14538-
14539-
write_csv(this_site, glue('data/camels_compliant/daymet/{s}.csv',
14548+
`pet(mm)` = pet)
14549+
14550+
write_csv(this_site, glue('macrosheds_figshare_v1/4_CAMELS-compliant_Daymet_forcings/{s}.csv',
1454014551
s = sites[s]))
1454114552
}
1454214553
}
1454314554
}
14544-
14545-
14546-

src/templates/figshare_docfiles/00_MAIN_README.txt

Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,30 @@ Details are in 01a_data_use_agreements.docx, alongside this readme. For citation
88
by domain and product, see 01b_attribution_and_intellectual_rights_complete.xlsx.
99
And please contact us (mail@macrosheds.org) if you feel any MacroSheds content should be amended.
1010

11-
Otherwise, just refer to this file, and the many others it describes, as needed. And
12-
don't forget about macrosheds.org for visualizing the MacroSheds dataset, and the "macrosheds"
13-
R package for working with it (https://github.com/MacroSHEDS/macrosheds).
11+
For a thorough description of the methods involved in building this dataset, see the
12+
corresponding publication (pending). Also note that the easiest way to explore
13+
our data catalog is use the interactive catalogs on macrosheds.org (Data tab).
14+
You can also visualize the dataset there.
15+
16+
Otherwise, just refer to this file, and the many others it names, as needed. And don't forget
17+
about the "macrosheds" R package, which helps with retrieving and working with MacroSheds data
18+
(https://github.com/MacroSHEDS/macrosheds). All project code is available
19+
on GitHub: https://github.com/MacroSHEDS
1420

1521
---
1622

1723
MacroSheds dataset contents:
1824

19-
DOCUMENTATION (this directory)
25+
NOTE: in addition to the files in 0_documentation_and_metadata, there are
26+
individual "documentation" directories in 2_timeseries_data/<n>/<d>,
27+
where <n> is any MacroSheds network and <d> is any MacroSheds domain within
28+
<n>. These additional documentation directories contain information on dataset
29+
retrieval locations, and code used to retrieve, munge, and derive each
30+
timeseries data product.
31+
32+
0_documentation_and_metadata (this directory)
2033
├── 01a_data_use_agreements.docx
34+
├── 01b_attribution_and_intellectual_rights_complete.xlsx
2135
├── 02_glossary.txt
2236
├── 03_changelog.txt
2337
├── 04_site_documentation
@@ -27,27 +41,53 @@ DOCUMENTATION (this directory)
2741
│   ├── 05b_timeseries_variable_metadata.csv
2842
│   ├── 05c_timeseries_variable_metadata_column_descriptions.txt
2943
│   ├── 05d_timeseries_column_descriptions.txt
30-
│   └── 05e_range_check_limits.csv
44+
│   ├── 05e_range_check_limits.csv
45+
│   ├── 05f_detection_limits_and_precision.csv
46+
│   ├── 05g_detection_limits_and_precision_column_descriptions.txt
47+
│   └── 05h_timeseries_refs.bib
3148
├── 06_ws_attr_documentation
3249
│   ├── 06b_ws_attr_variable_metadata.csv
3350
│   ├── 06c_ws_attr_variable_metadata_column_descriptions.txt
3451
│   ├── 06d_ws_attr_variable_category_codes.csv
3552
│   ├── 06e_ws_attr_data_source_codes.csv
3653
│   ├── 06f_ws_attr_summary_column_descriptions.csv
37-
│   └── 06g_ws_attr_timeseries_column_descriptions.txt
54+
│   ├── 06g_ws_attr_timeseries_column_descriptions.txt
55+
│   └── 06h_ws_attr_refs.bib
56+
├── 07_CAMELS-compliant_datasets_documentation
57+
│   ├── 07a_CAMELS-compliant_datasets_metadata.txt
58+
│   ├── 07b_CAMELS-compliant_ws_attributes_column_descriptions.txt
59+
│   └── 07c_CAMELS-compliant_Daymet_forcings_column_descriptions.txt
3860
└── 08_data_irregularities.csv
3961

40-
WATERSHED ATTRIBUTES
41-
summaries
42-
timeseries by category, 1-6
43-
maybe add some readmes that refer back to the master readmes
62+
1_watershed_attribute_data
63+
├── ws_attr_summaries.csv
64+
└── ws_attr_timeseries
65+
├── climate.csv
66+
├── hydrology.csv
67+
├── landcover.csv
68+
├── parentmaterial.csv
69+
├── terrain.csv
70+
└── vegetation.csv
71+
72+
2_timeseries_data
73+
└── separate directories for each MacroSheds network
74+
└── separate directories for each MacroSheds domain within that network
75+
├── documentation (of retrieval locations and code used to retrieve/munge/derive each product)
76+
├── discharge (if available)
77+
├── precipitation (if available)
78+
├── stream_chemistry (if available)
79+
├── precip chemistry (if available)
80+
├── stream_gauge_locations
81+
├── precip_gauge_locations (if available)
82+
└── ws_boundary (except McMurdo)
4483

45-
TIME SERIES
46-
network-domain
47-
should this documentation be moved to the huge docs section?
48-
these readmes should refer back to the master readmes
84+
3_CAMELS-compliant_watershed_attributes
85+
├── clim.csv
86+
├── geol.csv
87+
├── soil.csv
88+
├── topo.csv
89+
└── vege.csv
4990

50-
CAMELS-STYLE TIME SERIES AND WATERSHED ATTRIBUTES
51-
camels-style attributes
52-
camels-style timeseries data
91+
4_CAMELS-compliant_Daymet_forcings
92+
└── forcings for each site in MacroSheds
5393

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The subset of MacroSheds that relates to streamflow and climate forcings makes it a valuable supplement to existing datasets like CAMELS (671 sites) and GAGES-II (9067 sites). Using CAMELS methods, we have compiled watershed attributes and Daymet forcings, for each MacroSheds site, that are immediately commensurable with the published CAMELS dataset, enhancing the predictive power of the combined set, especially for small watersheds. Of the 178 sites with discharge data that MacroSheds adds to this corpus (as of version 1), 122 have watershed areas of 10 km2 or less, and 68 have areas of 1 km2 or less. For CAMELS, these numbers are 8 and 0, respectively. For GAGES-II, they are 207 and 2 (see Figure 2 in the MacroSheds data paper, in review at the time of this writing).
2+
3+
Please note that we used gSSURGO (Soil Survey Staff 2022) instead of the superseded STATSGO dataset for soil characteristics. Two other CAMELS watershed attributes, pet_mean and aridity, were also computed differently for MacroSheds watersheds. For these, we solved the Priestly-Taylor formulation by using a gridded 𝛼 product (Aschonitis et al. 2017), rather than calibrating 𝛼 ourselves.
4+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
MacroSheds CAMELS-compliant watershed attribute data
2+
Column descriptions
3+
4+
Except for the deviations below, we direct the user of these data to the variable descriptions in the original CAMELS dataset
5+
webpage: https://ral.ucar.edu/solutions/products/camels
6+
CAMELS attributes download URL: https://ral.ucar.edu/sites/default/files/public/product-tool/camels-catchment-attributes-and-meteorology-for-large-sample-studies-dataset-downloads/camels_attributes_v2.0.zip
7+
8+
See the file called camels_attributes_v2.0.xlsx
9+
10+
Deviations from CAMELS:
11+
We used gSSURGO (Soil Survey Staff 2022) instead of the superseded STATSGO dataset for soil characteristics, namely sand_frac, clay_frac, silt_frac, and organic_frac. We have omitted the following variables that are included with the original CAMELS dataset: soil_depth_pelletier, soil_depth_statsgo, soil_porosity, soil_conductivity, max_water_content, water_frac, other_frac. However, note that equivalents of these variables are included with the core MacroSheds watershed attribute dataset.
12+
13+
Two other CAMELS watershed attributes, pet_mean and aridity, were also computed differently for MacroSheds watersheds. For these, we solved the Priestly-Taylor formulation by using a gridded 𝛼 product (Aschonitis et al. 2017), rather than calibrating 𝛼 ourselves.
14+
15+
References:
16+
Aschonitis, V. G., Papamichail, D., Demertzi, K., Colombani, N., Mastrocicco, M., Ghirardini, A., Castaldelli, G., & Fano, E.-A. (2017). High resolution global grids of revised Priestley-Taylor and Hargreaves-Samani coefficients for assessing ASCE-standardized reference crop evapotranspiration and solar radiation, links to ESRI-grid files [Data set].
17+
Soil Survey Staff. (2022). National Value Added Look Up (valu) Table Database for the Gridded Soil Survey Geographic (gSSURGO) Database for the United States of America and the Territories, Commonwealths, and Island Nations served by the USDA-NRCS. United States Department of Agriculture, Natural Resources Conservation Service. https://gdg.sc.egov.usda.gov/
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
MacroSheds CAMELS-compliant Daymet climate forcings
2+
3+
The Daymet forcings accompanying the core MacroSheds dataset represent watershed averages of published, gridded Daymet products (Thornton et al. 2020). Note that our format is a little different from what CAMELS provides. Our Daymet files are CSVs with a single datetime column and only one header. For site locations and elevations (and lots of other site information) see 04_site_documentation.
4+
5+
We've included timeseries of potential evapotranspiration (pet) in this dataset, though pet is not a Daymet variable per se. The pet product was also computed differently for MacroSheds watersheds than for CAMELS in that we solved the Priestly-Taylor formulation by using a gridded 𝛼 product (Aschonitis et al. 2017), rather than calibrating 𝛼 ourselves.
6+
7+
References:
8+
Aschonitis, V. G., Papamichail, D., Demertzi, K., Colombani, N., Mastrocicco, M., Ghirardini, A., Castaldelli, G., & Fano, E.-A. (2017). High resolution global grids of revised Priestley-Taylor and Hargreaves-Samani coefficients for assessing ASCE-standardized reference crop evapotranspiration and solar radiation, links to ESRI-grid files [Data set].
9+
Thornton, M. M., Shrestha, R., Wei, Y., Thornton, P. E., Kao, S., & Wilson, B. E. (2020). DaymetDaymet: Daily Surface Weather Data on a 1-km Grid for North America, Version 4 [NetCDF]. 0 MB. https://doi.org/10.3334/ORNLDAAC/1840
10+

src/templates/write_metadata_d_boilerplate.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ This MacroSheds data product ({p}) was generated from the following munged Macro
33

44
{mp}
55

6-
Source data were retrieved from the following web page(s), static file(s), or web-API endpoint(s):
6+
Source data were retrieved from the following web page(s), static file(s), or web-API endpoint(s)
7+
on the datetime in parentheses:
78

89
{ru}
910

0 commit comments

Comments
 (0)