|
| 1 | +""" |
| 2 | +validate_and_extract_crown_metadata.py |
| 3 | +
|
| 4 | +Validate, deduplicate, and extract crown metadata from NEON shapefiles. |
| 5 | +
|
| 6 | +Usage: |
| 7 | + python validate_and_extract_crown_metadata.py /path/to/shapefile_directory /path/to/output.gpkg |
| 8 | +
|
| 9 | +This script validates all shapefiles in a directory, deduplicates them, extracts crown metadata, cleans the result, and saves as a GeoPackage. |
| 10 | +""" |
| 11 | + |
| 12 | +import os |
| 13 | +import glob |
| 14 | +import geopandas as gpd |
| 15 | +import pandas as pd |
| 16 | +from typing import List, Dict, Any |
| 17 | +import re |
| 18 | + |
| 19 | + |
| 20 | +def extract_site_plot_year_from_filename(filename): |
| 21 | + base = os.path.splitext(os.path.basename(filename))[0] |
| 22 | + base = re.sub(r" \(\d+\)$", "", base) |
| 23 | + parts = base.split("_") |
| 24 | + if len(parts) == 3 and parts[2].isdigit(): |
| 25 | + site, plot, year = parts |
| 26 | + return site, plot, year |
| 27 | + return None, None, None |
| 28 | + |
| 29 | + |
| 30 | +def validate_shapefile(shp_path: str) -> Dict[str, Any]: |
| 31 | + result = { |
| 32 | + "path": shp_path, |
| 33 | + "valid": False, |
| 34 | + "has_crs": False, |
| 35 | + "missing_sidecars": [], |
| 36 | + "error": None, |
| 37 | + "site": None, |
| 38 | + "plot": None, |
| 39 | + "year": None, |
| 40 | + } |
| 41 | + base, _ = os.path.splitext(shp_path) |
| 42 | + required_exts = [".shp", ".dbf", ".shx"] |
| 43 | + for ext in required_exts: |
| 44 | + if not os.path.exists(base + ext): |
| 45 | + result["missing_sidecars"].append(base + ext) |
| 46 | + try: |
| 47 | + gdf = gpd.read_file(shp_path) |
| 48 | + result["valid"] = True |
| 49 | + if gdf.crs is not None: |
| 50 | + result["has_crs"] = True |
| 51 | + site, plot, year = extract_site_plot_year_from_filename(shp_path) |
| 52 | + result["site"] = site |
| 53 | + result["plot"] = plot |
| 54 | + result["year"] = year |
| 55 | + except Exception as e: |
| 56 | + result["error"] = str(e) |
| 57 | + return result |
| 58 | + |
| 59 | + |
| 60 | +def deduplicate_shapefiles(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
| 61 | + seen = set() |
| 62 | + deduped = [] |
| 63 | + for res in results: |
| 64 | + if not res["valid"] or not res["has_crs"]: |
| 65 | + continue |
| 66 | + key = (res.get("site"), res.get("plot"), res.get("year")) |
| 67 | + if key in seen: |
| 68 | + continue |
| 69 | + seen.add(key) |
| 70 | + deduped.append(res) |
| 71 | + return deduped |
| 72 | + |
| 73 | + |
| 74 | +def summarize_validation( |
| 75 | + results: List[Dict[str, Any]], show_invalid: bool = False |
| 76 | +) -> Dict[str, Any]: |
| 77 | + total = len(results) |
| 78 | + valid = sum(r["valid"] and r["has_crs"] for r in results) |
| 79 | + missing_sidecar = sum(r["valid"] and bool(r["missing_sidecars"]) for r in results) |
| 80 | + missing_crs = sum(r["valid"] and not r["has_crs"] for r in results) |
| 81 | + unreadable = sum(not r["valid"] for r in results) |
| 82 | + print( |
| 83 | + f"Summary:\n Total shapefiles: {total}\n Valid (with CRS): {valid}\n Missing sidecar files: {missing_sidecar}\n Missing CRS: {missing_crs}\n Unreadable: {unreadable}" |
| 84 | + ) |
| 85 | + print( |
| 86 | + "\nTo view invalid files, set show_invalid=True when calling summarize_validation, or run the code separately to examine the paths." |
| 87 | + ) |
| 88 | + if show_invalid: |
| 89 | + print("\nInvalid files:") |
| 90 | + for r in results: |
| 91 | + if not (r["valid"] and r["has_crs"]): |
| 92 | + print( |
| 93 | + f"- {r['path']} | Error: {r['error']} | Missing sidecars: {r['missing_sidecars']}" |
| 94 | + ) |
| 95 | + return { |
| 96 | + "total": total, |
| 97 | + "valid": valid, |
| 98 | + "missing_sidecar": missing_sidecar, |
| 99 | + "missing_crs": missing_crs, |
| 100 | + "unreadable": unreadable, |
| 101 | + "invalid_files": [r for r in results if not (r["valid"] and r["has_crs"])], |
| 102 | + } |
| 103 | + |
| 104 | + |
| 105 | +def extract_crowns_from_shapefiles(shapefile_paths, target_crs="EPSG:4326"): |
| 106 | + # Mapping from NEON site code to UTM CRS |
| 107 | + SITE_UTM_ZONES = { |
| 108 | + "ABBY": "EPSG:32610", |
| 109 | + "BART": "EPSG:32619", |
| 110 | + "BONA": "EPSG:32606", |
| 111 | + "CLBJ": "EPSG:32614", |
| 112 | + "DEJU": "EPSG:32606", |
| 113 | + "DELA": "EPSG:32616", |
| 114 | + "GRSM": "EPSG:32617", |
| 115 | + "GUAN": "EPSG:32619", |
| 116 | + "HARV": "EPSG:32618", |
| 117 | + "HEAL": "EPSG:32606", |
| 118 | + "JERC": "EPSG:32616", |
| 119 | + "KONZ": "EPSG:32614", |
| 120 | + "LENO": "EPSG:32616", |
| 121 | + "MLBS": "EPSG:32617", |
| 122 | + "MOAB": "EPSG:32612", |
| 123 | + "NIWO": "EPSG:32613", |
| 124 | + "ONAQ": "EPSG:32612", |
| 125 | + "OSBS": "EPSG:32617", |
| 126 | + "PUUM": "EPSG:32605", |
| 127 | + "RMNP": "EPSG:32613", |
| 128 | + "SCBI": "EPSG:32617", |
| 129 | + "SERC": "EPSG:32618", |
| 130 | + "SJER": "EPSG:32611", |
| 131 | + "SOAP": "EPSG:32611", |
| 132 | + "SRER": "EPSG:32612", |
| 133 | + "TALL": "EPSG:32616", |
| 134 | + "TEAK": "EPSG:32611", |
| 135 | + "UKFS": "EPSG:32615", |
| 136 | + "UNDE": "EPSG:32616", |
| 137 | + "WREF": "EPSG:32610", |
| 138 | + } |
| 139 | + |
| 140 | + all_gdfs = [] |
| 141 | + for shp in shapefile_paths: |
| 142 | + try: |
| 143 | + gdf = gpd.read_file(shp) |
| 144 | + site, plot, year = extract_site_plot_year_from_filename(shp) |
| 145 | + utm_crs = SITE_UTM_ZONES.get(site) |
| 146 | + if utm_crs: |
| 147 | + if gdf.crs is None: |
| 148 | + gdf = gdf.set_crs("EPSG:4326") |
| 149 | + if str(gdf.crs) != utm_crs: |
| 150 | + gdf = gdf.to_crs(utm_crs) |
| 151 | + gdf["center_easting"] = gdf.geometry.centroid.x |
| 152 | + gdf["center_northing"] = gdf.geometry.centroid.y |
| 153 | + else: |
| 154 | + # Only print warning if site is not None (avoid noisy output for missing/invalid files) |
| 155 | + if site is not None: |
| 156 | + print( |
| 157 | + f"Warning: No UTM zone for site {site}, skipping easting/northing extraction." |
| 158 | + ) |
| 159 | + gdf["center_easting"] = None |
| 160 | + gdf["center_northing"] = None |
| 161 | + if "individual" in gdf.columns: |
| 162 | + gdf = gdf.rename(columns={"individual": "individual_id"}) |
| 163 | + else: |
| 164 | + gdf["individual_id"] = None |
| 165 | + gdf["site"] = site |
| 166 | + gdf["plot"] = plot |
| 167 | + gdf["year"] = year |
| 168 | + gdf["source_file"] = shp |
| 169 | + keep_cols = [ |
| 170 | + "individual_id", |
| 171 | + "geometry", |
| 172 | + "site", |
| 173 | + "plot", |
| 174 | + "year", |
| 175 | + "source_file", |
| 176 | + "center_easting", |
| 177 | + "center_northing", |
| 178 | + ] |
| 179 | + for col in keep_cols: |
| 180 | + if col not in gdf.columns: |
| 181 | + gdf[col] = None |
| 182 | + gdf = gdf[keep_cols] |
| 183 | + # Reproject to target_crs (e.g., EPSG:4326) for concatenation, but after extracting easting/northing |
| 184 | + gdf = gdf.to_crs(target_crs) |
| 185 | + # Only append if not empty/all-NA |
| 186 | + if not gdf.empty and not gdf.isna().all(axis=None): |
| 187 | + all_gdfs.append(gdf) |
| 188 | + except Exception as e: |
| 189 | + print(f"Could not read {shp}: {e}") |
| 190 | + # Filter out empty/all-NA DataFrames before concatenation |
| 191 | + all_gdfs = [g for g in all_gdfs if not g.empty and not g.isna().all(axis=None)] |
| 192 | + if all_gdfs: |
| 193 | + combined = gpd.GeoDataFrame( |
| 194 | + pd.concat(all_gdfs, ignore_index=True), crs=target_crs |
| 195 | + ) |
| 196 | + cleaned = combined.dropna( |
| 197 | + subset=["individual_id", "geometry", "site", "plot", "year"] |
| 198 | + ) |
| 199 | + cleaned = cleaned.reset_index(drop=True) |
| 200 | + return cleaned |
| 201 | + else: |
| 202 | + return gpd.GeoDataFrame( |
| 203 | + columns=[ |
| 204 | + "individual_id", |
| 205 | + "geometry", |
| 206 | + "site", |
| 207 | + "plot", |
| 208 | + "year", |
| 209 | + "source_file", |
| 210 | + "center_easting", |
| 211 | + "center_northing", |
| 212 | + ], |
| 213 | + crs=target_crs, |
| 214 | + ) |
| 215 | + |
| 216 | + |
| 217 | +if __name__ == "__main__": |
| 218 | + import sys |
| 219 | + |
| 220 | + if len(sys.argv) < 3: |
| 221 | + print( |
| 222 | + "Usage: python validate_and_extract_crown_metadata.py /path/to/shapefile_directory /path/to/output.gpkg" |
| 223 | + ) |
| 224 | + sys.exit(1) |
| 225 | + shp_dir = sys.argv[1] |
| 226 | + out_path = sys.argv[2] |
| 227 | + shp_files = glob.glob(os.path.join(shp_dir, "**", "*.shp"), recursive=True) |
| 228 | + results = [validate_shapefile(f) for f in shp_files] |
| 229 | + deduped = deduplicate_shapefiles(results) |
| 230 | + summarize_validation(results) |
| 231 | + shapefile_paths = [rec["path"] for rec in deduped] |
| 232 | + crowns_gdf = extract_crowns_from_shapefiles(shapefile_paths) |
| 233 | + print(f"\nFinal cleaned crowns GeoDataFrame: {len(crowns_gdf)} rows") |
| 234 | + crowns_gdf.to_file(out_path, driver="GPKG") |
| 235 | + print(f"Saved cleaned crown metadata to {out_path}") |
0 commit comments