Skip to content

Commit e80c911

Browse files
committed
Add new shp file processing code with bug fixes
1 parent a866034 commit e80c911

5 files changed

Lines changed: 275 additions & 77 deletions

File tree

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# validate_and_extract_crown_metadata.py
2+
3+
This script validates, deduplicates, and extracts crown metadata from NEON tree crown shapefiles, outputting a clean GeoDataFrame for downstream analysis.
4+
5+
## Features
6+
- Validates all shapefiles in a directory (checks readability, CRS, and required sidecar files)
7+
- Extracts `site`, `plot`, and `year` from the filename (expects `SITE_PLOT_YEAR.shp` format)
8+
- Deduplicates files by (site, plot, year)
9+
- Extracts all crowns and their metadata into a single GeoDataFrame
10+
- Cleans the data (removes rows with missing key fields, resets index)
11+
- Saves the result as a GeoPackage (GPKG)
12+
13+
## Usage
14+
```bash
15+
python validate_and_extract_crown_metadata.py /path/to/shapefile_directory /path/to/output.gpkg
16+
```
17+
18+
## Output
19+
- A GeoPackage file containing a single layer with columns:
20+
- `individual_id`: unique crown identifier
21+
- `geometry`: crown polygon
22+
- `site`: 4-letter NEON site code
23+
- `plot`: plot identifier from filename
24+
- `year`: 4-digit year from filename
25+
- `source_file`: original shapefile path
26+
27+
## Notes
28+
- Only shapefiles with valid CRS and required sidecars are processed.
29+
- Filenames must follow the `SITE_PLOT_YEAR.shp` pattern for metadata extraction.
30+
- Rows with missing `individual_id`, `geometry`, `site`, `plot`, or `year` are dropped.
31+
- All geometries are reprojected to EPSG:4326 (WGS84) for consistency.
32+
33+
## Example
34+
```python
35+
import geopandas as gpd
36+
# Load the output GeoPackage
37+
crowns_gdf = gpd.read_file('/path/to/output.gpkg')
38+
print(crowns_gdf.info())
39+
print(crowns_gdf.head())
40+
```
Binary file not shown.
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
"""
2+
validate_and_extract_crown_metadata.py
3+
4+
Validate, deduplicate, and extract crown metadata from NEON shapefiles.
5+
6+
Usage:
7+
python validate_and_extract_crown_metadata.py /path/to/shapefile_directory /path/to/output.gpkg
8+
9+
This script validates all shapefiles in a directory, deduplicates them, extracts crown metadata, cleans the result, and saves as a GeoPackage.
10+
"""
11+
12+
import os
13+
import glob
14+
import geopandas as gpd
15+
import pandas as pd
16+
from typing import List, Dict, Any
17+
import re
18+
19+
20+
def extract_site_plot_year_from_filename(filename):
21+
base = os.path.splitext(os.path.basename(filename))[0]
22+
base = re.sub(r" \(\d+\)$", "", base)
23+
parts = base.split("_")
24+
if len(parts) == 3 and parts[2].isdigit():
25+
site, plot, year = parts
26+
return site, plot, year
27+
return None, None, None
28+
29+
30+
def validate_shapefile(shp_path: str) -> Dict[str, Any]:
31+
result = {
32+
"path": shp_path,
33+
"valid": False,
34+
"has_crs": False,
35+
"missing_sidecars": [],
36+
"error": None,
37+
"site": None,
38+
"plot": None,
39+
"year": None,
40+
}
41+
base, _ = os.path.splitext(shp_path)
42+
required_exts = [".shp", ".dbf", ".shx"]
43+
for ext in required_exts:
44+
if not os.path.exists(base + ext):
45+
result["missing_sidecars"].append(base + ext)
46+
try:
47+
gdf = gpd.read_file(shp_path)
48+
result["valid"] = True
49+
if gdf.crs is not None:
50+
result["has_crs"] = True
51+
site, plot, year = extract_site_plot_year_from_filename(shp_path)
52+
result["site"] = site
53+
result["plot"] = plot
54+
result["year"] = year
55+
except Exception as e:
56+
result["error"] = str(e)
57+
return result
58+
59+
60+
def deduplicate_shapefiles(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
61+
seen = set()
62+
deduped = []
63+
for res in results:
64+
if not res["valid"] or not res["has_crs"]:
65+
continue
66+
key = (res.get("site"), res.get("plot"), res.get("year"))
67+
if key in seen:
68+
continue
69+
seen.add(key)
70+
deduped.append(res)
71+
return deduped
72+
73+
74+
def summarize_validation(
75+
results: List[Dict[str, Any]], show_invalid: bool = False
76+
) -> Dict[str, Any]:
77+
total = len(results)
78+
valid = sum(r["valid"] and r["has_crs"] for r in results)
79+
missing_sidecar = sum(r["valid"] and bool(r["missing_sidecars"]) for r in results)
80+
missing_crs = sum(r["valid"] and not r["has_crs"] for r in results)
81+
unreadable = sum(not r["valid"] for r in results)
82+
print(
83+
f"Summary:\n Total shapefiles: {total}\n Valid (with CRS): {valid}\n Missing sidecar files: {missing_sidecar}\n Missing CRS: {missing_crs}\n Unreadable: {unreadable}"
84+
)
85+
print(
86+
"\nTo view invalid files, set show_invalid=True when calling summarize_validation, or run the code separately to examine the paths."
87+
)
88+
if show_invalid:
89+
print("\nInvalid files:")
90+
for r in results:
91+
if not (r["valid"] and r["has_crs"]):
92+
print(
93+
f"- {r['path']} | Error: {r['error']} | Missing sidecars: {r['missing_sidecars']}"
94+
)
95+
return {
96+
"total": total,
97+
"valid": valid,
98+
"missing_sidecar": missing_sidecar,
99+
"missing_crs": missing_crs,
100+
"unreadable": unreadable,
101+
"invalid_files": [r for r in results if not (r["valid"] and r["has_crs"])],
102+
}
103+
104+
105+
def extract_crowns_from_shapefiles(shapefile_paths, target_crs="EPSG:4326"):
106+
# Mapping from NEON site code to UTM CRS
107+
SITE_UTM_ZONES = {
108+
"ABBY": "EPSG:32610",
109+
"BART": "EPSG:32619",
110+
"BONA": "EPSG:32606",
111+
"CLBJ": "EPSG:32614",
112+
"DEJU": "EPSG:32606",
113+
"DELA": "EPSG:32616",
114+
"GRSM": "EPSG:32617",
115+
"GUAN": "EPSG:32619",
116+
"HARV": "EPSG:32618",
117+
"HEAL": "EPSG:32606",
118+
"JERC": "EPSG:32616",
119+
"KONZ": "EPSG:32614",
120+
"LENO": "EPSG:32616",
121+
"MLBS": "EPSG:32617",
122+
"MOAB": "EPSG:32612",
123+
"NIWO": "EPSG:32613",
124+
"ONAQ": "EPSG:32612",
125+
"OSBS": "EPSG:32617",
126+
"PUUM": "EPSG:32605",
127+
"RMNP": "EPSG:32613",
128+
"SCBI": "EPSG:32617",
129+
"SERC": "EPSG:32618",
130+
"SJER": "EPSG:32611",
131+
"SOAP": "EPSG:32611",
132+
"SRER": "EPSG:32612",
133+
"TALL": "EPSG:32616",
134+
"TEAK": "EPSG:32611",
135+
"UKFS": "EPSG:32615",
136+
"UNDE": "EPSG:32616",
137+
"WREF": "EPSG:32610",
138+
}
139+
140+
all_gdfs = []
141+
for shp in shapefile_paths:
142+
try:
143+
gdf = gpd.read_file(shp)
144+
site, plot, year = extract_site_plot_year_from_filename(shp)
145+
utm_crs = SITE_UTM_ZONES.get(site)
146+
if utm_crs:
147+
if gdf.crs is None:
148+
gdf = gdf.set_crs("EPSG:4326")
149+
if str(gdf.crs) != utm_crs:
150+
gdf = gdf.to_crs(utm_crs)
151+
gdf["center_easting"] = gdf.geometry.centroid.x
152+
gdf["center_northing"] = gdf.geometry.centroid.y
153+
else:
154+
# Only print warning if site is not None (avoid noisy output for missing/invalid files)
155+
if site is not None:
156+
print(
157+
f"Warning: No UTM zone for site {site}, skipping easting/northing extraction."
158+
)
159+
gdf["center_easting"] = None
160+
gdf["center_northing"] = None
161+
if "individual" in gdf.columns:
162+
gdf = gdf.rename(columns={"individual": "individual_id"})
163+
else:
164+
gdf["individual_id"] = None
165+
gdf["site"] = site
166+
gdf["plot"] = plot
167+
gdf["year"] = year
168+
gdf["source_file"] = shp
169+
keep_cols = [
170+
"individual_id",
171+
"geometry",
172+
"site",
173+
"plot",
174+
"year",
175+
"source_file",
176+
"center_easting",
177+
"center_northing",
178+
]
179+
for col in keep_cols:
180+
if col not in gdf.columns:
181+
gdf[col] = None
182+
gdf = gdf[keep_cols]
183+
# Reproject to target_crs (e.g., EPSG:4326) for concatenation, but after extracting easting/northing
184+
gdf = gdf.to_crs(target_crs)
185+
# Only append if not empty/all-NA
186+
if not gdf.empty and not gdf.isna().all(axis=None):
187+
all_gdfs.append(gdf)
188+
except Exception as e:
189+
print(f"Could not read {shp}: {e}")
190+
# Filter out empty/all-NA DataFrames before concatenation
191+
all_gdfs = [g for g in all_gdfs if not g.empty and not g.isna().all(axis=None)]
192+
if all_gdfs:
193+
combined = gpd.GeoDataFrame(
194+
pd.concat(all_gdfs, ignore_index=True), crs=target_crs
195+
)
196+
cleaned = combined.dropna(
197+
subset=["individual_id", "geometry", "site", "plot", "year"]
198+
)
199+
cleaned = cleaned.reset_index(drop=True)
200+
return cleaned
201+
else:
202+
return gpd.GeoDataFrame(
203+
columns=[
204+
"individual_id",
205+
"geometry",
206+
"site",
207+
"plot",
208+
"year",
209+
"source_file",
210+
"center_easting",
211+
"center_northing",
212+
],
213+
crs=target_crs,
214+
)
215+
216+
217+
if __name__ == "__main__":
218+
import sys
219+
220+
if len(sys.argv) < 3:
221+
print(
222+
"Usage: python validate_and_extract_crown_metadata.py /path/to/shapefile_directory /path/to/output.gpkg"
223+
)
224+
sys.exit(1)
225+
shp_dir = sys.argv[1]
226+
out_path = sys.argv[2]
227+
shp_files = glob.glob(os.path.join(shp_dir, "**", "*.shp"), recursive=True)
228+
results = [validate_shapefile(f) for f in shp_files]
229+
deduped = deduplicate_shapefiles(results)
230+
summarize_validation(results)
231+
shapefile_paths = [rec["path"] for rec in deduped]
232+
crowns_gdf = extract_crowns_from_shapefiles(shapefile_paths)
233+
print(f"\nFinal cleaned crowns GeoDataFrame: {len(crowns_gdf)} rows")
234+
crowns_gdf.to_file(out_path, driver="GPKG")
235+
print(f"Saved cleaned crown metadata to {out_path}")

neon_tree_classification/data/shapefile_processing/validation.py

Whitespace-only changes.

scripts/test_shapefile_processor.py

Lines changed: 0 additions & 77 deletions
This file was deleted.

0 commit comments

Comments
 (0)