Skip to content

Commit 6e8be2e

Browse files
committed
add shp files curation code
1 parent 949b878 commit 6e8be2e

1 file changed

Lines changed: 171 additions & 0 deletions

File tree

src/curate_shp_files.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import geopandas as gpd
2+
import pandas as pd
3+
import os
4+
import glob
5+
import re
6+
import shutil
7+
8+
def consolidate_files(parent_dir, destination_dir):
9+
"""
10+
Consolidate shapefiles from subdirectories into a single directory.
11+
"""
12+
errors = []
13+
# List to keep track of files moved
14+
files_moved = []
15+
# Walk through all subdirectories
16+
for root, dirs, files in os.walk(parent_dir):
17+
# Skip the parent directory itself and the destination directory
18+
if root == parent_dir or root == destination_dir:
19+
continue
20+
21+
print(f"Checking directory: {root}")
22+
23+
# Find all shapefile-related files in this directory
24+
# (looking for all extensions: .shp, .shx, .dbf, .prj, .cpg)
25+
all_files = []
26+
for ext in ['.shp', '.shx', '.dbf', '.prj', '.cpg']:
27+
all_files.extend(glob.glob(os.path.join(root, f'*{ext}')))
28+
29+
# Copy each file to the destination
30+
for file_path in all_files:
31+
try:
32+
file_name = os.path.basename(file_path)
33+
dest_path = os.path.join(destination_dir, file_name)
34+
35+
# Check if file already exists at destination
36+
if os.path.exists(dest_path):
37+
print(f"Warning: File {file_name} already exists in destination. Ignoring.")
38+
continue
39+
40+
shutil.copy2(file_path, dest_path)
41+
files_moved.append(file_path)
42+
print(f"Copied: {file_path} -> {dest_path}")
43+
except Exception as e:
44+
errors.append((file_path, str(e)))
45+
print(f"Error copying {file_path}: {e}")
46+
47+
# Print summary
48+
print("\nOperation complete!")
49+
print(f"Total files moved: {len(files_moved)}")
50+
print(f"Errors encountered: {len(errors)}")
51+
52+
if errors:
53+
print("\nFiles that couldn't be copied:")
54+
for file_path, error in errors:
55+
print(f"- {file_path}: {error}")
56+
57+
# Count files in destination by extension
58+
print("\nFiles in destination directory by extension:")
59+
for ext in ['.shp', '.shx', '.dbf', '.prj', '.cpg']:
60+
count = len(glob.glob(os.path.join(destination_dir, f'*{ext}')))
61+
print(f"{ext}: {count} files")
62+
63+
# Function to extract site, plot, and year from filename
64+
def extract_metadata(filename):
65+
# Assuming filename format like: SOAP_052_2019.shp
66+
parts = os.path.basename(filename).split('.')[0].split('_')
67+
if len(parts) >= 3:
68+
site = parts[0]
69+
plot = parts[1]
70+
year = parts[2]
71+
return site, plot, year
72+
return "unknown", "unknown", "unknown"
73+
74+
# Function to calculate centroid coordinates
75+
def get_centroid_coords(gdf):
76+
# Handle potential MultiPolygon geometries by getting the centroid of all geometries
77+
bounds = gdf.total_bounds # minx, miny, maxx, maxy
78+
center_x = (bounds[0] + bounds[2]) / 2
79+
center_y = (bounds[1] + bounds[3]) / 2
80+
81+
# Get the UTM zone if available from projection info
82+
utm_zone = "unknown"
83+
if 'utm' in gdf.crs.name.lower():
84+
utm_match = re.search(r'utm zone (\d+)', gdf.crs.name.lower())
85+
if utm_match:
86+
utm_zone = utm_match.group(1)
87+
88+
return center_x, center_y, bounds, utm_zone
89+
90+
def process_shp_files(consolidated_dir):
91+
"""
92+
Process shapefiles in the consolidated directory to extract metadata and coordinates.
93+
"""
94+
# Find all shapefiles
95+
shp_files = glob.glob(os.path.join(consolidated_dir, '*.shp'))
96+
97+
# List to store metadata
98+
sites_data = []
99+
100+
# Process each shapefile
101+
for shp_file in shp_files:
102+
try:
103+
# Extract metadata from filename
104+
site, plot, year = extract_metadata(shp_file)
105+
106+
# Read the shapefile
107+
gdf = gpd.read_file(shp_file)
108+
109+
# Get centroid coordinates and bounds
110+
center_x, center_y, bounds, utm_zone = get_centroid_coords(gdf)
111+
112+
# Store the data
113+
sites_data.append({
114+
'filename': os.path.basename(shp_file),
115+
'site': site,
116+
'plot': plot,
117+
'year': year,
118+
'center_easting': center_x,
119+
'center_northing': center_y,
120+
'min_easting': bounds[0],
121+
'min_northing': bounds[1],
122+
'max_easting': bounds[2],
123+
'max_northing': bounds[3],
124+
'utm_zone': utm_zone,
125+
'crs': str(gdf.crs),
126+
'num_polygons': len(gdf)
127+
})
128+
129+
print(f"Processed: {os.path.basename(shp_file)}")
130+
131+
except Exception as e:
132+
print(f"Error processing {shp_file}: {e}")
133+
134+
# Create a DataFrame with the collected data
135+
sites_df = pd.DataFrame(sites_data)
136+
137+
# Display the DataFrame
138+
print("\nSites and Coordinates Summary:")
139+
print(sites_df.head())
140+
141+
# Save to CSV
142+
csv_path = os.path.join(consolidated_dir, 'neon_sites_coordinates.csv')
143+
sites_df.to_csv(csv_path, index=False)
144+
print(f"\nSaved coordinates to: {csv_path}")
145+
146+
# Additional analysis - group by site
147+
print("\nNumber of plots per site:")
148+
site_counts = sites_df.groupby('site').size()
149+
print(site_counts)
150+
151+
# Check which coordinate reference system (CRS) is used for each site
152+
print("\nCoordinate Reference Systems used:")
153+
crs_counts = sites_df.groupby(['site', 'crs']).size().reset_index(name='count')
154+
print(crs_counts)
155+
156+
total_polygons = sites_df['num_polygons'].sum()
157+
print(f"\nTotal number of polygons across all shapefiles: {total_polygons}")
158+
159+
if __name__ == "__main__":
160+
parent_dir = '/blue/azare/riteshchowdhry/Macrosystems/Data_files/hand_annotated_neon/'
161+
destination_dir = '/blue/azare/riteshchowdhry/Macrosystems/Data_files/hand_annotated_neon/consolidated_dir'
162+
163+
# Create destination directory if it doesn't exist
164+
os.makedirs(destination_dir, exist_ok=True)
165+
166+
# If required, consolidate files from parent directory to destination directory
167+
consolidate_files(parent_dir, destination_dir)
168+
169+
# Process the consolidated shapefiles and create a csv with all the metadata
170+
process_shp_files(destination_dir)
171+

0 commit comments

Comments
 (0)