1+ import geopandas as gpd
2+ import pandas as pd
3+ import os
4+ import glob
5+ import re
6+ import shutil
7+
8+ def consolidate_files (parent_dir , destination_dir ):
9+ """
10+ Consolidate shapefiles from subdirectories into a single directory.
11+ """
12+ errors = []
13+ # List to keep track of files moved
14+ files_moved = []
15+ # Walk through all subdirectories
16+ for root , dirs , files in os .walk (parent_dir ):
17+ # Skip the parent directory itself and the destination directory
18+ if root == parent_dir or root == destination_dir :
19+ continue
20+
21+ print (f"Checking directory: { root } " )
22+
23+ # Find all shapefile-related files in this directory
24+ # (looking for all extensions: .shp, .shx, .dbf, .prj, .cpg)
25+ all_files = []
26+ for ext in ['.shp' , '.shx' , '.dbf' , '.prj' , '.cpg' ]:
27+ all_files .extend (glob .glob (os .path .join (root , f'*{ ext } ' )))
28+
29+ # Copy each file to the destination
30+ for file_path in all_files :
31+ try :
32+ file_name = os .path .basename (file_path )
33+ dest_path = os .path .join (destination_dir , file_name )
34+
35+ # Check if file already exists at destination
36+ if os .path .exists (dest_path ):
37+ print (f"Warning: File { file_name } already exists in destination. Ignoring." )
38+ continue
39+
40+ shutil .copy2 (file_path , dest_path )
41+ files_moved .append (file_path )
42+ print (f"Copied: { file_path } -> { dest_path } " )
43+ except Exception as e :
44+ errors .append ((file_path , str (e )))
45+ print (f"Error copying { file_path } : { e } " )
46+
47+ # Print summary
48+ print ("\n Operation complete!" )
49+ print (f"Total files moved: { len (files_moved )} " )
50+ print (f"Errors encountered: { len (errors )} " )
51+
52+ if errors :
53+ print ("\n Files that couldn't be copied:" )
54+ for file_path , error in errors :
55+ print (f"- { file_path } : { error } " )
56+
57+ # Count files in destination by extension
58+ print ("\n Files in destination directory by extension:" )
59+ for ext in ['.shp' , '.shx' , '.dbf' , '.prj' , '.cpg' ]:
60+ count = len (glob .glob (os .path .join (destination_dir , f'*{ ext } ' )))
61+ print (f"{ ext } : { count } files" )
62+
63+ # Function to extract site, plot, and year from filename
64+ def extract_metadata (filename ):
65+ # Assuming filename format like: SOAP_052_2019.shp
66+ parts = os .path .basename (filename ).split ('.' )[0 ].split ('_' )
67+ if len (parts ) >= 3 :
68+ site = parts [0 ]
69+ plot = parts [1 ]
70+ year = parts [2 ]
71+ return site , plot , year
72+ return "unknown" , "unknown" , "unknown"
73+
74+ # Function to calculate centroid coordinates
75+ def get_centroid_coords (gdf ):
76+ # Handle potential MultiPolygon geometries by getting the centroid of all geometries
77+ bounds = gdf .total_bounds # minx, miny, maxx, maxy
78+ center_x = (bounds [0 ] + bounds [2 ]) / 2
79+ center_y = (bounds [1 ] + bounds [3 ]) / 2
80+
81+ # Get the UTM zone if available from projection info
82+ utm_zone = "unknown"
83+ if 'utm' in gdf .crs .name .lower ():
84+ utm_match = re .search (r'utm zone (\d+)' , gdf .crs .name .lower ())
85+ if utm_match :
86+ utm_zone = utm_match .group (1 )
87+
88+ return center_x , center_y , bounds , utm_zone
89+
90+ def process_shp_files (consolidated_dir ):
91+ """
92+ Process shapefiles in the consolidated directory to extract metadata and coordinates.
93+ """
94+ # Find all shapefiles
95+ shp_files = glob .glob (os .path .join (consolidated_dir , '*.shp' ))
96+
97+ # List to store metadata
98+ sites_data = []
99+
100+ # Process each shapefile
101+ for shp_file in shp_files :
102+ try :
103+ # Extract metadata from filename
104+ site , plot , year = extract_metadata (shp_file )
105+
106+ # Read the shapefile
107+ gdf = gpd .read_file (shp_file )
108+
109+ # Get centroid coordinates and bounds
110+ center_x , center_y , bounds , utm_zone = get_centroid_coords (gdf )
111+
112+ # Store the data
113+ sites_data .append ({
114+ 'filename' : os .path .basename (shp_file ),
115+ 'site' : site ,
116+ 'plot' : plot ,
117+ 'year' : year ,
118+ 'center_easting' : center_x ,
119+ 'center_northing' : center_y ,
120+ 'min_easting' : bounds [0 ],
121+ 'min_northing' : bounds [1 ],
122+ 'max_easting' : bounds [2 ],
123+ 'max_northing' : bounds [3 ],
124+ 'utm_zone' : utm_zone ,
125+ 'crs' : str (gdf .crs ),
126+ 'num_polygons' : len (gdf )
127+ })
128+
129+ print (f"Processed: { os .path .basename (shp_file )} " )
130+
131+ except Exception as e :
132+ print (f"Error processing { shp_file } : { e } " )
133+
134+ # Create a DataFrame with the collected data
135+ sites_df = pd .DataFrame (sites_data )
136+
137+ # Display the DataFrame
138+ print ("\n Sites and Coordinates Summary:" )
139+ print (sites_df .head ())
140+
141+ # Save to CSV
142+ csv_path = os .path .join (consolidated_dir , 'neon_sites_coordinates.csv' )
143+ sites_df .to_csv (csv_path , index = False )
144+ print (f"\n Saved coordinates to: { csv_path } " )
145+
146+ # Additional analysis - group by site
147+ print ("\n Number of plots per site:" )
148+ site_counts = sites_df .groupby ('site' ).size ()
149+ print (site_counts )
150+
151+ # Check which coordinate reference system (CRS) is used for each site
152+ print ("\n Coordinate Reference Systems used:" )
153+ crs_counts = sites_df .groupby (['site' , 'crs' ]).size ().reset_index (name = 'count' )
154+ print (crs_counts )
155+
156+ total_polygons = sites_df ['num_polygons' ].sum ()
157+ print (f"\n Total number of polygons across all shapefiles: { total_polygons } " )
158+
159+ if __name__ == "__main__" :
160+ parent_dir = '/blue/azare/riteshchowdhry/Macrosystems/Data_files/hand_annotated_neon/'
161+ destination_dir = '/blue/azare/riteshchowdhry/Macrosystems/Data_files/hand_annotated_neon/consolidated_dir'
162+
163+ # Create destination directory if it doesn't exist
164+ os .makedirs (destination_dir , exist_ok = True )
165+
166+ # If required, consolidate files from parent directory to destination directory
167+ consolidate_files (parent_dir , destination_dir )
168+
169+ # Process the consolidated shapefiles and create a csv with all the metadata
170+ process_shp_files (destination_dir )
171+
0 commit comments