1+ from math import asin
2+
3+ import numpy as np
14import pandas as pd
2- from powersimdata .utility .distance import haversine
35from scipy .optimize import curve_fit
6+ from scipy .spatial import KDTree
47from scipy .stats import linregress
58
69from prereise .gather .griddata .hifld import const
710from prereise .gather .griddata .hifld .data_access import load
11+ from prereise .gather .helpers import latlon_to_xyz
812
913
1014def floatify (value , default = float ("nan" )):
@@ -22,54 +26,127 @@ def floatify(value, default=float("nan")):
2226 return default
2327
2428
25- def map_generator_to_sub_by_location (generator , substation_groupby ):
26- """Determine a likely substation for a generator to be connected to. Priority order
27- of mapping is: 1) if location is available and one or more substations exist in that
28- ZIP code, map by location to closest substation within that ZIP code, 2) if location
29- is available but no substations exist in that ZIP code, map to the closest
30- substation within neighboring ZIP codes, 3) if only ZIP code is available
31- (no location), and one or more substations exist, map to an arbitrarily chosen
32- substation within that ZIP code, 4) if only ZIP code is available (no location)
33- but no substations exist in that ZIP code, return NA.
34-
35- :param pandas.Series generator: one generating unit from data frame.
36- :param pandas.GroupBy substation_groupby: data frame of substations, grouped by
37- (interconnect, ZIP).
38- :return: (*int/pd.NA*) -- substation ID if the generator can be mapped successfully
39- to a substation, else pd.NA.
29+ def map_generators_to_sub_by_location (
30+ generators , substations , inplace = True , report_worst = None
31+ ):
32+ """Determine the closest substation to each generator. For generators without
33+ latitude and longitude, an attempt will be made to match via ZIP code, and failing
34+ that a pandas.NA value will be returned.
35+
36+ :param pandas.DataFrame generators: generator data. Required columns:
37+ 'interconnect', 'lat', 'lon', 'ZIP'.
38+ :param pandas.DataFrame substations: substation data. Required columns:
39+ 'interconnect', 'lat', 'lon', 'ZIP'.
40+ :param bool inplace: whether to modify the generator table inplace with new 'sub_id'
41+ and 'sub_dist' columns or to return a new one.
42+ :param int report_worst: if not None, display the distances of the worst N mappings.
43+ :return: (*pandas.DataFrame/None*) -- if ``inplace`` is `False`, return the modified
44+ DataFrame; otherwise return nothing.
4045 """
41- lookup_params = tuple (generator .loc [["interconnect" , "ZIP" ]])
42- if pd .isna (generator ["lat" ]) or pd .isna (generator ["lon" ]):
43- # No location available
44- try :
45- matching_subs = substation_groupby .get_group (lookup_params )
46- return matching_subs .index [0 ]
47- except KeyError :
48- return pd .NA
49- try :
50- # This ZIP code contains substations, this block will execute successfully
51- matching_subs = substation_groupby .get_group (lookup_params )
52- except KeyError :
53- # If this ZIP code does not contain substations, this block will execute, and
54- # we select a set of 'nearby' substations
55- zip_range = [int (generator .loc ["ZIP" ]) + offset for offset in range (- 100 , 101 )]
56- zip_range_strings = [str (z ).rjust (5 , "0" ) for z in zip_range ]
57- try :
58- matching_subs = pd .concat (
59- [
60- substation_groupby .get_group ((generator .loc ["interconnect" ], z ))
61- for z in zip_range_strings
62- if (generator .loc ["interconnect" ], z ) in substation_groupby .groups
63- ]
64- )
65- except ValueError :
66- # If no matching subs within the given interconnection and ZIPs, give up
46+
47+ def get_closest_substation (generator , voltage_trees , subs_voltage_lookup ):
48+ if not isinstance (generator ["xyz" ], list ):
6749 return pd .NA
68- distance_to_subs = matching_subs .apply (
69- lambda x : haversine ((x .lat , x .lon ), (generator .lat , generator .lon )),
50+ if pd .isnull (generator ["voltage_class" ]) or generator ["Pmax" ] < 100 :
51+ grouper_key = generator ["interconnect" ]
52+ else :
53+ grouper_key = (generator ["interconnect" ], generator ["voltage_class" ])
54+ chord_dist , array_index = voltage_trees [grouper_key ].query (generator ["xyz" ])
55+ sub_id = subs_voltage_lookup [grouper_key ][array_index ]
56+ # Translate chord distance (unit circle) to great circle distance (miles)
57+ dist_in_miles = 3963 * 2 * asin (chord_dist / 2 ) # use 3963 mi as earth radius
58+ return pd .Series ({"dist" : dist_in_miles , "sub_id" : sub_id })
59+
60+ def classify_voltages (voltage , voltage_ranges ):
61+ for v_range , bounds in voltage_ranges .items ():
62+ if bounds ["min" ] <= voltage <= bounds ["max" ]:
63+ return v_range
64+ return float ("nan" )
65+
66+ voltage_ranges = {
67+ "under 100" : {"min" : 0 , "max" : 99 },
68+ "100-161" : {"min" : 100 , "max" : 161 },
69+ "220-287" : {"min" : 220 , "max" : 287 },
70+ "345" : {"min" : 345 , "max" : 345 },
71+ "500" : {"min" : 500 , "max" : 500 },
72+ "735 and above" : {"min" : 735 , "max" : float ("inf" )},
73+ }
74+
75+ # Translate lat/lon to 3D positions (assume spherical earth, origin at center)
76+ substations_with_xyz = substations .assign (
77+ xyz = substations .apply (lambda x : latlon_to_xyz (x ["lat" ], x ["lon" ]), axis = 1 )
78+ )
79+ generators_with_xyz = generators .assign (
80+ xyz = generators .apply (
81+ lambda x : (
82+ pd .NA
83+ if pd .isna (x ["lat" ]) or pd .isna (x ["lon" ])
84+ else latlon_to_xyz (x ["lat" ], x ["lon" ])
85+ ),
86+ axis = 1 ,
87+ )
88+ )
89+
90+ # Bin voltages into broad classes
91+ generators_with_xyz ["voltage_class" ] = generators ["Grid Voltage (kV)" ].map (
92+ lambda x : classify_voltages (x , voltage_ranges )
93+ )
94+
95+ # Group substations by voltage to build KDTrees
96+ subs_voltage_lookup = {
97+ (interconnect , voltage_level ): substations_with_xyz .loc [
98+ (substations_with_xyz ["interconnect" ] == interconnect )
99+ & (substations_with_xyz ["MAX_VOLT" ] >= voltage_range ["min" ])
100+ ].index
101+ for interconnect in generators ["interconnect" ].unique ()
102+ for voltage_level , voltage_range in voltage_ranges .items ()
103+ }
104+ # Group substations by ZIP code for a fallback for generators without coordinates
105+ subs_zip_groupby = substations_with_xyz .groupby (["interconnect" , "ZIP" ])
106+
107+ # Create a KDTree for each combination of voltage and interconnect
108+ voltage_trees = {
109+ key : KDTree (np .vstack (substations_with_xyz .loc [sub_ids , "xyz" ]))
110+ for key , sub_ids in subs_voltage_lookup .items ()
111+ if len (sub_ids ) > 0
112+ }
113+ # Create a KDTree for each interconnect (all voltages)
114+ subs_interconnect_groupby = substations_with_xyz .groupby ("interconnect" )
115+ for interconnect in generators ["interconnect" ].unique ():
116+ tree_subs = subs_interconnect_groupby .get_group (interconnect )
117+ voltage_trees [interconnect ] = KDTree (np .vstack (tree_subs ["xyz" ]))
118+ subs_voltage_lookup [interconnect ] = tree_subs .index
119+
120+ # Query the appropriate tree for each generator to get the closest substation ID
121+ mapping_results = generators_with_xyz .apply (
122+ lambda x : get_closest_substation (x , voltage_trees , subs_voltage_lookup ),
70123 axis = 1 ,
71124 )
72- return distance_to_subs .idxmin ()
125+ # For generators without coordinates, try to pick a substation with a matching ZIP
126+ for g in generators .loc [mapping_results ["sub_id" ].isnull ()].index :
127+ try :
128+ candidates = subs_zip_groupby .get_group (
129+ (generators .loc [g , "interconnect" ], generators .loc [g , "ZIP" ])
130+ )
131+ # arbitrary choose the first one
132+ mapping_results .loc [g , "sub_id" ] = candidates .index [0 ]
133+ except KeyError :
134+ continue # No coordinates, no matching ZIP, we're out of luck
135+
136+ if report_worst is not None :
137+ print (
138+ mapping_results .sort_values ("sub_dist" , ascending = False )
139+ .join (generators [["Plant Code" , "Grid Voltage (kV)" , "Pmax" ]])
140+ .head (report_worst )
141+ )
142+
143+ if inplace :
144+ generators ["sub_id" ] = mapping_results ["sub_id" ]
145+ generators ["sub_dist" ] = mapping_results ["dist" ]
146+ else :
147+ return generators_with_xyz .drop (["xyz" , "voltage_class" ], axis = 1 ).join (
148+ mapping_results
149+ )
73150
74151
75152def map_generator_to_bus_by_sub (generator , bus_groupby ):
@@ -83,7 +160,15 @@ def map_generator_to_bus_by_sub(generator, bus_groupby):
83160 if pd .isna (generator .sub_id ):
84161 return pd .NA
85162 else :
86- return bus_groupby .get_group (generator .sub_id )["baseKV" ].idxmin ()
163+ bus_voltages = bus_groupby .get_group (generator .sub_id )["baseKV" ]
164+ if len (bus_voltages ) == 1 or generator .Pmax < 200 :
165+ # Return the lowest-voltage for small generators, or the only voltage
166+ return bus_voltages .idxmin ()
167+ if generator .Pmax < 500 :
168+ # Return the second-lowest voltage for mid-sized generators
169+ return bus_voltages .sort_values ().index [1 ]
170+ # Return the highest voltage for large generators
171+ return bus_voltages .idxmax ()
87172
88173
89174def estimate_heat_rate_curve (
@@ -246,6 +331,30 @@ def estimate_coefficients(generator, regressions):
246331 generators .update (linear_heat_rate_assumptions )
247332
248333
334+ def aggregate_hydro_generators_by_plant_id (generators ):
335+ """Combine hydro generators within the same plant into aggregated larger generators.
336+ 'Pmin' and 'Pmax' values will be summed, all other attributes (including the index)
337+ will be taken from the (somewhat arbitrary) first generator in the plant grouping.
338+
339+ :param pandas.DataFrame generators: data frame of generators.
340+ :return: (*pandas.DataFrame*) -- data frame of generators, with hydro generators
341+ aggregated.
342+ """
343+ indiv_hydro_gens = generators .query ("`Energy Source 1` == 'WAT'" ).copy ()
344+ original_hydro_indices = indiv_hydro_gens .index .tolist ()
345+ # Retain the original indices to keep track of original indices for later append
346+ indiv_hydro_gens .reset_index (inplace = True )
347+ hydro_groupby = indiv_hydro_gens .groupby ("Plant Code" )
348+ # Choose characteristics from the (arbitrary) first plant
349+ aggregated_hydro = hydro_groupby .first ()
350+ aggregated_hydro [["Pmin" , "Pmax" ]] = hydro_groupby [["Pmin" , "Pmax" ]].sum ()
351+ # Reset/set index to restore 'Plant Code' as a column and original index numbering
352+ aggregated_hydro .reset_index (inplace = True )
353+ aggregated_hydro .set_index ("index" , inplace = True ) # 'index' was the original
354+ generators = generators .drop (original_hydro_indices ).append (aggregated_hydro )
355+ return generators
356+
357+
249358def build_plant (bus , substations , kwargs = {}):
250359 """Use source data on generating units from EIA/EPA, along with transmission network
251360 data, to produce a plant data frame.
@@ -286,35 +395,47 @@ def build_plant(bus, substations, kwargs={}):
286395 bus_groupby = bus .groupby (bus ["sub_id" ].astype (int ))
287396 # Filter substations with no buses
288397 substations = substations .loc [set (bus_groupby .groups .keys ())]
289- substation_groupby = substations .groupby (["interconnect" , "ZIP" ])
290398 epa_ampd_groupby = epa_ampd .groupby (["ORISPL_CODE" , "UNITID" ])
291399
292- # Add information
293- generators ["interconnect" ] = (
294- generators ["Plant Code" ]
295- .map (plants ["NERC Region" ])
296- .map (const .nercregion2interconnect )
297- )
298- generators ["lat" ] = generators ["Plant Code" ].map (plants ["Latitude" ])
299- generators ["lon" ] = generators ["Plant Code" ].map (plants ["Longitude" ])
300- generators ["ZIP" ] = generators ["Plant Code" ].map (plants ["Zip" ])
301- generators ["Balancing Authority Code" ] = generators ["Plant Code" ].map (
302- plants ["Balancing Authority Code" ]
400+ # Add information to generators based on Form 860 Plant table
401+ # Merging this way allows column-on-column merge while preserving original index
402+ generators = (
403+ generators .reset_index ()
404+ .merge (
405+ plants ,
406+ on = "Plant Code" ,
407+ suffixes = (None , "_860Plant" ),
408+ )
409+ .set_index ("index" )
303410 )
304- print ("Mapping generators to substations... (this may take several minutes)" )
305- generators ["sub_id" ] = generators .apply (
306- lambda x : map_generator_to_sub_by_location (x , substation_groupby ), axis = 1
411+ generators .rename (
412+ {"Latitude" : "lat" , "Longitude" : "lon" , "Zip" : "ZIP" }, axis = 1 , inplace = True
307413 )
308- generators ["bus_id" ] = generators .apply (
309- lambda x : map_generator_to_bus_by_sub (x , bus_groupby ), axis = 1
414+ # Map interconnect via BA first (more reliable) then by NERC Region
415+ generators ["interconnect" ] = (
416+ generators ["Balancing Authority Code" ]
417+ .map (const .balancingauthority2interconnect )
418+ .combine_first (generators ["NERC Region" ].map (const .nercregion2interconnect ))
310419 )
420+ generators ["Grid Voltage (kV)" ] = generators ["Grid Voltage (kV)" ].map (floatify )
421+
422+ # Ensure we have Pmax and Pmin for each generator
311423 generators ["Pmax" ] = generators [
312424 ["Summer Capacity (MW)" , "Winter Capacity (MW)" ]
313425 ].max (axis = 1 )
314426 # Drop generators with no capacity listed
315427 generators = generators .loc [~ generators ["Pmax" ].isnull ()]
316428 generators .rename ({"Minimum Load (MW)" : "Pmin" }, inplace = True , axis = 1 )
317429 generators ["Pmin" ] = generators ["Pmin" ].fillna (0 )
430+
431+ # Aggregate hydro generators within each plant
432+ generators = aggregate_hydro_generators_by_plant_id (generators )
433+
434+ map_generators_to_sub_by_location (generators , substations )
435+ generators ["bus_id" ] = generators .apply (
436+ lambda x : map_generator_to_bus_by_sub (x , bus_groupby ), axis = 1
437+ )
438+
318439 print ("Fitting heat rate curves to EPA data... (this may take several minutes)" )
319440 heat_rate_curve_estimates = generators .apply (
320441 lambda x : estimate_heat_rate_curve (
0 commit comments