Skip to content

Commit 2dd569e

Browse files
authored
Merge pull request #267 from Breakthrough-Energy/daniel/hifld_gen_bus_mapping
refactor: improve mapping of generators to buses
2 parents fbf0a90 + ffb2d4d commit 2dd569e

6 files changed

Lines changed: 301 additions & 67 deletions

File tree

prereise/gather/griddata/hifld/const.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,79 @@
194194
"Flywheels",
195195
}
196196

197+
balancingauthority2interconnect = {
198+
"AEC": "Eastern",
199+
"AECI": "Eastern",
200+
"AVA": "Western",
201+
"AVRN": "Western",
202+
"AZPS": "Western",
203+
"BANC": "Western",
204+
"BPAT": "Western",
205+
"CHPD": "Western",
206+
"CISO": "Western",
207+
"CPLE": "Eastern",
208+
"CPLW": "Eastern",
209+
"CSTO": "Western",
210+
"DEAA": "Western",
211+
"DOPD": "Western",
212+
"DUK": "Eastern",
213+
"EEI": "Eastern",
214+
"EPE": "Western",
215+
"ERCO": "ERCOT",
216+
"FMPP": "Eastern",
217+
"FPC": "Eastern",
218+
"FPL": "Eastern",
219+
"GCPD": "Western",
220+
"GRIF": "Western",
221+
"GRIS": "Western",
222+
"GRMA": "Western",
223+
"GVL": "Eastern",
224+
"GWA": "Western",
225+
"HGMA": "Western",
226+
"HST": "Eastern",
227+
"IID": "Western",
228+
"IPCO": "Western",
229+
"ISNE": "Eastern",
230+
"JEA": "Eastern",
231+
"LDWP": "Western",
232+
"LGEE": "Eastern",
233+
"MISO": "Eastern",
234+
"NBSO": "Eastern",
235+
"NEVP": "Western",
236+
"NSB": "Eastern",
237+
"NWMT": "Western",
238+
"NYIS": "Eastern",
239+
"OVEC": "Eastern",
240+
"PACE": "Western",
241+
"PACW": "Western",
242+
"PGE": "Western",
243+
"PJM": "Eastern",
244+
"PNM": "Western",
245+
"PSCO": "Western",
246+
"PSEI": "Western",
247+
"SC": "Eastern",
248+
"SCEG": "Eastern",
249+
"SCL": "Western",
250+
"SEC": "Eastern",
251+
"SEPA": "Eastern",
252+
"SOCO": "Eastern",
253+
"SPA": "Eastern",
254+
"SRP": "Western",
255+
"SWPP": "Eastern",
256+
"TAL": "Eastern",
257+
"TEC": "Eastern",
258+
"TEPC": "Western",
259+
"TIDC": "Western",
260+
"TPWR": "Western",
261+
"TVA": "Eastern",
262+
# "WACM": "Western", # can be Western or Eastern
263+
# "WALC": "Western", # can be Western or Eastern
264+
# "WAUW": "Western", # can be Western or Eastern
265+
"WWA": "Western",
266+
"YAD": "Eastern",
267+
}
268+
269+
# Usage of this is deprecated, since these data seem noisier than Balancing Authorities
197270
nercregion2interconnect = {
198271
"ASCC": "Alaska", # Not currently used
199272
"HICC": "Hawaii", # Not currently used
@@ -438,6 +511,18 @@
438511
"west": -125,
439512
}
440513

514+
proxy_substations = [
515+
{"LATITUDE": 35.0514, "LONGITUDE": -81.0694, "NAME": "Catawba", "STATE": "SC"},
516+
{
517+
"LATITUDE": 44.2853,
518+
"LONGITUDE": -105.3826,
519+
"NAME": "Neil Simpson II",
520+
"STATE": "WY",
521+
},
522+
]
523+
524+
substations_lines_filter_override = {301995}
525+
441526
seams_substations = {
442527
"east_west": {
443528
202364, # 'LAMAR HVDC TIE'

prereise/gather/griddata/hifld/data_process/generators.py

Lines changed: 184 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1+
from math import asin
2+
3+
import numpy as np
14
import pandas as pd
2-
from powersimdata.utility.distance import haversine
35
from scipy.optimize import curve_fit
6+
from scipy.spatial import KDTree
47
from scipy.stats import linregress
58

69
from prereise.gather.griddata.hifld import const
710
from prereise.gather.griddata.hifld.data_access import load
11+
from prereise.gather.helpers import latlon_to_xyz
812

913

1014
def floatify(value, default=float("nan")):
@@ -22,54 +26,127 @@ def floatify(value, default=float("nan")):
2226
return default
2327

2428

25-
def map_generator_to_sub_by_location(generator, substation_groupby):
26-
"""Determine a likely substation for a generator to be connected to. Priority order
27-
of mapping is: 1) if location is available and one or more substations exist in that
28-
ZIP code, map by location to closest substation within that ZIP code, 2) if location
29-
is available but no substations exist in that ZIP code, map to the closest
30-
substation within neighboring ZIP codes, 3) if only ZIP code is available
31-
(no location), and one or more substations exist, map to an arbitrarily chosen
32-
substation within that ZIP code, 4) if only ZIP code is available (no location)
33-
but no substations exist in that ZIP code, return NA.
34-
35-
:param pandas.Series generator: one generating unit from data frame.
36-
:param pandas.GroupBy substation_groupby: data frame of substations, grouped by
37-
(interconnect, ZIP).
38-
:return: (*int/pd.NA*) -- substation ID if the generator can be mapped successfully
39-
to a substation, else pd.NA.
29+
def map_generators_to_sub_by_location(
30+
generators, substations, inplace=True, report_worst=None
31+
):
32+
"""Determine the closest substation to each generator. For generators without
33+
latitude and longitude, an attempt will be made to match via ZIP code, and failing
34+
that a pandas.NA value will be returned.
35+
36+
:param pandas.DataFrame generators: generator data. Required columns:
37+
'interconnect', 'lat', 'lon', 'ZIP'.
38+
:param pandas.DataFrame substations: substation data. Required columns:
39+
'interconnect', 'lat', 'lon', 'ZIP'.
40+
:param bool inplace: whether to modify the generator table inplace with new 'sub_id'
41+
and 'sub_dist' columns or to return a new one.
42+
:param int report_worst: if not None, display the distances of the worst N mappings.
43+
:return: (*pandas.DataFrame/None*) -- if ``inplace`` is `False`, return the modified
44+
DataFrame; otherwise return nothing.
4045
"""
41-
lookup_params = tuple(generator.loc[["interconnect", "ZIP"]])
42-
if pd.isna(generator["lat"]) or pd.isna(generator["lon"]):
43-
# No location available
44-
try:
45-
matching_subs = substation_groupby.get_group(lookup_params)
46-
return matching_subs.index[0]
47-
except KeyError:
48-
return pd.NA
49-
try:
50-
# This ZIP code contains substations, this block will execute successfully
51-
matching_subs = substation_groupby.get_group(lookup_params)
52-
except KeyError:
53-
# If this ZIP code does not contain substations, this block will execute, and
54-
# we select a set of 'nearby' substations
55-
zip_range = [int(generator.loc["ZIP"]) + offset for offset in range(-100, 101)]
56-
zip_range_strings = [str(z).rjust(5, "0") for z in zip_range]
57-
try:
58-
matching_subs = pd.concat(
59-
[
60-
substation_groupby.get_group((generator.loc["interconnect"], z))
61-
for z in zip_range_strings
62-
if (generator.loc["interconnect"], z) in substation_groupby.groups
63-
]
64-
)
65-
except ValueError:
66-
# If no matching subs within the given interconnection and ZIPs, give up
46+
47+
def get_closest_substation(generator, voltage_trees, subs_voltage_lookup):
48+
if not isinstance(generator["xyz"], list):
6749
return pd.NA
68-
distance_to_subs = matching_subs.apply(
69-
lambda x: haversine((x.lat, x.lon), (generator.lat, generator.lon)),
50+
if pd.isnull(generator["voltage_class"]) or generator["Pmax"] < 100:
51+
grouper_key = generator["interconnect"]
52+
else:
53+
grouper_key = (generator["interconnect"], generator["voltage_class"])
54+
chord_dist, array_index = voltage_trees[grouper_key].query(generator["xyz"])
55+
sub_id = subs_voltage_lookup[grouper_key][array_index]
56+
# Translate chord distance (unit circle) to great circle distance (miles)
57+
dist_in_miles = 3963 * 2 * asin(chord_dist / 2) # use 3963 mi as earth radius
58+
return pd.Series({"dist": dist_in_miles, "sub_id": sub_id})
59+
60+
def classify_voltages(voltage, voltage_ranges):
61+
for v_range, bounds in voltage_ranges.items():
62+
if bounds["min"] <= voltage <= bounds["max"]:
63+
return v_range
64+
return float("nan")
65+
66+
voltage_ranges = {
67+
"under 100": {"min": 0, "max": 99},
68+
"100-161": {"min": 100, "max": 161},
69+
"220-287": {"min": 220, "max": 287},
70+
"345": {"min": 345, "max": 345},
71+
"500": {"min": 500, "max": 500},
72+
"735 and above": {"min": 735, "max": float("inf")},
73+
}
74+
75+
# Translate lat/lon to 3D positions (assume spherical earth, origin at center)
76+
substations_with_xyz = substations.assign(
77+
xyz=substations.apply(lambda x: latlon_to_xyz(x["lat"], x["lon"]), axis=1)
78+
)
79+
generators_with_xyz = generators.assign(
80+
xyz=generators.apply(
81+
lambda x: (
82+
pd.NA
83+
if pd.isna(x["lat"]) or pd.isna(x["lon"])
84+
else latlon_to_xyz(x["lat"], x["lon"])
85+
),
86+
axis=1,
87+
)
88+
)
89+
90+
# Bin voltages into broad classes
91+
generators_with_xyz["voltage_class"] = generators["Grid Voltage (kV)"].map(
92+
lambda x: classify_voltages(x, voltage_ranges)
93+
)
94+
95+
# Group substations by voltage to build KDTrees
96+
subs_voltage_lookup = {
97+
(interconnect, voltage_level): substations_with_xyz.loc[
98+
(substations_with_xyz["interconnect"] == interconnect)
99+
& (substations_with_xyz["MAX_VOLT"] >= voltage_range["min"])
100+
].index
101+
for interconnect in generators["interconnect"].unique()
102+
for voltage_level, voltage_range in voltage_ranges.items()
103+
}
104+
# Group substations by ZIP code for a fallback for generators without coordinates
105+
subs_zip_groupby = substations_with_xyz.groupby(["interconnect", "ZIP"])
106+
107+
# Create a KDTree for each combination of voltage and interconnect
108+
voltage_trees = {
109+
key: KDTree(np.vstack(substations_with_xyz.loc[sub_ids, "xyz"]))
110+
for key, sub_ids in subs_voltage_lookup.items()
111+
if len(sub_ids) > 0
112+
}
113+
# Create a KDTree for each interconnect (all voltages)
114+
subs_interconnect_groupby = substations_with_xyz.groupby("interconnect")
115+
for interconnect in generators["interconnect"].unique():
116+
tree_subs = subs_interconnect_groupby.get_group(interconnect)
117+
voltage_trees[interconnect] = KDTree(np.vstack(tree_subs["xyz"]))
118+
subs_voltage_lookup[interconnect] = tree_subs.index
119+
120+
# Query the appropriate tree for each generator to get the closest substation ID
121+
mapping_results = generators_with_xyz.apply(
122+
lambda x: get_closest_substation(x, voltage_trees, subs_voltage_lookup),
70123
axis=1,
71124
)
72-
return distance_to_subs.idxmin()
125+
# For generators without coordinates, try to pick a substation with a matching ZIP
126+
for g in generators.loc[mapping_results["sub_id"].isnull()].index:
127+
try:
128+
candidates = subs_zip_groupby.get_group(
129+
(generators.loc[g, "interconnect"], generators.loc[g, "ZIP"])
130+
)
131+
# arbitrary choose the first one
132+
mapping_results.loc[g, "sub_id"] = candidates.index[0]
133+
except KeyError:
134+
continue # No coordinates, no matching ZIP, we're out of luck
135+
136+
if report_worst is not None:
137+
print(
138+
mapping_results.sort_values("sub_dist", ascending=False)
139+
.join(generators[["Plant Code", "Grid Voltage (kV)", "Pmax"]])
140+
.head(report_worst)
141+
)
142+
143+
if inplace:
144+
generators["sub_id"] = mapping_results["sub_id"]
145+
generators["sub_dist"] = mapping_results["dist"]
146+
else:
147+
return generators_with_xyz.drop(["xyz", "voltage_class"], axis=1).join(
148+
mapping_results
149+
)
73150

74151

75152
def map_generator_to_bus_by_sub(generator, bus_groupby):
@@ -83,7 +160,15 @@ def map_generator_to_bus_by_sub(generator, bus_groupby):
83160
if pd.isna(generator.sub_id):
84161
return pd.NA
85162
else:
86-
return bus_groupby.get_group(generator.sub_id)["baseKV"].idxmin()
163+
bus_voltages = bus_groupby.get_group(generator.sub_id)["baseKV"]
164+
if len(bus_voltages) == 1 or generator.Pmax < 200:
165+
# Return the lowest-voltage for small generators, or the only voltage
166+
return bus_voltages.idxmin()
167+
if generator.Pmax < 500:
168+
# Return the second-lowest voltage for mid-sized generators
169+
return bus_voltages.sort_values().index[1]
170+
# Return the highest voltage for large generators
171+
return bus_voltages.idxmax()
87172

88173

89174
def estimate_heat_rate_curve(
@@ -246,6 +331,30 @@ def estimate_coefficients(generator, regressions):
246331
generators.update(linear_heat_rate_assumptions)
247332

248333

334+
def aggregate_hydro_generators_by_plant_id(generators):
335+
"""Combine hydro generators within the same plant into aggregated larger generators.
336+
'Pmin' and 'Pmax' values will be summed, all other attributes (including the index)
337+
will be taken from the (somewhat arbitrary) first generator in the plant grouping.
338+
339+
:param pandas.DataFrame generators: data frame of generators.
340+
:return: (*pandas.DataFrame*) -- data frame of generators, with hydro generators
341+
aggregated.
342+
"""
343+
indiv_hydro_gens = generators.query("`Energy Source 1` == 'WAT'").copy()
344+
original_hydro_indices = indiv_hydro_gens.index.tolist()
345+
# Retain the original indices to keep track of original indices for later append
346+
indiv_hydro_gens.reset_index(inplace=True)
347+
hydro_groupby = indiv_hydro_gens.groupby("Plant Code")
348+
# Choose characteristics from the (arbitrary) first plant
349+
aggregated_hydro = hydro_groupby.first()
350+
aggregated_hydro[["Pmin", "Pmax"]] = hydro_groupby[["Pmin", "Pmax"]].sum()
351+
# Reset/set index to restore 'Plant Code' as a column and original index numbering
352+
aggregated_hydro.reset_index(inplace=True)
353+
aggregated_hydro.set_index("index", inplace=True) # 'index' was the original
354+
generators = generators.drop(original_hydro_indices).append(aggregated_hydro)
355+
return generators
356+
357+
249358
def build_plant(bus, substations, kwargs={}):
250359
"""Use source data on generating units from EIA/EPA, along with transmission network
251360
data, to produce a plant data frame.
@@ -286,35 +395,47 @@ def build_plant(bus, substations, kwargs={}):
286395
bus_groupby = bus.groupby(bus["sub_id"].astype(int))
287396
# Filter substations with no buses
288397
substations = substations.loc[set(bus_groupby.groups.keys())]
289-
substation_groupby = substations.groupby(["interconnect", "ZIP"])
290398
epa_ampd_groupby = epa_ampd.groupby(["ORISPL_CODE", "UNITID"])
291399

292-
# Add information
293-
generators["interconnect"] = (
294-
generators["Plant Code"]
295-
.map(plants["NERC Region"])
296-
.map(const.nercregion2interconnect)
297-
)
298-
generators["lat"] = generators["Plant Code"].map(plants["Latitude"])
299-
generators["lon"] = generators["Plant Code"].map(plants["Longitude"])
300-
generators["ZIP"] = generators["Plant Code"].map(plants["Zip"])
301-
generators["Balancing Authority Code"] = generators["Plant Code"].map(
302-
plants["Balancing Authority Code"]
400+
# Add information to generators based on Form 860 Plant table
401+
# Merging this way allows column-on-column merge while preserving original index
402+
generators = (
403+
generators.reset_index()
404+
.merge(
405+
plants,
406+
on="Plant Code",
407+
suffixes=(None, "_860Plant"),
408+
)
409+
.set_index("index")
303410
)
304-
print("Mapping generators to substations... (this may take several minutes)")
305-
generators["sub_id"] = generators.apply(
306-
lambda x: map_generator_to_sub_by_location(x, substation_groupby), axis=1
411+
generators.rename(
412+
{"Latitude": "lat", "Longitude": "lon", "Zip": "ZIP"}, axis=1, inplace=True
307413
)
308-
generators["bus_id"] = generators.apply(
309-
lambda x: map_generator_to_bus_by_sub(x, bus_groupby), axis=1
414+
# Map interconnect via BA first (more reliable) then by NERC Region
415+
generators["interconnect"] = (
416+
generators["Balancing Authority Code"]
417+
.map(const.balancingauthority2interconnect)
418+
.combine_first(generators["NERC Region"].map(const.nercregion2interconnect))
310419
)
420+
generators["Grid Voltage (kV)"] = generators["Grid Voltage (kV)"].map(floatify)
421+
422+
# Ensure we have Pmax and Pmin for each generator
311423
generators["Pmax"] = generators[
312424
["Summer Capacity (MW)", "Winter Capacity (MW)"]
313425
].max(axis=1)
314426
# Drop generators with no capacity listed
315427
generators = generators.loc[~generators["Pmax"].isnull()]
316428
generators.rename({"Minimum Load (MW)": "Pmin"}, inplace=True, axis=1)
317429
generators["Pmin"] = generators["Pmin"].fillna(0)
430+
431+
# Aggregate hydro generators within each plant
432+
generators = aggregate_hydro_generators_by_plant_id(generators)
433+
434+
map_generators_to_sub_by_location(generators, substations)
435+
generators["bus_id"] = generators.apply(
436+
lambda x: map_generator_to_bus_by_sub(x, bus_groupby), axis=1
437+
)
438+
318439
print("Fitting heat rate curves to EPA data... (this may take several minutes)")
319440
heat_rate_curve_estimates = generators.apply(
320441
lambda x: estimate_heat_rate_curve(

0 commit comments

Comments
 (0)