Skip to content

Commit 8687bb5

Browse files
thodson-usgsclaude
andauthored
Simplify defunct stubs and fix O(n^2) pagination concats (#225)
- nwis.py: reduce 5 defunct function stubs (get_qwdata, get_discharge_measurements, get_gwlevels, get_pmcodes, get_water_use) to minimal **kwargs signatures since they only raise NameError - nwis.py: fix O(n^2) pd.concat in _read_json by collecting per-site DataFrames in a list and concatenating once; also drop an unnecessary str() + pd.read_json round-trip since the source is already parsed - nwis.py: restore missing `elif service == "peaks"` dispatch branch in get_record (lost in an earlier lint/format pass) - waterdata/utils.py: fix O(n^2) pd.concat in get_stats_data pagination loop using the same collect-then-concat pattern - samples.py: replace the 230-line get_usgs_samples pass-through that duplicated 22 parameters verbatim with a **kwargs wrapper - demos/hydroshare/*Peaks,Ratings,Statistics,WaterUse*.ipynb: fix SyntaxError from a joined `from dataretrieval import nwisfrom ...` import line; comment out two WaterUse cells that referenced a variable from a now-defunct get_water_use call - nb-clean metadata on NWIS_demo_1 and WaterData_demo notebooks Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 0aec2c8 commit 8687bb5

9 files changed

Lines changed: 57 additions & 327 deletions

dataretrieval/nwis.py

Lines changed: 34 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -140,38 +140,15 @@ def preformat_peaks_response(df: pd.DataFrame) -> pd.DataFrame:
140140
return df
141141

142142

143-
def get_qwdata(
144-
sites: list[str] | str | None = None,
145-
start: str | None = None,
146-
end: str | None = None,
147-
multi_index: bool = True,
148-
wide_format: bool = True,
149-
datetime_index: bool = True,
150-
ssl_check: bool = True,
151-
**kwargs,
152-
) -> tuple[pd.DataFrame, BaseMetadata]:
153-
"""
154-
This function is defunct, use `get_samples()`
155-
in the waterdata module.
156-
157-
"""
143+
def get_qwdata(**kwargs):
144+
"""Defunct: use ``waterdata.get_samples()``."""
158145
raise NameError(
159146
"`nwis.get_qwdata` has been replaced with `waterdata.get_samples()`."
160147
)
161148

162149

163-
def get_discharge_measurements(
164-
sites: list[str] | str | None = None,
165-
start: str | None = None,
166-
end: str | None = None,
167-
ssl_check: bool = True,
168-
**kwargs,
169-
) -> tuple[pd.DataFrame, BaseMetadata]:
170-
"""
171-
This function is defunct, use `get_field_measurements()`
172-
in the waterdata module.
173-
174-
"""
150+
def get_discharge_measurements(**kwargs):
151+
"""Defunct: use ``waterdata.get_field_measurements()``."""
175152
raise NameError(
176153
"`nwis.get_discharge_measurements` has been replaced "
177154
"with `waterdata.get_field_measurements`."
@@ -247,20 +224,8 @@ def get_discharge_peaks(
247224
)
248225

249226

250-
def get_gwlevels(
251-
sites: list[str] | str | None = None,
252-
start: str = "1851-01-01",
253-
end: str | None = None,
254-
multi_index: bool = True,
255-
datetime_index: bool = True,
256-
ssl_check: bool = True,
257-
**kwargs,
258-
) -> tuple[pd.DataFrame, BaseMetadata]:
259-
"""
260-
This function is defunct, use `get_field_measurements()`
261-
in the waterdata module.
262-
263-
"""
227+
def get_gwlevels(**kwargs):
228+
"""Defunct: use ``waterdata.get_field_measurements()``."""
264229
raise NameError(
265230
"`nwis.get_gwlevels` has been replaced "
266231
"with `waterdata.get_field_measurements()`."
@@ -692,33 +657,16 @@ def get_iv(
692657
return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs)
693658

694659

695-
def get_pmcodes(
696-
parameterCd: str | list[str] = "All",
697-
partial: bool = True,
698-
ssl_check: bool = True,
699-
) -> tuple[pd.DataFrame, BaseMetadata]:
700-
"""
701-
This function is defunct, use
702-
`get_reference_table(collection="parameter-codes")`.
703-
704-
"""
660+
def get_pmcodes(**kwargs):
661+
"""Defunct: use ``get_reference_table(collection='parameter-codes')``."""
705662
raise NameError(
706663
"`nwis.get_pmcodes` has been replaced "
707664
"with `get_reference_table(collection='parameter-codes')`."
708665
)
709666

710667

711-
def get_water_use(
712-
years: str | list[str] = "ALL",
713-
state: str | None = None,
714-
counties: str | list[str] = "ALL",
715-
categories: str | list[str] = "ALL",
716-
ssl_check: bool = True,
717-
) -> tuple[pd.DataFrame, BaseMetadata]:
718-
"""
719-
This function is defunct and currently has no replacement.
720-
721-
"""
668+
def get_water_use(**kwargs):
669+
"""Defunct: no current replacement."""
722670
raise NameError("`nwis.get_water_use` is defunct.")
723671

724672

@@ -950,6 +898,17 @@ def get_record(
950898
df, _ = get_info(sites=sites, ssl_check=ssl_check, **kwargs)
951899
return df
952900

901+
elif service == "peaks":
902+
df, _ = get_discharge_peaks(
903+
sites=sites,
904+
start=start,
905+
end=end,
906+
multi_index=multi_index,
907+
ssl_check=ssl_check,
908+
**kwargs,
909+
)
910+
return df
911+
953912
elif service == "ratings":
954913
df, _ = get_ratings(site=sites, ssl_check=ssl_check, **kwargs)
955914
return df
@@ -979,7 +938,7 @@ def _read_json(json):
979938
A custom metadata object
980939
981940
"""
982-
merged_df = pd.DataFrame(columns=["site_no", "datetime"])
941+
all_site_dfs = []
983942

984943
site_list = [
985944
ts["sourceInfo"]["siteCode"][0]["value"] for ts in json["value"]["timeSeries"]
@@ -1008,14 +967,11 @@ def _read_json(json):
1008967
# check whether min, max, mean record XXX
1009968
option = timeseries["variable"]["options"]["option"][0].get("value")
1010969

1011-
# loop through each parameter in timeseries, then concat to the merged_df
1012970
for parameter in timeseries["values"]:
1013971
col_name = param_cd
1014972
method = parameter["method"][0]["methodDescription"]
1015973

1016-
# if len(timeseries['values']) > 1 and method:
1017974
if method:
1018-
# get method, format it, and append to column name
1019975
method = method.strip("[]()").lower()
1020976
col_name = f"{col_name}_{method}"
1021977

@@ -1025,22 +981,15 @@ def _read_json(json):
1025981
record_json = parameter["value"]
1026982

1027983
if not record_json:
1028-
# no data in record
1029984
continue
1030-
# should be able to avoid this by dumping
1031-
record_json = str(record_json).replace("'", '"')
1032-
1033-
# read json, converting all values to float64 and all qualifiers
1034-
# Lists can't be hashed, thus we cannot df.merge on a list column
1035-
record_df = pd.read_json(
1036-
StringIO(record_json),
1037-
orient="records",
1038-
dtype={"value": "float64", "qualifiers": "unicode"},
1039-
convert_dates=False,
1040-
)
1041985

986+
record_df = pd.DataFrame(record_json)
987+
record_df["value"] = pd.to_numeric(record_df["value"], errors="coerce")
1042988
record_df["qualifiers"] = (
1043-
record_df["qualifiers"].str.strip("[]").str.replace("'", "")
989+
record_df["qualifiers"]
990+
.astype(str)
991+
.str.strip("[]")
992+
.str.replace("'", "")
1044993
)
1045994

1046995
record_df.rename(
@@ -1054,11 +1003,14 @@ def _read_json(json):
10541003

10551004
site_df = site_df.merge(record_df, how="outer", on="datetime")
10561005

1057-
# end of site loop
10581006
site_df["site_no"] = site_no
1059-
merged_df = pd.concat([merged_df, site_df])
1007+
all_site_dfs.append(site_df)
1008+
1009+
if not all_site_dfs:
1010+
return pd.DataFrame(columns=["site_no", "datetime"])
1011+
1012+
merged_df = pd.concat(all_site_dfs, ignore_index=True)
10601013

1061-
# convert to datetime, normalizing the timezone to UTC when doing so
10621014
if "datetime" in merged_df.columns:
10631015
merged_df["datetime"] = pd.to_datetime(merged_df["datetime"], utc=True)
10641016

0 commit comments

Comments
 (0)