Skip to content

Commit efc3ec4

Browse files
Add patron data helper
1 parent 41bf522 commit efc3ec4

4 files changed

Lines changed: 537 additions & 2 deletions

File tree

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ help:
99
@echo " lint project files using the flake8 linter"
1010

1111
test:
12-
pytest
12+
pytest -W ignore::FutureWarning
1313

1414
lint:
1515
flake8 --exclude *env

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,15 @@ config-helper = [
6767
obfuscation-helper = [
6868
"bcrypt>=4.0.1"
6969
]
70+
patron-data-helper = [
71+
"nypl_py_utils[postgresql-client,redshift-client]>=1.5.0",
72+
"pandas>=2.2.2"
73+
]
7074
research-catalog-identifier-helper = [
7175
"requests>=2.28.1"
7276
]
7377
development = [
74-
"nypl_py_utils[avro-client,kinesis-client,kms-client,mysql-client,oauth2-api-client,postgresql-client,redshift-client,s3-client,secrets-manager-client,sftp-client,config-helper,obfuscation-helper,research-catalog-identifier-helper]",
78+
"nypl_py_utils[avro-client,kinesis-client,kms-client,mysql-client,oauth2-api-client,postgresql-client,redshift-client,s3-client,secrets-manager-client,sftp-client,config-helper,obfuscation-helper,patron-data-helper,research-catalog-identifier-helper]",
7579
"flake8>=6.0.0",
7680
"freezegun>=1.2.2",
7781
"mock>=4.0.3",
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
import pandas as pd
2+
3+
from nypl_py_utils.functions.log_helper import create_log
4+
5+
logger = create_log("patron_data_helpers")
6+
7+
_REDSHIFT_QUERY = """
8+
SELECT patron_id, postal_code, geoid
9+
FROM {table}
10+
WHERE patron_id IN ({ids});"""
11+
12+
_SIERRA_BARCODES_TO_IDS_QUERY = """
13+
SELECT index_tag || index_entry, record_id
14+
FROM sierra_view.phrase_entry
15+
WHERE index_tag || index_entry IN ({});"""
16+
17+
_SIERRA_PATRON_DATA_QUERY = """
18+
SELECT id, barcode, ptype_code, pcode3,
19+
CASE WHEN LENGTH(TRIM(home_library_code)) = 0
20+
OR TRIM(home_library_code) = 'none' THEN NULL
21+
ELSE TRIM(home_library_code) END
22+
FROM sierra_view.patron_view
23+
WHERE id IN ({});"""
24+
25+
26+
def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
27+
remove_duplicates=True):
28+
"""
29+
Converts barcodes into Sierra patron ids
30+
31+
Parameters
32+
----------
33+
sierra_client: PostgreSQLClient
34+
The client with which to query Sierra
35+
barcodes: sequence of strings
36+
The sequence of barcodes to be mapped. Must be iterable and without
37+
'None' entries. Each barcode is expected to be a string without a
38+
prepending 'b' character.
39+
isolate_connection: bool, optional
40+
Whether the database connection should be opened and closed within this
41+
method or whether it will be handled by the user
42+
remove_duplicates: bool, optional
43+
Whether barcodes that map to multiple patron ids should be removed
44+
45+
Returns
46+
-------
47+
DataFrame
48+
A pandas DataFrame with 'barcode' and 'patron_id' columns. The
49+
'patron_id' column is set to be a string.
50+
"""
51+
unique_barcodes = set(barcodes)
52+
if unique_barcodes:
53+
logger.info(f"Mapping ({len(unique_barcodes)}) barcodes to patron ids")
54+
barcodes_str = "'b" + "','b".join(unique_barcodes) + "'"
55+
if isolate_connection:
56+
sierra_client.connect()
57+
raw_data = sierra_client.execute_query(
58+
_SIERRA_BARCODES_TO_IDS_QUERY.format(barcodes_str))
59+
if isolate_connection:
60+
sierra_client.close_connection()
61+
else:
62+
logger.info("No barcodes given with which to query Sierra")
63+
raw_data = []
64+
65+
df = pd.DataFrame(raw_data, columns=["barcode", "patron_id"])
66+
df = df[pd.notnull(df[["barcode", "patron_id"]]).all(axis=1)]
67+
df["barcode"] = df["barcode"].str.lstrip("b")
68+
df["patron_id"] = df["patron_id"].astype("Int64").astype("string")
69+
df = df.drop_duplicates()
70+
if remove_duplicates:
71+
return df.drop_duplicates("barcode", keep=False)
72+
else:
73+
return df
74+
75+
76+
def get_sierra_patron_data_from_ids(sierra_client, patron_ids,
77+
isolate_connection=True,
78+
remove_duplicates=False):
79+
"""
80+
Given Sierra patron ids, returns standard patron fields from Sierra
81+
82+
Parameters
83+
----------
84+
sierra_client: PostgreSQLClient
85+
The client with which to query Sierra
86+
patron_ids: sequence of strings
87+
The sequence of patron ids to be fetched. Must be iterable and without
88+
'None' entries. Each patron id is expected to be a string.
89+
isolate_connection: bool, optional
90+
Whether the database connection should be opened and closed within this
91+
method or whether it will be handled by the user
92+
remove_duplicates: bool, optional
93+
Whether patron ids that map to multiple rows with different values
94+
should be removed
95+
96+
Returns
97+
-------
98+
DataFrame
99+
A pandas DataFrame with standard patron columns. The 'patron_id' column
100+
is set to be a string.
101+
"""
102+
unique_patron_ids = set(patron_ids)
103+
if unique_patron_ids:
104+
logger.info(
105+
f"Fetching Sierra patron data for ({len(unique_patron_ids)}) "
106+
"patrons")
107+
patron_ids_str = ",".join(unique_patron_ids)
108+
if isolate_connection:
109+
sierra_client.connect()
110+
raw_data = sierra_client.execute_query(
111+
_SIERRA_PATRON_DATA_QUERY.format(patron_ids_str))
112+
if isolate_connection:
113+
sierra_client.close_connection()
114+
else:
115+
logger.info("No patron ids given with which to query Sierra")
116+
raw_data = []
117+
118+
df = pd.DataFrame(raw_data, columns=[
119+
"patron_id", "barcode", "ptype_code", "pcode3",
120+
"patron_home_library_code"])
121+
df = df[pd.notnull(df["patron_id"])]
122+
df["patron_id"] = df["patron_id"].astype("Int64").astype("string")
123+
if remove_duplicates:
124+
# If one patron id maps to two rows that are identical except for the
125+
# barcode, arbitrarily delete one of the rows
126+
df = df.drop_duplicates(
127+
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
128+
return df.drop_duplicates("patron_id", keep=False)
129+
else:
130+
return df.drop_duplicates()
131+
132+
133+
def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
134+
isolate_connection=True):
135+
"""
136+
Given barcodes, returns standard patron fields from Sierra. One row per
137+
barcode is returned for all barcodes found in Sierra.
138+
139+
Parameters
140+
----------
141+
sierra_client: PostgreSQLClient
142+
The client with which to query Sierra
143+
barcodes: sequence of strings
144+
The sequence of barcodes to be mapped. Must be iterable and without
145+
'None' entries. Each barcode is expected to be a string without a
146+
prepending 'b' character.
147+
isolate_connection: bool, optional
148+
Whether the database connection should be opened and closed within this
149+
method or whether it will be handled by the user
150+
151+
Returns
152+
-------
153+
DataFrame
154+
A pandas DataFrame with barcodes plus the standard patron columns. The
155+
'patron_id' column is set to be a string.
156+
"""
157+
if isolate_connection:
158+
sierra_client.connect()
159+
barcode_patron_id_df = barcodes_to_patron_ids(
160+
sierra_client, barcodes, False, True)
161+
patron_data_df = get_sierra_patron_data_from_ids(
162+
sierra_client, barcode_patron_id_df["patron_id"], False, False)
163+
if isolate_connection:
164+
sierra_client.close_connection()
165+
166+
# If one patron id maps to two rows that are identical except for the
167+
# barcode, arbitrarily delete one of the rows
168+
patron_data_df = patron_data_df.drop_duplicates(
169+
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
170+
171+
# Prefer matches where both the barcode and the patron id match. Otherwise,
172+
# accept matches where only the patron id matches. If more than one match
173+
# is found, use none of them and NULL out the patron fields.
174+
df = barcode_patron_id_df.merge(
175+
patron_data_df, how="left", on=["patron_id", "barcode"],
176+
indicator=True
177+
)
178+
perfect_match_df = df[df["_merge"] == "both"].drop(columns=["_merge"])
179+
imperfect_match_df = df[["barcode", "patron_id"]].drop(
180+
perfect_match_df.index).merge(patron_data_df.drop(columns=["barcode"]),
181+
how="left", on="patron_id")
182+
df = pd.concat([perfect_match_df, imperfect_match_df], ignore_index=True)
183+
df.loc[df.duplicated("barcode", keep=False), [
184+
"ptype_code", "pcode3", "patron_home_library_code"]] = None
185+
return df.drop_duplicates("barcode")
186+
187+
188+
def get_redshift_patron_data(redshift_client, obfuscated_patron_ids,
189+
isolate_connection=True):
190+
"""
191+
Given obfuscated patron ids, returns postal code and geoid from Redshift.
192+
One row per patron id is returned for all patron ids found in Redshift.
193+
194+
Parameters
195+
----------
196+
redshift_client: RedshiftClient
197+
The client with which to query Redshift
198+
obfuscated_patron_ids: sequence of strings
199+
The sequence of patron ids to be mapped. Must be iterable and without
200+
'None' entries. Each patron id is expected to have been obfuscated.
201+
isolate_connection: bool, optional
202+
Whether the database connection should be opened and closed within this
203+
method or whether it will be handled by the user
204+
205+
Returns
206+
-------
207+
DataFrame
208+
A pandas DataFrame with 'patron_id', 'postal_code', and 'geoid' columns
209+
"""
210+
unique_patron_ids = set(obfuscated_patron_ids)
211+
if unique_patron_ids:
212+
logger.info(f"Querying Redshift for ({len(unique_patron_ids)}) "
213+
"patrons")
214+
redshift_table = "patron_info"
215+
if redshift_client.database != "production":
216+
redshift_table += "_" + redshift_client.database
217+
patron_ids_str = "'" + "','".join(unique_patron_ids) + "'"
218+
219+
if isolate_connection:
220+
redshift_client.connect()
221+
raw_data = redshift_client.execute_query(
222+
_REDSHIFT_QUERY.format(table=redshift_table, ids=patron_ids_str))
223+
if isolate_connection:
224+
redshift_client.close_connection()
225+
else:
226+
logger.info("No patron ids given with which to query Redshift")
227+
raw_data = []
228+
229+
df = pd.DataFrame(raw_data, columns=["patron_id", "postal_code", "geoid"])
230+
df = df[pd.notnull(df["patron_id"])]
231+
if not df["patron_id"].is_unique:
232+
duplicates = df.loc[df.duplicated("patron_id"), "patron_id"]
233+
logger.warning(
234+
"More than one Redshift row found for the following patron ids: "
235+
f"{', '.join(duplicates)}")
236+
return df.drop_duplicates("patron_id", keep=False)
237+
else:
238+
return df

0 commit comments

Comments
 (0)