Skip to content

Commit 896f130

Browse files
Merge pull request #40 from NYPL/add_record_num
Add search by record_num to patron_data_helper
2 parents f9f5746 + e0f64c0 commit 896f130

5 files changed

Lines changed: 172 additions & 74 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
# Changelog
2+
## v1.6.1 11/26/24
3+
- Add record_num capability to patron_data_helper
4+
25
## v1.6.0 11/20/24
36
- Added patron_data_helper functions
47
- Use executemany instead of execute when appropriate in PostgreSQLClient

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ This package contains common Python utility classes and functions.
2020
* Creating a logger in the appropriate format
2121
* Obfuscating a value using bcrypt
2222
* Parsing/building Research Catalog identifiers
23-
* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids
23+
* Mapping between barcodes and Sierra patron ids plus getting patron data from Sierra and Redshift using those ids or record_nums
2424

2525
## Usage
2626
```python

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "nypl_py_utils"
7-
version = "1.6.0"
7+
version = "1.6.1"
88
authors = [
99
{ name="Aaron Friedman", email="aaronfriedman@nypl.org" },
1010
]

src/nypl_py_utils/functions/patron_data_helper.py

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
WHERE index_tag || index_entry IN ({});"""
1616

1717
_SIERRA_PATRON_DATA_QUERY = """
18-
SELECT id, barcode, ptype_code, pcode3,
18+
SELECT id, record_num, barcode, ptype_code, pcode3,
1919
CASE WHEN LENGTH(TRIM(home_library_code)) = 0
2020
OR TRIM(home_library_code) = 'none' THEN NULL
2121
ELSE TRIM(home_library_code) END
2222
FROM sierra_view.patron_view
23-
WHERE id IN ({});"""
23+
WHERE {id_field} IN ({ids});"""
2424

2525

2626
def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
@@ -73,61 +73,76 @@ def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True,
7373
return df
7474

7575

76-
def get_sierra_patron_data_from_ids(sierra_client, patron_ids,
76+
def get_sierra_patron_data_from_ids(sierra_client, ids,
7777
isolate_connection=True,
78-
remove_duplicates=False):
78+
remove_duplicates=False,
79+
use_record_num=False):
7980
"""
8081
Given Sierra patron ids, returns standard patron fields from Sierra
8182
8283
Parameters
8384
----------
8485
sierra_client: PostgreSQLClient
8586
The client with which to query Sierra
86-
patron_ids: sequence of strings
87-
The sequence of patron ids to be fetched. Must be iterable and without
88-
'None' entries. Each patron id is expected to be a string.
87+
ids: sequence of strings
88+
The sequence of patron ids or record_nums to be fetched. Must be
89+
iterable and without any 'None' entries. Each id is expected to be a
90+
string.
8991
isolate_connection: bool, optional
9092
Whether the database connection should be opened and closed within this
9193
method or whether it will be handled by the user
9294
remove_duplicates: bool, optional
9395
Whether patron ids that map to multiple rows with different values
9496
should be removed
97+
use_record_num: bool, optional
98+
Whether the `ids` given are record_nums rather than patron ids
9599
96100
Returns
97101
-------
98102
DataFrame
99103
A pandas DataFrame with standard patron columns. The 'patron_id' column
100104
is set to be a string.
101105
"""
102-
unique_patron_ids = set(patron_ids)
103-
if unique_patron_ids:
106+
unique_ids = set(ids)
107+
if unique_ids:
104108
logger.info(
105-
f"Fetching Sierra patron data for ({len(unique_patron_ids)}) "
106-
"patrons")
107-
patron_ids_str = ",".join(unique_patron_ids)
109+
f"Fetching Sierra patron data for ({len(unique_ids)}) patrons")
110+
id_field = "record_num" if use_record_num else "id"
111+
ids_str = ",".join(unique_ids)
108112
if isolate_connection:
109113
sierra_client.connect()
110114
raw_data = sierra_client.execute_query(
111-
_SIERRA_PATRON_DATA_QUERY.format(patron_ids_str))
115+
_SIERRA_PATRON_DATA_QUERY.format(id_field=id_field, ids=ids_str))
112116
if isolate_connection:
113117
sierra_client.close_connection()
114118
else:
115119
logger.info("No patron ids given with which to query Sierra")
116120
raw_data = []
117121

118122
df = pd.DataFrame(raw_data, columns=[
119-
"patron_id", "barcode", "ptype_code", "pcode3",
123+
"patron_id", "record_num", "barcode", "ptype_code", "pcode3",
120124
"patron_home_library_code"])
121125
df = df[pd.notnull(df["patron_id"])]
122126
df["patron_id"] = df["patron_id"].astype("Int64").astype("string")
123-
if remove_duplicates:
127+
if use_record_num:
128+
df = df[pd.notnull(df["record_num"])]
129+
df["record_num"] = df["record_num"].astype("Int32").astype("string")
130+
131+
if not remove_duplicates:
132+
return df.drop_duplicates()
133+
elif use_record_num:
124134
# If one patron id maps to two rows that are identical except for the
125135
# barcode, arbitrarily delete one of the rows
136+
df = df.drop_duplicates(
137+
["patron_id", "record_num", "ptype_code", "pcode3",
138+
"patron_home_library_code"])
139+
return df.drop_duplicates("record_num", keep=False)
140+
else:
141+
# If one patron id maps to two rows that are identical except for the
142+
# barcode or record_num, arbitrarily delete one of the rows
126143
df = df.drop_duplicates(
127144
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
128145
return df.drop_duplicates("patron_id", keep=False)
129-
else:
130-
return df.drop_duplicates()
131146

132147

133148
def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
@@ -159,12 +174,12 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
159174
barcode_patron_id_df = barcodes_to_patron_ids(
160175
sierra_client, barcodes, False, True)
161176
patron_data_df = get_sierra_patron_data_from_ids(
162-
sierra_client, barcode_patron_id_df["patron_id"], False, False)
177+
sierra_client, barcode_patron_id_df["patron_id"], False, False, False)
163178
if isolate_connection:
164179
sierra_client.close_connection()
165180

166181
# If one patron id maps to two rows that are identical except for the
167-
# barcode, arbitrarily delete one of the rows
182+
# barcode or record_num, arbitrarily delete one of the rows
168183
patron_data_df = patron_data_df.drop_duplicates(
169184
["patron_id", "ptype_code", "pcode3", "patron_home_library_code"])
170185

@@ -181,7 +196,8 @@ def get_sierra_patron_data_from_barcodes(sierra_client, barcodes,
181196
how="left", on="patron_id")
182197
df = pd.concat([perfect_match_df, imperfect_match_df], ignore_index=True)
183198
df.loc[df.duplicated("barcode", keep=False), [
184-
"ptype_code", "pcode3", "patron_home_library_code"]] = None
199+
"record_num", "ptype_code", "pcode3",
200+
"patron_home_library_code"]] = None
185201
return df.drop_duplicates("barcode")
186202

187203

tests/test_patron_data_helper.py

Lines changed: 131 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -19,36 +19,39 @@
1919
]
2020

2121
_TEST_SIERRA_IDS_RESPONSE = [
22-
(1, "b1", 11, 12, "aa"), (2, "b2", 21, 22, "bb"),
23-
(3, "b3", 31, 32, "cc"), (33, "b3", 331, 332, "ccc"),
24-
(4, None, None, None, None), (5, "b5", 51, 52, "dd"),
25-
(6, "b6", 61, 62, "ee"), (6, "b66", 61, 62, "ee"),
26-
(7, "b7", 71, 72, "ff"), (7, "b77", 771, 772, "ffff"),
27-
(None, "b4", None, None, None), (5, "b5", 51, 52, "dd"),
22+
(1, "1", "b1", 11, 12, "aa"), (2, "2", "b2", 21, 22, "bb"),
23+
(3, "3", "b3", 31, 32, "cc"), (33, "3", "b3", 331, 332, "ccc"),
24+
(4, None, None, None, None, None), (5, "5", "b5", 51, 52, "dd"),
25+
(6, "6", "b6", 61, 62, "ee"), (6, "6", "b66", 61, 62, "ee"),
26+
(7, "7", "b7", 71, 72, "ff"), (7, "77", "b77", 771, 772, "ffff"),
27+
(None, "4", "b4", None, None, None), (5, "5", "b5", 51, 52, "dd"),
2828
]
2929

3030
_TEST_BARCODE_DF = pd.DataFrame(
31-
[[f"b{i}", str(i)] for i in range(1, 10)],
31+
[[f"b{i}", str(i)] for i in range(1, 11)],
3232
columns=["barcode", "patron_id"])
3333
_TEST_BARCODE_DF["patron_id"] = _TEST_BARCODE_DF["patron_id"].astype("string")
3434

3535
_TEST_ID_DF = pd.DataFrame(
36-
[["1", "b1", 11, 12, "aa"], # one perfect match
37-
["2", "b5", 21, 22, "bb"], # different id and barcode matches
36+
[["1", "1", "b1", 11, 12, "aa"], # one perfect match
37+
["2", "2", "b5", 21, 22, "bb"], # different id and barcode matches
3838
# no match for patron id 3
39-
["4", "b4", 41, 42, "dd"], # two matches -- one perfect, one imperfect
40-
["4", "b444", 43, 44, "dddd"],
41-
["5", "b555", 51, 52, "eeee"], # two matches -- both imperfect
42-
["5", "b556", 53, 54, "eeef"],
43-
["6", "b6", 61, 62, "ffff"], # two matches -- both perfect
44-
["6", "b6", 63, 64, "fffg"],
45-
["7", "b777", 71, 72, "gg"], # two matches -- same except barcode
46-
["7", "b778", 71, 72, "gg"],
47-
["8", None, 81, 82, "hh"], # one imperfect match/no barcode
48-
["9", "b9", None, None, None]], # one perfect match/all null fields
49-
columns=["patron_id", "barcode", "ptype_code", "pcode3",
39+
["4", "4", "b4", 41, 42, "dd"], # two matches -- perfect and imperfect
40+
["4", "4", "b444", 43, 44, "dddd"],
41+
["5", "5", "b555", 51, 52, "eeee"], # two matches -- both imperfect
42+
["5", "5", "b556", 53, 54, "eeef"],
43+
["6", "6", "b6", 61, 62, "ffff"], # two matches -- both perfect
44+
["6", "6", "b6", 63, 64, "fffg"],
45+
["7", "7", "b777", 71, 72, "gg"], # two matches -- same but barcode
46+
["7", "7", "b778", 71, 72, "gg"],
47+
["8", "88", "b8", 81, 82, "hh"], # two matches -- same but record_num
48+
["8", "89", "b8", 81, 82, "hh"],
49+
["9", "9", None, 91, 92, "ii"], # one match/no barcode
50+
["10", "10", "b10", None, None, None]], # one match/all null fields
51+
columns=["patron_id", "record_num", "barcode", "ptype_code", "pcode3",
5052
"patron_home_library_code"])
51-
_TEST_ID_DF["patron_id"] = _TEST_ID_DF["patron_id"].astype("string")
53+
_TEST_ID_DF[["patron_id", "record_num"]] = _TEST_ID_DF[
54+
["patron_id", "record_num"]].astype("string")
5255

5356

5457
class TestPatronDataHelper:
@@ -108,16 +111,20 @@ def test_barcodes_to_patron_ids_with_duplicates(self, mocker):
108111
remove_duplicates=False
109112
))
110113

111-
def test_get_sierra_patron_data_from_ids(self, mocker):
114+
def test_get_sierra_patron_data_from_ids_pat_ids(self, mocker):
112115
RESULT = pd.DataFrame(
113-
[["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
114-
["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
115-
["4", None, None, None, None],
116-
["5", "b5", 51, 52, "dd"], ["6", "b6", 61, 62, "ee"],
117-
["6", "b66", 61, 62, "ee"], ["7", "b7", 71, 72, "ff"],
118-
["7", "b77", 771, 772, "ffff"]],
119-
columns=["patron_id", "barcode", "ptype_code", "pcode3",
120-
"patron_home_library_code"])
116+
[["1", "1", "b1", 11, 12, "aa"],
117+
["2", "2", "b2", 21, 22, "bb"],
118+
["3", "3", "b3", 31, 32, "cc"],
119+
["33", "3", "b3", 331, 332, "ccc"],
120+
["4", None, None, None, None, None],
121+
["5", "5", "b5", 51, 52, "dd"],
122+
["6", "6", "b6", 61, 62, "ee"],
123+
["6", "6", "b66", 61, 62, "ee"],
124+
["7", "7", "b7", 71, 72, "ff"],
125+
["7", "77", "b77", 771, 772, "ffff"]],
126+
columns=["patron_id", "record_num", "barcode", "ptype_code",
127+
"pcode3", "patron_home_library_code"])
121128
RESULT["patron_id"] = RESULT["patron_id"].astype("string")
122129

123130
mock_sierra_client = mocker.MagicMock()
@@ -137,7 +144,48 @@ def test_get_sierra_patron_data_from_ids(self, mocker):
137144
# directly. The workaround is to test the total length of the query
138145
# plus that each id appears in it.
139146
query = mock_sierra_client.execute_query.call_args[0][0]
140-
assert len(query) == 257
147+
assert len(query) == 269
148+
assert "WHERE id IN" in query
149+
for el in range(1, 9):
150+
assert str(el) in query
151+
152+
def test_get_sierra_patron_data_from_ids_record_nums(self, mocker):
153+
RESULT = pd.DataFrame(
154+
[["1", "1", "b1", 11., 12., "aa"],
155+
["2", "2", "b2", 21., 22., "bb"],
156+
["3", "3", "b3", 31., 32., "cc"],
157+
["33", "3", "b3", 331., 332., "ccc"],
158+
["5", "5", "b5", 51., 52., "dd"],
159+
["6", "6", "b6", 61., 62., "ee"],
160+
["6", "6", "b66", 61., 62., "ee"],
161+
["7", "7", "b7", 71., 72., "ff"],
162+
["7", "77", "b77", 771., 772., "ffff"]],
163+
columns=["patron_id", "record_num", "barcode", "ptype_code",
164+
"pcode3", "patron_home_library_code"],
165+
index=[0, 1, 2, 3, 5, 6, 7, 8, 9])
166+
RESULT[["patron_id", "record_num"]] = RESULT[
167+
["patron_id", "record_num"]].astype("string")
168+
169+
mock_sierra_client = mocker.MagicMock()
170+
mock_sierra_client.execute_query.return_value = \
171+
_TEST_SIERRA_IDS_RESPONSE
172+
173+
assert_frame_equal(
174+
RESULT, get_sierra_patron_data_from_ids(
175+
mock_sierra_client, [str(el) for el in range(1, 9)] + ["1",],
176+
use_record_num=True,
177+
))
178+
179+
mock_sierra_client.connect.assert_called_once()
180+
mock_sierra_client.execute_query.assert_called_once()
181+
mock_sierra_client.close_connection.assert_called_once()
182+
183+
# Because the set of record_nums is unordered, it can't be tested
184+
# directly. The workaround is to test the total length of the query
185+
# plus that each id appears in it.
186+
query = mock_sierra_client.execute_query.call_args[0][0]
187+
assert len(query) == 277
188+
assert "WHERE record_num IN" in query
141189
for el in range(1, 9):
142190
assert str(el) in query
143191

@@ -152,14 +200,18 @@ def test_get_sierra_patron_data_from_ids_unisolated(self, mocker):
152200
mock_sierra_client.execute_query.assert_called_once()
153201
mock_sierra_client.close_connection.assert_not_called()
154202

155-
def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
203+
def test_get_sierra_patron_data_from_ids_without_duplicates_pat_ids(
204+
self, mocker):
156205
RESULT = pd.DataFrame(
157-
[["1", "b1", 11, 12, "aa"], ["2", "b2", 21, 22, "bb"],
158-
["3", "b3", 31, 32, "cc"], ["33", "b3", 331, 332, "ccc"],
159-
["4", None, None, None, None], ["5", "b5", 51, 52, "dd"],
160-
["6", "b6", 61, 62, "ee"]],
161-
columns=["patron_id", "barcode", "ptype_code", "pcode3",
162-
"patron_home_library_code"])
206+
[["1", "1", "b1", 11, 12, "aa"],
207+
["2", "2", "b2", 21, 22, "bb"],
208+
["3", "3", "b3", 31, 32, "cc"],
209+
["33", "3", "b3", 331, 332, "ccc"],
210+
["4", None, None, None, None, None],
211+
["5", "5", "b5", 51, 52, "dd"],
212+
["6", "6", "b6", 61, 62, "ee"]],
213+
columns=["patron_id", "record_num", "barcode", "ptype_code",
214+
"pcode3", "patron_home_library_code"])
163215
RESULT["patron_id"] = RESULT["patron_id"].astype("string")
164216

165217
mock_sierra_client = mocker.MagicMock()
@@ -172,22 +224,49 @@ def test_get_sierra_patron_data_from_ids_without_duplicates(self, mocker):
172224
remove_duplicates=True
173225
))
174226

227+
def test_get_sierra_patron_data_from_ids_without_duplicates_record_nums(
228+
self, mocker):
229+
RESULT = pd.DataFrame(
230+
[["1", "1", "b1", 11., 12., "aa"],
231+
["2", "2", "b2", 21., 22., "bb"],
232+
["5", "5", "b5", 51., 52., "dd"],
233+
["6", "6", "b6", 61., 62., "ee"],
234+
["7", "7", "b7", 71., 72., "ff"],
235+
["7", "77", "b77", 771., 772., "ffff"]],
236+
columns=["patron_id", "record_num", "barcode", "ptype_code",
237+
"pcode3", "patron_home_library_code"],
238+
index=[0, 1, 5, 6, 8, 9])
239+
RESULT[["patron_id", "record_num"]] = RESULT[
240+
["patron_id", "record_num"]].astype("string")
241+
242+
mock_sierra_client = mocker.MagicMock()
243+
mock_sierra_client.execute_query.return_value = \
244+
_TEST_SIERRA_IDS_RESPONSE
245+
246+
assert_frame_equal(
247+
RESULT, get_sierra_patron_data_from_ids(
248+
mock_sierra_client, [str(el) for el in range(1, 9)],
249+
remove_duplicates=True, use_record_num=True,
250+
))
251+
175252
def test_get_sierra_patron_data_from_barcodes(self, mocker):
176253
RESULT = pd.DataFrame(
177-
[["b1", "1", 11, 12, "aa"],
178-
["b4", "4", 41, 42, "dd"],
179-
["b6", "6", None, None, None],
180-
["b9", "9", None, None, None],
181-
["b2", "2", 21, 22, "bb"],
182-
["b3", "3", None, None, None],
183-
["b5", "5", None, None, None],
184-
["b7", "7", 71, 72, "gg"],
185-
["b8", "8", 81, 82, "hh"]],
186-
columns=["barcode", "patron_id", "ptype_code", "pcode3",
187-
"patron_home_library_code"])
188-
RESULT["patron_id"] = RESULT["patron_id"].astype("string")
189-
TEST_BARCODES = [f"b{i}" for i in range(1, 11)] + ["b1",]
190-
TEST_IDS = pd.Series([str(i) for i in range(1, 10)],
254+
[["b1", "1", "1", 11, 12, "aa"],
255+
["b4", "4", "4", 41, 42, "dd"],
256+
["b6", "6", None, None, None, None],
257+
["b8", "8", "88", 81, 82, "hh"],
258+
["b10", "10", "10", None, None, None],
259+
["b2", "2", "2", 21, 22, "bb"],
260+
["b3", "3", None, None, None, None],
261+
["b5", "5", None, None, None, None],
262+
["b7", "7", "7", 71, 72, "gg"],
263+
["b9", "9", "9", 91, 92, "ii"]],
264+
columns=["barcode", "patron_id", "record_num", "ptype_code",
265+
"pcode3", "patron_home_library_code"])
266+
RESULT[["patron_id", "record_num"]] = RESULT[
267+
["patron_id", "record_num"]].astype("string")
268+
TEST_BARCODES = [f"b{i}" for i in range(1, 12)] + ["b1",]
269+
TEST_IDS = pd.Series([str(i) for i in range(1, 11)],
191270
dtype="string", name="patron_id")
192271
mocked_barcodes_method = mocker.patch(
193272
"nypl_py_utils.functions.patron_data_helper.barcodes_to_patron_ids", # noqa: E501

0 commit comments

Comments
 (0)