|
| 1 | +import pandas as pd |
| 2 | + |
| 3 | +from nypl_py_utils.functions.log_helper import create_log |
| 4 | + |
| 5 | +logger = create_log("patron_data_helpers") |
| 6 | + |
| 7 | +_REDSHIFT_QUERY = """ |
| 8 | + SELECT patron_id, postal_code, geoid |
| 9 | + FROM {table} |
| 10 | + WHERE patron_id IN ({ids});""" |
| 11 | + |
| 12 | +_SIERRA_BARCODES_TO_IDS_QUERY = """ |
| 13 | + SELECT index_tag || index_entry, record_id |
| 14 | + FROM sierra_view.phrase_entry |
| 15 | + WHERE index_tag || index_entry IN ({});""" |
| 16 | + |
| 17 | +_SIERRA_PATRON_DATA_QUERY = """ |
| 18 | + SELECT id, barcode, ptype_code, pcode3, |
| 19 | + CASE WHEN LENGTH(TRIM(home_library_code)) = 0 |
| 20 | + OR TRIM(home_library_code) = 'none' THEN NULL |
| 21 | + ELSE TRIM(home_library_code) END |
| 22 | + FROM sierra_view.patron_view |
| 23 | + WHERE id IN ({});""" |
| 24 | + |
| 25 | + |
| 26 | +def barcodes_to_patron_ids(sierra_client, barcodes, isolate_connection=True, |
| 27 | + remove_duplicates=True): |
| 28 | + """ |
| 29 | + Converts barcodes into Sierra patron ids |
| 30 | +
|
| 31 | + Parameters |
| 32 | + ---------- |
| 33 | + sierra_client: PostgreSQLClient |
| 34 | + The client with which to query Sierra |
| 35 | + barcodes: sequence of strings |
| 36 | + The sequence of barcodes to be mapped. Must be iterable and without |
| 37 | + 'None' entries. Each barcode is expected to be a string without a |
| 38 | + prepending 'b' character. |
| 39 | + isolate_connection: bool, optional |
| 40 | + Whether the database connection should be opened and closed within this |
| 41 | + method or whether it will be handled by the user |
| 42 | + remove_duplicates: bool, optional |
| 43 | + Whether barcodes that map to multiple patron ids should be removed |
| 44 | +
|
| 45 | + Returns |
| 46 | + ------- |
| 47 | + DataFrame |
| 48 | + A pandas DataFrame with 'barcode' and 'patron_id' columns. The |
| 49 | + 'patron_id' column is set to be a string. |
| 50 | + """ |
| 51 | + unique_barcodes = set(barcodes) |
| 52 | + if unique_barcodes: |
| 53 | + logger.info(f"Mapping ({len(unique_barcodes)}) barcodes to patron ids") |
| 54 | + barcodes_str = "'b" + "','b".join(unique_barcodes) + "'" |
| 55 | + if isolate_connection: |
| 56 | + sierra_client.connect() |
| 57 | + raw_data = sierra_client.execute_query( |
| 58 | + _SIERRA_BARCODES_TO_IDS_QUERY.format(barcodes_str)) |
| 59 | + if isolate_connection: |
| 60 | + sierra_client.close_connection() |
| 61 | + else: |
| 62 | + logger.info("No barcodes given with which to query Sierra") |
| 63 | + raw_data = [] |
| 64 | + |
| 65 | + df = pd.DataFrame(raw_data, columns=["barcode", "patron_id"]) |
| 66 | + df = df[pd.notnull(df[["barcode", "patron_id"]]).all(axis=1)] |
| 67 | + df["barcode"] = df["barcode"].str.lstrip("b") |
| 68 | + df["patron_id"] = df["patron_id"].astype("Int64").astype("string") |
| 69 | + df = df.drop_duplicates() |
| 70 | + if remove_duplicates: |
| 71 | + return df.drop_duplicates("barcode", keep=False) |
| 72 | + else: |
| 73 | + return df |
| 74 | + |
| 75 | + |
| 76 | +def get_sierra_patron_data_from_ids(sierra_client, patron_ids, |
| 77 | + isolate_connection=True, |
| 78 | + remove_duplicates=False): |
| 79 | + """ |
| 80 | + Given Sierra patron ids, returns standard patron fields from Sierra |
| 81 | +
|
| 82 | + Parameters |
| 83 | + ---------- |
| 84 | + sierra_client: PostgreSQLClient |
| 85 | + The client with which to query Sierra |
| 86 | + patron_ids: sequence of strings |
| 87 | + The sequence of patron ids to be fetched. Must be iterable and without |
| 88 | + 'None' entries. Each patron id is expected to be a string. |
| 89 | + isolate_connection: bool, optional |
| 90 | + Whether the database connection should be opened and closed within this |
| 91 | + method or whether it will be handled by the user |
| 92 | + remove_duplicates: bool, optional |
| 93 | + Whether patron ids that map to multiple rows with different values |
| 94 | + should be removed |
| 95 | +
|
| 96 | + Returns |
| 97 | + ------- |
| 98 | + DataFrame |
| 99 | + A pandas DataFrame with standard patron columns. The 'patron_id' column |
| 100 | + is set to be a string. |
| 101 | + """ |
| 102 | + unique_patron_ids = set(patron_ids) |
| 103 | + if unique_patron_ids: |
| 104 | + logger.info( |
| 105 | + f"Fetching Sierra patron data for ({len(unique_patron_ids)}) " |
| 106 | + "patrons") |
| 107 | + patron_ids_str = ",".join(unique_patron_ids) |
| 108 | + if isolate_connection: |
| 109 | + sierra_client.connect() |
| 110 | + raw_data = sierra_client.execute_query( |
| 111 | + _SIERRA_PATRON_DATA_QUERY.format(patron_ids_str)) |
| 112 | + if isolate_connection: |
| 113 | + sierra_client.close_connection() |
| 114 | + else: |
| 115 | + logger.info("No patron ids given with which to query Sierra") |
| 116 | + raw_data = [] |
| 117 | + |
| 118 | + df = pd.DataFrame(raw_data, columns=[ |
| 119 | + "patron_id", "barcode", "ptype_code", "pcode3", |
| 120 | + "patron_home_library_code"]) |
| 121 | + df = df[pd.notnull(df["patron_id"])] |
| 122 | + df["patron_id"] = df["patron_id"].astype("Int64").astype("string") |
| 123 | + if remove_duplicates: |
| 124 | + # If one patron id maps to two rows that are identical except for the |
| 125 | + # barcode, arbitrarily delete one of the rows |
| 126 | + df = df.drop_duplicates( |
| 127 | + ["patron_id", "ptype_code", "pcode3", "patron_home_library_code"]) |
| 128 | + return df.drop_duplicates("patron_id", keep=False) |
| 129 | + else: |
| 130 | + return df.drop_duplicates() |
| 131 | + |
| 132 | + |
| 133 | +def get_sierra_patron_data_from_barcodes(sierra_client, barcodes, |
| 134 | + isolate_connection=True): |
| 135 | + """ |
| 136 | + Given barcodes, returns standard patron fields from Sierra. One row per |
| 137 | + barcode is returned for all barcodes found in Sierra. |
| 138 | +
|
| 139 | + Parameters |
| 140 | + ---------- |
| 141 | + sierra_client: PostgreSQLClient |
| 142 | + The client with which to query Sierra |
| 143 | + barcodes: sequence of strings |
| 144 | + The sequence of barcodes to be mapped. Must be iterable and without |
| 145 | + 'None' entries. Each barcode is expected to be a string without a |
| 146 | + prepending 'b' character. |
| 147 | + isolate_connection: bool, optional |
| 148 | + Whether the database connection should be opened and closed within this |
| 149 | + method or whether it will be handled by the user |
| 150 | +
|
| 151 | + Returns |
| 152 | + ------- |
| 153 | + DataFrame |
| 154 | + A pandas DataFrame with barcodes plus the standard patron columns. The |
| 155 | + 'patron_id' column is set to be a string. |
| 156 | + """ |
| 157 | + if isolate_connection: |
| 158 | + sierra_client.connect() |
| 159 | + barcode_patron_id_df = barcodes_to_patron_ids( |
| 160 | + sierra_client, barcodes, False, True) |
| 161 | + patron_data_df = get_sierra_patron_data_from_ids( |
| 162 | + sierra_client, barcode_patron_id_df["patron_id"], False, False) |
| 163 | + if isolate_connection: |
| 164 | + sierra_client.close_connection() |
| 165 | + |
| 166 | + # If one patron id maps to two rows that are identical except for the |
| 167 | + # barcode, arbitrarily delete one of the rows |
| 168 | + patron_data_df = patron_data_df.drop_duplicates( |
| 169 | + ["patron_id", "ptype_code", "pcode3", "patron_home_library_code"]) |
| 170 | + |
| 171 | + # Prefer matches where both the barcode and the patron id match. Otherwise, |
| 172 | + # accept matches where only the patron id matches. If more than one match |
| 173 | + # is found, use none of them and NULL out the patron fields. |
| 174 | + df = barcode_patron_id_df.merge( |
| 175 | + patron_data_df, how="left", on=["patron_id", "barcode"], |
| 176 | + indicator=True |
| 177 | + ) |
| 178 | + perfect_match_df = df[df["_merge"] == "both"].drop(columns=["_merge"]) |
| 179 | + imperfect_match_df = df[["barcode", "patron_id"]].drop( |
| 180 | + perfect_match_df.index).merge(patron_data_df.drop(columns=["barcode"]), |
| 181 | + how="left", on="patron_id") |
| 182 | + df = pd.concat([perfect_match_df, imperfect_match_df], ignore_index=True) |
| 183 | + df.loc[df.duplicated("barcode", keep=False), [ |
| 184 | + "ptype_code", "pcode3", "patron_home_library_code"]] = None |
| 185 | + return df.drop_duplicates("barcode") |
| 186 | + |
| 187 | + |
| 188 | +def get_redshift_patron_data(redshift_client, obfuscated_patron_ids, |
| 189 | + isolate_connection=True): |
| 190 | + """ |
| 191 | + Given obfuscated patron ids, returns postal code and geoid from Redshift. |
| 192 | + One row per patron id is returned for all patron ids found in Redshift. |
| 193 | +
|
| 194 | + Parameters |
| 195 | + ---------- |
| 196 | + redshift_client: RedshiftClient |
| 197 | + The client with which to query Redshift |
| 198 | + obfuscated_patron_ids: sequence of strings |
| 199 | + The sequence of patron ids to be mapped. Must be iterable and without |
| 200 | + 'None' entries. Each patron id is expected to have been obfuscated. |
| 201 | + isolate_connection: bool, optional |
| 202 | + Whether the database connection should be opened and closed within this |
| 203 | + method or whether it will be handled by the user |
| 204 | +
|
| 205 | + Returns |
| 206 | + ------- |
| 207 | + DataFrame |
| 208 | + A pandas DataFrame with 'patron_id', 'postal_code', and 'geoid' columns |
| 209 | + """ |
| 210 | + unique_patron_ids = set(obfuscated_patron_ids) |
| 211 | + if unique_patron_ids: |
| 212 | + logger.info(f"Querying Redshift for ({len(unique_patron_ids)}) " |
| 213 | + "patrons") |
| 214 | + redshift_table = "patron_info" |
| 215 | + if redshift_client.database != "production": |
| 216 | + redshift_table += "_" + redshift_client.database |
| 217 | + patron_ids_str = "'" + "','".join(unique_patron_ids) + "'" |
| 218 | + |
| 219 | + if isolate_connection: |
| 220 | + redshift_client.connect() |
| 221 | + raw_data = redshift_client.execute_query( |
| 222 | + _REDSHIFT_QUERY.format(table=redshift_table, ids=patron_ids_str)) |
| 223 | + if isolate_connection: |
| 224 | + redshift_client.close_connection() |
| 225 | + else: |
| 226 | + logger.info("No patron ids given with which to query Redshift") |
| 227 | + raw_data = [] |
| 228 | + |
| 229 | + df = pd.DataFrame(raw_data, columns=["patron_id", "postal_code", "geoid"]) |
| 230 | + df = df[pd.notnull(df["patron_id"])] |
| 231 | + if not df["patron_id"].is_unique: |
| 232 | + duplicates = df.loc[df.duplicated("patron_id"), "patron_id"] |
| 233 | + logger.warning( |
| 234 | + "More than one Redshift row found for the following patron ids: " |
| 235 | + f"{', '.join(duplicates)}") |
| 236 | + return df.drop_duplicates("patron_id", keep=False) |
| 237 | + else: |
| 238 | + return df |
0 commit comments