Skip to content

Commit b601927

Browse files
fix: improve _get_file to avoid to give filename and use url hash as a discriminator
1 parent 049504f commit b601927

3 files changed

Lines changed: 34 additions & 38 deletions

File tree

tests/test_profile.py

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,19 @@
2929
_ASYNC_RETRY_INTERVAL_SECONDS = 5
3030
_ASYNC_TIMEOUT_SECONDS = 60
3131

32+
NICO_PDF_URL = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
33+
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
34+
495c951bbae6b/profiles/1fed6e15b2df4465b1e406adabd0075d3214bc18/parsing/resume.pdf"""
35+
NICO_PNG_URL = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/team\
36+
s/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948b\
37+
fb7f/files/035b6b44943877bae355a527efcb7b721dbcdde7/file-nico_durant.png"""
38+
NICO_DOCX_URL = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/tea\
39+
ms/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948\
40+
bfb7f/files/73ad352f0e93a46c82591655edacaf01711141a6/file-nico_durant.docx"""
41+
JOHN_PDF_URL = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
42+
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/06d96aab2661b16eaf4d34d385d\
43+
3c2b0cf00c0eb/profiles/d79768fb63013a8bdd04e7e8742cc84afd428a87/parsing/resume.pdf"""
44+
3245

3346
@pytest.fixture(scope="module")
3447
def hrflow_client():
@@ -129,10 +142,7 @@ def _profile_get() -> t.Dict[str, t.Any]:
129142
@pytest.mark.parsing_file_sync
130143
@pytest.mark.quicksilver
131144
def test_profile_parsing_file_quicksilver_sync_basic(hrflow_client):
132-
s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
133-
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
134-
495c951bbae6b/profiles/1fed6e15b2df4465b1e406adabd0075d3214bc18/parsing/resume.pdf"""
135-
file = _file_get(s3_url, "nico_durant.pdf")
145+
file = _file_get(NICO_PDF_URL)
136146
model = ProfileParsingFileResponse.parse_obj(
137147
hrflow_client.profile.parsing.add_file(
138148
source_key=_var_from_env_get("HRFLOW_SOURCE_KEY_QUICKSILVER_SYNC"),
@@ -222,10 +232,7 @@ def test_profile_parsing_file_quicksilver_sync_basic(hrflow_client):
222232
@pytest.mark.parsing_file_sync
223233
@pytest.mark.hawk
224234
def test_profile_parsing_file_hawk_sync_basic(hrflow_client):
225-
s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
226-
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
227-
495c951bbae6b/profiles/1fed6e15b2df4465b1e406adabd0075d3214bc18/parsing/resume.pdf"""
228-
file = _file_get(s3_url, "nico_durant.pdf")
235+
file = _file_get(NICO_PDF_URL)
229236
model = ProfileParsingFileResponse.parse_obj(
230237
hrflow_client.profile.parsing.add_file(
231238
source_key=_var_from_env_get("HRFLOW_SOURCE_KEY_HAWK_SYNC"),
@@ -316,10 +323,7 @@ def test_profile_parsing_file_hawk_sync_basic(hrflow_client):
316323
@pytest.mark.quicksilver
317324
def test_profile_parsing_file_quicksilver_async_basic(hrflow_client):
318325
SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_QUICKSILVER_ASYNC")
319-
s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
320-
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/06d96aab2661b16eaf4d34d385d\
321-
3c2b0cf00c0eb/profiles/d79768fb63013a8bdd04e7e8742cc84afd428a87/parsing/resume.pdf"""
322-
file = _file_get(s3_url, "john_smith.pdf")
326+
file = _file_get(JOHN_PDF_URL)
323327
reference = str(uuid1())
324328
model = ProfileParsingFileResponse.parse_obj(
325329
hrflow_client.profile.parsing.add_file(
@@ -397,10 +401,7 @@ def test_profile_parsing_file_quicksilver_async_basic(hrflow_client):
397401
@pytest.mark.mozart
398402
def test_profile_parsing_file_mozart_async_basic(hrflow_client):
399403
SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_MOZART_ASYNC")
400-
s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
401-
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/06d96aab2661b16eaf4d34d385d\
402-
3c2b0cf00c0eb/profiles/d79768fb63013a8bdd04e7e8742cc84afd428a87/parsing/resume.pdf"""
403-
file = _file_get(s3_url, "john_smith.pdf")
404+
file = _file_get(JOHN_PDF_URL)
404405
reference = str(uuid1())
405406
model = ProfileParsingFileResponse.parse_obj(
406407
hrflow_client.profile.parsing.add_file(
@@ -652,10 +653,7 @@ def test_profile_editing_basic(hrflow_client):
652653
@pytest.mark.parsing
653654
def test_profile_parsing_hawk_sync_png(hrflow_client):
654655
SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_HAWK_SYNC")
655-
s3_url = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/teams/\
656-
fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948bfb\
657-
7f/files/035b6b44943877bae355a527efcb7b721dbcdde7/file-nico_durant.png"""
658-
file = _file_get(s3_url, "nico_durant.png")
656+
file = _file_get(NICO_PNG_URL)
659657
reference = str(uuid1())
660658
model = ProfileParsingFileResponse.parse_obj(
661659
hrflow_client.profile.parsing.add_file(
@@ -673,10 +671,7 @@ def test_profile_parsing_hawk_sync_png(hrflow_client):
673671
@pytest.mark.parsing
674672
def test_profile_parsing_hawk_sync_docx(hrflow_client):
675673
SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_HAWK_SYNC")
676-
s3_url = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/teams/\
677-
fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948bfb\
678-
7f/files/73ad352f0e93a46c82591655edacaf01711141a6/file-nico_durant.docx"""
679-
file = _file_get(s3_url, "nico_durant.docx")
674+
file = _file_get(NICO_DOCX_URL)
680675
reference = str(uuid1())
681676
model = ProfileParsingFileResponse.parse_obj(
682677
hrflow_client.profile.parsing.add_file(

tests/test_text.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
)
1818
from .utils.tools import _file_get, _var_from_env_get
1919

20+
MARY_PDF_URL = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
21+
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
22+
495c951bbae6b/profiles/52e3c23a5f21190c59f53c41b5630ecb5d414f94/parsing/resume.pdf"""
2023
TAGGING_TEXTS = [
2124
(
2225
"Data Insights Corp. is seeking a Senior Data Scientist for a"
@@ -357,10 +360,7 @@ def test_tagger_hrflow_labels_no_context(hrflow_client):
357360
@pytest.mark.text
358361
@pytest.mark.ocr
359362
def test_ocr_basic(hrflow_client):
360-
s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
361-
teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
362-
495c951bbae6b/profiles/52e3c23a5f21190c59f53c41b5630ecb5d414f94/parsing/resume.pdf"""
363-
file = _file_get(s3_url, "ocr.pdf")
363+
file = _file_get(MARY_PDF_URL)
364364
assert file is not None
365365
model = TextOCRResponse.parse_obj(hrflow_client.text.ocr.post(file=file))
366366
assert model.code == requests.codes.ok

tests/utils/tools.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import hashlib
12
import io
23
import os
34
import typing as t
@@ -51,13 +52,12 @@ def _iso8601_to_datetime(datestr: str) -> t.Optional[datetime]:
5152
pass
5253

5354

54-
def _file_get(
55-
url: str, file_name: t.Optional[str] = None
56-
) -> t.Optional[t.Union[io.BytesIO, io.BufferedReader]]:
55+
def _file_get(url: str) -> t.Optional[t.Union[io.BytesIO, io.BufferedReader]]:
5756
"""
58-
Gets the file corresponding to the specified `url`. If tests/assets/`file_name`
59-
does not exist, it will be downloaded from `url` and stored for reuse, basically,
60-
it will be locally cached.
57+
Gets the file corresponding to the specified `url`.
58+
This function avoids downloading the same file multiple times using caching based on
59+
the hash of the URL. If tests/assets/`<url_hash>` does not exist, it will
60+
be downloaded from `url` and stored for reuse.
6161
6262
Args:
6363
url (str): The download URL of the file.
@@ -66,17 +66,18 @@ def _file_get(
6666
Returns:
6767
The content of the file if it exists; otherwise, returns `None`.
6868
"""
69+
url_hash = hashlib.md5(url.encode()).hexdigest()
6970

70-
if file_name is None: # deduce from the url
71-
file_name = url[url.rfind("/") + 1 :]
71+
last_slash_index = url.rfind("/") + 1
72+
file_name = url[last_slash_index:]
7273

7374
# look up for its cached version
7475
dir_path = "tests/assets"
75-
file_path = os.path.join(dir_path, file_name)
76+
file_path = os.path.join(dir_path, url_hash)
7677
if os.path.isfile(file_path):
7778
with open(file_path, "rb") as file:
7879
file_object = io.BytesIO(file.read())
79-
file_object.name = file_path
80+
file_object.name = file_name
8081
return file_object
8182

8283
response = requests.get(url)

0 commit comments

Comments
 (0)