fix: improve _get_file to avoid to give filename and use url hash as a discriminator

corentin-hrflow · corentin-hrflow · commit b601927df911 · 2024-03-14T14:00:18.000+01:00
diff --git a/tests/test_profile.py b/tests/test_profile.py
@@ -29,6 +29,19 @@
 _ASYNC_RETRY_INTERVAL_SECONDS = 5
 _ASYNC_TIMEOUT_SECONDS = 60
 
+NICO_PDF_URL = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
+teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
+495c951bbae6b/profiles/1fed6e15b2df4465b1e406adabd0075d3214bc18/parsing/resume.pdf"""
+NICO_PNG_URL = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/team\
+s/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948b\
+fb7f/files/035b6b44943877bae355a527efcb7b721dbcdde7/file-nico_durant.png"""
+NICO_DOCX_URL = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/tea\
+ms/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948\
+bfb7f/files/73ad352f0e93a46c82591655edacaf01711141a6/file-nico_durant.docx"""
+JOHN_PDF_URL = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
+teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/06d96aab2661b16eaf4d34d385d\
+3c2b0cf00c0eb/profiles/d79768fb63013a8bdd04e7e8742cc84afd428a87/parsing/resume.pdf"""
+
 
 @pytest.fixture(scope="module")
 def hrflow_client():
@@ -129,10 +142,7 @@ def _profile_get() -> t.Dict[str, t.Any]:
 @pytest.mark.parsing_file_sync
 @pytest.mark.quicksilver
 def test_profile_parsing_file_quicksilver_sync_basic(hrflow_client):
-    s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
-teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
-495c951bbae6b/profiles/1fed6e15b2df4465b1e406adabd0075d3214bc18/parsing/resume.pdf"""
-    file = _file_get(s3_url, "nico_durant.pdf")
+    file = _file_get(NICO_PDF_URL)
     model = ProfileParsingFileResponse.parse_obj(
         hrflow_client.profile.parsing.add_file(
             source_key=_var_from_env_get("HRFLOW_SOURCE_KEY_QUICKSILVER_SYNC"),
@@ -222,10 +232,7 @@ def test_profile_parsing_file_quicksilver_sync_basic(hrflow_client):
 @pytest.mark.parsing_file_sync
 @pytest.mark.hawk
 def test_profile_parsing_file_hawk_sync_basic(hrflow_client):
-    s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
-teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
-495c951bbae6b/profiles/1fed6e15b2df4465b1e406adabd0075d3214bc18/parsing/resume.pdf"""
-    file = _file_get(s3_url, "nico_durant.pdf")
+    file = _file_get(NICO_PDF_URL)
     model = ProfileParsingFileResponse.parse_obj(
         hrflow_client.profile.parsing.add_file(
             source_key=_var_from_env_get("HRFLOW_SOURCE_KEY_HAWK_SYNC"),
@@ -316,10 +323,7 @@ def test_profile_parsing_file_hawk_sync_basic(hrflow_client):
 @pytest.mark.quicksilver
 def test_profile_parsing_file_quicksilver_async_basic(hrflow_client):
     SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_QUICKSILVER_ASYNC")
-    s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
-teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/06d96aab2661b16eaf4d34d385d\
-3c2b0cf00c0eb/profiles/d79768fb63013a8bdd04e7e8742cc84afd428a87/parsing/resume.pdf"""
-    file = _file_get(s3_url, "john_smith.pdf")
+    file = _file_get(JOHN_PDF_URL)
     reference = str(uuid1())
     model = ProfileParsingFileResponse.parse_obj(
         hrflow_client.profile.parsing.add_file(
@@ -397,10 +401,7 @@ def test_profile_parsing_file_quicksilver_async_basic(hrflow_client):
 @pytest.mark.mozart
 def test_profile_parsing_file_mozart_async_basic(hrflow_client):
     SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_MOZART_ASYNC")
-    s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
-teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/06d96aab2661b16eaf4d34d385d\
-3c2b0cf00c0eb/profiles/d79768fb63013a8bdd04e7e8742cc84afd428a87/parsing/resume.pdf"""
-    file = _file_get(s3_url, "john_smith.pdf")
+    file = _file_get(JOHN_PDF_URL)
     reference = str(uuid1())
     model = ProfileParsingFileResponse.parse_obj(
         hrflow_client.profile.parsing.add_file(
@@ -652,10 +653,7 @@ def test_profile_editing_basic(hrflow_client):
 @pytest.mark.parsing
 def test_profile_parsing_hawk_sync_png(hrflow_client):
     SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_HAWK_SYNC")
-    s3_url = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/teams/\
-fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948bfb\
-7f/files/035b6b44943877bae355a527efcb7b721dbcdde7/file-nico_durant.png"""
-    file = _file_get(s3_url, "nico_durant.png")
+    file = _file_get(NICO_PNG_URL)
     reference = str(uuid1())
     model = ProfileParsingFileResponse.parse_obj(
         hrflow_client.profile.parsing.add_file(
@@ -673,10 +671,7 @@ def test_profile_parsing_hawk_sync_png(hrflow_client):
 @pytest.mark.parsing
 def test_profile_parsing_hawk_sync_docx(hrflow_client):
     SOURCE_KEY = _var_from_env_get("HRFLOW_SOURCE_KEY_HAWK_SYNC")
-    s3_url = """https://riminder-documents-eu-2019-12.s3.eu-west-1.amazonaws.com/teams/\
-fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/7f61abfb4a0ea127ca1536136a0891c5948bfb\
-7f/files/73ad352f0e93a46c82591655edacaf01711141a6/file-nico_durant.docx"""
-    file = _file_get(s3_url, "nico_durant.docx")
+    file = _file_get(NICO_DOCX_URL)
     reference = str(uuid1())
     model = ProfileParsingFileResponse.parse_obj(
         hrflow_client.profile.parsing.add_file(
diff --git a/tests/test_text.py b/tests/test_text.py
@@ -17,6 +17,9 @@
 )
 from .utils.tools import _file_get, _var_from_env_get
 
+MARY_PDF_URL = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
+teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
+495c951bbae6b/profiles/52e3c23a5f21190c59f53c41b5630ecb5d414f94/parsing/resume.pdf"""
 TAGGING_TEXTS = [
     (
         "Data Insights Corp. is seeking a Senior Data Scientist for a"
@@ -357,10 +360,7 @@ def test_tagger_hrflow_labels_no_context(hrflow_client):
 @pytest.mark.text
 @pytest.mark.ocr
 def test_ocr_basic(hrflow_client):
-    s3_url = """https://riminder-documents-eu-2019-12.s3-eu-west-1.amazonaws.com/\
-teams/fc9d40fd60e679119130ea74ae1d34a3e22174f2/sources/07065e555609a231752a586afd6\
-495c951bbae6b/profiles/52e3c23a5f21190c59f53c41b5630ecb5d414f94/parsing/resume.pdf"""
-    file = _file_get(s3_url, "ocr.pdf")
+    file = _file_get(MARY_PDF_URL)
     assert file is not None
     model = TextOCRResponse.parse_obj(hrflow_client.text.ocr.post(file=file))
     assert model.code == requests.codes.ok
diff --git a/tests/utils/tools.py b/tests/utils/tools.py
@@ -1,3 +1,4 @@
+import hashlib
 import io
 import os
 import typing as t
@@ -51,13 +52,12 @@ def _iso8601_to_datetime(datestr: str) -> t.Optional[datetime]:
         pass
 
 
-def _file_get(
-    url: str, file_name: t.Optional[str] = None
-) -> t.Optional[t.Union[io.BytesIO, io.BufferedReader]]:
+def _file_get(url: str) -> t.Optional[t.Union[io.BytesIO, io.BufferedReader]]:
     """
-    Gets the file corresponding to the specified `url`. If tests/assets/`file_name`
-    does not exist, it will be downloaded from `url` and stored for reuse, basically,
-    it will be locally cached.
+    Gets the file corresponding to the specified `url`.
+    This function avoids downloading the same file multiple times using caching based on
+    the hash of the URL. If tests/assets/`<url_hash>` does not exist, it will
+    be downloaded from `url` and stored for reuse.
 
     Args:
         url (str): The download URL of the file.
@@ -66,17 +66,18 @@ def _file_get(
     Returns:
         The content of the file if it exists; otherwise, returns `None`.
     """
+    url_hash = hashlib.md5(url.encode()).hexdigest()
 
-    if file_name is None:  # deduce from the url
-        file_name = url[url.rfind("/") + 1 :]
+    last_slash_index = url.rfind("/") + 1
+    file_name = url[last_slash_index:]
 
     # look up for its cached version
     dir_path = "tests/assets"
-    file_path = os.path.join(dir_path, file_name)
+    file_path = os.path.join(dir_path, url_hash)
     if os.path.isfile(file_path):
         with open(file_path, "rb") as file:
             file_object = io.BytesIO(file.read())
-            file_object.name = file_path
+            file_object.name = file_name
             return file_object
 
     response = requests.get(url)