Skip to content

Commit d5f7f95

Browse files
committed
Fix duplicate downloads (#450)
We want downloads to not overwrite files in the filestore, otherwise we risk overwriting a copy that is already linked elsewhere and thus creating a redundant copy from the old link. Instead, fail to write, but don't complain. This insures we use the filestore copy that already exists.
1 parent 54d8d6d commit d5f7f95

2 files changed

Lines changed: 26 additions & 4 deletions

File tree

hca/dss/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from atomicwrites import atomic_write
2222
from requests.exceptions import ChunkedEncodingError, ConnectionError, ReadTimeout
2323

24-
from hca.dss.util import iter_paths, object_name_builder, hardlink
24+
from hca.dss.util import iter_paths, object_name_builder, hardlink, atomic_overwrite
2525
from glob import escape as glob_escape
2626
from hca.util import tsv
2727
from ..util import SwaggerClient, DEFAULT_THREAD_COUNT
@@ -528,7 +528,7 @@ def _download_bundle_manifest(self, manifest_bytes, bundle_dir, dss_file):
528528
logger.info("Skipping download of '%s' because it already exists at '%s'.", dss_file.name, dest_path)
529529
else:
530530
self._make_dirs_if_necessary(dest_path)
531-
with atomic_write(dest_path, mode="wb", overwrite=True) as fh:
531+
with atomic_overwrite(dest_path, mode="wb") as fh:
532532
fh.write(manifest_bytes)
533533
file_path = os.path.join(bundle_dir, dss_file.name)
534534
self._make_dirs_if_necessary(file_path)
@@ -592,14 +592,14 @@ def _download_file(self, dss_file, dest_path):
592592
ranged get doesn't yield the correct header, then we start over.
593593
"""
594594
self._make_dirs_if_necessary(dest_path)
595-
with atomic_write(dest_path, mode="wb", overwrite=True) as fh:
595+
with atomic_overwrite(dest_path, mode="wb") as fh:
596596
if dss_file.size == 0:
597597
return
598598

599599
download_hash = self._do_download_file(dss_file, fh)
600600

601601
if download_hash.lower() != dss_file.sha256.lower():
602-
# No need to delete what's been written. atomic_write ensures we're cleaned up
602+
# No need to delete what's been written. atomic_overwrite ensures we're cleaned up
603603
logger.error("%s", "File {}: GET FAILED. Checksum mismatch.".format(dss_file.uuid))
604604
raise ValueError("Expected sha256 {} Received sha256 {}".format(
605605
dss_file.sha256.lower(), download_hash.lower()))

hca/dss/util/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import os
44
import shutil
55
from builtins import FileExistsError
6+
7+
import atomicwrites
8+
69
from ...util.compat import scandir
710

811
log = logging.getLogger(__name__)
@@ -61,3 +64,22 @@ def hardlink(source, link_name):
6164
shutil.copyfile(source, link_name)
6265
else:
6366
raise
67+
68+
69+
class atomic_overwrite:
70+
"""Atomically write, but don't complain if file already exists"""
71+
72+
def __init__(self, *args, **kwargs):
73+
self.writer = atomicwrites.atomic_write(*args, overwrite=False, **kwargs)
74+
75+
def __enter__(self):
76+
return self.writer.__enter__()
77+
78+
def __exit__(self, exc_type, exc_val, exc_tb):
79+
# During atomicwrites.atomic_write's commit() method which is called
80+
# while exiting, a FileExistsError may be raised. It's fine to catch
81+
# this since atomicwrites still cleans up after itself.
82+
try:
83+
return self.writer.__exit__(exc_type, exc_val, exc_tb)
84+
except FileExistsError:
85+
pass

0 commit comments

Comments
 (0)