Skip to content

Commit cd788e6

Browse files
gh-84353: Preserve non-UTF-8 filenames when appending to ZipFile
1 parent ba0aca3 commit cd788e6

3 files changed

Lines changed: 39 additions & 25 deletions

File tree

Lib/test/test_zipfile/test_core.py

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3639,29 +3639,23 @@ def test_read_with_unsuitable_metadata_encoding(self):
36393639

36403640
def test_read_after_append(self):
36413641
newname = '\u56db' # Han 'four'
3642-
expected_names = [name.encode('shift_jis').decode('cp437')
3643-
for name in self.file_names[:2]] + self.file_names[2:]
3644-
expected_names.append(newname)
3645-
expected_content = (*self.file_content, b"newcontent")
3642+
newname2 = 'fünf' # encodeable in cp437
3643+
expected_names = [*self.file_names, newname, newname2]
3644+
bad_expected_names = [name.encode('shift_jis').decode('cp437')
3645+
if i < 2 else name
3646+
for i, name in enumerate(expected_names)]
3647+
expected_content = (*self.file_content, b"newcontent", b"newcontent2")
36463648

36473649
with zipfile.ZipFile(TESTFN, "a") as zipfp:
36483650
zipfp.writestr(newname, "newcontent")
3649-
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
3651+
zipfp.writestr(newname2, "newcontent2")
3652+
self.assertEqual(sorted(zipfp.namelist()), sorted(bad_expected_names))
36503653

36513654
with zipfile.ZipFile(TESTFN, "r") as zipfp:
3652-
self._test_read(zipfp, expected_names, expected_content)
3655+
self._test_read(zipfp, bad_expected_names, expected_content)
36533656

36543657
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
3655-
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
3656-
for i, (name, content) in enumerate(zip(expected_names, expected_content)):
3657-
info = zipfp.getinfo(name)
3658-
self.assertEqual(info.filename, name)
3659-
self.assertEqual(info.file_size, len(content))
3660-
if i < 2:
3661-
with self.assertRaises(zipfile.BadZipFile):
3662-
zipfp.read(name)
3663-
else:
3664-
self.assertEqual(zipfp.read(name), content)
3658+
self._test_read(zipfp, expected_names, expected_content)
36653659

36663660
def test_write_with_metadata_encoding(self):
36673661
ZF = zipfile.ZipFile
@@ -3670,6 +3664,20 @@ def test_write_with_metadata_encoding(self):
36703664
"^metadata_encoding is only"):
36713665
ZF("nonesuch.zip", mode, metadata_encoding="shift_jis")
36723666

3667+
def test_add_comment(self):
3668+
with zipfile.ZipFile(TESTFN, "r") as zipfp:
3669+
bad_expected_names = zipfp.namelist()
3670+
3671+
with zipfile.ZipFile(TESTFN, "a") as zipfp:
3672+
zipfp.comment = b'comment'
3673+
self.assertEqual(zipfp.namelist(), bad_expected_names)
3674+
3675+
with zipfile.ZipFile(TESTFN, "r") as zipfp:
3676+
self._test_read(zipfp, bad_expected_names, self.file_content)
3677+
3678+
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
3679+
self._test_read(zipfp, self.file_names, self.file_content)
3680+
36733681
def test_cli_with_metadata_encoding(self):
36743682
errmsg = "Non-conforming encodings not supported with -c."
36753683
args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"]

Lib/zipfile/__init__.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,7 @@ def __repr__(self):
515515
result.append('>')
516516
return ''.join(result)
517517

518-
def FileHeader(self, zip64=None):
518+
def FileHeader(self, zip64=None, metadata_encoding=None):
519519
"""Return the per-file header as a bytes object.
520520
521521
When the optional zip64 arg is None rather than a bool, we will
@@ -557,17 +557,19 @@ def FileHeader(self, zip64=None):
557557

558558
self.extract_version = max(min_version, self.extract_version)
559559
self.create_version = max(min_version, self.create_version)
560-
filename, flag_bits = self._encodeFilenameFlags()
560+
filename, flag_bits = self._encodeFilenameFlags(metadata_encoding)
561561
header = struct.pack(structFileHeader, stringFileHeader,
562562
self.extract_version, self.reserved, flag_bits,
563563
self.compress_type, dostime, dosdate, CRC,
564564
compress_size, file_size,
565565
len(filename), len(extra))
566566
return header + filename + extra
567567

568-
def _encodeFilenameFlags(self):
568+
def _encodeFilenameFlags(self, encoding):
569+
if not encoding or self.flag_bits & _MASK_UTF_FILENAME:
570+
encoding = 'ascii'
569571
try:
570-
return self.filename.encode('ascii'), self.flag_bits
572+
return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME
571573
except UnicodeEncodeError:
572574
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
573575

@@ -1370,7 +1372,7 @@ def close(self):
13701372
# Preserve current position in file
13711373
self._zipfile.start_dir = self._fileobj.tell()
13721374
self._fileobj.seek(self._zinfo.header_offset)
1373-
self._fileobj.write(self._zinfo.FileHeader(self._zip64))
1375+
self._fileobj.write(self._zinfo.FileHeader(self._zip64, self._zipfile.metadata_encoding))
13741376
self._fileobj.seek(self._zipfile.start_dir)
13751377

13761378
# Successfully written: Add file to our caches
@@ -1571,6 +1573,8 @@ def _RealGetContents(self):
15711573
else:
15721574
# Historical ZIP filename encoding
15731575
filename = filename.decode(self.metadata_encoding or 'cp437')
1576+
if not self.metadata_encoding and not filename.isascii():
1577+
self.metadata_encoding = "cp437"
15741578
# Create ZipInfo instance to store file information
15751579
x = ZipInfo(filename)
15761580
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
@@ -1808,7 +1812,7 @@ def _open_to_write(self, zinfo, force_zip64=False):
18081812
zinfo.compress_size = 0
18091813
zinfo.CRC = 0
18101814

1811-
zinfo.flag_bits = 0x00
1815+
zinfo.flag_bits = _MASK_UTF_FILENAME
18121816
if zinfo.compress_type == ZIP_LZMA:
18131817
# Compressed data includes an end-of-stream (EOS) marker
18141818
zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1
@@ -1830,7 +1834,7 @@ def _open_to_write(self, zinfo, force_zip64=False):
18301834
self._writecheck(zinfo)
18311835
self._didModify = True
18321836

1833-
self.fp.write(zinfo.FileHeader(zip64))
1837+
self.fp.write(zinfo.FileHeader(zip64, self.metadata_encoding))
18341838

18351839
self._writing = True
18361840
return _ZipWriteFile(self, zinfo, zip64)
@@ -2062,7 +2066,7 @@ def mkdir(self, zinfo_or_directory_name, mode=511):
20622066

20632067
self.filelist.append(zinfo)
20642068
self.NameToInfo[zinfo.filename] = zinfo
2065-
self.fp.write(zinfo.FileHeader(False))
2069+
self.fp.write(zinfo.FileHeader(False, self.metadata_encoding))
20662070
self.start_dir = self.fp.tell()
20672071

20682072
def __del__(self):
@@ -2133,7 +2137,7 @@ def _write_end_record(self):
21332137

21342138
extract_version = max(min_version, zinfo.extract_version)
21352139
create_version = max(min_version, zinfo.create_version)
2136-
filename, flag_bits = zinfo._encodeFilenameFlags()
2140+
filename, flag_bits = zinfo._encodeFilenameFlags(self.metadata_encoding)
21372141
centdir = struct.pack(structCentralDir,
21382142
stringCentralDir, create_version,
21392143
zinfo.create_system, extract_version, zinfo.reserved,
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Preserve non-ASCII filenames encoded not in UTF-8 when appending to
2+
:class:`zipfile.ZipFile`.

0 commit comments

Comments
 (0)