From 671e241e8fa37fff088643e3d2dbd46a72f7fb7c Mon Sep 17 00:00:00 2001 From: Andrea Giudiceandrea Date: Fri, 11 Dec 2020 04:34:30 +0900 Subject: [PATCH 01/11] Add support for Unicode Path Extra Field in ZipFile --- Lib/zipfile/__init__.py | 50 ++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 6e6211de6b1684..e0e7be1feb05ca 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -338,6 +338,22 @@ def _EndRecData(fpin): # Unable to find a valid end of central directory structure return None +def _sanitize_filename(filename): + """Terminate the file name at the first null byte and + ensure paths always use forward slashes as the directory separator.""" + + # Terminate the file name at the first null byte. Null bytes in file + # names are used as tricks by viruses in archives. + null_byte = filename.find(chr(0)) + if null_byte >= 0: + filename = filename[0:null_byte] + # This is used to ensure paths in generated ZIP files always use + # forward slashes as the directory separator, as required by the + # ZIP format specification. + if os.sep != "/" and os.sep in filename: + filename = filename.replace(os.sep, "/") + return filename + class ZipInfo (object): """Class with attributes describing each file in the ZIP archive.""" @@ -360,6 +376,7 @@ class ZipInfo (object): 'external_attr', 'header_offset', 'CRC', + 'orig_filename_crc', 'compress_size', 'file_size', '_raw_time', @@ -368,16 +385,9 @@ class ZipInfo (object): def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): self.orig_filename = filename # Original file name in archive - # Terminate the file name at the first null byte. Null bytes in file - # names are used as tricks by viruses in archives. - null_byte = filename.find(chr(0)) - if null_byte >= 0: - filename = filename[0:null_byte] - # This is used to ensure paths in generated ZIP files always use - # forward slashes as the directory separator, as required by the - # ZIP format specification. - if os.sep != "/" and os.sep in filename: - filename = filename.replace(os.sep, "/") + # Terminate the file name at the first null byte and + # ensure paths always use forward slashes as the directory separator. + filename = _sanitize_filename(filename) self.filename = filename # Normalized file name self.date_time = date_time # year, month, day, hour, min, sec @@ -508,6 +518,24 @@ def _decodeExtra(self): except struct.error: raise BadZipFile(f"Corrupt zip64 extra field. " f"{field} not found.") from None + elif tp == 0x7075: + data = extra[4:ln+4] + # Unicode Path Extra Field + try: + up_version, up_name_crc = unpack(' 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) + orig_filename_crc = crc32(filename) flags = centdir[_CD_FLAG_BITS] if flags & _MASK_UTF_FILENAME: # UTF-8 file names extension @@ -1433,6 +1462,7 @@ def _RealGetContents(self): x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) + x.orig_filename_crc = orig_filename_crc x._decodeExtra() x.header_offset = x.header_offset + concat self.filelist.append(x) From 045806c7ac955cce010b23734c6da1aeb63cb4b0 Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Fri, 10 Mar 2023 04:47:50 +0900 Subject: [PATCH 02/11] Add testcase for Unicode Path Extra Field in ZipFile --- Lib/test/test_zipfile/test_core.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index e23f5c2a8556f2..9684635b0b5c2e 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1616,6 +1616,25 @@ def test_write_unicode_filenames(self): self.assertEqual(zf.filelist[0].filename, "foo.txt") self.assertEqual(zf.filelist[1].filename, "\xf6.txt") + def test_read_zipfile_containing_unicode_path_extra_field(self): + with zipfile.ZipFile(TESTFN, mode='w') as zf: + # create a file with a non-ASCII name + filename = '이름.txt' + with open(filename, mode='w') as myfile: + myfile.write('Hello, world!') + + # create a ZipInfo object with Unicode path extra field + zip_info = zipfile.ZipInfo(filename) + extra_data = filename.encode('utf-8') + b'\x00' # path in UTF-8, null-terminated + zip_info.extra = b'\x75\x70' + len(extra_data).to_bytes(2, 'little') + extra_data + + # add the file to the ZIP archive + with open(filename, mode='r') as myfile: + zf.writestr(zip_info, myfile.read()) + + with zipfile.ZipFile(TESTFN, "r") as zf: + self.assertEqual(zf.filelist[0].filename, "이름.txt") + def test_read_after_write_unicode_filenames(self): with zipfile.ZipFile(TESTFN2, 'w') as zipfp: zipfp.writestr('приклад', b'sample') From e389807eefc925c518a83b1c180940284db11eab Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Fri, 10 Mar 2023 05:03:00 +0900 Subject: [PATCH 03/11] Add news --- Misc/ACKS | 1 + .../Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst diff --git a/Misc/ACKS b/Misc/ACKS index 7bbde3af99782b..89a4ed40a24998 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -627,6 +627,7 @@ Julian Gindi Yannick Gingras Neil Girdhar Matt Giuca +Andrea Giudiceandrea Franz Glasner Wim Glenn Michael Goderbauer diff --git a/Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst b/Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst new file mode 100644 index 00000000000000..39461f3f84c9ac --- /dev/null +++ b/Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst @@ -0,0 +1,2 @@ +Add support for Unicode Path Extra Field in ZipFile. Patch by Yeojin Kim +and Andrea Giudiceandrea From a9502d73d9246968a39f62485059bb018ea36867 Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Fri, 10 Mar 2023 05:57:33 +0900 Subject: [PATCH 04/11] Do not create a redundant file --- Lib/test/test_zipfile/test_core.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 9684635b0b5c2e..e0b6152ec521e8 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1620,8 +1620,6 @@ def test_read_zipfile_containing_unicode_path_extra_field(self): with zipfile.ZipFile(TESTFN, mode='w') as zf: # create a file with a non-ASCII name filename = '이름.txt' - with open(filename, mode='w') as myfile: - myfile.write('Hello, world!') # create a ZipInfo object with Unicode path extra field zip_info = zipfile.ZipInfo(filename) @@ -1629,8 +1627,7 @@ def test_read_zipfile_containing_unicode_path_extra_field(self): zip_info.extra = b'\x75\x70' + len(extra_data).to_bytes(2, 'little') + extra_data # add the file to the ZIP archive - with open(filename, mode='r') as myfile: - zf.writestr(zip_info, myfile.read()) + zf.writestr(zip_info, b'Hello World!') with zipfile.ZipFile(TESTFN, "r") as zf: self.assertEqual(zf.filelist[0].filename, "이름.txt") From a094a35e11aca7569e9f74fa39bcdcc0bf7a8cf0 Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Wed, 15 Mar 2023 22:07:35 +0900 Subject: [PATCH 05/11] Raise BadZipFile instead of warning --- Lib/zipfile/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e0e7be1feb05ca..307fed4f7a00aa 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -528,14 +528,11 @@ def _decodeExtra(self): if up_unicode_name: self.filename = _sanitize_filename(up_unicode_name) else: - import warnings warnings.warn("Empty unicode path extra field (0x7075)", stacklevel=2) - except struct.error: - import warnings - warnings.warn("Corrupt unicode path extra field (0x7075)", stacklevel=2) - except UnicodeDecodeError: - import warnings - warnings.warn('Corrupt unicode path extra field (0x7075): invalid utf-8 bytes', stacklevel=2) + except struct.error as e: + raise BadZipFile("Corrupt unicode path extra field (0x7075)") from e + except UnicodeDecodeError as e: + raise BadZipFile('Corrupt unicode path extra field (0x7075): invalid utf-8 bytes') from e extra = extra[ln+4:] From 76300747d5b4fb06b7f759ed377be943aa8899e8 Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Wed, 15 Mar 2023 22:08:56 +0900 Subject: [PATCH 06/11] Delete orig_filename_crc --- Lib/zipfile/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 307fed4f7a00aa..1ab14d94f6a992 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -376,7 +376,6 @@ class ZipInfo (object): 'external_attr', 'header_offset', 'CRC', - 'orig_filename_crc', 'compress_size', 'file_size', '_raw_time', @@ -523,7 +522,7 @@ def _decodeExtra(self): # Unicode Path Extra Field try: up_version, up_name_crc = unpack(' 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) - orig_filename_crc = crc32(filename) flags = centdir[_CD_FLAG_BITS] if flags & _MASK_UTF_FILENAME: # UTF-8 file names extension @@ -1459,7 +1457,6 @@ def _RealGetContents(self): x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) - x.orig_filename_crc = orig_filename_crc x._decodeExtra() x.header_offset = x.header_offset + concat self.filelist.append(x) From b7167c96f0b9368d0db1a560a2dbab950d4dab42 Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Thu, 16 Mar 2023 10:54:52 +0900 Subject: [PATCH 07/11] Fix testcase --- Lib/test/test_zipfile/test_core.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index e0b6152ec521e8..7c3671a360aaa8 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1616,15 +1616,23 @@ def test_write_unicode_filenames(self): self.assertEqual(zf.filelist[0].filename, "foo.txt") self.assertEqual(zf.filelist[1].filename, "\xf6.txt") + @requires_zlib() def test_read_zipfile_containing_unicode_path_extra_field(self): with zipfile.ZipFile(TESTFN, mode='w') as zf: # create a file with a non-ASCII name filename = '이름.txt' + filename_encoded = filename.encode('utf-8') # create a ZipInfo object with Unicode path extra field zip_info = zipfile.ZipInfo(filename) - extra_data = filename.encode('utf-8') + b'\x00' # path in UTF-8, null-terminated - zip_info.extra = b'\x75\x70' + len(extra_data).to_bytes(2, 'little') + extra_data + + tag_for_unicode_path = b'\x75\x70' + version_of_unicode_path = b'\x01' + filename_crc = struct.pack(' Date: Thu, 16 Mar 2023 16:49:33 +0900 Subject: [PATCH 08/11] Revert "Delete orig_filename_crc" This reverts commit 76300747d5b4fb06b7f759ed377be943aa8899e8. --- Lib/zipfile/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 1ab14d94f6a992..307fed4f7a00aa 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -376,6 +376,7 @@ class ZipInfo (object): 'external_attr', 'header_offset', 'CRC', + 'orig_filename_crc', 'compress_size', 'file_size', '_raw_time', @@ -522,7 +523,7 @@ def _decodeExtra(self): # Unicode Path Extra Field try: up_version, up_name_crc = unpack(' 2: print(centdir) filename = fp.read(centdir[_CD_FILENAME_LENGTH]) + orig_filename_crc = crc32(filename) flags = centdir[_CD_FLAG_BITS] if flags & _MASK_UTF_FILENAME: # UTF-8 file names extension @@ -1457,6 +1459,7 @@ def _RealGetContents(self): x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) + x.orig_filename_crc = orig_filename_crc x._decodeExtra() x.header_offset = x.header_offset + concat self.filelist.append(x) From c75e016665de2cdd8705b32687fa8937a1c27796 Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Thu, 16 Mar 2023 16:51:44 +0900 Subject: [PATCH 09/11] Add import --- Lib/test/test_zipfile/test_core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 7c3671a360aaa8..73c6b0185a1a0e 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1628,7 +1628,10 @@ def test_read_zipfile_containing_unicode_path_extra_field(self): tag_for_unicode_path = b'\x75\x70' version_of_unicode_path = b'\x01' + + import zlib filename_crc = struct.pack(' Date: Wed, 5 Apr 2023 18:08:56 +0900 Subject: [PATCH 10/11] Use orig_filename_crc as local variable --- Lib/zipfile/__init__.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 307fed4f7a00aa..ab1dfea99bf15e 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -94,7 +94,7 @@ class LargeZipFile(Exception): # The "central directory" structure, magic number, size, and indices # of entries in the structure (section V.F in the format document) -structCentralDir = "<4s4B4HL2L5H2L" +structCentralDir = "<4s4B4HL4L2HL" stringCentralDir = b"PK\001\002" sizeCentralDir = struct.calcsize(structCentralDir) @@ -376,7 +376,6 @@ class ZipInfo (object): 'external_attr', 'header_offset', 'CRC', - 'orig_filename_crc', 'compress_size', 'file_size', '_raw_time', @@ -492,7 +491,7 @@ def _encodeFilenameFlags(self): except UnicodeEncodeError: return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME - def _decodeExtra(self): + def _decodeExtra(self, filename_crc): # Try to decode the extra field. extra = self.extra unpack = struct.unpack @@ -523,7 +522,7 @@ def _decodeExtra(self): # Unicode Path Extra Field try: up_version, up_name_crc = unpack('>9)+1980, (d>>5)&0xF, d&0x1F, t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) - - x.orig_filename_crc = orig_filename_crc - x._decodeExtra() + x._decodeExtra(orig_filename_crc) x.header_offset = x.header_offset + concat self.filelist.append(x) self.NameToInfo[x.filename] = x From 84f7359c7aa6c4c7d59d2137f38a8ba08d80b916 Mon Sep 17 00:00:00 2001 From: Yeojin Kim Date: Wed, 5 Apr 2023 18:52:46 +0900 Subject: [PATCH 11/11] Rollback structCentralDir --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index ab1dfea99bf15e..95c047991f872b 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -94,7 +94,7 @@ class LargeZipFile(Exception): # The "central directory" structure, magic number, size, and indices # of entries in the structure (section V.F in the format document) -structCentralDir = "<4s4B4HL4L2HL" +structCentralDir = "<4s4B4HL2L5H2L" stringCentralDir = b"PK\001\002" sizeCentralDir = struct.calcsize(structCentralDir)