From 6aed859adec5134ab756520c0dc5f7feeae6cab8 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 24 May 2025 17:34:26 +0800 Subject: [PATCH 01/64] Add `remove()` and `repack()` to `ZipFile` --- Doc/library/zipfile.rst | 23 + Lib/test/test_zipfile/test_core.py | 725 +++++++++++++++++++++++++++++ Lib/test/test_zipfile64.py | 129 +++++ Lib/zipfile/__init__.py | 374 +++++++++++++++ 4 files changed, 1251 insertions(+) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 6a4fa67332e179..047512bc88e07c 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -518,6 +518,29 @@ ZipFile Objects .. versionadded:: 3.11 +.. method:: ZipFile.remove(zinfo_or_arcname) + + Removes a member from the archive. *zinfo_or_arcname* is either the full + path of the member, or a :class:`ZipInfo` instance. + + The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``. + + Calling :meth:`remove` on a closed ZipFile will raise a :exc:`ValueError`. + + .. versionadded:: next + + +.. method:: ZipFile.repack() + + Repack a zip file and physically remove non-referenced file entries. + + The archive must be opened with mode ``'a'``. + + Calling :meth:`repack` on a closed ZipFile will raise a :exc:`ValueError`. + + .. versionadded:: next + + The following data attributes are also available: .. attribute:: ZipFile.filename diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index ada96813709aea..2ba2d296c44aa9 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -13,6 +13,7 @@ import time import unittest import unittest.mock as mock +import warnings import zipfile @@ -1360,6 +1361,730 @@ class LzmaWriterTests(AbstractWriterTests, unittest.TestCase): class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_ZSTANDARD + +def ComparableZipInfo(zinfo): + return (zinfo.filename, zinfo.header_offset, zinfo.compress_size, zinfo.CRC) + +_struct_pack = struct.pack + +def struct_pack_no_dd_sig(fmt, *values): + """A mock side_effect for native `struct.pack` to not generate a + signature for data descriptors.""" + # suppress BytesWarning etc. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + if values[0] == zipfile._DD_SIGNATURE: + return _struct_pack(fmt[0:1] + fmt[2:], *values[1:]) + return _struct_pack(fmt, *values) + +class RepackHelperMixin: + """Common helpers for remove and repack.""" + def _prepare_zip_from_test_files(self, zfname, test_files, force_zip64=False): + zinfos = [] + with zipfile.ZipFile(zfname, 'w', self.compression) as zh: + for file, data in test_files: + with zh.open(file, 'w', force_zip64=force_zip64) as fh: + fh.write(data) + zinfo = zh.getinfo(file) + zinfos.append(ComparableZipInfo(zinfo)) + return zinfos + +class AbstractRemoveTests(RepackHelperMixin): + def test_remove_by_name(self): + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for i in range(0, 3): + with self.subTest(i=i, filename=test_files[i][0]): + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove(test_files[i][0]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zi for j, zi in enumerate(zinfos) if j != i], + ) + + # check NameToInfo cache + with self.assertRaises(KeyError): + zh.getinfo(test_files[i][0]) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + def test_remove_by_zinfo(self): + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for i in range(0, 3): + with self.subTest(i=i, filename=test_files[i][0]): + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove(zh.infolist()[i]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zi for j, zi in enumerate(zinfos) if j != i], + ) + + # check NameToInfo cache + with self.assertRaises(KeyError): + zh.getinfo(test_files[i][0]) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + def test_remove_by_name_nonexist(self): + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with self.assertRaises(KeyError): + zh.remove('nonexist.txt') + + def test_remove_by_zinfo_nonexist(self): + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with self.assertRaises(KeyError): + zh.remove(zipfile.ZipInfo('nonexist.txt')) + + def test_remove_by_name_duplicated(self): + test_files = [ + ('file.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file1.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + # suppress duplicated name warning + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove('file.txt') + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zinfos[0], zinfos[2]], + ) + + # check NameToInfo cache + self.assertEqual( + ComparableZipInfo(zh.getinfo('file.txt')), + zinfos[0], + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove('file.txt') + zh.remove('file.txt') + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zinfos[2]], + ) + + # check NameToInfo cache + with self.assertRaises(KeyError): + zh.getinfo('file.txt') + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + def test_remove_by_zinfo_duplicated(self): + test_files = [ + ('file.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file1.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + # suppress duplicated name warning + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove(zh.infolist()[0]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zinfos[1], zinfos[2]], + ) + + # check NameToInfo cache + self.assertEqual( + ComparableZipInfo(zh.getinfo('file.txt')), + zinfos[1], + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove(zh.infolist()[1]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zinfos[0], zinfos[2]], + ) + + # check NameToInfo cache + self.assertEqual( + ComparableZipInfo(zh.getinfo('file.txt')), + zinfos[0], + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + infolist = zh.infolist().copy() + zh.remove(infolist[0]) + zh.remove(infolist[1]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zinfos[2]], + ) + + # check NameToInfo cache + with self.assertRaises(KeyError): + zh.getinfo('file.txt') + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + def test_remove_zip64(self): + test_files = [ + ('pre.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('datafile', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ('post.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ] + + for i in range(0, 3): + with self.subTest(i=i, filename=test_files[i][0]): + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files, force_zip64=True) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove(zh.infolist()[i]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zi for j, zi in enumerate(zinfos) if j != i], + ) + + # check NameToInfo cache + with self.assertRaises(KeyError): + zh.getinfo(test_files[i][0]) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + def test_remove_validate(self): + file = 'datafile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + + # closed: error out and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a') as zh: + zh.close() + with self.assertRaises(ValueError): + zh.remove(file) + + # writing: error out and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a') as zh: + with zh.open('newfile.txt', 'w') as fh: + with self.assertRaises(ValueError): + zh.remove(file) + + # mode 'r': error out and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'r') as zh: + with self.assertRaises(ValueError): + zh.remove(file) + +class StoredRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_STORED + +@requires_zlib() +class DeflateRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_DEFLATED + +@requires_bz2() +class Bzip2RemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_BZIP2 + +@requires_lzma() +class LzmaRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_LZMA + +@requires_zstd() +class ZstdRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_ZSTANDARD + +class AbstractRepackTests(RepackHelperMixin): + def test_repack_basic(self): + """Should remove local file entries for deleted files.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + ln = len(test_files) + iii = (ii for n in range(1, ln + 1) for ii in itertools.combinations(range(ln), n)) + for ii in iii: + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + expected_zinfos = self._prepare_zip_from_test_files(TESTFN, _test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_bytes_before_first_file(self): + """Should preserve random bytes before the first recorded local file entry.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for ii in ([], [0], [0, 1], [0, 1, 2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + fh.write(b'dummy') + with open(TESTFN, 'r+b') as fh: + fh.seek(0, 2) + expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + fh.write(b'dummy') + with open(TESTFN, 'r+b') as fh: + fh.seek(0, 2) + zinfos = self._prepare_zip_from_test_files(fh, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_magic_before_first_file(self): + """Should preserve random signature bytes not forming a valid file entry + before the first recorded local file entry.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for ii in ([], [0], [0, 1], [0, 1, 2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + fh.write(b'PK\003\004 ') + with open(TESTFN, 'r+b') as fh: + fh.seek(0, 2) + expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + fh.write(b'PK\003\004 ') + with open(TESTFN, 'r+b') as fh: + fh.seek(0, 2) + zinfos = self._prepare_zip_from_test_files(fh, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_file_entry_before_first_file(self): + """Should preserve seeming valid file entries not forming consecutive + valid file entries until the first recorded local file entry. + + This may happen whan a self-extractor contains an uncompressed ZIP + library. (simulated by writing a ZIP file in this test) + """ + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for ii in ([], [0], [0, 1], [0, 1, 2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w') as zh: + zh.writestr('file.txt', b'dummy') + fh.write(b' ') + with open(TESTFN, 'r+b') as fh: + fh.seek(0, 2) + expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w') as zh: + zh.writestr('file.txt', b'dummy') + fh.write(b' ') + with open(TESTFN, 'r+b') as fh: + fh.seek(0, 2) + zinfos = self._prepare_zip_from_test_files(fh, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_zip64(self): + """Should correctly handle file entries with zip64.""" + test_files = [ + ('pre.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('datafile', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ('post.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ] + + for ii in ([0], [0, 1], [1], [2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + expected_zinfos = self._prepare_zip_from_test_files(TESTFN, _test_files, force_zip64=True) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + zinfos = self._prepare_zip_from_test_files(TESTFN, test_files, force_zip64=True) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_data_descriptor(self): + """Should correctly handle file entries using data descriptor.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for ii in ([0], [0, 1], [1], [2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + # make sure data descriptor bit is really set (by making zipfile unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor not used: {zi.filename}') + + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_data_descriptor_and_zip64(self): + """Should correctly handle file entries using data descriptor and zip64.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for ii in ([0], [0, 1], [1], [2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files, force_zip64=True) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + # make sure data descriptor bit is really set (by making zipfile unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor not used: {zi.filename}') + + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_data_descriptor_no_sig(self): + """Should correctly handle file entries using data descriptor without signature.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for ii in ([0], [0, 1], [1], [2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + # make sure data descriptor bit is really set (by making zipfile unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') + + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_data_descriptor_no_sig_and_zip64(self): + """Should correctly handle file entries using data descriptor without signature and zip64.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + for ii in ([0], [0, 1], [1], [2]): + with self.subTest(remove=ii): + # calculate the expected results + _test_files = [data for j, data in enumerate(test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files, force_zip64=True) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + # make sure data descriptor bit is really set (by making zipfile unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') + + for i in ii: + zh.remove(test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def test_repack_validate(self): + file = 'datafile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + + # closed: error out and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a') as zh: + zh.close() + with self.assertRaises(ValueError): + zh.repack() + + # writing: error out and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a') as zh: + with zh.open('newfile.txt', 'w') as fh: + with self.assertRaises(ValueError): + zh.repack() + + # mode 'r': error out and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'r') as zh: + with self.assertRaises(ValueError): + zh.repack() + + # mode 'w': error out and do nothing + with zipfile.ZipFile(TESTFN, 'w') as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'w') as zh: + with self.assertRaises(ValueError): + zh.repack() + + # mode 'x': error out and do nothing + os.remove(TESTFN) + with zipfile.ZipFile(TESTFN, 'x') as zh: + with self.assertRaises(ValueError): + zh.repack() + +class StoredRepackTests(AbstractRepackTests, unittest.TestCase): + compression = zipfile.ZIP_STORED + +@requires_zlib() +class DeflateRepackTests(AbstractRepackTests, unittest.TestCase): + compression = zipfile.ZIP_DEFLATED + +@requires_bz2() +class Bzip2RepackTests(AbstractRepackTests, unittest.TestCase): + compression = zipfile.ZIP_BZIP2 + +@requires_lzma() +class LzmaRepackTests(AbstractRepackTests, unittest.TestCase): + compression = zipfile.ZIP_LZMA + +@requires_zstd() +class ZstdRepackTests(AbstractRepackTests, unittest.TestCase): + compression = zipfile.ZIP_ZSTANDARD + + class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): if name + 'o' not in namelist: diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index 2e1affe0252858..ba943719fcc64e 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -14,11 +14,14 @@ import zipfile, unittest import time import sys +import unittest.mock as mock from tempfile import TemporaryFile from test.support import os_helper from test.support import requires_zlib +from test.test_zipfile.test_core import Unseekable +from test.test_zipfile.test_core import struct_pack_no_dd_sig TESTFN = os_helper.TESTFN TESTFN2 = TESTFN + "2" @@ -87,6 +90,132 @@ def tearDown(self): os_helper.unlink(TESTFN2) +class TestRepack(unittest.TestCase): + def setUp(self): + # Create test data. + line_gen = ("Test of zipfile line %d." % i for i in range(1000000)) + self.data = '\n'.join(line_gen).encode('ascii') + + # It will contain enough copies of self.data to reach about 8 GiB. + self.datacount = 8*1024**3 // len(self.data) + + def _write_large_file(self, fh): + next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL + for num in range(self.datacount): + fh.write(self.data) + # Print still working message since this test can be really slow + if next_time <= time.monotonic(): + next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL + print(( + ' writing %d of %d, be patient...' % + (num, self.datacount)), file=sys.__stdout__) + sys.__stdout__.flush() + + def test_clean_removed_large_file(self): + """Should move the physical data of a file positioned after a large + removed file without causing a memory issue.""" + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_clean_removed_large_file(f) + self.assertFalse(f.closed) + + def _test_clean_removed_large_file(self, f): + file = 'file.txt' + file1 = 'largefile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with zipfile.ZipFile(f, 'w') as zh: + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + zh.writestr(file, data) + + with zipfile.ZipFile(f, 'a') as zh: + zh.remove(file1) + zh.repack() + self.assertIsNone(zh.testzip()) + + def test_clean_removed_file_before_large_file(self): + """Should move the physical data of a large file positioned after a + removed file without causing a memory issue.""" + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_clean_removed_file_before_large_file(f) + self.assertFalse(f.closed) + + def _test_clean_removed_file_before_large_file(self, f): + file = 'file.txt' + file1 = 'largefile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with zipfile.ZipFile(f, 'w') as zh: + zh.writestr(file, data) + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + + with zipfile.ZipFile(f, 'a') as zh: + zh.remove(file) + zh.repack() + self.assertIsNone(zh.testzip()) + + def test_clean_removed_large_file_with_dd(self): + """Should scan for the data descriptor of a removed large file without + causing a memory issue.""" + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_clean_removed_large_file_with_dd(f) + self.assertFalse(f.closed) + + def _test_clean_removed_large_file_with_dd(self, f): + file = 'file.txt' + file1 = 'largefile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with zipfile.ZipFile(Unseekable(f), 'w') as zh: + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + zh.writestr(file, data) + + with zipfile.ZipFile(f, 'a') as zh: + # make sure data descriptor bit is really set (by making zip file unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') + + zh.remove(file1) + zh.repack() + self.assertIsNone(zh.testzip()) + + def test_clean_removed_large_file_with_dd_no_sig(self): + """Should scan for the data descriptor (without signature) of a removed + large file without causing a memory issue.""" + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_clean_removed_large_file_with_dd_no_sig(f) + self.assertFalse(f.closed) + + def _test_clean_removed_large_file_with_dd_no_sig(self, f): + # Reduce data to 400 MiB for this test, as it's especially slow... + self.datacount = 400*1024**2 // len(self.data) + + file = 'file.txt' + file1 = 'largefile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with zipfile.ZipFile(Unseekable(f), 'w') as zh: + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + zh.writestr(file, data) + + with zipfile.ZipFile(f, 'a') as zh: + # make sure data descriptor bit is really set (by making zip file unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') + + zh.remove(file1) + zh.repack() + self.assertIsNone(zh.testzip()) + + class OtherTests(unittest.TestCase): def testMoreThan64kFiles(self): # This test checks that more than 64k files can be added to an archive, diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 18caeb3e04a2b5..54f3b1f3f6739d 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1367,6 +1367,322 @@ def close(self): self._zipfile._writing = False +class _ZipRepacker: + """Class for ZipFile repacking.""" + def __init__(self, debug=0): + self.debug = debug # Level of printing: 0 through 3 + + def repack(self, zfile): + """ + Repack the ZIP file, removing unrecorded local file entries and random + bytes not listed in the central directory. + + Assumes that local file entries are written consecutively without gaps. + + Truncation is applied in two phases: + + 1. Before the first recorded file entry: + - If a sequence of valid local file entries (starting with + `PK\x03\x04`) is found leading up to the first recorded entry, + it is truncated. + - Otherwise, all leading bytes are preserved (e.g., in cases such + as self-extracting code or embedded ZIP libraries). + + 2. Between or after the recorded entries: + - Any data between two recorded entries, or after the last recorded + entry but before the central directory, is removed—regardless of + whether it resembles a valid entry. + + ### Examples + + Truncation before first recorded entry: + + [random bytes] + [unrecorded local file entry 1] + [unrecorded local file entry 2] + [random bytes] + <- truncation start + [unrecorded local file entry 3] + [unrecorded local file entry 4] + <- truncation end + [recorded local file entry 1] + ... + [central directory] + + Truncation between recorded entries: + + ... + [recorded local file entry 5] + <- truncation start + [random bytes] + [unrecorded local file entry] + [random bytes] + <- truncation end + [recorded local file entry 6] + ... + [recorded local file entry n] + <- truncation start + [unrecorded local file entry] + <- truncation end + [central directory] + + No truncation case: + + [unrecorded local file entry 1] + [unrecorded local file entry 2] + ... + [unrecorded local file entry n] + [random bytes] + [recorded local file entry 1] + ... + """ + with zfile._lock: + self._repack(zfile) + + def _repack(self, zfile, *, chunk_size=2**20): + fp = zfile.fp + + # get a sorted filelist by header offset, in case the dir order + # doesn't match the actual entry order + filelist = sorted(zfile.filelist, key=lambda x: x.header_offset) + + # calculate the starting entry offset (bytes to skip) + entry_offset = 0 + + try: + data_offset = filelist[0].header_offset + except IndexError: + data_offset = zfile.start_dir + + if data_offset > 0: + if self.debug > 2: + print('scanning file signatures before:', data_offset) + for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): + if self._starts_consecutive_file_entries(fp, pos, data_offset): + entry_offset = data_offset - pos + break + + # move file entries + for i, info in enumerate(filelist): + # get the total size of the entry + try: + offset = filelist[i + 1].header_offset + except IndexError: + offset = zfile.start_dir + entry_size = offset - info.header_offset + + used_entry_size = self._calc_local_file_entry_size(fp, info) + + # update the header and move entry data to the new position + if entry_offset > 0: + old_header_offset = info.header_offset + info.header_offset -= entry_offset + read_size = 0 + while read_size < used_entry_size: + fp.seek(old_header_offset + read_size) + data = fp.read(min(used_entry_size - read_size, chunk_size)) + fp.seek(info.header_offset + read_size) + fp.write(data) + fp.flush() + read_size += len(data) + + if info._end_offset is not None: + info._end_offset = info.header_offset + used_entry_size + + # update entry_offset for subsequent files to follow + if used_entry_size < entry_size: + entry_offset += entry_size - used_entry_size + + # Avoid missing entry if entries have a duplicated name. + # Reverse the order as NameToInfo normally stores the last added one. + for info in reversed(zfile.filelist): + zfile.NameToInfo.setdefault(info.filename, info) + + # update state + zfile.start_dir -= entry_offset + zfile._didModify = True + + def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_size=4096): + sig_len = len(signature) + remainder = b'' + pos = start_offset + + fp.seek(start_offset) + while pos < end_offset: + read_size = min(chunk_size, end_offset - pos) + chunk = remainder + fp.read(read_size) + if not chunk: + break + + idx = 0 + while True: + idx = chunk.find(signature, idx) + if idx == -1 or idx + sig_len > len(chunk): + break + + abs_pos = pos - len(remainder) + idx + yield abs_pos + idx += 1 + + remainder = chunk[-(sig_len - 1):] + pos += read_size + + def _starts_consecutive_file_entries(self, fp, start_offset, end_offset): + offset = start_offset + + while offset < end_offset: + if self.debug > 2: + print('checking local file entry:', offset) + + fp.seek(offset) + try: + fheader = self._read_local_file_header(fp) + except BadZipFile: + return False + + # Create a dummy ZipInfo to utilize parsing. + # Flush only the required information. + zinfo = ZipInfo() + zinfo.header_offset = offset + zinfo.flag_bits = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] + zinfo.compress_size = fheader[_FH_COMPRESSED_SIZE] + zinfo.file_size = fheader[_FH_UNCOMPRESSED_SIZE] + zinfo.CRC = fheader[_FH_CRC] + + filename = fp.read(fheader[_FH_FILENAME_LENGTH]) + zinfo.extra = fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + pos = fp.tell() + + if pos > end_offset: + return False + + try: + zinfo._decodeExtra(crc32(filename)) # parse zip64 + except BadZipFile: + return False + + data_descriptor_size = 0 + + if zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: + # According to the spec, these fields should be zero when data + # descriptor is used. Otherwise treat as a false positive on + # random bytes to return early, as scanning for data descriptor + # is rather intensive. + if not (zinfo.CRC == zinfo.compress_size == zinfo.file_size == 0): + return False + + zip64 = ( + fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff or + fheader[_FH_COMPRESSED_SIZE] == 0xffffffff + ) + + dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) + + if dd is None: + return False + + crc, compress_size, file_size, data_descriptor_size = dd + zinfo.CRC = crc + zinfo.compress_size = compress_size + zinfo.file_size = file_size + + offset += ( + sizeFileHeader + + fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] + + zinfo.compress_size + + data_descriptor_size + ) + + if self.debug > 2: + print('next', offset) + + return offset == end_offset + + def _read_local_file_header(self, fp): + fheader = fp.read(sizeFileHeader) + if len(fheader) != sizeFileHeader: + raise BadZipFile("Truncated file header") + fheader = struct.unpack(structFileHeader, fheader) + if fheader[_FH_SIGNATURE] != stringFileHeader: + raise BadZipFile("Bad magic number for file header") + return fheader + + def _scan_data_descriptor(self, fp, offset, end_offset, zip64): + dd_fmt = ' Date: Sat, 24 May 2025 11:17:39 +0000 Subject: [PATCH 02/64] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst new file mode 100644 index 00000000000000..f9165d4d280bfe --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst @@ -0,0 +1 @@ +Add ``ZipFile.remove()`` and ``ZipFile.repack()`` From 80ab2e27e4bb63cea3e7aa3c022236a76e05af68 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 24 May 2025 22:22:28 +0800 Subject: [PATCH 03/64] Fix and optimize test code --- Lib/test/test_zipfile/test_core.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 2ba2d296c44aa9..b6ec4aff30e404 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1704,17 +1704,13 @@ def test_repack_bytes_before_first_file(self): # calculate the expected results _test_files = [data for j, data in enumerate(test_files) if j not in ii] with open(TESTFN, 'wb') as fh: - fh.write(b'dummy') - with open(TESTFN, 'r+b') as fh: - fh.seek(0, 2) + fh.write(b'dummy ') expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: - fh.write(b'dummy') - with open(TESTFN, 'r+b') as fh: - fh.seek(0, 2) + fh.write(b'dummy ') zinfos = self._prepare_zip_from_test_files(fh, test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: @@ -1748,16 +1744,12 @@ def test_repack_magic_before_first_file(self): _test_files = [data for j, data in enumerate(test_files) if j not in ii] with open(TESTFN, 'wb') as fh: fh.write(b'PK\003\004 ') - with open(TESTFN, 'r+b') as fh: - fh.seek(0, 2) expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: fh.write(b'PK\003\004 ') - with open(TESTFN, 'r+b') as fh: - fh.seek(0, 2) zinfos = self._prepare_zip_from_test_files(fh, test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: @@ -1777,7 +1769,7 @@ def test_repack_magic_before_first_file(self): self.assertEqual(os.path.getsize(TESTFN), expected_size) def test_repack_file_entry_before_first_file(self): - """Should preserve seeming valid file entries not forming consecutive + """Should preserve seemingly valid file entries not forming consecutive valid file entries until the first recorded local file entry. This may happen whan a self-extractor contains an uncompressed ZIP @@ -1797,8 +1789,6 @@ def test_repack_file_entry_before_first_file(self): with zipfile.ZipFile(fh, 'w') as zh: zh.writestr('file.txt', b'dummy') fh.write(b' ') - with open(TESTFN, 'r+b') as fh: - fh.seek(0, 2) expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) expected_size = os.path.getsize(TESTFN) @@ -1807,8 +1797,6 @@ def test_repack_file_entry_before_first_file(self): with zipfile.ZipFile(fh, 'w') as zh: zh.writestr('file.txt', b'dummy') fh.write(b' ') - with open(TESTFN, 'r+b') as fh: - fh.seek(0, 2) zinfos = self._prepare_zip_from_test_files(fh, test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: From 72c2a6678f48454761edc4fd59f57862dd5d1b6c Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 24 May 2025 22:29:55 +0800 Subject: [PATCH 04/64] Handle common setups with `setUpClass` --- Lib/test/test_zipfile/test_core.py | 184 +++++++++-------------------- 1 file changed, 58 insertions(+), 126 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index b6ec4aff30e404..a975a43543da20 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1390,18 +1390,20 @@ def _prepare_zip_from_test_files(self, zfname, test_files, force_zip64=False): return zinfos class AbstractRemoveTests(RepackHelperMixin): - def test_remove_by_name(self): - test_files = [ + @classmethod + def setUpClass(self): + self.test_files = [ ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), ] + def test_remove_by_name(self): for i in range(0, 3): - with self.subTest(i=i, filename=test_files[i][0]): - zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with self.subTest(i=i, filename=self.test_files[i][0]): + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) # check infolist self.assertEqual( @@ -1411,21 +1413,15 @@ def test_remove_by_name(self): # check NameToInfo cache with self.assertRaises(KeyError): - zh.getinfo(test_files[i][0]) + zh.getinfo(self.test_files[i][0]) # make sure the zip file is still valid self.assertIsNone(zh.testzip()) def test_remove_by_zinfo(self): - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for i in range(0, 3): - with self.subTest(i=i, filename=test_files[i][0]): - zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with self.subTest(i=i, filename=self.test_files[i][0]): + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: zh.remove(zh.infolist()[i]) @@ -1437,31 +1433,19 @@ def test_remove_by_zinfo(self): # check NameToInfo cache with self.assertRaises(KeyError): - zh.getinfo(test_files[i][0]) + zh.getinfo(self.test_files[i][0]) # make sure the zip file is still valid self.assertIsNone(zh.testzip()) def test_remove_by_name_nonexist(self): - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - - zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with self.assertRaises(KeyError): zh.remove('nonexist.txt') def test_remove_by_zinfo_nonexist(self): - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - - zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with self.assertRaises(KeyError): zh.remove(zipfile.ZipInfo('nonexist.txt')) @@ -1583,15 +1567,9 @@ def test_remove_by_zinfo_duplicated(self): self.assertIsNone(zh.testzip()) def test_remove_zip64(self): - test_files = [ - ('pre.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('datafile', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ('post.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ] - for i in range(0, 3): - with self.subTest(i=i, filename=test_files[i][0]): - zinfos = self._prepare_zip_from_test_files(TESTFN, test_files, force_zip64=True) + with self.subTest(i=i, filename=self.test_files[i][0]): + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: zh.remove(zh.infolist()[i]) @@ -1603,7 +1581,7 @@ def test_remove_zip64(self): # check NameToInfo cache with self.assertRaises(KeyError): - zh.getinfo(test_files[i][0]) + zh.getinfo(self.test_files[i][0]) # make sure the zip file is still valid self.assertIsNone(zh.testzip()) @@ -1655,28 +1633,30 @@ class ZstdRemoveTests(AbstractRemoveTests, unittest.TestCase): compression = zipfile.ZIP_ZSTANDARD class AbstractRepackTests(RepackHelperMixin): - def test_repack_basic(self): - """Should remove local file entries for deleted files.""" - test_files = [ + @classmethod + def setUpClass(self): + self.test_files = [ ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), ] - ln = len(test_files) + def test_repack_basic(self): + """Should remove local file entries for deleted files.""" + ln = len(self.test_files) iii = (ii for n in range(1, ln + 1) for ii in itertools.combinations(range(ln), n)) for ii in iii: with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] - expected_zinfos = self._prepare_zip_from_test_files(TESTFN, _test_files) + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result - zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1693,28 +1673,22 @@ def test_repack_basic(self): def test_repack_bytes_before_first_file(self): """Should preserve random bytes before the first recorded local file entry.""" - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for ii in ([], [0], [0, 1], [0, 1, 2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: fh.write(b'dummy ') - expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) + expected_zinfos = self._prepare_zip_from_test_files(fh, test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: fh.write(b'dummy ') - zinfos = self._prepare_zip_from_test_files(fh, test_files) + zinfos = self._prepare_zip_from_test_files(fh, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1732,28 +1706,22 @@ def test_repack_bytes_before_first_file(self): def test_repack_magic_before_first_file(self): """Should preserve random signature bytes not forming a valid file entry before the first recorded local file entry.""" - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for ii in ([], [0], [0, 1], [0, 1, 2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: fh.write(b'PK\003\004 ') - expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) + expected_zinfos = self._prepare_zip_from_test_files(fh, test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: fh.write(b'PK\003\004 ') - zinfos = self._prepare_zip_from_test_files(fh, test_files) + zinfos = self._prepare_zip_from_test_files(fh, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1775,21 +1743,15 @@ def test_repack_file_entry_before_first_file(self): This may happen whan a self-extractor contains an uncompressed ZIP library. (simulated by writing a ZIP file in this test) """ - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for ii in ([], [0], [0, 1], [0, 1, 2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: with zipfile.ZipFile(fh, 'w') as zh: zh.writestr('file.txt', b'dummy') fh.write(b' ') - expected_zinfos = self._prepare_zip_from_test_files(fh, _test_files) + expected_zinfos = self._prepare_zip_from_test_files(fh, test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -1797,10 +1759,10 @@ def test_repack_file_entry_before_first_file(self): with zipfile.ZipFile(fh, 'w') as zh: zh.writestr('file.txt', b'dummy') fh.write(b' ') - zinfos = self._prepare_zip_from_test_files(fh, test_files) + zinfos = self._prepare_zip_from_test_files(fh, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1817,24 +1779,18 @@ def test_repack_file_entry_before_first_file(self): def test_repack_zip64(self): """Should correctly handle file entries with zip64.""" - test_files = [ - ('pre.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('datafile', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ('post.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ] - for ii in ([0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] - expected_zinfos = self._prepare_zip_from_test_files(TESTFN, _test_files, force_zip64=True) + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files, force_zip64=True) expected_size = os.path.getsize(TESTFN) # do the removal and check the result - zinfos = self._prepare_zip_from_test_files(TESTFN, test_files, force_zip64=True) + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1851,30 +1807,24 @@ def test_repack_zip64(self): def test_repack_data_descriptor(self): """Should correctly handle file entries using data descriptor.""" - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for ii in ([0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files) + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): self.assertTrue(zi.flag_bits & 8, f'data descriptor not used: {zi.filename}') for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1891,30 +1841,24 @@ def test_repack_data_descriptor(self): def test_repack_data_descriptor_and_zip64(self): """Should correctly handle file entries using data descriptor and zip64.""" - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for ii in ([0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files, force_zip64=True) + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): self.assertTrue(zi.flag_bits & 8, f'data descriptor not used: {zi.filename}') for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1931,32 +1875,26 @@ def test_repack_data_descriptor_and_zip64(self): def test_repack_data_descriptor_no_sig(self): """Should correctly handle file entries using data descriptor without signature.""" - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for ii in ([0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files) + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist @@ -1973,32 +1911,26 @@ def test_repack_data_descriptor_no_sig(self): def test_repack_data_descriptor_no_sig_and_zip64(self): """Should correctly handle file entries using data descriptor without signature and zip64.""" - test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] - for ii in ([0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results - _test_files = [data for j, data in enumerate(test_files) if j not in ii] + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), _test_files, force_zip64=True) + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') for i in ii: - zh.remove(test_files[i][0]) + zh.remove(self.test_files[i][0]) zh.repack() # check infolist From a4b410b9f6b6160092f073eb4f3aa922f18c8517 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 24 May 2025 23:01:15 +0800 Subject: [PATCH 05/64] Add tests for mode `w` and `x` for `remove()` --- Lib/test/test_zipfile/test_core.py | 61 ++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index a975a43543da20..d884a9042237ff 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1587,31 +1587,68 @@ def test_remove_zip64(self): self.assertIsNone(zh.testzip()) def test_remove_validate(self): - file = 'datafile.txt' - data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' - # closed: error out and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: - zh.writestr(file, data) + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: zh.close() with self.assertRaises(ValueError): - zh.remove(file) + zh.remove(self.test_files[0][0]) # writing: error out and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: - zh.writestr(file, data) + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: with zh.open('newfile.txt', 'w') as fh: with self.assertRaises(ValueError): - zh.remove(file) + zh.remove(self.test_files[0][0]) # mode 'r': error out and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: - zh.writestr(file, data) + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'r') as zh: with self.assertRaises(ValueError): - zh.remove(file) + zh.remove(self.test_files[0][0]) + + def test_remove_mode_w(self): + with zipfile.ZipFile(TESTFN, 'w') as zh: + for file, data in self.test_files: + zh.writestr(file, data) + zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + + zh.remove(self.test_files[0][0]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zinfos[1], zinfos[2]], + ) + + # check NameToInfo cache + with self.assertRaises(KeyError): + zh.getinfo(self.test_files[0][0]) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + def test_remove_mode_x(self): + os.remove(TESTFN) + with zipfile.ZipFile(TESTFN, 'w') as zh: + for file, data in self.test_files: + zh.writestr(file, data) + zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + + zh.remove(self.test_files[0][0]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [zinfos[1], zinfos[2]], + ) + + # check NameToInfo cache + with self.assertRaises(KeyError): + zh.getinfo(self.test_files[0][0]) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) class StoredRemoveTests(AbstractRemoveTests, unittest.TestCase): compression = zipfile.ZIP_STORED From a9e85c654c8ba7d3b4be5cbe368c7a1934b0f16c Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 06:03:08 +0800 Subject: [PATCH 06/64] Introduce `_calc_initial_entry_offset` and refactor --- Lib/zipfile/__init__.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 54f3b1f3f6739d..a68d46649be087 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1446,21 +1446,13 @@ def _repack(self, zfile, *, chunk_size=2**20): # doesn't match the actual entry order filelist = sorted(zfile.filelist, key=lambda x: x.header_offset) - # calculate the starting entry offset (bytes to skip) - entry_offset = 0 - try: data_offset = filelist[0].header_offset except IndexError: data_offset = zfile.start_dir - if data_offset > 0: - if self.debug > 2: - print('scanning file signatures before:', data_offset) - for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): - if self._starts_consecutive_file_entries(fp, pos, data_offset): - entry_offset = data_offset - pos - break + # calculate the starting entry offset (bytes to skip) + entry_offset = self._calc_initial_entry_offset(fp, data_offset) # move file entries for i, info in enumerate(filelist): @@ -1502,6 +1494,15 @@ def _repack(self, zfile, *, chunk_size=2**20): zfile.start_dir -= entry_offset zfile._didModify = True + def _calc_initial_entry_offset(self, fp, data_offset): + if data_offset > 0: + if self.debug > 2: + print('scanning file signatures before:', data_offset) + for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): + if self._starts_consecutive_file_entries(fp, pos, data_offset): + return data_offset - pos + return 0 + def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_size=4096): sig_len = len(signature) remainder = b'' From 236cd06084176266bd885b4e9361cb3acb00a298 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 06:17:38 +0800 Subject: [PATCH 07/64] Optimize `_calc_initial_entry_offset` by introducing cache --- Lib/test/test_zipfile/test_core.py | 4 ++++ Lib/zipfile/__init__.py | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index d884a9042237ff..6e1bb48d214fbc 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1787,6 +1787,8 @@ def test_repack_file_entry_before_first_file(self): with open(TESTFN, 'wb') as fh: with zipfile.ZipFile(fh, 'w') as zh: zh.writestr('file.txt', b'dummy') + zh.writestr('file2.txt', b'dummy') + zh.writestr('file3.txt', b'dummy') fh.write(b' ') expected_zinfos = self._prepare_zip_from_test_files(fh, test_files) expected_size = os.path.getsize(TESTFN) @@ -1795,6 +1797,8 @@ def test_repack_file_entry_before_first_file(self): with open(TESTFN, 'wb') as fh: with zipfile.ZipFile(fh, 'w') as zh: zh.writestr('file.txt', b'dummy') + zh.writestr('file2.txt', b'dummy') + zh.writestr('file3.txt', b'dummy') fh.write(b' ') zinfos = self._prepare_zip_from_test_files(fh, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index a68d46649be087..4723e4de1b8652 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1495,11 +1495,12 @@ def _repack(self, zfile, *, chunk_size=2**20): zfile._didModify = True def _calc_initial_entry_offset(self, fp, data_offset): + checked_offsets = set() if data_offset > 0: if self.debug > 2: print('scanning file signatures before:', data_offset) for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): - if self._starts_consecutive_file_entries(fp, pos, data_offset): + if self._starts_consecutive_file_entries(fp, pos, data_offset, checked_offsets): return data_offset - pos return 0 @@ -1528,13 +1529,23 @@ def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_si remainder = chunk[-(sig_len - 1):] pos += read_size - def _starts_consecutive_file_entries(self, fp, start_offset, end_offset): + def _starts_consecutive_file_entries(self, fp, start_offset, end_offset, checked_offsets): offset = start_offset while offset < end_offset: if self.debug > 2: print('checking local file entry:', offset) + # Cache checked offsets to improve performance by failing + # subsequent (possible) file entry offsets early. They are + # rechecked only when proven false eventually. + if offset in checked_offsets: + if self.debug > 2: + print('skipping checked:', offset) + return False + else: + checked_offsets.add(offset) + fp.seek(offset) try: fheader = self._read_local_file_header(fp) From bdc58c7111f5346cc1e20800fb4b7dd69f31eeed Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 06:51:51 +0800 Subject: [PATCH 08/64] Introduce `_validate_local_file_entry` and refactor --- Lib/zipfile/__init__.py | 111 +++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 4723e4de1b8652..2b5115ad6dd0f4 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1500,7 +1500,9 @@ def _calc_initial_entry_offset(self, fp, data_offset): if self.debug > 2: print('scanning file signatures before:', data_offset) for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): - if self._starts_consecutive_file_entries(fp, pos, data_offset, checked_offsets): + if self.debug > 2: + print('checking file signature at:', pos) + if self._validate_local_file_entry_sequence(fp, pos, data_offset, checked_offsets): return data_offset - pos return 0 @@ -1529,12 +1531,12 @@ def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_si remainder = chunk[-(sig_len - 1):] pos += read_size - def _starts_consecutive_file_entries(self, fp, start_offset, end_offset, checked_offsets): + def _validate_local_file_entry_sequence(self, fp, start_offset, end_offset, checked_offsets): offset = start_offset while offset < end_offset: if self.debug > 2: - print('checking local file entry:', offset) + print('checking local file entry at:', offset) # Cache checked offsets to improve performance by failing # subsequent (possible) file entry offsets early. They are @@ -1546,69 +1548,72 @@ def _starts_consecutive_file_entries(self, fp, start_offset, end_offset, checked else: checked_offsets.add(offset) - fp.seek(offset) - try: - fheader = self._read_local_file_header(fp) - except BadZipFile: + entry_size = self._validate_local_file_entry(fp, offset, end_offset) + if entry_size is None: return False + offset += entry_size - # Create a dummy ZipInfo to utilize parsing. - # Flush only the required information. - zinfo = ZipInfo() - zinfo.header_offset = offset - zinfo.flag_bits = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] - zinfo.compress_size = fheader[_FH_COMPRESSED_SIZE] - zinfo.file_size = fheader[_FH_UNCOMPRESSED_SIZE] - zinfo.CRC = fheader[_FH_CRC] - - filename = fp.read(fheader[_FH_FILENAME_LENGTH]) - zinfo.extra = fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) - pos = fp.tell() + return offset == end_offset - if pos > end_offset: - return False + def _validate_local_file_entry(self, fp, offset, end_offset): + fp.seek(offset) + try: + fheader = self._read_local_file_header(fp) + except BadZipFile: + return None - try: - zinfo._decodeExtra(crc32(filename)) # parse zip64 - except BadZipFile: - return False + # Create a dummy ZipInfo to utilize parsing. + # Flush only the required information. + zinfo = ZipInfo() + zinfo.header_offset = offset + zinfo.flag_bits = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] + zinfo.compress_size = fheader[_FH_COMPRESSED_SIZE] + zinfo.file_size = fheader[_FH_UNCOMPRESSED_SIZE] + zinfo.CRC = fheader[_FH_CRC] - data_descriptor_size = 0 + filename = fp.read(fheader[_FH_FILENAME_LENGTH]) + zinfo.extra = fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + pos = fp.tell() - if zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: - # According to the spec, these fields should be zero when data - # descriptor is used. Otherwise treat as a false positive on - # random bytes to return early, as scanning for data descriptor - # is rather intensive. - if not (zinfo.CRC == zinfo.compress_size == zinfo.file_size == 0): - return False + if pos > end_offset: + return None - zip64 = ( - fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff or - fheader[_FH_COMPRESSED_SIZE] == 0xffffffff - ) + try: + zinfo._decodeExtra(crc32(filename)) # parse zip64 + except BadZipFile: + return None - dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) + data_descriptor_size = 0 - if dd is None: - return False + if zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: + # According to the spec, these fields should be zero when data + # descriptor is used. Otherwise treat as a false positive on + # random bytes to return early, as scanning for data descriptor + # is rather intensive. + if not (zinfo.CRC == zinfo.compress_size == zinfo.file_size == 0): + return None + + zip64 = ( + fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff or + fheader[_FH_COMPRESSED_SIZE] == 0xffffffff + ) - crc, compress_size, file_size, data_descriptor_size = dd - zinfo.CRC = crc - zinfo.compress_size = compress_size - zinfo.file_size = file_size + dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) - offset += ( - sizeFileHeader + - fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] + - zinfo.compress_size + - data_descriptor_size - ) + if dd is None: + return None - if self.debug > 2: - print('next', offset) + crc, compress_size, file_size, data_descriptor_size = dd + zinfo.CRC = crc + zinfo.compress_size = compress_size + zinfo.file_size = file_size - return offset == end_offset + return ( + sizeFileHeader + + fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] + + zinfo.compress_size + + data_descriptor_size + ) def _read_local_file_header(self, fp): fheader = fp.read(sizeFileHeader) From c3c834505ff0017a2ffd56c264ed81444cd65780 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 07:30:36 +0800 Subject: [PATCH 09/64] Introduce `_debug` and refactor --- Lib/zipfile/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 2b5115ad6dd0f4..031bd35e062349 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1372,6 +1372,10 @@ class _ZipRepacker: def __init__(self, debug=0): self.debug = debug # Level of printing: 0 through 3 + def _debug(self, level, *msg): + if self.debug >= level: + print(*msg) + def repack(self, zfile): """ Repack the ZIP file, removing unrecorded local file entries and random @@ -1497,11 +1501,9 @@ def _repack(self, zfile, *, chunk_size=2**20): def _calc_initial_entry_offset(self, fp, data_offset): checked_offsets = set() if data_offset > 0: - if self.debug > 2: - print('scanning file signatures before:', data_offset) + self._debug(3, 'scanning file signatures before:', data_offset) for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): - if self.debug > 2: - print('checking file signature at:', pos) + self._debug(3, 'checking file signature at:', pos) if self._validate_local_file_entry_sequence(fp, pos, data_offset, checked_offsets): return data_offset - pos return 0 @@ -1535,15 +1537,13 @@ def _validate_local_file_entry_sequence(self, fp, start_offset, end_offset, chec offset = start_offset while offset < end_offset: - if self.debug > 2: - print('checking local file entry at:', offset) + self._debug(3, 'checking local file entry at:', offset) # Cache checked offsets to improve performance by failing # subsequent (possible) file entry offsets early. They are # rechecked only when proven false eventually. if offset in checked_offsets: - if self.debug > 2: - print('skipping checked:', offset) + self._debug(3, 'skipping checked:', offset) return False else: checked_offsets.add(offset) From 1b7d75a4ea4e3f8db516bfa74fb56ac2ccd61cf2 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 08:14:37 +0800 Subject: [PATCH 10/64] Introduce `_move_entry_data` and rework chunk_size passing --- Lib/zipfile/__init__.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 031bd35e062349..ee5949d58ee578 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1369,8 +1369,9 @@ def close(self): class _ZipRepacker: """Class for ZipFile repacking.""" - def __init__(self, debug=0): + def __init__(self, *, chunk_size=2**20, debug=0): self.debug = debug # Level of printing: 0 through 3 + self.chunk_size = chunk_size def _debug(self, level, *msg): if self.debug >= level: @@ -1443,7 +1444,7 @@ def repack(self, zfile): with zfile._lock: self._repack(zfile) - def _repack(self, zfile, *, chunk_size=2**20): + def _repack(self, zfile): fp = zfile.fp # get a sorted filelist by header offset, in case the dir order @@ -1473,14 +1474,7 @@ def _repack(self, zfile, *, chunk_size=2**20): if entry_offset > 0: old_header_offset = info.header_offset info.header_offset -= entry_offset - read_size = 0 - while read_size < used_entry_size: - fp.seek(old_header_offset + read_size) - data = fp.read(min(used_entry_size - read_size, chunk_size)) - fp.seek(info.header_offset + read_size) - fp.write(data) - fp.flush() - read_size += len(data) + self._move_entry_data(fp, old_header_offset, info.header_offset, used_entry_size) if info._end_offset is not None: info._end_offset = info.header_offset + used_entry_size @@ -1700,6 +1694,16 @@ def _calc_local_file_entry_size(self, fp, zinfo): dd_size ) + def _move_entry_data(self, fp, old_offset, new_offset, size): + read_size = 0 + while read_size < size: + fp.seek(old_offset + read_size) + data = fp.read(min(size - read_size, self.chunk_size)) + fp.seek(new_offset + read_size) + fp.write(data) + fp.flush() + read_size += len(data) + class ZipFile: """ Class with methods to open, read, write, close, list zip files. From 51c9254cfb738eddec9e14d82e301aa0456e02c5 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 09:24:27 +0800 Subject: [PATCH 11/64] Refactor `_validate_local_file_entry` --- Lib/zipfile/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index ee5949d58ee578..3dd511c7e8e62c 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1593,7 +1593,8 @@ def _validate_local_file_entry(self, fp, offset, end_offset): ) dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) - + if dd is None: + dd = self._scan_data_descriptor_no_sig(fp, pos, end_offset, zip64) if dd is None: return None @@ -1636,7 +1637,7 @@ def _scan_data_descriptor(self, fp, offset, end_offset, zip64): return crc, compress_size, file_size, dd_size - return self._scan_data_descriptor_no_sig(fp, offset, end_offset, zip64) + return None def _scan_data_descriptor_no_sig(self, fp, offset, end_offset, zip64, chunk_size=8192): dd_fmt = ' Date: Sun, 25 May 2025 09:39:52 +0800 Subject: [PATCH 12/64] Add `strict_descriptor` option --- Lib/test/test_zipfile/test_core.py | 39 ++++++++++++++++++++++++++++++ Lib/zipfile/__init__.py | 5 ++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 6e1bb48d214fbc..2395c60befd5b0 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1950,6 +1950,45 @@ def test_repack_data_descriptor_no_sig(self): # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + def test_repack_data_descriptor_no_sig_strict(self): + """Should skip data descriptor without signature when `strict_descriptor` is set.""" + for ii in ([0], [0, 1]): + with self.subTest(remove=ii): + # calculate the expected results + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) + with zipfile.ZipFile(TESTFN, 'a') as zh: + for i in ii: + zh.remove(self.test_files[i][0]) + expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + # make sure data descriptor bit is really set (by making zipfile unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') + + for i in ii: + zh.remove(self.test_files[i][0]) + zh.repack(strict_descriptor=True) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + def test_repack_data_descriptor_no_sig_and_zip64(self): """Should correctly handle file entries using data descriptor without signature and zip64.""" for ii in ([0], [0, 1], [1], [2]): diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 3dd511c7e8e62c..3a7aaa1d0f9459 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1369,9 +1369,10 @@ def close(self): class _ZipRepacker: """Class for ZipFile repacking.""" - def __init__(self, *, chunk_size=2**20, debug=0): + def __init__(self, *, chunk_size=2**20, strict_descriptor=False, debug=0): self.debug = debug # Level of printing: 0 through 3 self.chunk_size = chunk_size + self.strict_descriptor = strict_descriptor def _debug(self, level, *msg): if self.debug >= level: @@ -1593,7 +1594,7 @@ def _validate_local_file_entry(self, fp, offset, end_offset): ) dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) - if dd is None: + if dd is None and not self.strict_descriptor: dd = self._scan_data_descriptor_no_sig(fp, pos, end_offset, zip64) if dd is None: return None From 8f0a504b205bc4e84f9a7b12497d03edd91064f5 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 15:48:21 +0800 Subject: [PATCH 13/64] Fix and improve validation tests - Separate individual validation tests. - Check underlying repacker not called in validation. - Use `unlink` to prevent FileNotFoundError. - Fix mode 'x' test. --- Lib/test/test_zipfile/test_core.py | 54 +++++++++++++++--------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 2395c60befd5b0..563ce2c6154b60 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1586,23 +1586,22 @@ def test_remove_zip64(self): # make sure the zip file is still valid self.assertIsNone(zh.testzip()) - def test_remove_validate(self): - # closed: error out and do nothing - zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + def test_remove_closed(self): + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: zh.close() with self.assertRaises(ValueError): zh.remove(self.test_files[0][0]) - # writing: error out and do nothing - zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + def test_remove_writing(self): + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: with zh.open('newfile.txt', 'w') as fh: with self.assertRaises(ValueError): zh.remove(self.test_files[0][0]) - # mode 'r': error out and do nothing - zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + def test_remove_mode_r(self): + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'r') as zh: with self.assertRaises(ValueError): zh.remove(self.test_files[0][0]) @@ -1629,8 +1628,8 @@ def test_remove_mode_w(self): self.assertIsNone(zh.testzip()) def test_remove_mode_x(self): - os.remove(TESTFN) - with zipfile.ZipFile(TESTFN, 'w') as zh: + unlink(TESTFN) + with zipfile.ZipFile(TESTFN, 'x') as zh: for file, data in self.test_files: zh.writestr(file, data) zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] @@ -2025,45 +2024,46 @@ def test_repack_data_descriptor_no_sig_and_zip64(self): # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) - def test_repack_validate(self): - file = 'datafile.txt' - data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' - - # closed: error out and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: - zh.writestr(file, data) + @mock.patch('zipfile._ZipRepacker') + def test_repack_closed(self, m_repack): + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: zh.close() with self.assertRaises(ValueError): zh.repack() + m_repack.assert_not_called() - # writing: error out and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: - zh.writestr(file, data) + @mock.patch('zipfile._ZipRepacker') + def test_repack_writing(self, m_repack): + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: with zh.open('newfile.txt', 'w') as fh: with self.assertRaises(ValueError): zh.repack() + m_repack.assert_not_called() - # mode 'r': error out and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: - zh.writestr(file, data) + @mock.patch('zipfile._ZipRepacker') + def test_repack_mode_r(self, m_repack): + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'r') as zh: with self.assertRaises(ValueError): zh.repack() + m_repack.assert_not_called() - # mode 'w': error out and do nothing - with zipfile.ZipFile(TESTFN, 'w') as zh: - zh.writestr(file, data) + @mock.patch('zipfile._ZipRepacker') + def test_repack_mode_w(self, m_repack): with zipfile.ZipFile(TESTFN, 'w') as zh: with self.assertRaises(ValueError): zh.repack() + m_repack.assert_not_called() - # mode 'x': error out and do nothing - os.remove(TESTFN) + @mock.patch('zipfile._ZipRepacker') + def test_repack_mode_x(self, m_repack): + unlink(TESTFN) with zipfile.ZipFile(TESTFN, 'x') as zh: with self.assertRaises(ValueError): zh.repack() + m_repack.assert_not_called() class StoredRepackTests(AbstractRepackTests, unittest.TestCase): compression = zipfile.ZIP_STORED From 0cb8682dfa1fdc14c0b3167ea938db44dc4f154d Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 20:20:08 +0800 Subject: [PATCH 14/64] Remove obsolete NameToInfo updating --- Lib/zipfile/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 3a7aaa1d0f9459..5b15c02fd619ca 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1484,11 +1484,6 @@ def _repack(self, zfile): if used_entry_size < entry_size: entry_offset += entry_size - used_entry_size - # Avoid missing entry if entries have a duplicated name. - # Reverse the order as NameToInfo normally stores the last added one. - for info in reversed(zfile.filelist): - zfile.NameToInfo.setdefault(info.filename, info) - # update state zfile.start_dir -= entry_offset zfile._didModify = True From a788a001a4ba5b1453ad02268060629c72ccbc0b Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 17:26:28 +0800 Subject: [PATCH 15/64] Use `zinfo` rather than `info` --- Lib/zipfile/__init__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 5b15c02fd619ca..7adced683f8e9f 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1461,24 +1461,24 @@ def _repack(self, zfile): entry_offset = self._calc_initial_entry_offset(fp, data_offset) # move file entries - for i, info in enumerate(filelist): + for i, zinfo in enumerate(filelist): # get the total size of the entry try: offset = filelist[i + 1].header_offset except IndexError: offset = zfile.start_dir - entry_size = offset - info.header_offset + entry_size = offset - zinfo.header_offset - used_entry_size = self._calc_local_file_entry_size(fp, info) + used_entry_size = self._calc_local_file_entry_size(fp, zinfo) # update the header and move entry data to the new position if entry_offset > 0: - old_header_offset = info.header_offset - info.header_offset -= entry_offset - self._move_entry_data(fp, old_header_offset, info.header_offset, used_entry_size) + old_header_offset = zinfo.header_offset + zinfo.header_offset -= entry_offset + self._move_entry_data(fp, old_header_offset, zinfo.header_offset, used_entry_size) - if info._end_offset is not None: - info._end_offset = info.header_offset + used_entry_size + if zinfo._end_offset is not None: + zinfo._end_offset = zinfo.header_offset + used_entry_size # update entry_offset for subsequent files to follow if used_entry_size < entry_size: From ae01b8cd69f53a509b5782ad080c9254ae13edb1 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 17:28:40 +0800 Subject: [PATCH 16/64] Raise on overlapping file blocks --- Lib/test/test_zipfile/test_core.py | 15 +++++++++++++ Lib/zipfile/__init__.py | 35 +++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 563ce2c6154b60..8fddf5ab1c51b9 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2024,6 +2024,21 @@ def test_repack_data_descriptor_no_sig_and_zip64(self): # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + def test_repack_overlapping_blocks(self): + for ii in ([0], [1], [2]): + with self.subTest(remove=ii): + self._prepare_zip_from_test_files(TESTFN, self.test_files) + with open(TESTFN, 'r+b') as fh: + with zipfile.ZipFile(fh, 'a') as zh: + zh.writestr('file.txt', b'dummy') + for i in ii: + zh.infolist()[i].file_size += 50 + zh.infolist()[i].compress_size += 50 + + with zipfile.ZipFile(TESTFN, 'a') as zh: + with self.assertRaises(zipfile.BadZipFile): + zh.repack() + @mock.patch('zipfile._ZipRepacker') def test_repack_closed(self, m_repack): self._prepare_zip_from_test_files(TESTFN, self.test_files) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 7adced683f8e9f..16636bf1bb7d6d 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1452,25 +1452,40 @@ def _repack(self, zfile): # doesn't match the actual entry order filelist = sorted(zfile.filelist, key=lambda x: x.header_offset) - try: - data_offset = filelist[0].header_offset - except IndexError: - data_offset = zfile.start_dir - - # calculate the starting entry offset (bytes to skip) - entry_offset = self._calc_initial_entry_offset(fp, data_offset) - - # move file entries + # calculate each entry size and validate + entry_size_list = [] + used_entry_size_list = [] for i, zinfo in enumerate(filelist): - # get the total size of the entry try: offset = filelist[i + 1].header_offset except IndexError: offset = zfile.start_dir entry_size = offset - zinfo.header_offset + # may raise on an invalid local file header used_entry_size = self._calc_local_file_entry_size(fp, zinfo) + self._debug(3, i, zinfo.orig_filename, entry_size, used_entry_size) + if used_entry_size > entry_size: + raise BadZipFile( + f"Overlapped entries: {zinfo.orig_filename!r} " + f"(possible zip bomb)") + + entry_size_list.append(entry_size) + used_entry_size_list.append(used_entry_size) + + # calculate the starting entry offset (bytes to skip) + try: + data_offset = filelist[0].header_offset + except IndexError: + data_offset = zfile.start_dir + entry_offset = self._calc_initial_entry_offset(fp, data_offset) + + # move file entries + for i, zinfo in enumerate(filelist): + entry_size = entry_size_list[i] + used_entry_size = used_entry_size_list[i] + # update the header and move entry data to the new position if entry_offset > 0: old_header_offset = zinfo.header_offset From edee2033c414f07dac405168a98c9f87104fded8 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 25 May 2025 21:31:15 +0800 Subject: [PATCH 17/64] Rework writing protection - Set `_writing` to prevent `open('w').write()` during repacking. - Move the protection logic to `ZipFile.repack()`. --- Lib/zipfile/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 16636bf1bb7d6d..74f5108cea1eca 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1442,10 +1442,6 @@ def repack(self, zfile): [recorded local file entry 1] ... """ - with zfile._lock: - self._repack(zfile) - - def _repack(self, zfile): fp = zfile.fp # get a sorted filelist by header offset, in case the dir order @@ -2271,7 +2267,12 @@ def repack(self, **opts): "Can't write to ZIP archive while an open writing handle exists" ) - _ZipRepacker(**opts).repack(self) + with self._lock: + self._writing = True + try: + _ZipRepacker(**opts).repack(self) + finally: + self._writing = False @classmethod def _sanitize_windows_name(cls, arcname, pathsep): From 555ac78aec9487131785d0645cba935c1cd411d2 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Mon, 26 May 2025 00:22:21 +0800 Subject: [PATCH 18/64] Update doc --- Doc/library/zipfile.rst | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 047512bc88e07c..e09f81366597bb 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -520,8 +520,15 @@ ZipFile Objects .. method:: ZipFile.remove(zinfo_or_arcname) - Removes a member from the archive. *zinfo_or_arcname* is either the full - path of the member, or a :class:`ZipInfo` instance. + Removes a member from the archive. *zinfo_or_arcname* may be the full path + of the member or a :class:`ZipInfo` instance. + + If multiple members share the same full path, only one is removed when + a path is provided. + + This does not physically remove the local file entry from the archive; + the ZIP file size remains unchanged. Use :meth:`ZipFile.repack` afterwards + to reclaim space. The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``. @@ -530,9 +537,20 @@ ZipFile Objects .. versionadded:: next -.. method:: ZipFile.repack() +.. method:: ZipFile.repack(*, strict_descriptor=False[, chunk_size]) + + Rewrites the archive to remove local file entries that are no longer + referenced, shrinking the ZIP file size. + + ``strict_descriptor=True`` can be provided to skip the slower scan for an + unsigned data descriptor (deprecated in the latest ZIP specification and is + only used by legacy tools) when checking for bytes resembling a valid local + file entry before the first referenced entry. This improves performance, + but may cause some stale local file entries to be preserved, as any entry + using an unsigned descriptor cannot be detected. - Repack a zip file and physically remove non-referenced file entries. + *chunk_size* may be specified to control the buffer size when moving + entry data (default is 1 MiB). The archive must be opened with mode ``'a'``. From 95fde316ab56be7099c3454b741dade89509a96c Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Mon, 26 May 2025 22:22:35 +0800 Subject: [PATCH 19/64] Fix typo --- Lib/test/test_zipfile/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 8fddf5ab1c51b9..91e969f3372258 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1776,7 +1776,7 @@ def test_repack_file_entry_before_first_file(self): """Should preserve seemingly valid file entries not forming consecutive valid file entries until the first recorded local file entry. - This may happen whan a self-extractor contains an uncompressed ZIP + This may happen when a self-extractor contains an uncompressed ZIP library. (simulated by writing a ZIP file in this test) """ for ii in ([], [0], [0, 1], [0, 1, 2]): From 8a448e452b761bfa29c779766bc46efade541bb5 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Mon, 26 May 2025 22:32:50 +0800 Subject: [PATCH 20/64] Add test for bytes between file entries --- Lib/test/test_zipfile/test_core.py | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 91e969f3372258..2ad756511b1bd1 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1817,6 +1817,38 @@ def test_repack_file_entry_before_first_file(self): # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + def test_repack_bytes_between_files(self): + """Should remove bytes between local file entries.""" + for ii in ([1], [1, 2], [2]): + with self.subTest(remove=ii): + # calculate the expected results + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + for i, (file, data) in enumerate(self.test_files): + zh.writestr(file, data) + fh.write(b' dummy bytes ') + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for i in ii: + zh.remove(self.test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # make sure the zip file is still valid + self.assertIsNone(zh.testzip()) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + def test_repack_zip64(self): """Should correctly handle file entries with zip64.""" for ii in ([0], [0, 1], [1], [2]): From 4c35eb262d8727846e03bfa121ba13e62c54faa3 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Mon, 26 May 2025 22:54:09 +0800 Subject: [PATCH 21/64] Check `testzip()` after zip file closed --- Lib/test/test_zipfile/test_core.py | 109 +++++++++++++++++------------ 1 file changed, 65 insertions(+), 44 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 2ad756511b1bd1..4ab38a0cb4cf3f 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1415,7 +1415,8 @@ def test_remove_by_name(self): with self.assertRaises(KeyError): zh.getinfo(self.test_files[i][0]) - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) def test_remove_by_zinfo(self): @@ -1435,7 +1436,8 @@ def test_remove_by_zinfo(self): with self.assertRaises(KeyError): zh.getinfo(self.test_files[i][0]) - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) def test_remove_by_name_nonexist(self): @@ -1477,7 +1479,8 @@ def test_remove_by_name_duplicated(self): zinfos[0], ) - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) @@ -1495,7 +1498,8 @@ def test_remove_by_name_duplicated(self): with self.assertRaises(KeyError): zh.getinfo('file.txt') - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) def test_remove_by_zinfo_duplicated(self): @@ -1525,7 +1529,8 @@ def test_remove_by_zinfo_duplicated(self): zinfos[1], ) - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) @@ -1544,7 +1549,8 @@ def test_remove_by_zinfo_duplicated(self): zinfos[0], ) - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) @@ -1563,7 +1569,8 @@ def test_remove_by_zinfo_duplicated(self): with self.assertRaises(KeyError): zh.getinfo('file.txt') - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) def test_remove_zip64(self): @@ -1583,8 +1590,9 @@ def test_remove_zip64(self): with self.assertRaises(KeyError): zh.getinfo(self.test_files[i][0]) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) def test_remove_closed(self): self._prepare_zip_from_test_files(TESTFN, self.test_files) @@ -1624,7 +1632,8 @@ def test_remove_mode_w(self): with self.assertRaises(KeyError): zh.getinfo(self.test_files[0][0]) - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) def test_remove_mode_x(self): @@ -1646,7 +1655,8 @@ def test_remove_mode_x(self): with self.assertRaises(KeyError): zh.getinfo(self.test_files[0][0]) - # make sure the zip file is still valid + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) class StoredRemoveTests(AbstractRemoveTests, unittest.TestCase): @@ -1701,12 +1711,13 @@ def test_repack_basic(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_bytes_before_first_file(self): """Should preserve random bytes before the first recorded local file entry.""" for ii in ([], [0], [0, 1], [0, 1, 2]): @@ -1733,12 +1744,13 @@ def test_repack_bytes_before_first_file(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_magic_before_first_file(self): """Should preserve random signature bytes not forming a valid file entry before the first recorded local file entry.""" @@ -1766,12 +1778,13 @@ def test_repack_magic_before_first_file(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_file_entry_before_first_file(self): """Should preserve seemingly valid file entries not forming consecutive valid file entries until the first recorded local file entry. @@ -1811,12 +1824,13 @@ def test_repack_file_entry_before_first_file(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_bytes_between_files(self): """Should remove bytes between local file entries.""" for ii in ([1], [1, 2], [2]): @@ -1843,12 +1857,13 @@ def test_repack_bytes_between_files(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_zip64(self): """Should correctly handle file entries with zip64.""" for ii in ([0], [0, 1], [1], [2]): @@ -1871,12 +1886,13 @@ def test_repack_zip64(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_data_descriptor(self): """Should correctly handle file entries using data descriptor.""" for ii in ([0], [0, 1], [1], [2]): @@ -1905,12 +1921,13 @@ def test_repack_data_descriptor(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_data_descriptor_and_zip64(self): """Should correctly handle file entries using data descriptor and zip64.""" for ii in ([0], [0, 1], [1], [2]): @@ -1939,12 +1956,13 @@ def test_repack_data_descriptor_and_zip64(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_data_descriptor_no_sig(self): """Should correctly handle file entries using data descriptor without signature.""" for ii in ([0], [0, 1], [1], [2]): @@ -1975,12 +1993,13 @@ def test_repack_data_descriptor_no_sig(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_data_descriptor_no_sig_strict(self): """Should skip data descriptor without signature when `strict_descriptor` is set.""" for ii in ([0], [0, 1]): @@ -2014,12 +2033,13 @@ def test_repack_data_descriptor_no_sig_strict(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_data_descriptor_no_sig_and_zip64(self): """Should correctly handle file entries using data descriptor without signature and zip64.""" for ii in ([0], [0, 1], [1], [2]): @@ -2050,12 +2070,13 @@ def test_repack_data_descriptor_no_sig_and_zip64(self): expected_zinfos, ) - # make sure the zip file is still valid - self.assertIsNone(zh.testzip()) - # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_overlapping_blocks(self): for ii in ([0], [1], [2]): with self.subTest(remove=ii): From 926338cbc1c7ebb6bcaf8107fd35ce9b6931b807 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Mon, 26 May 2025 20:49:28 +0800 Subject: [PATCH 22/64] Support `repack(removed)` --- Doc/library/zipfile.rst | 21 ++++-- Lib/test/test_zipfile/test_core.py | 112 +++++++++++++++++++++++++++++ Lib/zipfile/__init__.py | 60 +++++++++++----- 3 files changed, 171 insertions(+), 22 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index e09f81366597bb..72fb35b08cf19d 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -527,20 +527,33 @@ ZipFile Objects a path is provided. This does not physically remove the local file entry from the archive; - the ZIP file size remains unchanged. Use :meth:`ZipFile.repack` afterwards + the ZIP file size remains unchanged. Call :meth:`ZipFile.repack` afterwards to reclaim space. The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``. + Returns the removed :class:`ZipInfo` instance. + Calling :meth:`remove` on a closed ZipFile will raise a :exc:`ValueError`. .. versionadded:: next -.. method:: ZipFile.repack(*, strict_descriptor=False[, chunk_size]) +.. method:: ZipFile.repack(removed=None, *, \ + strict_descriptor=False[, chunk_size]) + + Rewrites the archive to remove stale local file entries, shrinking the ZIP + file size. + + If *removed* is provided, it must be a sequence of :class:`ZipInfo` objects + representing removed entries; only their corresponding local file entries + will be removed. - Rewrites the archive to remove local file entries that are no longer - referenced, shrinking the ZIP file size. + If *removed* is not provided, local file entries no longer referenced in the + central directory will be removed. The algorithm assumes that local file + entries are stored consecutively. Extra bytes between entries will also be + removed. Data before the first referenced entry is preserved unless it + appears to be a sequence of consecutive local file entries. ``strict_descriptor=True`` can be provided to skip the slower scan for an unsigned data descriptor (deprecated in the latest ZIP specification and is diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 4ab38a0cb4cf3f..28840e62db96f4 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2092,6 +2092,118 @@ def test_repack_overlapping_blocks(self): with self.assertRaises(zipfile.BadZipFile): zh.repack() + def test_repack_removed_basic(self): + """Should remove local file entries for provided deleted files.""" + ln = len(self.test_files) + iii = (ii for n in range(1, ln + 1) for ii in itertools.combinations(range(ln), n)) + for ii in iii: + with self.subTest(remove=ii): + # calculate the expected results + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zinfos = [zh.remove(self.test_files[i][0]) for i in ii] + zh.repack(zinfos) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + + def test_repack_removed_partial(self): + """Should remove local file entries only for provided deleted files.""" + ln = len(self.test_files) + iii = (ii for n in range(1, ln + 1) for ii in itertools.combinations(range(ln), n)) + for ii in iii: + with self.subTest(removed=ii): + # calculate the expected results + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for zi in zh.infolist().copy(): + zh.remove(zi) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zinfos = [zh.remove(self.test_files[i][0]) for i, _ in enumerate(self.test_files)] + zh.repack([zinfos[i] for i in ii]) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + [], + ) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + + def test_repack_removed_bytes_between_files(self): + """Should not remove bytes between local file entries.""" + # calculate the expected results + for ii in ([0], [1], [2]): + with self.subTest(removed=ii): + expected_zinfos = [] + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + for j, (file, data) in enumerate(self.test_files): + if j not in ii: + zh.writestr(file, data) + expected_zinfos.append(ComparableZipInfo(zh.getinfo(file))) + fh.write(b' dummy bytes ') + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + for i, (file, data) in enumerate(self.test_files): + zh.writestr(file, data) + fh.write(b' dummy bytes ') + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zinfos = [zh.remove(self.test_files[i][0]) for i in ii] + zh.repack(zinfos) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + + def test_repack_removed_bad_removed_zinfos(self): + """Should raise when providing non-removed zinfos.""" + # calculate the expected results + for ii in ([0], [1], [2]): + with self.subTest(removed=ii): + self._prepare_zip_from_test_files(TESTFN, self.test_files) + with zipfile.ZipFile(TESTFN, 'a') as zh: + zinfos = [zh.getinfo(self.test_files[i][0]) for i in ii] + with self.assertRaises(zipfile.BadZipFile): + zh.repack(zinfos) + @mock.patch('zipfile._ZipRepacker') def test_repack_closed(self, m_repack): self._prepare_zip_from_test_files(TESTFN, self.test_files) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 74f5108cea1eca..6ad4a3bd5f91b7 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1378,7 +1378,7 @@ def _debug(self, level, *msg): if self.debug >= level: print(*msg) - def repack(self, zfile): + def repack(self, zfile, removed=None): """ Repack the ZIP file, removing unrecorded local file entries and random bytes not listed in the central directory. @@ -1442,11 +1442,14 @@ def repack(self, zfile): [recorded local file entry 1] ... """ + removed_zinfos = set(removed or ()) + fp = zfile.fp # get a sorted filelist by header offset, in case the dir order # doesn't match the actual entry order - filelist = sorted(zfile.filelist, key=lambda x: x.header_offset) + filelist = (*zfile.filelist, *removed_zinfos) + filelist = sorted(filelist, key=lambda x: x.header_offset) # calculate each entry size and validate entry_size_list = [] @@ -1464,18 +1467,20 @@ def repack(self, zfile): self._debug(3, i, zinfo.orig_filename, entry_size, used_entry_size) if used_entry_size > entry_size: raise BadZipFile( - f"Overlapped entries: {zinfo.orig_filename!r} " - f"(possible zip bomb)") + f"Overlapped entries: {zinfo.orig_filename!r} ") entry_size_list.append(entry_size) used_entry_size_list.append(used_entry_size) # calculate the starting entry offset (bytes to skip) - try: - data_offset = filelist[0].header_offset - except IndexError: - data_offset = zfile.start_dir - entry_offset = self._calc_initial_entry_offset(fp, data_offset) + if removed is None: + try: + data_offset = filelist[0].header_offset + except IndexError: + data_offset = zfile.start_dir + entry_offset = self._calc_initial_entry_offset(fp, data_offset) + else: + entry_offset = 0 # move file entries for i, zinfo in enumerate(filelist): @@ -1483,17 +1488,34 @@ def repack(self, zfile): used_entry_size = used_entry_size_list[i] # update the header and move entry data to the new position - if entry_offset > 0: + if zinfo in removed_zinfos: old_header_offset = zinfo.header_offset zinfo.header_offset -= entry_offset - self._move_entry_data(fp, old_header_offset, zinfo.header_offset, used_entry_size) + self._move_entry_data( + fp, + old_header_offset + used_entry_size, + zinfo.header_offset, + entry_size - used_entry_size + ) - if zinfo._end_offset is not None: - zinfo._end_offset = zinfo.header_offset + used_entry_size + if zinfo._end_offset is not None: + zinfo._end_offset = zinfo.header_offset - # update entry_offset for subsequent files to follow - if used_entry_size < entry_size: - entry_offset += entry_size - used_entry_size + # update entry_offset for subsequent files to follow + entry_offset += used_entry_size + + else: + if entry_offset > 0: + old_header_offset = zinfo.header_offset + zinfo.header_offset -= entry_offset + self._move_entry_data(fp, old_header_offset, zinfo.header_offset, used_entry_size) + + if zinfo._end_offset is not None: + zinfo._end_offset = zinfo.header_offset + used_entry_size + + # update entry_offset for subsequent files to follow + if used_entry_size < entry_size: + entry_offset += entry_size - used_entry_size # update state zfile.start_dir -= entry_offset @@ -2250,7 +2272,9 @@ def remove(self, zinfo_or_arcname): self._didModify = True - def repack(self, **opts): + return zinfo + + def repack(self, removed=None, **opts): """Repack a zip file, removing non-referenced file entries. The archive must be opened with mode 'a', as mode 'w'/'x' do not @@ -2270,7 +2294,7 @@ def repack(self, **opts): with self._lock: self._writing = True try: - _ZipRepacker(**opts).repack(self) + _ZipRepacker(**opts).repack(self, removed) finally: self._writing = False From e76f9a1bc10ac09faffcc09ca254403cd0623856 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Tue, 27 May 2025 00:18:52 +0800 Subject: [PATCH 23/64] Fix bytes between entries be removed when `removed` is passed --- Lib/test/test_zipfile/test_core.py | 3 +++ Lib/zipfile/__init__.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 28840e62db96f4..24c466132c571d 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1846,6 +1846,7 @@ def test_repack_bytes_between_files(self): for i, (file, data) in enumerate(self.test_files): zh.writestr(file, data) fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: zh.remove(self.test_files[i][0]) @@ -2168,6 +2169,7 @@ def test_repack_removed_bytes_between_files(self): zh.writestr(file, data) expected_zinfos.append(ComparableZipInfo(zh.getinfo(file))) fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -2176,6 +2178,7 @@ def test_repack_removed_bytes_between_files(self): for i, (file, data) in enumerate(self.test_files): zh.writestr(file, data) fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: zinfos = [zh.remove(self.test_files[i][0]) for i in ii] zh.repack(zinfos) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 6ad4a3bd5f91b7..1a27fbeb354490 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1469,6 +1469,9 @@ def repack(self, zfile, removed=None): raise BadZipFile( f"Overlapped entries: {zinfo.orig_filename!r} ") + if removed is not None and zinfo not in removed_zinfos: + used_entry_size = entry_size + entry_size_list.append(entry_size) used_entry_size_list.append(used_entry_size) From 93f4c25d393423c00ba400f7b010d912b836e7d3 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Tue, 27 May 2025 07:48:07 +0800 Subject: [PATCH 24/64] Fix bad test code --- Lib/test/test_zipfile/test_core.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 24c466132c571d..3075461d7001fb 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1590,9 +1590,9 @@ def test_remove_zip64(self): with self.assertRaises(KeyError): zh.getinfo(self.test_files[i][0]) - # make sure the zip file is still valid - with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) def test_remove_closed(self): self._prepare_zip_from_test_files(TESTFN, self.test_files) @@ -1861,9 +1861,9 @@ def test_repack_bytes_between_files(self): # check file size self.assertEqual(os.path.getsize(TESTFN), expected_size) - # make sure the zip file is still valid - with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) def test_repack_zip64(self): """Should correctly handle file entries with zip64.""" @@ -2158,9 +2158,9 @@ def test_repack_removed_partial(self): def test_repack_removed_bytes_between_files(self): """Should not remove bytes between local file entries.""" - # calculate the expected results for ii in ([0], [1], [2]): with self.subTest(removed=ii): + # calculate the expected results expected_zinfos = [] with open(TESTFN, 'wb') as fh: with zipfile.ZipFile(fh, 'w', self.compression) as zh: @@ -2198,7 +2198,6 @@ def test_repack_removed_bytes_between_files(self): def test_repack_removed_bad_removed_zinfos(self): """Should raise when providing non-removed zinfos.""" - # calculate the expected results for ii in ([0], [1], [2]): with self.subTest(removed=ii): self._prepare_zip_from_test_files(TESTFN, self.test_files) From 9e94209e94e65d7ddd73c6c691d8af9a94f039c8 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Tue, 27 May 2025 08:27:17 +0800 Subject: [PATCH 25/64] Revise docstring --- Lib/zipfile/__init__.py | 62 ++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 1a27fbeb354490..e23ed0809e8081 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1380,67 +1380,71 @@ def _debug(self, level, *msg): def repack(self, zfile, removed=None): """ - Repack the ZIP file, removing unrecorded local file entries and random - bytes not listed in the central directory. + Repack the ZIP file, stripping unreferenced local file entries. - Assumes that local file entries are written consecutively without gaps. + Assumes that local file entries are stored consecutively, with no gaps + or overlaps. - Truncation is applied in two phases: + Stripping occurs in two phases: 1. Before the first recorded file entry: - If a sequence of valid local file entries (starting with - `PK\x03\x04`) is found leading up to the first recorded entry, - it is truncated. + `PK\x03\x04`) is found immediately before the first recorded + entry, it is stripped. - Otherwise, all leading bytes are preserved (e.g., in cases such - as self-extracting code or embedded ZIP libraries). + as self-extracting archives or embedded ZIP payloads). 2. Between or after the recorded entries: - - Any data between two recorded entries, or after the last recorded - entry but before the central directory, is removed—regardless of - whether it resembles a valid entry. + - Any bytes between two recorded entries, or between the last + recorded and the central directory, are removed—regardless of + whether they resemble valid entries. - ### Examples + Examples: - Truncation before first recorded entry: + Stripping before first recorded entry: [random bytes] - [unrecorded local file entry 1] - [unrecorded local file entry 2] + [unreferenced local file entry 1] + [unreferenced local file entry 2] [random bytes] - <- truncation start - [unrecorded local file entry 3] - [unrecorded local file entry 4] - <- truncation end + <-- stripping start + [unreferenced local file entry 3] + [unreferenced local file entry 4] + <-- stripping end [recorded local file entry 1] ... [central directory] - Truncation between recorded entries: + Stripping between recorded entries: ... [recorded local file entry 5] - <- truncation start + <-- stripping start [random bytes] - [unrecorded local file entry] + [unreferenced local file entry] [random bytes] - <- truncation end + <-- stripping end [recorded local file entry 6] ... [recorded local file entry n] - <- truncation start - [unrecorded local file entry] - <- truncation end + <-- stripping start + [unreferenced local file entry] + <-- stripping end [central directory] - No truncation case: + No stripping: - [unrecorded local file entry 1] - [unrecorded local file entry 2] + [unreferenced local file entry 1] + [unreferenced local file entry 2] ... - [unrecorded local file entry n] + [unreferenced local file entry n] [random bytes] [recorded local file entry 1] ... + + removed: None or a sequence of ZipInfo instances representing removed + entries. When provided, only their corresponding local file + entries are stripped. """ removed_zinfos = set(removed or ()) From 3ef72c68be70ad0f63f3cc864121a03049dd7fda Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Wed, 28 May 2025 11:37:02 +0800 Subject: [PATCH 26/64] Add `tearDown` for tests --- Lib/test/test_zipfile/test_core.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 3075461d7001fb..bbd62804b61f37 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1398,6 +1398,9 @@ def setUpClass(self): ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), ] + def tearDown(self): + unlink(TESTFN) + def test_remove_by_name(self): for i in range(0, 3): with self.subTest(i=i, filename=self.test_files[i][0]): @@ -1637,7 +1640,6 @@ def test_remove_mode_w(self): self.assertIsNone(zh.testzip()) def test_remove_mode_x(self): - unlink(TESTFN) with zipfile.ZipFile(TESTFN, 'x') as zh: for file, data in self.test_files: zh.writestr(file, data) @@ -1687,6 +1689,9 @@ def setUpClass(self): ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), ] + def tearDown(self): + unlink(TESTFN) + def test_repack_basic(self): """Should remove local file entries for deleted files.""" ln = len(self.test_files) @@ -2241,7 +2246,6 @@ def test_repack_mode_w(self, m_repack): @mock.patch('zipfile._ZipRepacker') def test_repack_mode_x(self, m_repack): - unlink(TESTFN) with zipfile.ZipFile(TESTFN, 'x') as zh: with self.assertRaises(ValueError): zh.repack() From fbf75888fcec8589029f13a0af16022c5c318bc0 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Wed, 28 May 2025 11:44:36 +0800 Subject: [PATCH 27/64] Rename methods and parameters --- Lib/test/test_zipfile/test_core.py | 8 ++++---- Lib/zipfile/__init__.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index bbd62804b61f37..26081a860157d6 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1391,8 +1391,8 @@ def _prepare_zip_from_test_files(self, zfname, test_files, force_zip64=False): class AbstractRemoveTests(RepackHelperMixin): @classmethod - def setUpClass(self): - self.test_files = [ + def setUpClass(cls): + cls.test_files = [ ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), @@ -1682,8 +1682,8 @@ class ZstdRemoveTests(AbstractRemoveTests, unittest.TestCase): class AbstractRepackTests(RepackHelperMixin): @classmethod - def setUpClass(self): - self.test_files = [ + def setUpClass(cls): + cls.test_files = [ ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e23ed0809e8081..9be6e862c9717f 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1498,7 +1498,7 @@ def repack(self, zfile, removed=None): if zinfo in removed_zinfos: old_header_offset = zinfo.header_offset zinfo.header_offset -= entry_offset - self._move_entry_data( + self._copy_bytes( fp, old_header_offset + used_entry_size, zinfo.header_offset, @@ -1515,7 +1515,7 @@ def repack(self, zfile, removed=None): if entry_offset > 0: old_header_offset = zinfo.header_offset zinfo.header_offset -= entry_offset - self._move_entry_data(fp, old_header_offset, zinfo.header_offset, used_entry_size) + self._copy_bytes(fp, old_header_offset, zinfo.header_offset, used_entry_size) if zinfo._end_offset is not None: zinfo._end_offset = zinfo.header_offset + used_entry_size @@ -1731,7 +1731,7 @@ def _calc_local_file_entry_size(self, fp, zinfo): dd_size ) - def _move_entry_data(self, fp, old_offset, new_offset, size): + def _copy_bytes(self, fp, old_offset, new_offset, size): read_size = 0 while read_size < size: fp.seek(old_offset + read_size) From 81a419aaf79b411da88aaa0df577b038d96ffe0f Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 29 May 2025 00:03:43 +0800 Subject: [PATCH 28/64] Adjust parameter order --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 9be6e862c9717f..5260be685aec88 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1369,7 +1369,7 @@ def close(self): class _ZipRepacker: """Class for ZipFile repacking.""" - def __init__(self, *, chunk_size=2**20, strict_descriptor=False, debug=0): + def __init__(self, *, strict_descriptor=False, chunk_size=2**20, debug=0): self.debug = debug # Level of printing: 0 through 3 self.chunk_size = chunk_size self.strict_descriptor = strict_descriptor From c62a4556ff9d86c3a969fa932886520327d2a62c Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 29 May 2025 01:15:52 +0800 Subject: [PATCH 29/64] Optimize code and revise comment - According to ZIP spec, both uncompressed and compressed size should be 0xffffffff when zip64 is used. --- Lib/zipfile/__init__.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 5260be685aec88..c1256a93fcac4c 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1619,14 +1619,11 @@ def _validate_local_file_entry(self, fp, offset, end_offset): # According to the spec, these fields should be zero when data # descriptor is used. Otherwise treat as a false positive on # random bytes to return early, as scanning for data descriptor - # is rather intensive. + # is rather expensive. if not (zinfo.CRC == zinfo.compress_size == zinfo.file_size == 0): return None - zip64 = ( - fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff or - fheader[_FH_COMPRESSED_SIZE] == 0xffffffff - ) + zip64 = fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) if dd is None and not self.strict_descriptor: From a05353c47674af813388ffcb415e67dbc3146e11 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 30 May 2025 01:43:25 +0800 Subject: [PATCH 30/64] Improve debug for `_ZipRepacker.repack()` --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index c1256a93fcac4c..29cedfc92a3675 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1468,7 +1468,7 @@ def repack(self, zfile, removed=None): # may raise on an invalid local file header used_entry_size = self._calc_local_file_entry_size(fp, zinfo) - self._debug(3, i, zinfo.orig_filename, entry_size, used_entry_size) + self._debug(3, i, zinfo.orig_filename, zinfo.header_offset, entry_size, used_entry_size) if used_entry_size > entry_size: raise BadZipFile( f"Overlapped entries: {zinfo.orig_filename!r} ") From 3d0240c1a3eab2890509b1d0d96e980a2b744d40 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Thu, 29 May 2025 20:09:06 +0800 Subject: [PATCH 31/64] Rework `_validate_local_file_entry_sequence` to return size or None --- Lib/zipfile/__init__.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 29cedfc92a3675..f17b3eee539a2c 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1529,13 +1529,14 @@ def repack(self, zfile, removed=None): zfile._didModify = True def _calc_initial_entry_offset(self, fp, data_offset): - checked_offsets = set() + checked_offsets = {} if data_offset > 0: self._debug(3, 'scanning file signatures before:', data_offset) for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): self._debug(3, 'checking file signature at:', pos) - if self._validate_local_file_entry_sequence(fp, pos, data_offset, checked_offsets): - return data_offset - pos + entry_size = self._validate_local_file_entry_sequence(fp, pos, data_offset, checked_offsets) + if entry_size == data_offset - pos: + return entry_size return 0 def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_size=4096): @@ -1569,21 +1570,20 @@ def _validate_local_file_entry_sequence(self, fp, start_offset, end_offset, chec while offset < end_offset: self._debug(3, 'checking local file entry at:', offset) - # Cache checked offsets to improve performance by failing - # subsequent (possible) file entry offsets early. They are - # rechecked only when proven false eventually. + # Cache checked offsets to improve performance. if offset in checked_offsets: - self._debug(3, 'skipping checked:', offset) - return False + self._debug(3, 'read from checked cache:', offset) + entry_size = checked_offsets[offset] else: - checked_offsets.add(offset) + entry_size = self._validate_local_file_entry(fp, offset, end_offset) + checked_offsets[offset] = entry_size - entry_size = self._validate_local_file_entry(fp, offset, end_offset) if entry_size is None: - return False + break + offset += entry_size - return offset == end_offset + return offset - start_offset def _validate_local_file_entry(self, fp, offset, end_offset): fp.seek(offset) From 31c4c936c6e5451f939e52818c1c35a742b31130 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 30 May 2025 01:46:06 +0800 Subject: [PATCH 32/64] Rework `_validate_local_file_entry_sequence` to allow passing no `checked_offsets` --- Lib/zipfile/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index f17b3eee539a2c..9c645ac731d170 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1564,19 +1564,21 @@ def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_si remainder = chunk[-(sig_len - 1):] pos += read_size - def _validate_local_file_entry_sequence(self, fp, start_offset, end_offset, checked_offsets): + def _validate_local_file_entry_sequence(self, fp, start_offset, end_offset, checked_offsets=None): offset = start_offset while offset < end_offset: self._debug(3, 'checking local file entry at:', offset) # Cache checked offsets to improve performance. - if offset in checked_offsets: - self._debug(3, 'read from checked cache:', offset) + try: entry_size = checked_offsets[offset] - else: + except (KeyError, TypeError): entry_size = self._validate_local_file_entry(fp, offset, end_offset) - checked_offsets[offset] = entry_size + if checked_offsets is not None: + checked_offsets[offset] = entry_size + else: + self._debug(3, 'read from checked cache:', offset) if entry_size is None: break From f8fade17a0335e3d4139788f84855907cf595424 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 30 May 2025 20:29:27 +0800 Subject: [PATCH 33/64] Introduce `_scan_data_descriptor_no_sig_by_decompression` --- Lib/test/test_zipfile/test_core.py | 44 ++++++++++++++++++++++ Lib/zipfile/__init__.py | 60 +++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 26081a860157d6..8a834f53f67e58 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2008,6 +2008,9 @@ def test_repack_data_descriptor_no_sig(self): def test_repack_data_descriptor_no_sig_strict(self): """Should skip data descriptor without signature when `strict_descriptor` is set.""" + if self.compression not in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA): + self.skipTest('require unsupported decompression method') + for ii in ([0], [0, 1]): with self.subTest(remove=ii): # calculate the expected results @@ -2046,6 +2049,47 @@ def test_repack_data_descriptor_no_sig_strict(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) + def test_repack_data_descriptor_no_sig_strict_by_decompressoin(self): + """Should correctly handle file entries using data descriptor without signature + through decompression.""" + if self.compression in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA): + self.skipTest('require supported decompression method') + + for ii in ([0], [0, 1]): + with self.subTest(remove=ii): + # calculate the expected results + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + # make sure data descriptor bit is really set (by making zipfile unseekable) + for zi in zh.infolist(): + self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') + + for i in ii: + zh.remove(self.test_files[i][0]) + zh.repack(strict_descriptor=True) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_data_descriptor_no_sig_and_zip64(self): """Should correctly handle file entries using data descriptor without signature and zip64.""" for ii in ([0], [0, 1], [1], [2]): diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 9c645ac731d170..5098799a9ab6f2 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1628,8 +1628,14 @@ def _validate_local_file_entry(self, fp, offset, end_offset): zip64 = fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) - if dd is None and not self.strict_descriptor: - dd = self._scan_data_descriptor_no_sig(fp, pos, end_offset, zip64) + if dd is None: + dd = self._scan_data_descriptor_no_sig_by_decompression( + fp, pos, end_offset, zip64, fheader[_FH_COMPRESSION_METHOD]) + if dd is False: + if not self.strict_descriptor: + dd = self._scan_data_descriptor_no_sig(fp, pos, end_offset, zip64) + else: + dd = None if dd is None: return None @@ -1705,6 +1711,56 @@ def _scan_data_descriptor_no_sig(self, fp, offset, end_offset, zip64, chunk_size return None + def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, zip64, method): + dd_fmt = ' end_offset: + return False + + try: + decompressor = _get_decompressor(method) + except NotImplementedError: + return False + + if decompressor is None: + return False + + # Current LZMADecompressor is unreliable since it's `.eof` is usually + # not set as expected. + if isinstance(decompressor, LZMADecompressor): + return False + + try: + pos = self._find_compression_end_offset(fp, offset, end_offset - dd_size, decompressor) + except Exception: + return None + + fp.seek(pos) + dd = fp.read(dd_size) + crc, compress_size, file_size = struct.unpack(dd_fmt, dd) + if pos - offset != compress_size: + return None + + return crc, compress_size, file_size, dd_size + + def _find_compression_end_offset(self, fp, offset, end_offset, decompressor, chunk_size=4096): + fp.seek(offset) + read_size = 0 + while True: + chunk = fp.read(min(chunk_size, end_offset - offset - read_size)) + if not chunk: + raise EOFError('Unexpected EOF while decompressing') + + # may raise on error + decompressor.decompress(chunk) + + read_size += len(chunk) + + if decompressor.eof: + unused_len = len(decompressor.unused_data) + return offset + read_size - unused_len + def _calc_local_file_entry_size(self, fp, zinfo): fp.seek(zinfo.header_offset) fheader = self._read_local_file_header(fp) From c80d21bf991dff0369178f504212642f4000f769 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 30 May 2025 01:47:55 +0800 Subject: [PATCH 34/64] Strip only entries immediately following a referenced entry - The previous implementation might cause [archive decryption header] and/or [archive extra data record] preceeding [central directory] be stripped. --- Doc/library/zipfile.rst | 18 +++-- Lib/test/test_zipfile/test_core.py | 106 +++++++++++++++++++++++++--- Lib/zipfile/__init__.py | 109 ++++++++++++++++++----------- 3 files changed, 178 insertions(+), 55 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 72fb35b08cf19d..382e6d6eee4274 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -550,17 +550,21 @@ ZipFile Objects will be removed. If *removed* is not provided, local file entries no longer referenced in the - central directory will be removed. The algorithm assumes that local file - entries are stored consecutively. Extra bytes between entries will also be - removed. Data before the first referenced entry is preserved unless it - appears to be a sequence of consecutive local file entries. + central directory will be removed. The algorithm assumes that local file + entries are stored consecutively: + #. Data before the first referenced entry is removed only when it appears to + be a sequence of consecutive entries with no extra following bytes; extra + preceeding bytes are preserved. + #. Data between referenced entries is removed only when it appears to + be a sequence of consecutive entries with no extra preceding bytes; extra + following bytes are preserved. ``strict_descriptor=True`` can be provided to skip the slower scan for an unsigned data descriptor (deprecated in the latest ZIP specification and is only used by legacy tools) when checking for bytes resembling a valid local - file entry before the first referenced entry. This improves performance, - but may cause some stale local file entries to be preserved, as any entry - using an unsigned descriptor cannot be detected. + file entry. This improves performance, but may cause some stale local file + entries to be preserved, as any entry using an unsigned descriptor cannot + be detected. *chunk_size* may be specified to control the buffer size when moving entry data (default is 1 MiB). diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 8a834f53f67e58..3c0d9218ceead9 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1836,22 +1836,31 @@ def test_repack_file_entry_before_first_file(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) - def test_repack_bytes_between_files(self): - """Should remove bytes between local file entries.""" + def test_repack_bytes_before_removed_files(self): + """Should preserve if there are bytes before stale local file entries.""" for ii in ([1], [1, 2], [2]): with self.subTest(remove=ii): # calculate the expected results - test_files = [data for j, data in enumerate(self.test_files) if j not in ii] - expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + for i, (file, data) in enumerate(self.test_files): + if i == ii[0]: + fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() + zh.writestr(file, data) + for i in ii: + zh.remove(self.test_files[i][0]) + expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: with zipfile.ZipFile(fh, 'w', self.compression) as zh: for i, (file, data) in enumerate(self.test_files): + if i == ii[0]: + fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() zh.writestr(file, data) - fh.write(b' dummy bytes ') - zh.start_dir = fh.tell() with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: zh.remove(self.test_files[i][0]) @@ -1870,6 +1879,87 @@ def test_repack_bytes_between_files(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) + def test_repack_bytes_after_removed_files(self): + """Should keep extra bytes if there are bytes after stale local file entries.""" + for ii in ([1], [1, 2], [2]): + with self.subTest(remove=ii): + # calculate the expected results + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + for i, (file, data) in enumerate(self.test_files): + if i not in ii: + zh.writestr(file, data) + if i == ii[-1]: + fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() + expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + for i, (file, data) in enumerate(self.test_files): + zh.writestr(file, data) + if i == ii[-1]: + fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + for i in ii: + zh.remove(self.test_files[i][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + + def test_repack_bytes_between_removed_files(self): + """Should strip only local file entries before random bytes.""" + # calculate the expected results + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + zh.writestr(*self.test_files[0]) + fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() + zh.writestr(*self.test_files[2]) + zh.remove(self.test_files[2][0]) + expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + with open(TESTFN, 'wb') as fh: + with zipfile.ZipFile(fh, 'w', self.compression) as zh: + zh.writestr(*self.test_files[0]) + zh.writestr(*self.test_files[1]) + fh.write(b' dummy bytes ') + zh.start_dir = fh.tell() + zh.writestr(*self.test_files[2]) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.remove(self.test_files[1][0]) + zh.remove(self.test_files[2][0]) + zh.repack() + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + def test_repack_zip64(self): """Should correctly handle file entries with zip64.""" for ii in ([0], [0, 1], [1], [2]): @@ -2011,7 +2101,7 @@ def test_repack_data_descriptor_no_sig_strict(self): if self.compression not in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA): self.skipTest('require unsupported decompression method') - for ii in ([0], [0, 1]): + for ii in ([0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results with open(TESTFN, 'wb') as fh: @@ -2055,7 +2145,7 @@ def test_repack_data_descriptor_no_sig_strict_by_decompressoin(self): if self.compression in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA): self.skipTest('require supported decompression method') - for ii in ([0], [0, 1]): + for ii in ([0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results test_files = [data for j, data in enumerate(self.test_files) if j not in ii] diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 5098799a9ab6f2..e6c1f5827c48ee 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1385,66 +1385,78 @@ def repack(self, zfile, removed=None): Assumes that local file entries are stored consecutively, with no gaps or overlaps. - Stripping occurs in two phases: + Behavior: - 1. Before the first recorded file entry: - - If a sequence of valid local file entries (starting with - `PK\x03\x04`) is found immediately before the first recorded - entry, it is stripped. - - Otherwise, all leading bytes are preserved (e.g., in cases such - as self-extracting archives or embedded ZIP payloads). + 1. If any referenced entry overlaps with another, a `BadZipFile` error + is raised since safe repacking cannot be guaranteed. - 2. Between or after the recorded entries: - - Any bytes between two recorded entries, or between the last - recorded and the central directory, are removed—regardless of - whether they resemble valid entries. + 2. Data before the first referenced entry is stripped only when it + appears to be a sequence of consecutive entries with no extra + following bytes; extra preceeding bytes are preserved. + + 3. Data between referenced entries is stripped only when it appears to + be a sequence of consecutive entries with no extra preceding bytes; + extra following bytes are preserved. + + 4. This is to prevent an unexpected data removal (false positive), + though a false negative may happen in certain rare cases. Examples: - Stripping before first recorded entry: + Stripping before the first referenced entry: [random bytes] - [unreferenced local file entry 1] - [unreferenced local file entry 2] + [unreferenced local file entry] [random bytes] <-- stripping start - [unreferenced local file entry 3] - [unreferenced local file entry 4] + [unreferenced local file entry] + [unreferenced local file entry] <-- stripping end - [recorded local file entry 1] + [local file entry 1] (or central directory) ... - [central directory] - Stripping between recorded entries: + Stripping between referenced entries: ... - [recorded local file entry 5] + [local file entry] <-- stripping start + [unreferenced local file entry] + [unreferenced local file entry] + <-- stripping end [random bytes] [unreferenced local file entry] [random bytes] - <-- stripping end - [recorded local file entry 6] + [local file entry] (or central directory) ... - [recorded local file entry n] - <-- stripping start + + No stripping: + [unreferenced local file entry] - <-- stripping end - [central directory] + [random bytes] + [local file entry 1] (or central directory) + ... No stripping: - [unreferenced local file entry 1] - [unreferenced local file entry 2] ... - [unreferenced local file entry n] + [local file entry] [random bytes] - [recorded local file entry 1] + [unreferenced local file entry] + [local file entry] (or central directory) ... - removed: None or a sequence of ZipInfo instances representing removed - entries. When provided, only their corresponding local file - entries are stripped. + Side effects: + - Modifies the ZIP file in place. + - Updates zfile.start_dir to account for removed data. + - Sets zfile._didModify to True. + - Adjusts header_offset and _end_offset of referenced ZipInfo + instances. + + Parameters: + zfile: A ZipFile object representing the archive to repack. + removed: Optional. A sequence of ZipInfo instances representing + the previously removed entries. When provided, only their + corresponding local file entries are stripped. """ removed_zinfos = set(removed or ()) @@ -1512,17 +1524,34 @@ def repack(self, zfile, removed=None): entry_offset += used_entry_size else: + old_header_offset = zinfo.header_offset + zinfo.header_offset -= entry_offset + if entry_offset > 0: - old_header_offset = zinfo.header_offset - zinfo.header_offset -= entry_offset self._copy_bytes(fp, old_header_offset, zinfo.header_offset, used_entry_size) - if zinfo._end_offset is not None: - zinfo._end_offset = zinfo.header_offset + used_entry_size - - # update entry_offset for subsequent files to follow if used_entry_size < entry_size: - entry_offset += entry_size - used_entry_size + stale_entry_size = self._validate_local_file_entry_sequence( + fp, + old_header_offset + used_entry_size, + old_header_offset + entry_size, + ) + else: + stale_entry_size = 0 + + if stale_entry_size > 0: + self._copy_bytes( + fp, + old_header_offset + used_entry_size + stale_entry_size, + zinfo.header_offset + used_entry_size, + entry_size - used_entry_size - stale_entry_size, + ) + + # update entry_offset for subsequent files to follow + entry_offset += stale_entry_size + + if zinfo._end_offset is not None: + zinfo._end_offset = zinfo.header_offset + entry_size - stale_entry_size # update state zfile.start_dir -= entry_offset From e1caea9b30a917e375e3dd481ba53c211afcb2b0 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 30 May 2025 20:59:52 +0800 Subject: [PATCH 35/64] Adjust method names --- Lib/test/test_zipfile64.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index ba943719fcc64e..35a290a1ac3436 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -111,16 +111,16 @@ def _write_large_file(self, fh): (num, self.datacount)), file=sys.__stdout__) sys.__stdout__.flush() - def test_clean_removed_large_file(self): + def test_strip_removed_large_file(self): """Should move the physical data of a file positioned after a large removed file without causing a memory issue.""" # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: - self._test_clean_removed_large_file(f) + self._test_strip_removed_large_file(f) self.assertFalse(f.closed) - def _test_clean_removed_large_file(self, f): + def _test_strip_removed_large_file(self, f): file = 'file.txt' file1 = 'largefile.txt' data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' @@ -134,16 +134,16 @@ def _test_clean_removed_large_file(self, f): zh.repack() self.assertIsNone(zh.testzip()) - def test_clean_removed_file_before_large_file(self): + def test_strip_removed_file_before_large_file(self): """Should move the physical data of a large file positioned after a removed file without causing a memory issue.""" # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: - self._test_clean_removed_file_before_large_file(f) + self._test_strip_removed_file_before_large_file(f) self.assertFalse(f.closed) - def _test_clean_removed_file_before_large_file(self, f): + def _test_strip_removed_file_before_large_file(self, f): file = 'file.txt' file1 = 'largefile.txt' data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' @@ -157,16 +157,16 @@ def _test_clean_removed_file_before_large_file(self, f): zh.repack() self.assertIsNone(zh.testzip()) - def test_clean_removed_large_file_with_dd(self): + def test_strip_removed_large_file_with_dd(self): """Should scan for the data descriptor of a removed large file without causing a memory issue.""" # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: - self._test_clean_removed_large_file_with_dd(f) + self._test_strip_removed_large_file_with_dd(f) self.assertFalse(f.closed) - def _test_clean_removed_large_file_with_dd(self, f): + def _test_strip_removed_large_file_with_dd(self, f): file = 'file.txt' file1 = 'largefile.txt' data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' @@ -184,16 +184,16 @@ def _test_clean_removed_large_file_with_dd(self, f): zh.repack() self.assertIsNone(zh.testzip()) - def test_clean_removed_large_file_with_dd_no_sig(self): + def test_strip_removed_large_file_with_dd_no_sig(self): """Should scan for the data descriptor (without signature) of a removed large file without causing a memory issue.""" # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: - self._test_clean_removed_large_file_with_dd_no_sig(f) + self._test_strip_removed_large_file_with_dd_no_sig(f) self.assertFalse(f.closed) - def _test_clean_removed_large_file_with_dd_no_sig(self, f): + def _test_strip_removed_large_file_with_dd_no_sig(self, f): # Reduce data to 400 MiB for this test, as it's especially slow... self.datacount = 400*1024**2 // len(self.data) From 2b23d46d9503cc6afb5d6927737603d3f3ab16d8 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 30 May 2025 21:05:52 +0800 Subject: [PATCH 36/64] Add memory usage test --- Lib/test/test_zipfile64.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index 35a290a1ac3436..f03adf91d5eff3 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -13,6 +13,7 @@ import zipfile, unittest import time +import tracemalloc import sys import unittest.mock as mock @@ -99,6 +100,9 @@ def setUp(self): # It will contain enough copies of self.data to reach about 8 GiB. self.datacount = 8*1024**3 // len(self.data) + # memory usage should not exceed 10 MiB + self.allowed_memory = 10*1024**2 + def _write_large_file(self, fh): next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL for num in range(self.datacount): @@ -117,8 +121,12 @@ def test_strip_removed_large_file(self): # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: + tracemalloc.start() self._test_strip_removed_large_file(f) self.assertFalse(f.closed) + current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + self.assertLess(peak, self.allowed_memory) def _test_strip_removed_large_file(self, f): file = 'file.txt' @@ -140,8 +148,12 @@ def test_strip_removed_file_before_large_file(self): # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: + tracemalloc.start() self._test_strip_removed_file_before_large_file(f) self.assertFalse(f.closed) + current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + self.assertLess(peak, self.allowed_memory) def _test_strip_removed_file_before_large_file(self, f): file = 'file.txt' @@ -163,8 +175,12 @@ def test_strip_removed_large_file_with_dd(self): # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: + tracemalloc.start() self._test_strip_removed_large_file_with_dd(f) self.assertFalse(f.closed) + current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + self.assertLess(peak, self.allowed_memory) def _test_strip_removed_large_file_with_dd(self, f): file = 'file.txt' @@ -190,8 +206,12 @@ def test_strip_removed_large_file_with_dd_no_sig(self): # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: + tracemalloc.start() self._test_strip_removed_large_file_with_dd_no_sig(f) self.assertFalse(f.closed) + current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + self.assertLess(peak, self.allowed_memory) def _test_strip_removed_large_file_with_dd_no_sig(self, f): # Reduce data to 400 MiB for this test, as it's especially slow... From de4f15bb4aebe5a346d5b143d10e3881c7f5a454 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 30 May 2025 22:49:14 +0800 Subject: [PATCH 37/64] Fix rst --- Doc/library/zipfile.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 382e6d6eee4274..878c92e9dcdecf 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -552,6 +552,7 @@ ZipFile Objects If *removed* is not provided, local file entries no longer referenced in the central directory will be removed. The algorithm assumes that local file entries are stored consecutively: + #. Data before the first referenced entry is removed only when it appears to be a sequence of consecutive entries with no extra following bytes; extra preceeding bytes are preserved. From ea3259fba7208ca588e34bc343842bc459cce1c1 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 09:41:57 +0800 Subject: [PATCH 38/64] Optimize code --- Lib/test/test_zipfile/test_core.py | 2 +- Lib/zipfile/__init__.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 3c0d9218ceead9..6cd0d0760bc8f4 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1374,7 +1374,7 @@ def struct_pack_no_dd_sig(fmt, *values): with warnings.catch_warnings(): warnings.simplefilter("ignore") if values[0] == zipfile._DD_SIGNATURE: - return _struct_pack(fmt[0:1] + fmt[2:], *values[1:]) + return _struct_pack(fmt[:1] + fmt[2:], *values[1:]) return _struct_pack(fmt, *values) class RepackHelperMixin: diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e6c1f5827c48ee..7f3568521e5fd9 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1494,10 +1494,10 @@ def repack(self, zfile, removed=None): # calculate the starting entry offset (bytes to skip) if removed is None: try: - data_offset = filelist[0].header_offset + offset = filelist[0].header_offset except IndexError: - data_offset = zfile.start_dir - entry_offset = self._calc_initial_entry_offset(fp, data_offset) + offset = zfile.start_dir + entry_offset = self._calc_initial_entry_offset(fp, offset) else: entry_offset = 0 @@ -1644,7 +1644,7 @@ def _validate_local_file_entry(self, fp, offset, end_offset): except BadZipFile: return None - data_descriptor_size = 0 + dd_size = 0 if zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: # According to the spec, these fields should be zero when data @@ -1668,16 +1668,13 @@ def _validate_local_file_entry(self, fp, offset, end_offset): if dd is None: return None - crc, compress_size, file_size, data_descriptor_size = dd - zinfo.CRC = crc - zinfo.compress_size = compress_size - zinfo.file_size = file_size + zinfo.CRC, zinfo.compress_size, zinfo.file_size, dd_size = dd return ( sizeFileHeader + fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] + zinfo.compress_size + - data_descriptor_size + dd_size ) def _read_local_file_header(self, fp): From fef92c45a308fe730ddce433286c2ebbfacaac0f Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 11:54:37 +0800 Subject: [PATCH 39/64] Fix and optimize `_iter_scan_signature` --- Lib/zipfile/__init__.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 7f3568521e5fd9..be282abbcda326 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1573,25 +1573,24 @@ def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_si remainder = b'' pos = start_offset - fp.seek(start_offset) while pos < end_offset: - read_size = min(chunk_size, end_offset - pos) - chunk = remainder + fp.read(read_size) - if not chunk: - break + # required for each loop since fp may be changed during each yield + fp.seek(pos) + + chunk = remainder + fp.read(min(chunk_size, end_offset - pos)) + delta = pos - len(remainder) idx = 0 while True: idx = chunk.find(signature, idx) - if idx == -1 or idx + sig_len > len(chunk): + if idx == -1: break - abs_pos = pos - len(remainder) + idx - yield abs_pos + yield delta + idx idx += 1 remainder = chunk[-(sig_len - 1):] - pos += read_size + pos += chunk_size def _validate_local_file_entry_sequence(self, fp, start_offset, end_offset, checked_offsets=None): offset = start_offset From 8067b0c4894fcdf72f8723340e11be17721df606 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 10:26:22 +0800 Subject: [PATCH 40/64] Fix `_scan_data_descriptor` --- Lib/zipfile/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index be282abbcda326..e014053bd2e91b 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1694,8 +1694,11 @@ def _scan_data_descriptor(self, fp, offset, end_offset, zip64): fp, struct.pack(' Date: Sun, 1 Jun 2025 11:14:19 +0800 Subject: [PATCH 41/64] Fix and optimize `_scan_data_descriptor_no_sig` --- Lib/zipfile/__init__.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e014053bd2e91b..a53b5f38c861e8 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1712,30 +1712,28 @@ def _scan_data_descriptor_no_sig(self, fp, offset, end_offset, zip64, chunk_size dd_fmt = ' Date: Sun, 1 Jun 2025 14:13:24 +0800 Subject: [PATCH 42/64] Rename `_trace_compressed_block_end` --- Lib/zipfile/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index a53b5f38c861e8..e9feae02b7add9 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1758,7 +1758,7 @@ def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, return False try: - pos = self._find_compression_end_offset(fp, offset, end_offset - dd_size, decompressor) + pos = self._trace_compressed_block_end(fp, offset, end_offset - dd_size, decompressor) except Exception: return None @@ -1770,7 +1770,7 @@ def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, return crc, compress_size, file_size, dd_size - def _find_compression_end_offset(self, fp, offset, end_offset, decompressor, chunk_size=4096): + def _trace_compressed_block_end(self, fp, offset, end_offset, decompressor, chunk_size=4096): fp.seek(offset) read_size = 0 while True: From 1d5ec6133715cb21bdd8202551c452ce3eef3bd9 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 14:05:09 +0800 Subject: [PATCH 43/64] Fix `_scan_data_descriptor_no_sig_by_decompression` --- Lib/zipfile/__init__.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e9feae02b7add9..3f65e2712bdb96 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1738,12 +1738,6 @@ def _scan_data_descriptor_no_sig(self, fp, offset, end_offset, zip64, chunk_size return None def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, zip64, method): - dd_fmt = ' end_offset: - return False - try: decompressor = _get_decompressor(method) except NotImplementedError: @@ -1757,6 +1751,12 @@ def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, if isinstance(decompressor, LZMADecompressor): return False + dd_fmt = ' Date: Sun, 1 Jun 2025 09:20:42 +0800 Subject: [PATCH 44/64] Add tests for `_ZipRepacker` --- Lib/test/test_zipfile/test_core.py | 399 +++++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 6cd0d0760bc8f4..43e17337a7a8de 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2404,6 +2404,405 @@ class LzmaRepackTests(AbstractRepackTests, unittest.TestCase): class ZstdRepackTests(AbstractRepackTests, unittest.TestCase): compression = zipfile.ZIP_ZSTANDARD +class ZipRepackerTests(unittest.TestCase): + def test_iter_scan_signature(self): + bytes_ = b'sig__sig__sig__sig' + ln = len(bytes_) + fp = io.BytesIO(bytes_) + repacker = zipfile._ZipRepacker() + + # basic + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 0, ln)), + [0, 5, 10, 15], + ) + + # start_offset + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 1, ln)), + [5, 10, 15], + ) + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 6, ln)), + [10, 15], + ) + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 16, ln)), + [], + ) + + # end_offset + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 0, ln - 1)), + [0, 5, 10], + ) + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 0, ln - 6)), + [0, 5], + ) + + # chunk_size + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 0, ln, 3)), + [0, 5, 10, 15], + ) + self.assertEqual( + list(repacker._iter_scan_signature(fp, b'sig', 0, ln, 1)), + [0, 5, 10, 15], + ) + + def test_scan_data_descriptor(self): + import zlib + repacker = zipfile._ZipRepacker() + + # basic + bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + (zlib.crc32(b'dummy'), 5, 5, 16), + ) + + # return None if no signature + bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + + # return None if not unpackable + bytes_ = b'PK\x07\x08' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + + # return None if compressed size not match + bytes_ = b'dummPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + + # zip64 + bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), True), + (zlib.crc32(b'dummy'), 5, 5, 24), + ) + + # offset + bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 1, len(bytes_), False), + None, + ) + + bytes_ = b'123dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 3, len(bytes_), False), + (zlib.crc32(b'dummy'), 5, 5, 16), + ) + + # end_offset + bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_) - 1, False), + None, + ) + + bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00123' + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_) - 3, False), + (zlib.crc32(b'dummy'), 5, 5, 16), + ) + + def test_scan_data_descriptor_no_sig(self): + import zlib + repacker = zipfile._ZipRepacker() + + # basic + bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), + (zlib.crc32(b'dummy'), 5, 5, 12), + ) + + # return None if compressed size not match + bytes_ = b'dumm\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + + # zip64 + bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), True), + (zlib.crc32(b'dummy'), 5, 5, 20), + ) + + # offset + bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 1, len(bytes_), False), + None, + ) + + bytes_ = b'123dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 3, len(bytes_), False), + (zlib.crc32(b'dummy'), 5, 5, 12), + ) + + # end_offset + bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_) - 1, False), + None, + ) + + bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00123' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_) - 3, False), + (zlib.crc32(b'dummy'), 5, 5, 12), + ) + + # chunk_size + bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False, 12), + (zlib.crc32(b'dummy'), 5, 5, 12), + ) + self.assertEqual( + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False, 1), + (zlib.crc32(b'dummy'), 5, 5, 12), + ) + + def test_scan_data_descriptor_no_sig_by_decompression(self): + import zlib + import compression.zstd + repacker = zipfile._ZipRepacker() + + for method in ( + zipfile.ZIP_DEFLATED, + zipfile.ZIP_BZIP2, + zipfile.ZIP_ZSTANDARD, + ): + compressor = zipfile._get_compressor(method) + with self.subTest(method=method, compressor=compressor): + comp_bytes = compressor.compress(b'dummy') + comp_bytes += compressor.flush() + comp_len = len(comp_bytes) + + # basic + bytes_ = comp_bytes + b'\x3f\xf2\xf4\x4f' + struct.pack(' Date: Sun, 1 Jun 2025 15:21:55 +0800 Subject: [PATCH 45/64] Remove unneeded import --- Lib/test/test_zipfile/test_core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 43e17337a7a8de..17c84d4ea7e1ab 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2588,7 +2588,6 @@ def test_scan_data_descriptor_no_sig(self): def test_scan_data_descriptor_no_sig_by_decompression(self): import zlib - import compression.zstd repacker = zipfile._ZipRepacker() for method in ( From 578c7c8b6a14f8ba022e49f41b531222a7067cc7 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 15:40:07 +0800 Subject: [PATCH 46/64] Add requirements --- Lib/test/test_zipfile/test_core.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 17c84d4ea7e1ab..ae76790cfe8c1c 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2451,6 +2451,7 @@ def test_iter_scan_signature(self): [0, 5, 10, 15], ) + @requires_zlib() def test_scan_data_descriptor(self): import zlib repacker = zipfile._ZipRepacker() @@ -2520,6 +2521,7 @@ def test_scan_data_descriptor(self): (zlib.crc32(b'dummy'), 5, 5, 16), ) + @requires_zlib() def test_scan_data_descriptor_no_sig(self): import zlib repacker = zipfile._ZipRepacker() @@ -2586,6 +2588,9 @@ def test_scan_data_descriptor_no_sig(self): (zlib.crc32(b'dummy'), 5, 5, 12), ) + @requires_zlib() + @requires_bz2() + @requires_zstd() def test_scan_data_descriptor_no_sig_by_decompression(self): import zlib repacker = zipfile._ZipRepacker() @@ -2686,6 +2691,9 @@ def test_scan_data_descriptor_no_sig_by_decompression_invalid(self): False, ) + @requires_zlib() + @requires_bz2() + @requires_zstd() def test_trace_compressed_block_end(self): import zlib import compression.zstd From c470c33389d71a9c2a5ebbb0410f5dd1d2158f7d Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 15:48:01 +0800 Subject: [PATCH 47/64] Fix `_scan_data_descriptor_no_sig_by_decompression` when library not available --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 3f65e2712bdb96..e38cbcbe1dd0d9 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1740,7 +1740,7 @@ def _scan_data_descriptor_no_sig(self, fp, offset, end_offset, zip64, chunk_size def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, zip64, method): try: decompressor = _get_decompressor(method) - except NotImplementedError: + except (NotImplementedError, RuntimeError): return False if decompressor is None: From b1dcb07f33a85ff88f0d3c5d2a117af53a33b4d4 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 15:54:37 +0800 Subject: [PATCH 48/64] Test with pre-calculated CRC --- Lib/test/test_zipfile/test_core.py | 32 +++++++++++++----------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index ae76790cfe8c1c..406167ae47c207 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2451,16 +2451,14 @@ def test_iter_scan_signature(self): [0, 5, 10, 15], ) - @requires_zlib() def test_scan_data_descriptor(self): - import zlib repacker = zipfile._ZipRepacker() # basic bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), - (zlib.crc32(b'dummy'), 5, 5, 16), + (0x4ff4f23f, 5, 5, 16), ) # return None if no signature @@ -2488,7 +2486,7 @@ def test_scan_data_descriptor(self): bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00' self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), True), - (zlib.crc32(b'dummy'), 5, 5, 24), + (0x4ff4f23f, 5, 5, 24), ) # offset @@ -2505,7 +2503,7 @@ def test_scan_data_descriptor(self): ) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 3, len(bytes_), False), - (zlib.crc32(b'dummy'), 5, 5, 16), + (0x4ff4f23f, 5, 5, 16), ) # end_offset @@ -2518,19 +2516,17 @@ def test_scan_data_descriptor(self): bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00123' self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_) - 3, False), - (zlib.crc32(b'dummy'), 5, 5, 16), + (0x4ff4f23f, 5, 5, 16), ) - @requires_zlib() def test_scan_data_descriptor_no_sig(self): - import zlib repacker = zipfile._ZipRepacker() # basic bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), - (zlib.crc32(b'dummy'), 5, 5, 12), + (0x4ff4f23f, 5, 5, 12), ) # return None if compressed size not match @@ -2544,7 +2540,7 @@ def test_scan_data_descriptor_no_sig(self): bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00' self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), True), - (zlib.crc32(b'dummy'), 5, 5, 20), + (0x4ff4f23f, 5, 5, 20), ) # offset @@ -2561,7 +2557,7 @@ def test_scan_data_descriptor_no_sig(self): ) self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 3, len(bytes_), False), - (zlib.crc32(b'dummy'), 5, 5, 12), + (0x4ff4f23f, 5, 5, 12), ) # end_offset @@ -2574,18 +2570,18 @@ def test_scan_data_descriptor_no_sig(self): bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00123' self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_) - 3, False), - (zlib.crc32(b'dummy'), 5, 5, 12), + (0x4ff4f23f, 5, 5, 12), ) # chunk_size bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False, 12), - (zlib.crc32(b'dummy'), 5, 5, 12), + (0x4ff4f23f, 5, 5, 12), ) self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False, 1), - (zlib.crc32(b'dummy'), 5, 5, 12), + (0x4ff4f23f, 5, 5, 12), ) @requires_zlib() @@ -2611,7 +2607,7 @@ def test_scan_data_descriptor_no_sig_by_decompression(self): self.assertEqual( repacker._scan_data_descriptor_no_sig_by_decompression( io.BytesIO(bytes_), 0, len(bytes_), False, method), - (zlib.crc32(b'dummy'), comp_len, 5, 12), + (0x4ff4f23f, comp_len, 5, 12), ) # return None if insufficient data length @@ -2642,7 +2638,7 @@ def test_scan_data_descriptor_no_sig_by_decompression(self): self.assertEqual( repacker._scan_data_descriptor_no_sig_by_decompression( io.BytesIO(bytes_), 0, len(bytes_), True, method), - (zlib.crc32(b'dummy'), comp_len, 5, 20), + (0x4ff4f23f, comp_len, 5, 20), ) # offset @@ -2657,7 +2653,7 @@ def test_scan_data_descriptor_no_sig_by_decompression(self): self.assertEqual( repacker._scan_data_descriptor_no_sig_by_decompression( io.BytesIO(bytes_), 3, len(bytes_), False, method), - (zlib.crc32(b'dummy'), comp_len, 5, 12), + (0x4ff4f23f, comp_len, 5, 12), ) # end_offset @@ -2672,7 +2668,7 @@ def test_scan_data_descriptor_no_sig_by_decompression(self): self.assertEqual( repacker._scan_data_descriptor_no_sig_by_decompression( io.BytesIO(bytes_), 0, len(bytes_) - 2, False, method), - (zlib.crc32(b'dummy'), comp_len, 5, 12), + (0x4ff4f23f, comp_len, 5, 12), ) def test_scan_data_descriptor_no_sig_by_decompression_invalid(self): From 04cddef2f71fd8c51a72673b337b7e5113e51e6f Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 15:57:16 +0800 Subject: [PATCH 49/64] Remove unneeded import --- Lib/test/test_zipfile/test_core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 406167ae47c207..c3dc3a3525431c 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2588,7 +2588,6 @@ def test_scan_data_descriptor_no_sig(self): @requires_bz2() @requires_zstd() def test_scan_data_descriptor_no_sig_by_decompression(self): - import zlib repacker = zipfile._ZipRepacker() for method in ( From 797a62cd8f831aac39199a4a207b2d1cf620eaf6 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 1 Jun 2025 16:29:38 +0800 Subject: [PATCH 50/64] Fix and optimize `repack` --- Lib/zipfile/__init__.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index e38cbcbe1dd0d9..c1aa477117e0d8 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1449,7 +1449,7 @@ def repack(self, zfile, removed=None): - Modifies the ZIP file in place. - Updates zfile.start_dir to account for removed data. - Sets zfile._didModify to True. - - Adjusts header_offset and _end_offset of referenced ZipInfo + - Updates header_offset and _end_offset of referenced ZipInfo instances. Parameters: @@ -1507,9 +1507,10 @@ def repack(self, zfile, removed=None): used_entry_size = used_entry_size_list[i] # update the header and move entry data to the new position + old_header_offset = zinfo.header_offset + zinfo.header_offset -= entry_offset + if zinfo in removed_zinfos: - old_header_offset = zinfo.header_offset - zinfo.header_offset -= entry_offset self._copy_bytes( fp, old_header_offset + used_entry_size, @@ -1517,16 +1518,10 @@ def repack(self, zfile, removed=None): entry_size - used_entry_size ) - if zinfo._end_offset is not None: - zinfo._end_offset = zinfo.header_offset - # update entry_offset for subsequent files to follow entry_offset += used_entry_size else: - old_header_offset = zinfo.header_offset - zinfo.header_offset -= entry_offset - if entry_offset > 0: self._copy_bytes(fp, old_header_offset, zinfo.header_offset, used_entry_size) @@ -1550,13 +1545,19 @@ def repack(self, zfile, removed=None): # update entry_offset for subsequent files to follow entry_offset += stale_entry_size - if zinfo._end_offset is not None: - zinfo._end_offset = zinfo.header_offset + entry_size - stale_entry_size - # update state zfile.start_dir -= entry_offset zfile._didModify = True + end_offset = zfile.start_dir + for zinfo in reversed(filelist): + if zinfo in removed_zinfos: + zinfo._end_offset = None + else: + if zinfo._end_offset is not None: + zinfo._end_offset = end_offset + end_offset = zinfo.header_offset + def _calc_initial_entry_offset(self, fp, data_offset): checked_offsets = {} if data_offset > 0: From 3b2f2328b6b8b38e9258347c67afd6f32139afd5 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 14 Jun 2025 10:32:05 +0800 Subject: [PATCH 51/64] Remove unneeded catch type - NotImplementedError is a subclass of RuntimeError. --- Lib/zipfile/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index c1aa477117e0d8..0841641d1bba14 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1741,7 +1741,7 @@ def _scan_data_descriptor_no_sig(self, fp, offset, end_offset, zip64, chunk_size def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, zip64, method): try: decompressor = _get_decompressor(method) - except (NotImplementedError, RuntimeError): + except RuntimeError: return False if decompressor is None: From cb549c95f1d924e66af8525f614265ffa4666fcc Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 14 Jun 2025 11:14:14 +0800 Subject: [PATCH 52/64] Patch more explicitly --- Lib/test/test_zipfile/test_core.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index c3dc3a3525431c..0d8271bb6558cf 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2066,13 +2066,13 @@ def test_repack_data_descriptor_no_sig(self): # calculate the expected results test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) @@ -2105,7 +2105,7 @@ def test_repack_data_descriptor_no_sig_strict(self): with self.subTest(remove=ii): # calculate the expected results with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: for i in ii: @@ -2115,7 +2115,7 @@ def test_repack_data_descriptor_no_sig_strict(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) @@ -2150,13 +2150,13 @@ def test_repack_data_descriptor_no_sig_strict_by_decompressoin(self): # calculate the expected results test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) @@ -2187,13 +2187,13 @@ def test_repack_data_descriptor_no_sig_and_zip64(self): # calculate the expected results test_files = [data for j, data in enumerate(self.test_files) if j not in ii] with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) expected_size = os.path.getsize(TESTFN) # do the removal and check the result with open(TESTFN, 'wb') as fh: - with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig): + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) @@ -2345,7 +2345,7 @@ def test_repack_removed_bad_removed_zinfos(self): with self.assertRaises(zipfile.BadZipFile): zh.repack(zinfos) - @mock.patch('zipfile._ZipRepacker') + @mock.patch.object(zipfile, '_ZipRepacker') def test_repack_closed(self, m_repack): self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: @@ -2354,7 +2354,7 @@ def test_repack_closed(self, m_repack): zh.repack() m_repack.assert_not_called() - @mock.patch('zipfile._ZipRepacker') + @mock.patch.object(zipfile, '_ZipRepacker') def test_repack_writing(self, m_repack): self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: @@ -2363,7 +2363,7 @@ def test_repack_writing(self, m_repack): zh.repack() m_repack.assert_not_called() - @mock.patch('zipfile._ZipRepacker') + @mock.patch.object(zipfile, '_ZipRepacker') def test_repack_mode_r(self, m_repack): self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'r') as zh: @@ -2371,14 +2371,14 @@ def test_repack_mode_r(self, m_repack): zh.repack() m_repack.assert_not_called() - @mock.patch('zipfile._ZipRepacker') + @mock.patch.object(zipfile, '_ZipRepacker') def test_repack_mode_w(self, m_repack): with zipfile.ZipFile(TESTFN, 'w') as zh: with self.assertRaises(ValueError): zh.repack() m_repack.assert_not_called() - @mock.patch('zipfile._ZipRepacker') + @mock.patch.object(zipfile, '_ZipRepacker') def test_repack_mode_x(self, m_repack): with zipfile.ZipFile(TESTFN, 'x') as zh: with self.assertRaises(ValueError): From 0f50a6f561c8983c7505ca81a42d8840cf6faec4 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 14 Jun 2025 11:33:20 +0800 Subject: [PATCH 53/64] Remove unneeded variables --- Lib/test/test_zipfile/test_core.py | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 0d8271bb6558cf..900f903cbe7cd5 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1444,13 +1444,13 @@ def test_remove_by_zinfo(self): self.assertIsNone(zh.testzip()) def test_remove_by_name_nonexist(self): - zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with self.assertRaises(KeyError): zh.remove('nonexist.txt') def test_remove_by_zinfo_nonexist(self): - zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: with self.assertRaises(KeyError): zh.remove(zipfile.ZipInfo('nonexist.txt')) @@ -1607,7 +1607,7 @@ def test_remove_closed(self): def test_remove_writing(self): self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: - with zh.open('newfile.txt', 'w') as fh: + with zh.open('newfile.txt', 'w'): with self.assertRaises(ValueError): zh.remove(self.test_files[0][0]) @@ -1704,7 +1704,7 @@ def test_repack_basic(self): expected_size = os.path.getsize(TESTFN) # do the removal and check the result - zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files) + self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: zh.remove(self.test_files[i][0]) @@ -1737,7 +1737,7 @@ def test_repack_bytes_before_first_file(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: fh.write(b'dummy ') - zinfos = self._prepare_zip_from_test_files(fh, self.test_files) + self._prepare_zip_from_test_files(fh, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: zh.remove(self.test_files[i][0]) @@ -1771,7 +1771,7 @@ def test_repack_magic_before_first_file(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: fh.write(b'PK\003\004 ') - zinfos = self._prepare_zip_from_test_files(fh, self.test_files) + self._prepare_zip_from_test_files(fh, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: zh.remove(self.test_files[i][0]) @@ -1817,7 +1817,7 @@ def test_repack_file_entry_before_first_file(self): zh.writestr('file2.txt', b'dummy') zh.writestr('file3.txt', b'dummy') fh.write(b' ') - zinfos = self._prepare_zip_from_test_files(fh, self.test_files) + self._prepare_zip_from_test_files(fh, self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: zh.remove(self.test_files[i][0]) @@ -1970,7 +1970,7 @@ def test_repack_zip64(self): expected_size = os.path.getsize(TESTFN) # do the removal and check the result - zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files, force_zip64=True) + self._prepare_zip_from_test_files(TESTFN, self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for i in ii: zh.remove(self.test_files[i][0]) @@ -2001,7 +2001,7 @@ def test_repack_data_descriptor(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) + self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): @@ -2036,7 +2036,7 @@ def test_repack_data_descriptor_and_zip64(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) + self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): @@ -2073,7 +2073,7 @@ def test_repack_data_descriptor_no_sig(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) + self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): @@ -2116,7 +2116,7 @@ def test_repack_data_descriptor_no_sig_strict(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) + self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): @@ -2157,7 +2157,7 @@ def test_repack_data_descriptor_no_sig_strict_by_decompressoin(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) + self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): @@ -2194,7 +2194,7 @@ def test_repack_data_descriptor_no_sig_and_zip64(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - zinfos = self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) + self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: # make sure data descriptor bit is really set (by making zipfile unseekable) for zi in zh.infolist(): @@ -2270,7 +2270,7 @@ def test_repack_removed_partial(self): with self.subTest(removed=ii): # calculate the expected results test_files = [data for j, data in enumerate(self.test_files) if j not in ii] - expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files) + self._prepare_zip_from_test_files(TESTFN, test_files) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: for zi in zh.infolist().copy(): zh.remove(zi) @@ -2314,7 +2314,7 @@ def test_repack_removed_bytes_between_files(self): # do the removal and check the result with open(TESTFN, 'wb') as fh: with zipfile.ZipFile(fh, 'w', self.compression) as zh: - for i, (file, data) in enumerate(self.test_files): + for file, data in self.test_files: zh.writestr(file, data) fh.write(b' dummy bytes ') zh.start_dir = fh.tell() @@ -2358,7 +2358,7 @@ def test_repack_closed(self, m_repack): def test_repack_writing(self, m_repack): self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: - with zh.open('newfile.txt', 'w') as fh: + with zh.open('newfile.txt', 'w'): with self.assertRaises(ValueError): zh.repack() m_repack.assert_not_called() From c759b639ee8c0d2162965409820ce97b28dbc540 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 14 Jun 2025 12:20:42 +0800 Subject: [PATCH 54/64] Improve dependency check for decompression tests --- Lib/test/test_zipfile/test_core.py | 301 +++++++++++++++-------------- 1 file changed, 157 insertions(+), 144 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 900f903cbe7cd5..74847a54a3eda9 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2584,180 +2584,193 @@ def test_scan_data_descriptor_no_sig(self): (0x4ff4f23f, 5, 5, 12), ) + def test_scan_data_descriptor_no_sig_by_decompression_stored(self): + self._test_scan_data_descriptor_no_sig_by_decompression_invalid(zipfile.ZIP_STORED) + @requires_zlib() + def test_scan_data_descriptor_no_sig_by_decompression_zlib(self): + self._test_scan_data_descriptor_no_sig_by_decompression(zipfile.ZIP_DEFLATED) + @requires_bz2() + def test_scan_data_descriptor_no_sig_by_decompression_bz2(self): + self._test_scan_data_descriptor_no_sig_by_decompression(zipfile.ZIP_BZIP2) + + @requires_lzma() + def test_scan_data_descriptor_no_sig_by_decompression_lzma(self): + self._test_scan_data_descriptor_no_sig_by_decompression_invalid(zipfile.ZIP_LZMA) + @requires_zstd() - def test_scan_data_descriptor_no_sig_by_decompression(self): + def test_scan_data_descriptor_no_sig_by_decompression_zstd(self): + self._test_scan_data_descriptor_no_sig_by_decompression(zipfile.ZIP_ZSTANDARD) + + def test_scan_data_descriptor_no_sig_by_decompression_unknown(self): + method = 1024 # simulate an unknown method + self._test_scan_data_descriptor_no_sig_by_decompression_invalid(method) + + def _test_scan_data_descriptor_no_sig_by_decompression(self, method): repacker = zipfile._ZipRepacker() - for method in ( - zipfile.ZIP_DEFLATED, - zipfile.ZIP_BZIP2, - zipfile.ZIP_ZSTANDARD, - ): - compressor = zipfile._get_compressor(method) - with self.subTest(method=method, compressor=compressor): - comp_bytes = compressor.compress(b'dummy') - comp_bytes += compressor.flush() - comp_len = len(comp_bytes) - - # basic - bytes_ = comp_bytes + b'\x3f\xf2\xf4\x4f' + struct.pack(' Date: Sat, 14 Jun 2025 13:53:40 +0800 Subject: [PATCH 55/64] Refactor and optimize `RepackHelperMixin` --- Lib/test/test_zipfile/test_core.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 74847a54a3eda9..5688e578057d29 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1379,9 +1379,18 @@ def struct_pack_no_dd_sig(fmt, *values): class RepackHelperMixin: """Common helpers for remove and repack.""" - def _prepare_zip_from_test_files(self, zfname, test_files, force_zip64=False): + @classmethod + def _prepare_test_files(cls): + return [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + @classmethod + def _prepare_zip_from_test_files(cls, zfname, test_files, force_zip64=False): zinfos = [] - with zipfile.ZipFile(zfname, 'w', self.compression) as zh: + with zipfile.ZipFile(zfname, 'w', cls.compression) as zh: for file, data in test_files: with zh.open(file, 'w', force_zip64=force_zip64) as fh: fh.write(data) @@ -1392,11 +1401,7 @@ def _prepare_zip_from_test_files(self, zfname, test_files, force_zip64=False): class AbstractRemoveTests(RepackHelperMixin): @classmethod def setUpClass(cls): - cls.test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] + cls.test_files = cls._prepare_test_files() def tearDown(self): unlink(TESTFN) @@ -1683,11 +1688,7 @@ class ZstdRemoveTests(AbstractRemoveTests, unittest.TestCase): class AbstractRepackTests(RepackHelperMixin): @classmethod def setUpClass(cls): - cls.test_files = [ - ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), - ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), - ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), - ] + cls.test_files = cls._prepare_test_files() def tearDown(self): unlink(TESTFN) From ce886161f12a2f799517cfa25b85bb8e14767a75 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 20 Jun 2025 13:24:38 +0800 Subject: [PATCH 56/64] Update NEWS --- .../2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst index f9165d4d280bfe..c22a67f9054065 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst @@ -1 +1 @@ -Add ``ZipFile.remove()`` and ``ZipFile.repack()`` +Add :meth:`remove` and :meth:`repack` to :class:`ZipFile`. From 5f093e5175db32e66a8b2b32313960189230d84e Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 20 Jun 2025 15:07:21 +0800 Subject: [PATCH 57/64] Sync with danny0838/zipremove@1691ca25bf971cf1e45d5ed7d22c512636f20cb8 --- Doc/library/zipfile.rst | 33 +- Lib/test/test_zipfile/test_core.py | 1033 ++++++++++++++++++++-------- Lib/test/test_zipfile64.py | 34 + Lib/zipfile/__init__.py | 65 +- 4 files changed, 827 insertions(+), 338 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 878c92e9dcdecf..072d50267059e5 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -526,9 +526,8 @@ ZipFile Objects If multiple members share the same full path, only one is removed when a path is provided. - This does not physically remove the local file entry from the archive; - the ZIP file size remains unchanged. Call :meth:`ZipFile.repack` afterwards - to reclaim space. + This does not physically remove the local file entry from the archive. + Call :meth:`ZipFile.repack` afterwards to reclaim space. The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``. @@ -542,30 +541,32 @@ ZipFile Objects .. method:: ZipFile.repack(removed=None, *, \ strict_descriptor=False[, chunk_size]) - Rewrites the archive to remove stale local file entries, shrinking the ZIP - file size. + Rewrites the archive to remove stale local file entries, shrinking its file + size. If *removed* is provided, it must be a sequence of :class:`ZipInfo` objects representing removed entries; only their corresponding local file entries will be removed. - If *removed* is not provided, local file entries no longer referenced in the - central directory will be removed. The algorithm assumes that local file - entries are stored consecutively: + If *removed* is not provided, the archive is scanned to identify and remove + local file entries that are no longer referenced in the central directory. + The algorithm assumes that local file entries (and the central directory, + which is mostly treated as the "last entry") are stored consecutively: #. Data before the first referenced entry is removed only when it appears to be a sequence of consecutive entries with no extra following bytes; extra - preceeding bytes are preserved. + preceding bytes are preserved. #. Data between referenced entries is removed only when it appears to be a sequence of consecutive entries with no extra preceding bytes; extra following bytes are preserved. - - ``strict_descriptor=True`` can be provided to skip the slower scan for an - unsigned data descriptor (deprecated in the latest ZIP specification and is - only used by legacy tools) when checking for bytes resembling a valid local - file entry. This improves performance, but may cause some stale local file - entries to be preserved, as any entry using an unsigned descriptor cannot - be detected. + #. Entries must not overlap. If any entry's data overlaps with another, a + :exc:`BadZipFile` error is raised and no changes are made. + + When scanning, setting ``strict_descriptor=True`` disables detection of any + entry using an unsigned data descriptor (deprecated in the ZIP specification + since version 6.3.0, released on 2006-09-29, and used only by some legacy + tools). This improves performance, but may cause some stale entries to be + preserved. *chunk_size* may be specified to control the buffer size when moving entry data (default is 1 MiB). diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 5688e578057d29..deb207ed6dc3b4 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1724,6 +1724,25 @@ def test_repack_basic(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) + def test_repack_propagation(self): + """Should call internal API with adequate parameters.""" + self._prepare_zip_from_test_files(TESTFN, self.test_files) + + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with mock.patch.object(zipfile._ZipRepacker, 'repack') as m_rp, \ + mock.patch.object(zipfile, '_ZipRepacker', wraps=zipfile._ZipRepacker) as m_zr: + zh.repack() + m_zr.assert_called_once_with() + m_rp.assert_called_once_with(zh, None) + + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zi = zh.remove(zh.infolist()[0]) + with mock.patch.object(zipfile._ZipRepacker, 'repack') as m_rp, \ + mock.patch.object(zipfile, '_ZipRepacker', wraps=zipfile._ZipRepacker) as m_zr: + zh.repack([zi], strict_descriptor=True, chunk_size=1024) + m_zr.assert_called_once_with(strict_descriptor=True, chunk_size=1024) + m_rp.assert_called_once_with(zh, [zi]) + def test_repack_bytes_before_first_file(self): """Should preserve random bytes before the first recorded local file entry.""" for ii in ([], [0], [0, 1], [0, 1, 2]): @@ -1961,246 +1980,29 @@ def test_repack_bytes_between_removed_files(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) - def test_repack_zip64(self): - """Should correctly handle file entries with zip64.""" - for ii in ([0], [0, 1], [1], [2]): - with self.subTest(remove=ii): - # calculate the expected results - test_files = [data for j, data in enumerate(self.test_files) if j not in ii] - expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files, force_zip64=True) - expected_size = os.path.getsize(TESTFN) - - # do the removal and check the result - self._prepare_zip_from_test_files(TESTFN, self.test_files, force_zip64=True) - with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - for i in ii: - zh.remove(self.test_files[i][0]) - zh.repack() - - # check infolist - self.assertEqual( - [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, - ) - - # check file size - self.assertEqual(os.path.getsize(TESTFN), expected_size) - - # make sure the zip file is still valid - with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) - - def test_repack_data_descriptor(self): - """Should correctly handle file entries using data descriptor.""" - for ii in ([0], [0, 1], [1], [2]): + def test_repack_prepended_bytes(self): + for ii in ([], [0], [0, 1], [1], [2]): with self.subTest(remove=ii): # calculate the expected results test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + fz = io.BytesIO() + self._prepare_zip_from_test_files(fz, test_files) + fz.seek(0) with open(TESTFN, 'wb') as fh: - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) - expected_size = os.path.getsize(TESTFN) - - # do the removal and check the result - with open(TESTFN, 'wb') as fh: - self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) - with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - # make sure data descriptor bit is really set (by making zipfile unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor not used: {zi.filename}') - - for i in ii: - zh.remove(self.test_files[i][0]) - zh.repack() - - # check infolist - self.assertEqual( - [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, - ) - - # check file size - self.assertEqual(os.path.getsize(TESTFN), expected_size) - - # make sure the zip file is still valid - with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) - - def test_repack_data_descriptor_and_zip64(self): - """Should correctly handle file entries using data descriptor and zip64.""" - for ii in ([0], [0, 1], [1], [2]): - with self.subTest(remove=ii): - # calculate the expected results - test_files = [data for j, data in enumerate(self.test_files) if j not in ii] - with open(TESTFN, 'wb') as fh: - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) - expected_size = os.path.getsize(TESTFN) - - # do the removal and check the result - with open(TESTFN, 'wb') as fh: - self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) - with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - # make sure data descriptor bit is really set (by making zipfile unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor not used: {zi.filename}') - - for i in ii: - zh.remove(self.test_files[i][0]) - zh.repack() - - # check infolist - self.assertEqual( - [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, - ) - - # check file size - self.assertEqual(os.path.getsize(TESTFN), expected_size) - - # make sure the zip file is still valid - with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) - - def test_repack_data_descriptor_no_sig(self): - """Should correctly handle file entries using data descriptor without signature.""" - for ii in ([0], [0, 1], [1], [2]): - with self.subTest(remove=ii): - # calculate the expected results - test_files = [data for j, data in enumerate(self.test_files) if j not in ii] - with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) - expected_size = os.path.getsize(TESTFN) - - # do the removal and check the result - with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) - with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - # make sure data descriptor bit is really set (by making zipfile unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') - - for i in ii: - zh.remove(self.test_files[i][0]) - zh.repack() - - # check infolist - self.assertEqual( - [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, - ) - - # check file size - self.assertEqual(os.path.getsize(TESTFN), expected_size) - - # make sure the zip file is still valid + fh.write(b'dummy ') + fh.write(fz.read()) with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) - - def test_repack_data_descriptor_no_sig_strict(self): - """Should skip data descriptor without signature when `strict_descriptor` is set.""" - if self.compression not in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA): - self.skipTest('require unsupported decompression method') - - for ii in ([0], [0, 1], [1], [2]): - with self.subTest(remove=ii): - # calculate the expected results - with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) - with zipfile.ZipFile(TESTFN, 'a') as zh: - for i in ii: - zh.remove(self.test_files[i][0]) expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] expected_size = os.path.getsize(TESTFN) # do the removal and check the result + fz = io.BytesIO() + self._prepare_zip_from_test_files(fz, self.test_files) + fz.seek(0) with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) - with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - # make sure data descriptor bit is really set (by making zipfile unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') - - for i in ii: - zh.remove(self.test_files[i][0]) - zh.repack(strict_descriptor=True) - - # check infolist - self.assertEqual( - [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, - ) - - # check file size - self.assertEqual(os.path.getsize(TESTFN), expected_size) - - # make sure the zip file is still valid - with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) - - def test_repack_data_descriptor_no_sig_strict_by_decompressoin(self): - """Should correctly handle file entries using data descriptor without signature - through decompression.""" - if self.compression in (zipfile.ZIP_STORED, zipfile.ZIP_LZMA): - self.skipTest('require supported decompression method') - - for ii in ([0], [0, 1], [1], [2]): - with self.subTest(remove=ii): - # calculate the expected results - test_files = [data for j, data in enumerate(self.test_files) if j not in ii] - with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files) - expected_size = os.path.getsize(TESTFN) - - # do the removal and check the result - with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - self._prepare_zip_from_test_files(Unseekable(fh), self.test_files) - with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - # make sure data descriptor bit is really set (by making zipfile unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') - - for i in ii: - zh.remove(self.test_files[i][0]) - zh.repack(strict_descriptor=True) - - # check infolist - self.assertEqual( - [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, - ) - - # check file size - self.assertEqual(os.path.getsize(TESTFN), expected_size) - - # make sure the zip file is still valid - with zipfile.ZipFile(TESTFN) as zh: - self.assertIsNone(zh.testzip()) - - def test_repack_data_descriptor_no_sig_and_zip64(self): - """Should correctly handle file entries using data descriptor without signature and zip64.""" - for ii in ([0], [0, 1], [1], [2]): - with self.subTest(remove=ii): - # calculate the expected results - test_files = [data for j, data in enumerate(self.test_files) if j not in ii] - with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - expected_zinfos = self._prepare_zip_from_test_files(Unseekable(fh), test_files, force_zip64=True) - expected_size = os.path.getsize(TESTFN) - - # do the removal and check the result - with open(TESTFN, 'wb') as fh: - with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig): - self._prepare_zip_from_test_files(Unseekable(fh), self.test_files, force_zip64=True) + fh.write(b'dummy ') + fh.write(fz.read()) with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: - # make sure data descriptor bit is really set (by making zipfile unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') - for i in ii: zh.remove(self.test_files[i][0]) zh.repack() @@ -2222,15 +2024,15 @@ def test_repack_overlapping_blocks(self): for ii in ([0], [1], [2]): with self.subTest(remove=ii): self._prepare_zip_from_test_files(TESTFN, self.test_files) - with open(TESTFN, 'r+b') as fh: - with zipfile.ZipFile(fh, 'a') as zh: - zh.writestr('file.txt', b'dummy') - for i in ii: - zh.infolist()[i].file_size += 50 - zh.infolist()[i].compress_size += 50 + with zipfile.ZipFile(TESTFN, 'a') as zh: + zh._didModify = True + for i in ii: + zi = zh.infolist()[i] + zi.compress_size += 1 + zi.file_size += 1 with zipfile.ZipFile(TESTFN, 'a') as zh: - with self.assertRaises(zipfile.BadZipFile): + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): zh.repack() def test_repack_removed_basic(self): @@ -2336,16 +2138,79 @@ def test_repack_removed_bytes_between_files(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) - def test_repack_removed_bad_removed_zinfos(self): - """Should raise when providing non-removed zinfos.""" + def test_repack_removed_bad_header_offset(self): + """Should raise when provided ZipInfo objects has differing header offset.""" + for ii in ([0], [1], [2]): + with self.subTest(removed=ii): + self._prepare_zip_from_test_files(TESTFN, self.test_files) + with zipfile.ZipFile(TESTFN, 'a') as zh: + zinfos = [zh.remove(self.test_files[i][0]) for i in ii] + for zi in zinfos: + zi.header_offset += 1 + with self.assertRaisesRegex(zipfile.BadZipFile, 'Bad magic number for file header'): + zh.repack(zinfos) + + def test_repack_removed_bad_header_offset2(self): + """Should raise when provided ZipInfo objects has differing header offset.""" + for ii in ([1], [2]): + with self.subTest(removed=ii): + self._prepare_zip_from_test_files(TESTFN, self.test_files) + with zipfile.ZipFile(TESTFN, 'a') as zh: + zinfos = [zh.remove(self.test_files[i][0]) for i in ii] + for zi in zinfos: + zi.header_offset -= 1 + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack(zinfos) + + def test_repack_removed_bad_non_removed(self): + """Should raise when provided ZipInfo objects are not removed.""" for ii in ([0], [1], [2]): with self.subTest(removed=ii): self._prepare_zip_from_test_files(TESTFN, self.test_files) with zipfile.ZipFile(TESTFN, 'a') as zh: zinfos = [zh.getinfo(self.test_files[i][0]) for i in ii] - with self.assertRaises(zipfile.BadZipFile): + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): zh.repack(zinfos) + def test_repack_removed_prepended_bytes(self): + for ii in ([], [0], [0, 1], [1], [2]): + with self.subTest(remove=ii): + # calculate the expected results + test_files = [data for j, data in enumerate(self.test_files) if j not in ii] + fz = io.BytesIO() + self._prepare_zip_from_test_files(fz, test_files) + fz.seek(0) + with open(TESTFN, 'wb') as fh: + fh.write(b'dummy ') + fh.write(fz.read()) + with zipfile.ZipFile(TESTFN) as zh: + expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_size = os.path.getsize(TESTFN) + + # do the removal and check the result + fz = io.BytesIO() + self._prepare_zip_from_test_files(fz, self.test_files) + fz.seek(0) + with open(TESTFN, 'wb') as fh: + fh.write(b'dummy ') + fh.write(fz.read()) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zinfos = [zh.remove(self.test_files[i][0]) for i in ii] + zh.repack(zinfos) + + # check infolist + self.assertEqual( + [ComparableZipInfo(zi) for zi in zh.infolist()], + expected_zinfos, + ) + + # check file size + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + # make sure the zip file is still valid + with zipfile.ZipFile(TESTFN) as zh: + self.assertIsNone(zh.testzip()) + @mock.patch.object(zipfile, '_ZipRepacker') def test_repack_closed(self, m_repack): self._prepare_zip_from_test_files(TESTFN, self.test_files) @@ -2405,7 +2270,537 @@ class LzmaRepackTests(AbstractRepackTests, unittest.TestCase): class ZstdRepackTests(AbstractRepackTests, unittest.TestCase): compression = zipfile.ZIP_ZSTANDARD +class OtherRepackTests(unittest.TestCase): + def test_full_overlap_different_names(self): + # see `test_full_overlap_different_names` in built-in test.test_zipfile + data = ( + b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e' + b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00b\xed' + b'\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\d\x0b`P' + b'K\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2' + b'\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00aPK' + b'\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e' + b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00bPK\x05' + b'\x06\x00\x00\x00\x00\x02\x00\x02\x00^\x00\x00\x00/\x00\x00' + b'\x00\x00\x00' + ) + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack() + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + zi = zh.remove('a') + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack([zi]) + + # local entry of 'a' should not be stripped (not found) + fz = io.BytesIO(data) + with zipfile.ZipFile(fz, 'a') as zh: + zh.remove('a') + zh.repack() + + expected = ( + b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e' + b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00b\xed' + b'\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\d\x0b`P' + b'K\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2' + b'\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00b' + b'PK\x05\x06\x00\x00\x00\x00\x01\x00\x01\x00/\x00\x00\x00/\x00' + b'\x00\x00\x00\x00' + ) + fz.seek(0) + self.assertEqual(fz.read(), expected) + + def test_quoted_overlap(self): + # see `test_quoted_overlap` in built-in test.test_zipfile + data = ( + b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05Y\xfc' + b'8\x044\x00\x00\x00(\x04\x00\x00\x01\x00\x00\x00a\x00' + b'\x1f\x00\xe0\xffPK\x03\x04\x14\x00\x00\x00\x08\x00\xa0l' + b'H\x05\xe2\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00' + b'\x00\x00b\xed\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\' + b'd\x0b`PK\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0' + b'lH\x05Y\xfc8\x044\x00\x00\x00(\x04\x00\x00\x01' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' + b'\x00aPK\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0l' + b'H\x05\xe2\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00' + b'bPK\x05\x06\x00\x00\x00\x00\x02\x00\x02\x00^\x00\x00' + b'\x00S\x00\x00\x00\x00\x00' + ) + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack() + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + zi = zh.remove('a') + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack([zi]) + + # local entry of 'a' should not be stripped (no valid entry) + fz = io.BytesIO(data) + with zipfile.ZipFile(fz, 'a') as zh: + zh.remove('a') + zh.repack() + + expected = ( + b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05Y\xfc' + b'8\x044\x00\x00\x00(\x04\x00\x00\x01\x00\x00\x00a\x00' + b'\x1f\x00\xe0\xffPK\x03\x04\x14\x00\x00\x00\x08\x00\xa0l' + b'H\x05\xe2\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00' + b'\x00\x00b\xed\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\' + b'd\x0b`PK\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0l' + b'H\x05\xe2\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$\x00\x00\x00' + b'bPK\x05\x06\x00\x00\x00\x00\x01\x00\x01\x00/\x00\x00' + b'\x00S\x00\x00\x00\x00\x00' + ) + fz.seek(0) + self.assertEqual(fz.read(), expected) + + def test_partial_overlap_at_dd(self): + # file 'a' has an unsigned data descriptor (whose information isn't + # consistent with in central directory) that starts at the starting + # position of file 'b' + data = ( + b'PK\x03\x04\x14\x00\x08\x00\x00\x00\x00\x00!\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00acontent' + b'PK\x03\x04\x14\x00\x00\x00\x00\x00\x00\x00!\x00\xa90\xc5\xfe' + b'\x07\x00\x00\x00\x07\x00\x00\x00\x01\x00\x00\x00bcontent' + b'PK\x01\x02\x14\x00\x14\x00\x08\x00\x00\x00\x00\x00!\x00' + b'\xa90\xc5\xfe\x07\x00\x00\x00\x07\x00\x00\x00\x01\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x80\x01\x00\x00\x00\x00a' + b'PK\x01\x02\x14\x00\x14\x00\x00\x00\x00\x00\x00\x00!\x00' + b'\xa90\xc5\xfe\x07\x00\x00\x00\x07\x00\x00\x00\x01\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x80\x01&\x00\x00\x00b' + b'PK\x05\x06\x00\x00\x00\x00\x02\x00\x02\x00^\x00\x00\x00L\x00' + b'\x00\x00\x00\x00' + ) + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + zi = zh.getinfo('a') + self.assertEqual(zi.header_offset, 0) + self.assertEqual(zi.compress_size, 7) + self.assertEqual(zi.file_size, 7) + self.assertEqual(zi.flag_bits, 8) + zi = zh.getinfo('b') + self.assertEqual(zi.header_offset, 38) + self.assertEqual(zi.compress_size, 7) + self.assertEqual(zi.file_size, 7) + self.assertEqual(zi.flag_bits, 0) + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack() + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + zi = zh.remove('a') + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack([zi]) + + # local entry of 'a' should not be stripped (no valid entry) + fz = io.BytesIO(data) + with zipfile.ZipFile(fz, 'a') as zh: + zh.remove('a') + zh.repack() + + expected = ( + b'PK\x03\x04\x14\x00\x08\x00\x00\x00\x00\x00!\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00acontent' + b'PK\x03\x04\x14\x00\x00\x00\x00\x00\x00\x00!\x00\xa90\xc5\xfe' + b'\x07\x00\x00\x00\x07\x00\x00\x00\x01\x00\x00\x00bcontent' + b'PK\x01\x02\x14\x00\x14\x00\x00\x00\x00\x00\x00\x00!\x00' + b'\xa90\xc5\xfe\x07\x00\x00\x00\x07\x00\x00\x00\x01\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\x80\x01&\x00\x00\x00b' + b'PK\x05\x06\x00\x00\x00\x00\x01\x00\x01\x00/\x00\x00\x00L\x00' + b'\x00\x00\x00\x00' + ) + fz.seek(0) + self.assertEqual(fz.read(), expected) + + def test_overlap_with_central_dir(self): + # see `test_overlap_with_central_dir` in built-in test.test_zipfile + data = ( + b'PK\x01\x02\x14\x03\x14\x00\x00\x00\x08\x00G_|Z' + b'\xe2\x1e8\xbb\x0b\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\xb4\x81\x00\x00\x00\x00aP' + b'K\x05\x06\x00\x00\x00\x00\x01\x00\x01\x00/\x00\x00\x00\x00' + b'\x00\x00\x00\x00\x00' + ) + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + with self.assertRaisesRegex(zipfile.BadZipFile, 'Bad magic number for file header'): + zh.repack() + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + zi = zh.remove('a') + with self.assertRaisesRegex(zipfile.BadZipFile, 'Bad magic number for file header'): + zh.repack([zi]) + + # local entry of 'a' should not be stripped (not found) + fz = io.BytesIO(data) + with zipfile.ZipFile(fz, 'a') as zh: + zh.remove('a') + zh.repack() + + expected = ( + b'PK\x05\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' + b'\x00\x00\x00\x00' + ) + fz.seek(0) + self.assertEqual(fz.read(), expected) + + def test_overlap_with_archive_comment(self): + # see `test_overlap_with_archive_comment` in built-in test.test_zipfile + data = ( + b'PK\x01\x02\x14\x03\x14\x00\x00\x00\x08\x00G_|Z' + b'\xe2\x1e8\xbb\x0b\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00' + b'\x00\x00\x00\x00\x00\x00\x00\x00\xb4\x81E\x00\x00\x00aP' + b'K\x05\x06\x00\x00\x00\x00\x01\x00\x01\x00/\x00\x00\x00\x00' + b'\x00\x00\x00*\x00' + b'PK\x03\x04\x14\x00\x00\x00\x08\x00G_|Z\xe2\x1e' + b'8\xbb\x0b\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00aK' + b'L\x1c\x05\xa3`\x14\x8cx\x00\x00' + ) + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack() + + with zipfile.ZipFile(io.BytesIO(data), 'a') as zh: + zi = zh.remove('a') + with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'): + zh.repack([zi]) + + # local entry of 'a' should not be stripped (not found) + fz = io.BytesIO(data) + with zipfile.ZipFile(fz, 'a') as zh: + zh.remove('a') + zh.repack() + + expected = ( + b'PK\x05\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' + b'\x00\x00\x00*\x00' + b'PK\x03\x04\x14\x00\x00\x00\x08\x00G_|Z\xe2\x1e' + b'8\xbb\x0b\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00aK' + b'L\x1c\x05\xa3`\x14\x8cx\x00\x00' + ) + fz.seek(0) + self.assertEqual(fz.read(), expected) + class ZipRepackerTests(unittest.TestCase): + def _generate_local_file_entry(self, arcname, raw_bytes, + compression=zipfile.ZIP_STORED, + force_zip64=False, dd=False, dd_sig=True): + fz = io.BytesIO() + f = Unseekable(fz) if dd else fz + cm = (mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig) + if not dd_sig else contextlib.nullcontext()) + with zipfile.ZipFile(f, 'w', compression=compression) as zh: + with cm: + with zh.open(arcname, 'w', force_zip64=force_zip64) as fh: + fh.write(raw_bytes) + fz.seek(0) + return fz.read() + + def test_validate_local_file_entry_stored(self): + self._test_validate_local_file_entry(method=zipfile.ZIP_STORED) + + @requires_zlib() + def test_validate_local_file_entry_zlib(self): + self._test_validate_local_file_entry(method=zipfile.ZIP_DEFLATED) + + @requires_bz2() + def test_validate_local_file_entry_bz2(self): + self._test_validate_local_file_entry(method=zipfile.ZIP_BZIP2) + + @requires_lzma() + def test_validate_local_file_entry_lzma(self): + self._test_validate_local_file_entry(method=zipfile.ZIP_LZMA) + + @requires_zstd() + def test_validate_local_file_entry_zstd(self): + self._test_validate_local_file_entry(method=zipfile.ZIP_ZSTANDARD) + + def _test_validate_local_file_entry(self, method): + repacker = zipfile._ZipRepacker() + + # basic + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + # offset + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_) + 1) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + bytes_ = b'pre' + bytes_ + b'post' + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 3, len(bytes_) - 4) + self.assertEqual(result, len(bytes_) - 7) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 3, len(bytes_)) + self.assertEqual(result, len(bytes_) - 7) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + # return None if no match at given offset + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 2, len(bytes_) - 4) + self.assertEqual(result, None) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 4, len(bytes_) - 4) + self.assertEqual(result, None) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + # return None if no sufficient header length + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method) + bytes_ = bytes_[:29] + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, None) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + # data descriptor + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method, dd=True) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_called_once_with(fz, 38, len(bytes_), False) + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + # data descriptor (unsigned) + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method, dd=True, dd_sig=False) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_called_once_with(fz, 38, len(bytes_), False) + m_sddnsbd.assert_called_once_with(fz, 38, len(bytes_), False, method) + if repacker._scan_data_descriptor_no_sig_by_decompression(fz, 38, len(bytes_), False, method): + m_sddns.assert_not_called() + else: + m_sddns.assert_called_once_with(fz, 38, len(bytes_), False) + + # return None for data descriptor (unsigned) if `strict_descriptor=True` + repacker = zipfile._ZipRepacker(strict_descriptor=True) + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method, dd=True, dd_sig=False) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, None) + m_sdd.assert_called_once_with(fz, 38, len(bytes_), False) + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + def test_validate_local_file_entry_zip64_stored(self): + self._test_validate_local_file_entry_zip64(method=zipfile.ZIP_STORED) + + @requires_zlib() + def test_validate_local_file_entry_zip64_zlib(self): + self._test_validate_local_file_entry_zip64(method=zipfile.ZIP_DEFLATED) + + @requires_bz2() + def test_validate_local_file_entry_zip64_bz2(self): + self._test_validate_local_file_entry_zip64(method=zipfile.ZIP_BZIP2) + + @requires_lzma() + def test_validate_local_file_entry_zip64_lzma(self): + self._test_validate_local_file_entry_zip64(method=zipfile.ZIP_LZMA) + + @requires_zstd() + def test_validate_local_file_entry_zip64_zstd(self): + self._test_validate_local_file_entry_zip64(method=zipfile.ZIP_ZSTANDARD) + + def _test_validate_local_file_entry_zip64(self, method): + repacker = zipfile._ZipRepacker() + + # zip64 + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method, force_zip64=True) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_not_called() + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + # data descriptor + zip64 + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method, force_zip64=True, dd=True) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_called_once_with(fz, 58, len(bytes_), True) + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + # data descriptor (unsigned) + zip64 + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method, force_zip64=True, dd=True, dd_sig=False) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_called_once_with(fz, 58, len(bytes_), True) + m_sddnsbd.assert_called_once_with(fz, 58, len(bytes_), True, method) + if repacker._scan_data_descriptor_no_sig_by_decompression(fz, 58, len(bytes_), True, method): + m_sddns.assert_not_called() + else: + m_sddns.assert_called_once_with(fz, 58, len(bytes_), True) + + # return None for data descriptor (unsigned) if `strict_descriptor=True` + repacker = zipfile._ZipRepacker(strict_descriptor=True) + bytes_ = self._generate_local_file_entry( + 'file.txt', b'dummy', compression=method, force_zip64=True, dd=True, dd_sig=False) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, None) + m_sdd.assert_called_once_with(fz, 58, len(bytes_), True) + m_sddnsbd.assert_not_called() + m_sddns.assert_not_called() + + def test_validate_local_file_entry_encrypted(self): + repacker = zipfile._ZipRepacker() + + bytes_ = ( + b'PK\x03\x04' + b'\x14\x00' + b'\x09\x00' + b'\x08\x00' + b'\xAB\x28' + b'\xD2\x5A' + b'\x00\x00\x00\x00' + b'\x00\x00\x00\x00' + b'\x00\x00\x00\x00' + b'\x08\x00' + b'\x00\x00' + b'file.txt' + b'\x97\xF1\x83\x34\x9D\xC4\x8C\xD3\xED\x79\x8C\xA2\xBB\x49\xFF\x1B\x89' + b'\x3F\xF2\xF4\x4F' + b'\x11\x00\x00\x00' + b'\x05\x00\x00\x00' + ) + fz = io.BytesIO(bytes_) + with mock.patch.object(repacker, '_scan_data_descriptor', + wraps=repacker._scan_data_descriptor) as m_sdd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression', + wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \ + mock.patch.object(repacker, '_scan_data_descriptor_no_sig', + wraps=repacker._scan_data_descriptor_no_sig) as m_sddns: + result = repacker._validate_local_file_entry(fz, 0, len(bytes_)) + self.assertEqual(result, len(bytes_)) + m_sdd.assert_called_once_with(fz, 38, len(bytes_), False) + m_sddnsbd.assert_not_called() + m_sddns.assert_called_once_with(fz, 38, len(bytes_), False) + def test_iter_scan_signature(self): bytes_ = b'sig__sig__sig__sig' ln = len(bytes_) @@ -2455,134 +2850,176 @@ def test_iter_scan_signature(self): def test_scan_data_descriptor(self): repacker = zipfile._ZipRepacker() + sig = zipfile._DD_SIGNATURE + raw_bytes = comp_bytes = b'dummy' + raw_len = comp_len = len(raw_bytes) + raw_crc = zipfile.crc32(raw_bytes) + # basic - bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), - (0x4ff4f23f, 5, 5, 16), + (raw_crc, comp_len, raw_len, 16), ) # return None if no signature - bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<3L', raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), None, ) - # return None if not unpackable - bytes_ = b'PK\x07\x08' + # return None if compressed size not match + bytes_ = comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len + 1, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), None, ) - # return None if compressed size not match - bytes_ = b'dummPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len - 1, raw_len) + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + + bytes_ = b'1' + comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len, raw_len) + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + None, + ) + + bytes_ = comp_bytes[1:] + struct.pack('<4L', sig, raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), None, ) # zip64 - bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<2L2Q', sig, raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), True), - (0x4ff4f23f, 5, 5, 24), + (raw_crc, comp_len, raw_len, 24), ) # offset - bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 1, len(bytes_), False), None, ) - bytes_ = b'123dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = b'123' + comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), None, ) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 3, len(bytes_), False), - (0x4ff4f23f, 5, 5, 16), + (raw_crc, comp_len, raw_len, 16), ) # end_offset - bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_) - 1, False), None, ) - bytes_ = b'dummyPK\x07\x08\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00123' + bytes_ = comp_bytes + struct.pack('<4L', sig, raw_crc, comp_len, raw_len) + b'123' self.assertEqual( repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_) - 3, False), - (0x4ff4f23f, 5, 5, 16), + (raw_crc, comp_len, raw_len, 16), + ) + self.assertEqual( + repacker._scan_data_descriptor(io.BytesIO(bytes_), 0, len(bytes_), False), + (raw_crc, comp_len, raw_len, 16), ) def test_scan_data_descriptor_no_sig(self): repacker = zipfile._ZipRepacker() + raw_bytes = comp_bytes = b'dummy' + raw_len = comp_len = len(raw_bytes) + raw_crc = zipfile.crc32(raw_bytes) + # basic - bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<3L', raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), - (0x4ff4f23f, 5, 5, 12), + (raw_crc, comp_len, raw_len, 12), ) # return None if compressed size not match - bytes_ = b'dumm\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<3L', raw_crc, comp_len + 1, raw_len) self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), None, ) - # zip64 - bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00' + bytes_ = comp_bytes + struct.pack('<3L', raw_crc, comp_len - 1, raw_len) self.assertEqual( - repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), True), - (0x4ff4f23f, 5, 5, 20), + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), + None, ) - # offset - bytes_ = b'dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = b'1' + comp_bytes + struct.pack('<3L', raw_crc, comp_len, raw_len) self.assertEqual( - repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 1, len(bytes_), False), + repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), None, ) - bytes_ = b'123dummy\x3f\xf2\xf4\x4f\x05\x00\x00\x00\x05\x00\x00\x00' + bytes_ = comp_bytes[1:] + struct.pack('<3L', raw_crc, comp_len, raw_len) self.assertEqual( repacker._scan_data_descriptor_no_sig(io.BytesIO(bytes_), 0, len(bytes_), False), None, ) + + # zip64 + bytes_ = comp_bytes + struct.pack(' entry_size: raise BadZipFile( f"Overlapped entries: {zinfo.orig_filename!r} ") @@ -1515,7 +1523,7 @@ def repack(self, zfile, removed=None): fp, old_header_offset + used_entry_size, zinfo.header_offset, - entry_size - used_entry_size + entry_size - used_entry_size, ) # update entry_offset for subsequent files to follow @@ -1523,16 +1531,18 @@ def repack(self, zfile, removed=None): else: if entry_offset > 0: - self._copy_bytes(fp, old_header_offset, zinfo.header_offset, used_entry_size) - - if used_entry_size < entry_size: - stale_entry_size = self._validate_local_file_entry_sequence( + self._copy_bytes( fp, - old_header_offset + used_entry_size, - old_header_offset + entry_size, + old_header_offset, + zinfo.header_offset, + used_entry_size, ) - else: - stale_entry_size = 0 + + stale_entry_size = self._validate_local_file_entry_sequence( + fp, + old_header_offset + used_entry_size, + old_header_offset + entry_size, + ) if stale_entry_size > 0: self._copy_bytes( @@ -1564,7 +1574,8 @@ def _calc_initial_entry_offset(self, fp, data_offset): self._debug(3, 'scanning file signatures before:', data_offset) for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset): self._debug(3, 'checking file signature at:', pos) - entry_size = self._validate_local_file_entry_sequence(fp, pos, data_offset, checked_offsets) + entry_size = self._validate_local_file_entry_sequence( + fp, pos, data_offset, checked_offsets) if entry_size == data_offset - pos: return entry_size return 0 @@ -1639,8 +1650,9 @@ def _validate_local_file_entry(self, fp, offset, end_offset): if pos > end_offset: return None + # parse zip64 try: - zinfo._decodeExtra(crc32(filename)) # parse zip64 + zinfo._decodeExtra(crc32(filename)) except BadZipFile: return None @@ -1657,14 +1669,14 @@ def _validate_local_file_entry(self, fp, offset, end_offset): zip64 = fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff dd = self._scan_data_descriptor(fp, pos, end_offset, zip64) - if dd is None: - dd = self._scan_data_descriptor_no_sig_by_decompression( - fp, pos, end_offset, zip64, fheader[_FH_COMPRESSION_METHOD]) + if dd is None and not self.strict_descriptor: + if zinfo.flag_bits & _MASK_ENCRYPTED: + dd = False + else: + dd = self._scan_data_descriptor_no_sig_by_decompression( + fp, pos, end_offset, zip64, fheader[_FH_COMPRESSION_METHOD]) if dd is False: - if not self.strict_descriptor: - dd = self._scan_data_descriptor_no_sig(fp, pos, end_offset, zip64) - else: - dd = None + dd = self._scan_data_descriptor_no_sig(fp, pos, end_offset, zip64) if dd is None: return None @@ -1747,14 +1759,10 @@ def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, if decompressor is None: return False - # Current LZMADecompressor is unreliable since it's `.eof` is usually - # not set as expected. - if isinstance(decompressor, LZMADecompressor): - return False - dd_fmt = ' Date: Fri, 20 Jun 2025 15:54:38 +0800 Subject: [PATCH 58/64] Revise NEWS --- .../2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst index c22a67f9054065..204213c74de5df 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-24-11-17-34.gh-issue-51067.yHOgfy.rst @@ -1 +1 @@ -Add :meth:`remove` and :meth:`repack` to :class:`ZipFile`. +Add :meth:`~zipfile.ZipFile.remove` and :meth:`~zipfile.ZipFile.repack` to :class:`~zipfile.ZipFile`. From 4b2176e89076ca2b25d1deef02d49cbd0e31be6b Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sat, 21 Jun 2025 23:07:50 +0800 Subject: [PATCH 59/64] Sync with danny0838/zipremove@1843d87b70e6cb129fb55446eaf4486a87d2af4d --- Doc/library/zipfile.rst | 2 +- Lib/test/test_zipfile/test_core.py | 83 ++++++++++++++++-------------- Lib/zipfile/__init__.py | 25 ++++----- 3 files changed, 54 insertions(+), 56 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 072d50267059e5..2c6e3324a98c12 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -527,7 +527,7 @@ ZipFile Objects a path is provided. This does not physically remove the local file entry from the archive. - Call :meth:`ZipFile.repack` afterwards to reclaim space. + Call :meth:`repack` afterwards to reclaim space. The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``. diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index deb207ed6dc3b4..a11571cec4c986 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1362,8 +1362,11 @@ class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_ZSTANDARD -def ComparableZipInfo(zinfo): - return (zinfo.filename, zinfo.header_offset, zinfo.compress_size, zinfo.CRC) +class ComparableZipInfo: + keys = [i for i in zipfile.ZipInfo.__slots__ if not i.startswith('_')] + + def __new__(cls, zinfo): + return {i: getattr(zinfo, i) for i in cls.keys} _struct_pack = struct.pack @@ -1379,6 +1382,8 @@ def struct_pack_no_dd_sig(fmt, *values): class RepackHelperMixin: """Common helpers for remove and repack.""" + maxDiff = 8192 + @classmethod def _prepare_test_files(cls): return [ @@ -1389,14 +1394,11 @@ def _prepare_test_files(cls): @classmethod def _prepare_zip_from_test_files(cls, zfname, test_files, force_zip64=False): - zinfos = [] with zipfile.ZipFile(zfname, 'w', cls.compression) as zh: for file, data in test_files: with zh.open(file, 'w', force_zip64=force_zip64) as fh: fh.write(data) - zinfo = zh.getinfo(file) - zinfos.append(ComparableZipInfo(zinfo)) - return zinfos + return list(zh.infolist()) class AbstractRemoveTests(RepackHelperMixin): @classmethod @@ -1416,7 +1418,7 @@ def test_remove_by_name(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zi for j, zi in enumerate(zinfos) if j != i], + [ComparableZipInfo(zi) for j, zi in enumerate(zinfos) if j != i], ) # check NameToInfo cache @@ -1437,7 +1439,7 @@ def test_remove_by_zinfo(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zi for j, zi in enumerate(zinfos) if j != i], + [ComparableZipInfo(zi) for j, zi in enumerate(zinfos) if j != i], ) # check NameToInfo cache @@ -1478,13 +1480,13 @@ def test_remove_by_name_duplicated(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zinfos[0], zinfos[2]], + [ComparableZipInfo(zi) for zi in [zinfos[0], zinfos[2]]], ) # check NameToInfo cache self.assertEqual( ComparableZipInfo(zh.getinfo('file.txt')), - zinfos[0], + ComparableZipInfo(zinfos[0]), ) # make sure the zip file is still valid @@ -1499,7 +1501,7 @@ def test_remove_by_name_duplicated(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zinfos[2]], + [ComparableZipInfo(zi) for zi in [zinfos[2]]], ) # check NameToInfo cache @@ -1528,13 +1530,13 @@ def test_remove_by_zinfo_duplicated(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zinfos[1], zinfos[2]], + [ComparableZipInfo(zi) for zi in [zinfos[1], zinfos[2]]], ) # check NameToInfo cache self.assertEqual( ComparableZipInfo(zh.getinfo('file.txt')), - zinfos[1], + ComparableZipInfo(zinfos[1]), ) # make sure the zip file is still valid @@ -1548,13 +1550,13 @@ def test_remove_by_zinfo_duplicated(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zinfos[0], zinfos[2]], + [ComparableZipInfo(zi) for zi in [zinfos[0], zinfos[2]]], ) # check NameToInfo cache self.assertEqual( ComparableZipInfo(zh.getinfo('file.txt')), - zinfos[0], + ComparableZipInfo(zinfos[0]), ) # make sure the zip file is still valid @@ -1570,7 +1572,7 @@ def test_remove_by_zinfo_duplicated(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zinfos[2]], + [ComparableZipInfo(zi) for zi in [zinfos[2]]], ) # check NameToInfo cache @@ -1591,7 +1593,7 @@ def test_remove_zip64(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zi for j, zi in enumerate(zinfos) if j != i], + [ComparableZipInfo(zi) for j, zi in enumerate(zinfos) if j != i], ) # check NameToInfo cache @@ -1626,14 +1628,14 @@ def test_remove_mode_w(self): with zipfile.ZipFile(TESTFN, 'w') as zh: for file, data in self.test_files: zh.writestr(file, data) - zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + zinfos = list(zh.infolist()) zh.remove(self.test_files[0][0]) # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zinfos[1], zinfos[2]], + [ComparableZipInfo(zi) for zi in [zinfos[1], zinfos[2]]], ) # check NameToInfo cache @@ -1648,14 +1650,14 @@ def test_remove_mode_x(self): with zipfile.ZipFile(TESTFN, 'x') as zh: for file, data in self.test_files: zh.writestr(file, data) - zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + zinfos = list(zh.infolist()) zh.remove(self.test_files[0][0]) # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - [zinfos[1], zinfos[2]], + [ComparableZipInfo(zi) for zi in [zinfos[1], zinfos[2]]], ) # check NameToInfo cache @@ -1714,7 +1716,7 @@ def test_repack_basic(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -1766,7 +1768,7 @@ def test_repack_bytes_before_first_file(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -1800,7 +1802,7 @@ def test_repack_magic_before_first_file(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -1846,7 +1848,7 @@ def test_repack_file_entry_before_first_file(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -1856,6 +1858,7 @@ def test_repack_file_entry_before_first_file(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) + @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() def test_repack_bytes_before_removed_files(self): """Should preserve if there are bytes before stale local file entries.""" for ii in ([1], [1, 2], [2]): @@ -1870,7 +1873,7 @@ def test_repack_bytes_before_removed_files(self): zh.writestr(file, data) for i in ii: zh.remove(self.test_files[i][0]) - expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_zinfos = list(zh.infolist()) expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -1889,7 +1892,7 @@ def test_repack_bytes_before_removed_files(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -1899,6 +1902,7 @@ def test_repack_bytes_before_removed_files(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) + @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() def test_repack_bytes_after_removed_files(self): """Should keep extra bytes if there are bytes after stale local file entries.""" for ii in ([1], [1, 2], [2]): @@ -1912,7 +1916,7 @@ def test_repack_bytes_after_removed_files(self): if i == ii[-1]: fh.write(b' dummy bytes ') zh.start_dir = fh.tell() - expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_zinfos = list(zh.infolist()) expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -1931,7 +1935,7 @@ def test_repack_bytes_after_removed_files(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -1941,6 +1945,7 @@ def test_repack_bytes_after_removed_files(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) + @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() def test_repack_bytes_between_removed_files(self): """Should strip only local file entries before random bytes.""" # calculate the expected results @@ -1951,7 +1956,7 @@ def test_repack_bytes_between_removed_files(self): zh.start_dir = fh.tell() zh.writestr(*self.test_files[2]) zh.remove(self.test_files[2][0]) - expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_zinfos = list(zh.infolist()) expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -1970,7 +1975,7 @@ def test_repack_bytes_between_removed_files(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -1992,7 +1997,7 @@ def test_repack_prepended_bytes(self): fh.write(b'dummy ') fh.write(fz.read()) with zipfile.ZipFile(TESTFN) as zh: - expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_zinfos = list(zh.infolist()) expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -2010,7 +2015,7 @@ def test_repack_prepended_bytes(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -2055,7 +2060,7 @@ def test_repack_removed_basic(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -2098,20 +2103,20 @@ def test_repack_removed_partial(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) + @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() def test_repack_removed_bytes_between_files(self): """Should not remove bytes between local file entries.""" for ii in ([0], [1], [2]): with self.subTest(removed=ii): # calculate the expected results - expected_zinfos = [] with open(TESTFN, 'wb') as fh: with zipfile.ZipFile(fh, 'w', self.compression) as zh: for j, (file, data) in enumerate(self.test_files): if j not in ii: zh.writestr(file, data) - expected_zinfos.append(ComparableZipInfo(zh.getinfo(file))) fh.write(b' dummy bytes ') zh.start_dir = fh.tell() + expected_zinfos = list(zh.infolist()) expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -2128,7 +2133,7 @@ def test_repack_removed_bytes_between_files(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size @@ -2184,7 +2189,7 @@ def test_repack_removed_prepended_bytes(self): fh.write(b'dummy ') fh.write(fz.read()) with zipfile.ZipFile(TESTFN) as zh: - expected_zinfos = [ComparableZipInfo(zi) for zi in zh.infolist()] + expected_zinfos = list(zh.infolist()) expected_size = os.path.getsize(TESTFN) # do the removal and check the result @@ -2201,7 +2206,7 @@ def test_repack_removed_prepended_bytes(self): # check infolist self.assertEqual( [ComparableZipInfo(zi) for zi in zh.infolist()], - expected_zinfos, + [ComparableZipInfo(zi) for zi in expected_zinfos], ) # check file size diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 2399ceb1d9d864..9c979fff464052 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1389,10 +1389,9 @@ def repack(self, zfile, removed=None): """ Repack the ZIP file, stripping unreferenced local file entries. - Assumes that local file entries are stored consecutively, with no gaps - or overlaps. - - Behavior: + Assumes that local file entries (and the central directory, which is + mostly treated as the "last entry") are stored consecutively, with no + gaps or overlaps: 1. If any referenced entry overlaps with another, a `BadZipFile` error is raised since safe repacking cannot be guaranteed. @@ -1405,8 +1404,8 @@ def repack(self, zfile, removed=None): be a sequence of consecutive entries with no extra preceding bytes; extra following bytes are preserved. - 4. This is to prevent an unexpected data removal (false positive), - though a false negative may happen in certain rare cases. + This is to prevent an unexpected data removal (false positive), though + a false negative may happen in certain rare cases. Examples: @@ -1456,8 +1455,8 @@ def repack(self, zfile, removed=None): - Modifies the ZIP file in place. - Updates zfile.start_dir to account for removed data. - Sets zfile._didModify to True. - - Updates header_offset and _end_offset of referenced ZipInfo - instances. + - Updates header_offset and clears _end_offset of referenced + ZipInfo instances. Parameters: zfile: A ZipFile object representing the archive to repack. @@ -1559,14 +1558,8 @@ def repack(self, zfile, removed=None): zfile.start_dir -= entry_offset zfile._didModify = True - end_offset = zfile.start_dir - for zinfo in reversed(filelist): - if zinfo in removed_zinfos: - zinfo._end_offset = None - else: - if zinfo._end_offset is not None: - zinfo._end_offset = end_offset - end_offset = zinfo.header_offset + for zinfo in filelist: + zinfo._end_offset = None def _calc_initial_entry_offset(self, fp, data_offset): checked_offsets = {} From d9824ceaa6e3398c7e297653d5dfa3a403395dca Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 22 Jun 2025 00:00:19 +0800 Subject: [PATCH 60/64] Fix timezone related timestamp issue --- Lib/test/test_zipfile/test_core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index a11571cec4c986..2c9098a2b13368 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1858,7 +1858,7 @@ def test_repack_file_entry_before_first_file(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) - @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() + @mock.patch.object(time, 'time', new=lambda: 315590400) # fix time for ZipFile.writestr() def test_repack_bytes_before_removed_files(self): """Should preserve if there are bytes before stale local file entries.""" for ii in ([1], [1, 2], [2]): @@ -1902,7 +1902,7 @@ def test_repack_bytes_before_removed_files(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) - @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() + @mock.patch.object(time, 'time', new=lambda: 315590400) # fix time for ZipFile.writestr() def test_repack_bytes_after_removed_files(self): """Should keep extra bytes if there are bytes after stale local file entries.""" for ii in ([1], [1, 2], [2]): @@ -1945,7 +1945,7 @@ def test_repack_bytes_after_removed_files(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) - @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() + @mock.patch.object(time, 'time', new=lambda: 315590400) # fix time for ZipFile.writestr() def test_repack_bytes_between_removed_files(self): """Should strip only local file entries before random bytes.""" # calculate the expected results @@ -2103,7 +2103,7 @@ def test_repack_removed_partial(self): with zipfile.ZipFile(TESTFN) as zh: self.assertIsNone(zh.testzip()) - @mock.patch.object(time, 'time', new=lambda: 315504000) # fix time for ZipFile.writestr() + @mock.patch.object(time, 'time', new=lambda: 315590400) # fix time for ZipFile.writestr() def test_repack_removed_bytes_between_files(self): """Should not remove bytes between local file entries.""" for ii in ([0], [1], [2]): From 85811abea894968cb072a25f3d1dfe0ceb7f8c2a Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Sun, 22 Jun 2025 21:30:48 +0800 Subject: [PATCH 61/64] Simplify tests with data descriptors --- Lib/test/test_zipfile/test_core.py | 11 +++++++---- Lib/test/test_zipfile64.py | 12 ------------ 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index 2c9098a2b13368..d24c04228b49cb 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -2503,11 +2503,14 @@ def _generate_local_file_entry(self, arcname, raw_bytes, fz = io.BytesIO() f = Unseekable(fz) if dd else fz cm = (mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig) - if not dd_sig else contextlib.nullcontext()) + if dd and not dd_sig else contextlib.nullcontext()) with zipfile.ZipFile(f, 'w', compression=compression) as zh: - with cm: - with zh.open(arcname, 'w', force_zip64=force_zip64) as fh: - fh.write(raw_bytes) + with cm, zh.open(arcname, 'w', force_zip64=force_zip64) as fh: + fh.write(raw_bytes) + if dd: + zi = zh.infolist()[0] + self.assertTrue(zi.flag_bits & zipfile._MASK_USE_DATA_DESCRIPTOR, + f'data descriptor flag not set: {zi.filename}') fz.seek(0) return fz.read() diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index 98807bf9f00c9d..53bd9fef193740 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -192,10 +192,6 @@ def _test_strip_removed_large_file_with_dd(self, f): zh.writestr(file, data) with zipfile.ZipFile(f, 'a') as zh: - # make sure data descriptor bit is really set (by making zip file unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') - zh.remove(file1) zh.repack() self.assertIsNone(zh.testzip()) @@ -227,10 +223,6 @@ def _test_strip_removed_large_file_with_dd_no_sig(self, f): zh.writestr(file, data) with zipfile.ZipFile(f, 'a') as zh: - # make sure data descriptor bit is really set (by making zip file unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') - zh.remove(file1) zh.repack() self.assertIsNone(zh.testzip()) @@ -261,10 +253,6 @@ def _test_strip_removed_large_file_with_dd_no_sig_by_decompression(self, f, meth zh.writestr(file, data) with zipfile.ZipFile(f, 'a') as zh: - # make sure data descriptor bit is really set (by making zip file unseekable) - for zi in zh.infolist(): - self.assertTrue(zi.flag_bits & 8, f'data descriptor flag not set: {zi.filename}') - zh.remove(file1) zh.repack() self.assertIsNone(zh.testzip()) From 748ac631adb86acd05177cc30ff46d4e4e67a372 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Mon, 23 Jun 2025 19:35:24 +0800 Subject: [PATCH 62/64] Sync with danny0838/zipremove@e79042768f3c2541e0226f6bed3a9ff2ee04fac0 --- Lib/test/test_zipfile/test_core.py | 84 ++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index d24c04228b49cb..1c520c4cd1cc74 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -3158,6 +3158,10 @@ def test_trace_compressed_block_end_zlib(self): def test_trace_compressed_block_end_bz2(self): self._test_trace_compressed_block_end(zipfile.ZIP_BZIP2, OSError) + @requires_lzma() + def test_trace_compressed_block_end_lzma(self): + self._test_trace_compressed_block_end(zipfile.ZIP_LZMA, EOFError) + @requires_zstd() def test_trace_compressed_block_end_zstd(self): import compression.zstd @@ -3226,6 +3230,86 @@ def _test_trace_compressed_block_end(self, method, exc_cls): comp_len, ) + def test_calc_local_file_entry_size(self): + repacker = zipfile._ZipRepacker() + + # basic + fz = io.BytesIO() + with zipfile.ZipFile(fz, 'w') as zh: + with zh.open('file.txt', 'w') as fh: + fh.write(b'dummy') + zi = zh.infolist()[-1] + + self.assertEqual( + repacker._calc_local_file_entry_size(fz, zi), + 43, + ) + + # data descriptor + fz = io.BytesIO() + with zipfile.ZipFile(Unseekable(fz), 'w') as zh: + with zh.open('file.txt', 'w') as fh: + fh.write(b'dummy') + zi = zh.infolist()[-1] + + self.assertEqual( + repacker._calc_local_file_entry_size(fz, zi), + 59, + ) + + # data descriptor (unsigned) + fz = io.BytesIO() + with zipfile.ZipFile(Unseekable(fz), 'w') as zh: + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig), \ + zh.open('file.txt', 'w') as fh: + fh.write(b'dummy') + zi = zh.infolist()[-1] + + self.assertEqual( + repacker._calc_local_file_entry_size(fz, zi), + 55, + ) + + def test_calc_local_file_entry_size_zip64(self): + repacker = zipfile._ZipRepacker() + + # zip64 + fz = io.BytesIO() + with zipfile.ZipFile(fz, 'w') as zh: + with zh.open('file.txt', 'w', force_zip64=True) as fh: + fh.write(b'dummy') + zi = zh.infolist()[-1] + + self.assertEqual( + repacker._calc_local_file_entry_size(fz, zi), + 63, + ) + + # data descriptor + zip64 + fz = io.BytesIO() + with zipfile.ZipFile(Unseekable(fz), 'w') as zh: + with zh.open('file.txt', 'w', force_zip64=True) as fh: + fh.write(b'dummy') + zi = zh.infolist()[-1] + + self.assertEqual( + repacker._calc_local_file_entry_size(fz, zi), + 87, + ) + + # data descriptor (unsigned) + zip64 + fz = io.BytesIO() + with zipfile.ZipFile(Unseekable(fz), 'w') as zh: + with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig), \ + zh.open('file.txt', 'w', force_zip64=True) as fh: + fh.write(b'dummy') + zi = zh.infolist()[-1] + + self.assertEqual( + repacker._calc_local_file_entry_size(fz, zi), + 83, + ) + def test_copy_bytes(self): repacker = zipfile._ZipRepacker() From 001a8d096386ffb9a16e595b4d655e839e5f465a Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Tue, 24 Jun 2025 19:55:13 +0800 Subject: [PATCH 63/64] Sync with danny0838/zipremove@87bcdb50411a355d24c35f31dcbe4273c0568cf8 --- Lib/test/test_zipfile64.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index 53bd9fef193740..a9ad63fde2d761 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -210,8 +210,9 @@ def test_strip_removed_large_file_with_dd_no_sig(self): self.assertLess(peak, self.allowed_memory) def _test_strip_removed_large_file_with_dd_no_sig(self, f): - # Reduce data to 400 MiB for this test, as it's especially slow... - self.datacount = 400*1024**2 // len(self.data) + # Reduce data scale for this test, as it's especially slow... + self.datacount = 30*1024**2 // len(self.data) + self.allowed_memory = 200*1024 file = 'file.txt' file1 = 'largefile.txt' From 3a364ce6992501f44f0a098ec3122858db66929d Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Wed, 25 Jun 2025 11:37:12 +0800 Subject: [PATCH 64/64] Sync with danny0838/zipremove@6a78bd15de87afde510f8a1b6364365c6e17f252 --- Lib/test/test_zipfile64.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index a9ad63fde2d761..5c4625cc8d68c5 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -199,6 +199,10 @@ def _test_strip_removed_large_file_with_dd(self, f): def test_strip_removed_large_file_with_dd_no_sig(self): """Should scan for the data descriptor (without signature) of a removed large file without causing a memory issue.""" + # Reduce data scale for this test, as it's especially slow... + self.datacount = 30*1024**2 // len(self.data) + self.allowed_memory = 200*1024 + # Try the temp file. If we do TESTFN2, then it hogs # gigabytes of disk space for the duration of the test. with TemporaryFile() as f: @@ -210,10 +214,6 @@ def test_strip_removed_large_file_with_dd_no_sig(self): self.assertLess(peak, self.allowed_memory) def _test_strip_removed_large_file_with_dd_no_sig(self, f): - # Reduce data scale for this test, as it's especially slow... - self.datacount = 30*1024**2 // len(self.data) - self.allowed_memory = 200*1024 - file = 'file.txt' file1 = 'largefile.txt' data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'