From 246d27475093f41f2b5dec3b90fc3ed4678d40d8 Mon Sep 17 00:00:00 2001 From: Maciej Urbanski Date: Wed, 19 Jun 2024 10:16:06 +0200 Subject: [PATCH] fix sync reuploading files over and over (fix #502) --- b2sdk/_internal/scan/folder.py | 15 ++++++-- changelog.d/502.fixed.md | 2 + test/integration/conftest.py | 7 ++++ test/integration/test_sync.py | 67 ++++++++++++++++++++++++++++++++++ test/unit/fixtures/folder.py | 4 +- test/unit/sync/test_sync.py | 22 +++++++++++ test/unit/v0/test_sync.py | 10 ++--- test/unit/v1/test_sync.py | 10 ++--- 8 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 changelog.d/502.fixed.md create mode 100644 test/integration/test_sync.py diff --git a/b2sdk/_internal/scan/folder.py b/b2sdk/_internal/scan/folder.py index 0be2ec871..e5ac283e0 100644 --- a/b2sdk/_internal/scan/folder.py +++ b/b2sdk/_internal/scan/folder.py @@ -63,7 +63,7 @@ def all_files(self, reporter: ProgressReport | None, policies_manager=DEFAULT_SCAN_MANAGER) -> Iterator[AbstractPath]: """ Return an iterator over all of the files in the folder, in - the order that B2 uses. + the order that B2 uses (lexicographic by object path). It also performs filtering using policies manager. @@ -138,11 +138,18 @@ def all_files(self, reporter: ProgressReport | None, """ Yield all files. + Yield a File object for each of the files anywhere under this folder, in the + order they would appear in B2, unless the path is excluded by policies manager. + :param reporter: a place to report errors :param policies_manager: a policy manager object, default is DEFAULT_SCAN_MANAGER + :return: an iterator over all files in the folder in the order they would appear in B2 """ root_path = Path(self.root) - yield from self._walk_relative_paths(root_path, Path(''), reporter, policies_manager) + + local_paths = self._walk_relative_paths(root_path, Path(''), reporter, policies_manager) + # Crucial to return the "correct" order of the files + yield from sorted(local_paths, key=lambda lp: lp.relative_path) def make_full_path(self, file_name): """ @@ -196,8 +203,8 @@ def _walk_relative_paths( visited_symlinks: set[int] | None = None, ): """ - Yield a File object for each of the files anywhere under this folder, in the - order they would appear in B2, unless the path is excluded by policies manager. + Yield a File object for each of the files anywhere under this folder, + unless the path is excluded by policies manager. :param local_dir: the path to the local directory that we are currently inspecting :param relative_dir_path: the path of this dir relative to the scan point, or Path('') if at scan point diff --git a/changelog.d/502.fixed.md b/changelog.d/502.fixed.md new file mode 100644 index 000000000..cd8dceae4 --- /dev/null +++ b/changelog.d/502.fixed.md @@ -0,0 +1,2 @@ +Fix `LocalFolder` regression (introduced in 2.4.0) which caused `LocalFolder` to not list files by path lexicographical order. +This is also a fix for `synchronizer` re-uploading files on every run in some cases. diff --git a/test/integration/conftest.py b/test/integration/conftest.py index 630748307..aacb348cd 100644 --- a/test/integration/conftest.py +++ b/test/integration/conftest.py @@ -12,6 +12,7 @@ import http import http.client import os +import secrets from test.integration import get_b2_auth_data from test.integration.bucket_cleaner import BucketCleaner from test.integration.helpers import ( @@ -93,3 +94,9 @@ def bucket(b2_api, bucket_name_prefix, bucket_cleaner): ) yield bucket bucket_cleaner.cleanup_bucket(bucket) + + +@pytest.fixture +def b2_subfolder(bucket, request): + subfolder_name = f"{request.node.name}_{secrets.token_urlsafe(4)}" + return f"b2://{bucket.name}/{subfolder_name}" diff --git a/test/integration/test_sync.py b/test/integration/test_sync.py new file mode 100644 index 000000000..7366e4f52 --- /dev/null +++ b/test/integration/test_sync.py @@ -0,0 +1,67 @@ +###################################################################### +# +# File: test/integration/test_sync.py +# +# Copyright 2024 Backblaze Inc. All Rights Reserved. +# +# License https://www.backblaze.com/using_b2_code.html +# +###################################################################### +from __future__ import annotations + +import io +import time + +import pytest + +from b2sdk.v2 import ( + CompareVersionMode, + NewerFileSyncMode, + Synchronizer, + SyncReport, + parse_folder, +) + + +@pytest.fixture +def local_folder_with_files(tmp_path): + folder = tmp_path / "test" + folder.mkdir() + (folder / "a").mkdir() + (folder / "a" / "foo").write_bytes(b"foo") + # space in the name is important as it influences lexicographical sorting used by B2 + (folder / "a b").mkdir() + (folder / "a b" / "bar").write_bytes(b"bar") + return folder + + +def test_sync_folder(b2_api, local_folder_with_files, b2_subfolder): + source_folder = parse_folder(str(local_folder_with_files), b2_api) + dest_folder = parse_folder(b2_subfolder, b2_api) + + synchronizer = Synchronizer( + max_workers=10, + newer_file_mode=NewerFileSyncMode.REPLACE, + compare_version_mode=CompareVersionMode.MODTIME, + compare_threshold=10, # ms + ) + + def sync_and_report(): + buf = io.StringIO() + reporter = SyncReport(buf, no_progress=True) + with reporter: + synchronizer.sync_folders( + source_folder=source_folder, + dest_folder=dest_folder, + now_millis=int(1000 * time.time()), + reporter=reporter, + ) + return reporter + + report = sync_and_report() + assert report.total_transfer_files == 2 + assert report.total_transfer_bytes == 6 + + second_sync_report = sync_and_report() + assert second_sync_report.total_transfer_files == 0 + assert second_sync_report.total_transfer_bytes == 0 diff --git a/test/unit/fixtures/folder.py b/test/unit/fixtures/folder.py index 3815c0a30..d6183c196 100644 --- a/test/unit/fixtures/folder.py +++ b/test/unit/fixtures/folder.py @@ -29,7 +29,7 @@ def __init__(self, test_files): super().__init__('test-bucket', 'folder', mock.MagicMock()) def get_file_versions(self): - yield from iter(self.file_versions) + yield from sorted(self.file_versions, key=lambda x: x.file_name) def _file_versions(self, name, mod_times, size=10): """ @@ -70,7 +70,7 @@ def __init__(self, test_files): self.local_paths = [self._local_path(*test_file) for test_file in test_files] def all_files(self, reporter, policies_manager=DEFAULT_SCAN_MANAGER): - for single_path in self.local_paths: + for single_path in sorted(self.local_paths, key=lambda x: x.relative_path): if single_path.relative_path.endswith('/'): if policies_manager.should_exclude_b2_directory(single_path.relative_path): continue diff --git a/test/unit/sync/test_sync.py b/test/unit/sync/test_sync.py index 00c07529d..1b1d65279 100644 --- a/test/unit/sync/test_sync.py +++ b/test/unit/sync/test_sync.py @@ -969,6 +969,28 @@ def check_path_and_get_size(self): # Order of indices: call index, pick arguments, pick first argument, first element of the first argument. assert isinstance(bucket.mock_calls[0][1][0][0], CopySource) + def test_sync_lexicographical_order(self, synchronizer_factory): + """ + Test sync is successful when dir tree order is different from absolute path lexicographical order + + Regression test for #502 + """ + synchronizer = synchronizer_factory() + + files = [('a/foo', [100], 10), ('a b/bar', [100], 10)] + src = self.folder_factory('local', *files) + dst = self.folder_factory('b2') + + self.assert_folder_sync_actions( + synchronizer, src, dst, [ + 'b2_upload(/dir/a b/bar, folder/a b/bar, 100)', + 'b2_upload(/dir/a/foo, folder/a/foo, 100)' + ] + ) + + dst_with_files = self.folder_factory('b2', *files) + self.assert_folder_sync_actions(synchronizer, src, dst_with_files, []) + class TstEncryptionSettingsProvider(AbstractSyncEncryptionSettingsProvider): def __init__(self, source_encryption_setting, destination_encryption_setting): diff --git a/test/unit/v0/test_sync.py b/test/unit/v0/test_sync.py index 0cc7009f1..273fccdf2 100644 --- a/test/unit/v0/test_sync.py +++ b/test/unit/v0/test_sync.py @@ -66,10 +66,10 @@ class TestFolder(TestSync): NAMES = [ '.dot_file', + 'hello.', os.path.join('hello', 'a', '1'), os.path.join('hello', 'a', '2'), os.path.join('hello', 'b'), - 'hello.', 'hello0', os.path.join('inner', 'a.bin'), os.path.join('inner', 'a.txt'), @@ -106,10 +106,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results): def test_exclusions(self): expected_list = [ '.dot_file', + 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', - 'hello.', 'hello0', 'inner/a.txt', 'inner/b.txt', @@ -129,10 +129,10 @@ def test_exclude_all(self): def test_exclusions_inclusions(self): expected_list = [ '.dot_file', + 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', - 'hello.', 'hello0', 'inner/a.bin', 'inner/a.txt', @@ -151,8 +151,8 @@ def test_exclusions_inclusions(self): def test_exclude_matches_prefix(self): expected_list = [ '.dot_file', - 'hello/b', 'hello.', + 'hello/b', 'hello0', 'inner/b.bin', 'inner/b.txt', @@ -204,10 +204,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self): def test_exclusion_with_exact_match(self): expected_list = [ '.dot_file', + 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', - 'hello.', 'inner/a.bin', 'inner/a.txt', 'inner/b.bin', diff --git a/test/unit/v1/test_sync.py b/test/unit/v1/test_sync.py index 9192de0fb..444f6b3f6 100644 --- a/test/unit/v1/test_sync.py +++ b/test/unit/v1/test_sync.py @@ -69,10 +69,10 @@ class TestFolder(TestSync): NAMES = [ '.dot_file', + 'hello.', os.path.join('hello', 'a', '1'), os.path.join('hello', 'a', '2'), os.path.join('hello', 'b'), - 'hello.', 'hello0', os.path.join('inner', 'a.bin'), os.path.join('inner', 'a.txt'), @@ -109,10 +109,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results): def test_exclusions(self): expected_list = [ '.dot_file', + 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', - 'hello.', 'hello0', 'inner/a.txt', 'inner/b.txt', @@ -132,10 +132,10 @@ def test_exclude_all(self): def test_exclusions_inclusions(self): expected_list = [ '.dot_file', + 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', - 'hello.', 'hello0', 'inner/a.bin', 'inner/a.txt', @@ -154,8 +154,8 @@ def test_exclusions_inclusions(self): def test_exclude_matches_prefix(self): expected_list = [ '.dot_file', - 'hello/b', 'hello.', + 'hello/b', 'hello0', 'inner/b.bin', 'inner/b.txt', @@ -207,10 +207,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self): def test_exclusion_with_exact_match(self): expected_list = [ '.dot_file', + 'hello.', 'hello/a/1', 'hello/a/2', 'hello/b', - 'hello.', 'inner/a.bin', 'inner/a.txt', 'inner/b.bin',