From e9c1c408b562b19b5f5847c733aeb91ef6d0b8bc Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 7 Jan 2021 00:57:14 +0100 Subject: [PATCH 1/4] remove empty shadowed_segments lists, fixes #5275 also: - add test for removed empty shadowed_segments list - add some comments - add repo_dump test debug tool note: this is the backport of #5614 to 1.1-maint - it needed some adaptions to fit into 1.1-maint: 1.1-maint always compacts when committing and does not persist shadow_index, so the tests are a bit different --- src/borg/repository.py | 7 +++++ src/borg/testsuite/repository.py | 45 +++++++++++++++++++++++++++++--- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 297a3eb1f..fc1efe14e 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -774,6 +774,8 @@ class Repository: try: self.shadow_index[key].remove(segment) except (KeyError, ValueError): + # do not remove entry with empty shadowed_segments list here, + # it is needed for shadowed_put_exists code (see below)! pass elif tag == TAG_DELETE and not in_index: # If the shadow index doesn't contain this key, then we can't say if there's a shadowed older tag, @@ -825,6 +827,11 @@ class Repository: new_segment, size = self.io.write_delete(key) self.compact[new_segment] += size segments.setdefault(new_segment, 0) + else: + # we did not keep the delete tag for key (see if-branch) + if not self.shadow_index[key]: + # shadowed segments list is empty -> remove it + del self.shadow_index[key] assert segments[segment] == 0, 'Corrupted segment reference count - corrupted index or hints' unused.append(segment) pi.show() diff --git a/src/borg/testsuite/repository.py b/src/borg/testsuite/repository.py index be203800e..11225b2f5 100644 --- a/src/borg/testsuite/repository.py +++ b/src/borg/testsuite/repository.py @@ -14,7 +14,7 @@ from ..helpers import IntegrityError from ..helpers import msgpack from ..locking import Lock, LockFailed from ..remote import RemoteRepository, InvalidRPCMethod, PathNotAllowed, ConnectionClosedWithHint, handle_remote_line -from ..repository import Repository, LoggedIO, MAGIC, MAX_DATA_SIZE, TAG_DELETE +from ..repository import Repository, LoggedIO, MAGIC, MAX_DATA_SIZE, TAG_DELETE, TAG_PUT, TAG_COMMIT from . import BaseTestCase from .hashindex import H @@ -54,6 +54,16 @@ class RepositoryTestCaseBase(BaseTestCase): self.repository.put(H(2), b'boo') self.repository.delete(H(3)) + def repo_dump(self, label=None): + label = label + ': ' if label is not None else '' + H_trans = {H(i): i for i in range(10)} + H_trans[None] = -1 # key == None appears in commits + tag_trans = {TAG_PUT: 'put', TAG_DELETE: 'del', TAG_COMMIT: 'comm'} + for segment, fn in self.repository.io.segment_iterator(): + for tag, key, offset, size in self.repository.io.iter_objects(segment): + print("%s%s H(%d) -> %s[%d..+%d]" % (label, tag_trans[tag], H_trans[key], fn, offset, size)) + print() + class RepositoryTestCase(RepositoryTestCaseBase): @@ -313,8 +323,10 @@ class RepositoryCommitTestCase(RepositoryTestCaseBase): self.repository.put(H(1), b'1') self.repository.put(H(2), b'2') self.repository.commit() + self.repo_dump('p1 p2 cc') self.repository.delete(H(1)) self.repository.commit() + self.repo_dump('d1 cc') last_segment = self.repository.io.get_latest_segment() - 1 num_deletes = 0 for tag, key, offset, size in self.repository.io.iter_objects(last_segment): @@ -325,11 +337,16 @@ class RepositoryCommitTestCase(RepositoryTestCaseBase): assert last_segment in self.repository.compact self.repository.put(H(3), b'3') self.repository.commit() + self.repo_dump('p3 cc') assert last_segment not in self.repository.compact assert not self.repository.io.segment_exists(last_segment) for segment, _ in self.repository.io.segment_iterator(): for tag, key, offset, size in self.repository.io.iter_objects(segment): assert tag != TAG_DELETE + assert key != H(1) + # after compaction, there should be no empty shadowed_segments lists left over. + # we have no put or del any more for H(1), so we lost knowledge about H(1). + assert H(1) not in self.repository.shadow_index def test_shadowed_entries_are_preserved(self): get_latest_segment = self.repository.io.get_latest_segment @@ -370,16 +387,19 @@ class RepositoryCommitTestCase(RepositoryTestCaseBase): self.repository.delete(H(1)) assert self.repository.shadow_index[H(1)] == [0] self.repository.commit() + self.repo_dump('p1 d1 cc') # note how an empty list means that nothing is shadowed for sure - assert self.repository.shadow_index[H(1)] == [] + assert self.repository.shadow_index[H(1)] == [] # because the delete is considered unstable self.repository.put(H(1), b'1') self.repository.delete(H(1)) + self.repo_dump('p1 d1') # 0 put/delete; 1 commit; 2 compacted; 3 commit; 4 put/delete assert self.repository.shadow_index[H(1)] == [4] self.repository.rollback() + self.repo_dump('r') self.repository.put(H(2), b'1') # After the rollback segment 4 shouldn't be considered anymore - assert self.repository.shadow_index[H(1)] == [] + assert self.repository.shadow_index[H(1)] == [] # because the delete is considered unstable class RepositoryAppendOnlyTestCase(RepositoryTestCaseBase): @@ -790,6 +810,25 @@ class RepositoryCheckTestCase(RepositoryTestCaseBase): self.assert_equal(self.repository.get(H(0)), b'data2') +class RepositoryHintsTestCase(RepositoryTestCaseBase): + + def test_hints_behaviour(self): + self.repository.put(H(0), b'data') + self.assert_equal(self.repository.shadow_index, {}) + self.assert_true(len(self.repository.compact) == 0) + self.repository.delete(H(0)) + self.repository.commit() + # this is to make the previous delete "stable", see delete_is_not_stable: + self.repository.put(H(1), b'data') + self.repository.commit() + # in 1.1-maint, commit() always implicitly compacts. + # nothing to compact any more! no info left about stuff that does not exist any more: + self.assert_not_in(H(0), self.repository.shadow_index) + # segment 0 was compacted away, no info about it left: + self.assert_not_in(0, self.repository.compact) + self.assert_not_in(0, self.repository.segments) + + class RemoteRepositoryTestCase(RepositoryTestCase): repository = None # type: RemoteRepository From a8cf5582b075fe89833680befb0d5a7ef8496785 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 18 Jan 2021 21:03:22 +0100 Subject: [PATCH 2/4] deduplicate code of put and delete, no functional change --- src/borg/repository.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index fc1efe14e..0fa2f05d9 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1142,13 +1142,9 @@ class Repository: except KeyError: pass else: - self.segments[segment] -= 1 - size = self.io.read(segment, offset, id, read_data=False) - self.storage_quota_use -= size - self.compact[segment] += size - segment, size = self.io.write_delete(id) - self.compact[segment] += size - self.segments.setdefault(segment, 0) + # note: doing a delete first will do some bookkeeping, + # like updating the shadow_index, quota, ... + self._delete(id, segment, offset) segment, offset = self.io.write_put(id, data) self.storage_quota_use += len(data) + self.io.put_header_fmt.size self.segments.setdefault(segment, 0) @@ -1172,6 +1168,10 @@ class Repository: except KeyError: raise self.ObjectNotFound(id, self.path) from None self.shadow_index.setdefault(id, []).append(segment) + self._delete(id, segment, offset) + + def _delete(self, id, segment, offset): + # common code used by put and delete self.segments[segment] -= 1 size = self.io.read(segment, offset, id, read_data=False) self.storage_quota_use -= size From df11a67f960cc0c43edb0bc138fc17b1aed3b0bb Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 18 Jan 2021 21:26:01 +0100 Subject: [PATCH 3/4] fix updating shadow_index also in put The shadow_index should be in same state after both of these sequences (let's assume that A is not in repo yet for simplicity, but it does not matter): a) explicit delete: put(A), delete(A), put(A), resulting in: PUT A, DEL A, PUT A repo contents b) implicit delete: put(A), put(A), resulting in: PUT A, DEL A, PUT A repo contents --- src/borg/repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 0fa2f05d9..388b3e3ca 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1167,11 +1167,11 @@ class Repository: segment, offset = self.index.pop(id) except KeyError: raise self.ObjectNotFound(id, self.path) from None - self.shadow_index.setdefault(id, []).append(segment) self._delete(id, segment, offset) def _delete(self, id, segment, offset): # common code used by put and delete + self.shadow_index.setdefault(id, []).append(segment) self.segments[segment] -= 1 size = self.io.read(segment, offset, id, read_data=False) self.storage_quota_use -= size From 36503c4e37094acddd570b6e79fbcaa612f7b376 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 4 Feb 2021 02:29:43 +0100 Subject: [PATCH 4/4] revert incorrect fix for put updating shadow_index, fixes #5661 A) the compaction code needs the shadow index only for this case: segment A: PUT x, segment B: DEL x, with A < B (DEL shadows the PUT). B) for the following case, we have no shadowing DEL (or rather: it does not matter, because there is a PUT right after the DEL) and x is in the repo index, thus the shadow_index is not needed for the special case in the compaction code: segment A: PUT x, segment B: DEL x PUT x see also PR #5636. reverts f079a83fed91c1221951303f758187987be0ac47 and clarifies the code by more comments. we keep the code deduplication of 5f32b5666af961f5256221660a7356d467f66605 and just add a update_shadow_index param to make it not look like there was something accidentally forgotten, which was the whole reason for the reverted "fix". --- src/borg/repository.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 388b3e3ca..690a8f959 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1142,9 +1142,11 @@ class Repository: except KeyError: pass else: - # note: doing a delete first will do some bookkeeping, - # like updating the shadow_index, quota, ... - self._delete(id, segment, offset) + # note: doing a delete first will do some bookkeeping. + # we do not want to update the shadow_index here, because + # we know already that we will PUT to this id, so it will + # be in the repo index (and we won't need it in the shadow_index). + self._delete(id, segment, offset, update_shadow_index=False) segment, offset = self.io.write_put(id, data) self.storage_quota_use += len(data) + self.io.put_header_fmt.size self.segments.setdefault(segment, 0) @@ -1167,11 +1169,16 @@ class Repository: segment, offset = self.index.pop(id) except KeyError: raise self.ObjectNotFound(id, self.path) from None - self._delete(id, segment, offset) + # if we get here, there is an object with this id in the repo, + # we write a DEL here that shadows the respective PUT. + # after the delete, the object is not in the repo index any more, + # for the compaction code, we need to update the shadow_index in this case. + self._delete(id, segment, offset, update_shadow_index=True) - def _delete(self, id, segment, offset): + def _delete(self, id, segment, offset, *, update_shadow_index): # common code used by put and delete - self.shadow_index.setdefault(id, []).append(segment) + if update_shadow_index: + self.shadow_index.setdefault(id, []).append(segment) self.segments[segment] -= 1 size = self.io.read(segment, offset, id, read_data=False) self.storage_quota_use -= size