repository.scan: use same end_segment within same scan

achieved by putting it into the state that is now used instead of the marker.
2026-02-20 00:10:35 -05:00 · 2022-09-19 21:14:25 +02:00 · 2022-09-19 21:14:25 +02:00 · c4e54ca44e
commit c4e54ca44e
parent c0e674ce61
5 changed files with 19 additions and 21 deletions
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -1747,9 +1747,9 @@ class ArchiveChecker:
        pi = ProgressIndicatorPercent(
            total=chunks_count_index, msg="Verifying data %6.2f%%", step=0.01, msgid="check.verify_data"
        )
-        marker = None
+        state = None
        while True:
-            chunk_ids, marker = self.repository.scan(limit=100, marker=marker)
+            chunk_ids, state = self.repository.scan(limit=100, state=state)
            if not chunk_ids:
                break
            chunks_count_segments += len(chunk_ids)
--- a/src/borg/archiver/debug_cmd.py
+++ b/src/borg/archiver/debug_cmd.py
@ -152,12 +152,10 @@ class DebugMixIn:
            cdata = repository.get(ids[0])
            key = key_factory(repository, cdata)
            repo_objs = RepoObj(key)
-            marker = None
+            state = None
            i = 0
            while True:
-                ids, marker = repository.scan(
-                    limit=LIST_SCAN_LIMIT, marker=marker
-                )  # must use on-disk order scanning here
+                ids, state = repository.scan(limit=LIST_SCAN_LIMIT, state=state)  # must use on-disk order scanning here
                if not ids:
                    break
                for id in ids:
@ -203,12 +201,12 @@ class DebugMixIn:
        key = key_factory(repository, cdata)
        repo_objs = RepoObj(key)

-        marker = None
+        state = None
        last_data = b""
        last_id = None
        i = 0
        while True:
-            ids, marker = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker)  # must use on-disk order scanning here
+            ids, state = repository.scan(limit=LIST_SCAN_LIMIT, state=state)  # must use on-disk order scanning here
            if not ids:
                break
            for id in ids:
--- a/src/borg/remote.py
+++ b/src/borg/remote.py
@ -989,8 +989,8 @@ This problem will go away as soon as the server has been upgraded to 1.0.7+.
    def list(self, limit=None, marker=None, mask=0, value=0):
        """actual remoting is done via self.call in the @api decorator"""

-    @api(since=parse_version("1.1.0b3"))
-    def scan(self, limit=None, marker=None):
+    @api(since=parse_version("2.0.0b2"))
+    def scan(self, limit=None, state=None):
        """actual remoting is done via self.call in the @api decorator"""

    @api(since=parse_version("2.0.0b2"))
--- a/src/borg/repository.py
+++ b/src/borg/repository.py
@ -1207,15 +1207,15 @@ class Repository:
            self.index = self.open_index(self.get_transaction_id())
        return [id_ for id_, _ in islice(self.index.iteritems(marker=marker, mask=mask, value=value), limit)]

-    def scan(self, limit=None, marker=None):
+    def scan(self, limit=None, state=None):
        """
-        list <limit> IDs starting from after <marker> - in on-disk order, so that a client
+        list (the next) <limit> chunk IDs from the repository - in on-disk order, so that a client
        fetching data in this order does linear reads and reuses stuff from disk cache.

-        marker can either be None (default, meaning "start from the beginning") or the object
-        returned from a previous scan call (meaning "continue scanning where we stopped previously").
+        state can either be None (initially, when starting to scan) or the object
+        returned from a previous scan call (meaning "continue scanning").

-        returns: list of chunk ids, marker
+        returns: list of chunk ids, state

        We rely on repository.check() has run already (either now or some time before) and that:

@ -1230,11 +1230,11 @@ class Repository:
        if not self.index:
            self.index = self.open_index(transaction_id)
        # smallest valid seg is <uint32> 0, smallest valid offs is <uint32> 8
-        start_segment, start_offset = marker if marker is not None else (0, 0)
+        start_segment, start_offset, end_segment = state if state is not None else (0, 0, transaction_id)
        ids, segment, offset = [], 0, 0
        # we only scan up to end_segment == transaction_id to only scan **committed** chunks,
        # avoiding scanning into newly written chunks.
-        for segment, filename in self.io.segment_iterator(start_segment, transaction_id):
+        for segment, filename in self.io.segment_iterator(start_segment, end_segment):
            obj_iterator = self.io.iter_objects(segment, start_offset, read_data=False)
            while True:
                try:
@ -1255,8 +1255,8 @@ class Repository:
                        # we have found an existing and current object
                        ids.append(id)
                        if len(ids) == limit:
-                            return ids, (segment, offset)
-        return ids, (segment, offset)
+                            return ids, (segment, offset, end_segment)
+        return ids, (segment, offset, end_segment)

    def flags(self, id, mask=0xFFFFFFFF, value=None):
        """
--- a/src/borg/testsuite/repository.py
+++ b/src/borg/testsuite/repository.py
@ -191,10 +191,10 @@ class RepositoryTestCase(RepositoryTestCaseBase):
        self.repository.commit(compact=False)
        all, _ = self.repository.scan()
        assert len(all) == 100
-        first_half, marker = self.repository.scan(limit=50)
+        first_half, state = self.repository.scan(limit=50)
        assert len(first_half) == 50
        assert first_half == all[:50]
-        second_half, _ = self.repository.scan(marker=marker)
+        second_half, _ = self.repository.scan(state=state)
        assert len(second_half) == 50
        assert second_half == all[50:]
        # check result order == on-disk order (which is hash order)