repository: remove N=1 fallback from get(), update _chunks eagerly in put()

get() raises ObjectNotFound when entry is missing or UNKNOWN_BYTES32; put()
marks the id in _chunks immediately so the index is live after each write.
This commit is contained in:
Mrityunjay Raj 2026-06-10 23:01:29 +05:30
parent 54adb9b7f6
commit 1df2065f85
2 changed files with 30 additions and 9 deletions

View file

@ -415,9 +415,8 @@ class Repository:
"""Set the ChunkIndex get() uses to resolve pack locations.
The caller retains ownership; Repository holds a borrowed reference.
Pass None to reset to an empty index.
"""
self._chunks = chunks if chunks is not None else ChunkIndex()
self._chunks = chunks
def flush(self):
"""Flush any buffered pack writer chunks."""
@ -435,7 +434,6 @@ class Repository:
if self.store_opened:
self.store.close()
self.store_opened = False
self._chunks = None
self.opened = False
def info(self):
@ -612,11 +610,12 @@ class Repository:
def get(self, id, read_data=True, raise_missing=True):
self._lock_refresh()
pack_id = id # N=1 fallback: pack_id == chunk_id
obj_offset, obj_size = 0, None
entry = self._chunks.get(id)
if entry is not None and entry.pack_id != UNKNOWN_BYTES32: # UNKNOWN: buffered, not yet flushed
pack_id, obj_offset, obj_size = entry.pack_id, entry.obj_offset, entry.obj_size
if entry is None or entry.pack_id == UNKNOWN_BYTES32:
if raise_missing:
raise self.ObjectNotFound(id, str(self._location))
return None
pack_id, obj_offset, obj_size = entry.pack_id, entry.obj_offset, entry.obj_size
id_hex = bin_to_hex(id)
key = "packs/" + bin_to_hex(pack_id)
try:
@ -672,7 +671,11 @@ class Repository:
data_size = len(data)
if data_size > MAX_DATA_SIZE:
raise IntegrityError(f"More than allowed put data [{data_size} > {MAX_DATA_SIZE}]")
return self._pack_writer.add(id, data)
pack_results = self._pack_writer.add(id, data)
self._chunks.add(id, 0) # mark seen; uncompressed size filled in by cache layer
if pack_results:
self._chunks.update_pack_info(pack_results)
return pack_results
def delete(self, id, wait=True):
"""delete a repo object

View file

@ -77,15 +77,21 @@ def pdchunk(chunk):
def test_basic_operations(repo_fixtures, request):
chunks = ChunkIndex()
with get_repository_from_fixture(repo_fixtures, request) as repository:
for x in range(100):
repository.put(H(x), fchunk(b"SOMEDATA"))
pack_results = repository.put(H(x), fchunk(b"SOMEDATA"))
if pack_results:
for chunk_id, *_ in pack_results:
chunks.add(chunk_id, 0)
chunks.update_pack_info(pack_results)
key50 = H(50)
assert pdchunk(repository.get(key50)) == b"SOMEDATA"
repository.delete(key50)
with pytest.raises(Repository.ObjectNotFound):
repository.get(key50)
with reopen(repository) as repository:
repository.set_chunk_index(chunks)
with pytest.raises(Repository.ObjectNotFound):
repository.get(key50)
for x in range(100):
@ -256,6 +262,18 @@ def test_get_uses_chunk_index_location(tmp_path):
assert repository.get(id2) == chunk2
def test_put_marks_id_in_chunk_index(tmp_path):
# put() immediately updates _chunks: add() marks the id as seen, then update_pack_info
# fills in the real pack location for the current session.
with Repository(str(tmp_path / "repo"), exclusive=True, create=True) as repository:
id1 = H(1)
repository.put(id1, fchunk(b"ZEROS"))
entry = repository._chunks.get(id1)
assert entry is not None
assert entry.pack_id == id1 # N=1: pack_id == chunk_id, set by update_pack_info in put()
assert entry.size == 0 # uncompressed size filled in by cache layer
def test_pack_writer_final_partial_pack_uses_sha256():
# When max_count > 1, a final flush with only 1 piece must still use SHA256,
# not the N=1 pack_id == chunk_id hack.