mirror of
https://github.com/borgbackup/borg.git
synced 2026-06-11 01:41:57 -04:00
Merge pull request #8803 from ThomasWaldmann/transfer-rechunk
transfer with re-chunking
This commit is contained in:
commit
a764abd7b0
3 changed files with 219 additions and 71 deletions
|
|
@ -1,13 +1,15 @@
|
|||
import argparse
|
||||
|
||||
from ._common import with_repository, with_other_repository, Highlander
|
||||
from ..archive import Archive
|
||||
from ..archive import Archive, cached_hash, DownloadPipeline
|
||||
from ..chunker import get_chunker
|
||||
from ..compress import CompressionSpec
|
||||
from ..constants import * # NOQA
|
||||
from ..crypto.key import uses_same_id_hash, uses_same_chunker_secret
|
||||
from ..helpers import Error
|
||||
from ..helpers import location_validator, Location, archivename_validator, comment_validator
|
||||
from ..helpers import format_file_size, bin_to_hex
|
||||
from ..helpers import ChunkerParams, ChunkIteratorFileWrapper
|
||||
from ..manifest import Manifest
|
||||
from ..legacyrepository import LegacyRepository
|
||||
from ..repository import Repository
|
||||
|
|
@ -17,6 +19,103 @@ from ..logger import create_logger
|
|||
logger = create_logger()
|
||||
|
||||
|
||||
def transfer_chunks(
|
||||
upgrader, other_repository, other_manifest, other_chunks, archive, cache, recompress, dry_run, chunker_params=None
|
||||
):
|
||||
"""
|
||||
Transfer chunks from another repository to the current repository.
|
||||
|
||||
If chunker_params is provided, the chunks will be re-chunked using the specified parameters.
|
||||
"""
|
||||
transfer = 0
|
||||
present = 0
|
||||
chunks = []
|
||||
|
||||
# Determine if re-chunking is needed
|
||||
rechunkify = chunker_params is not None
|
||||
|
||||
if rechunkify:
|
||||
# Similar to ArchiveRecreater.iter_chunks
|
||||
pipeline = DownloadPipeline(other_manifest.repository, other_manifest.repo_objs)
|
||||
chunk_iterator = pipeline.fetch_many(other_chunks, ro_type=ROBJ_FILE_STREAM)
|
||||
file = ChunkIteratorFileWrapper(chunk_iterator)
|
||||
|
||||
# Create a chunker with the specified parameters
|
||||
chunker = get_chunker(*chunker_params, seed=archive.key.chunk_seed, sparse=False)
|
||||
for chunk in chunker.chunkify(file):
|
||||
if not dry_run:
|
||||
chunk_id, data = cached_hash(chunk, archive.key.id_hash)
|
||||
size = len(data)
|
||||
# Check if the chunk is already in the repository
|
||||
chunk_present = cache.seen_chunk(chunk_id, size)
|
||||
if chunk_present:
|
||||
chunk_entry = cache.reuse_chunk(chunk_id, size, archive.stats)
|
||||
present += size
|
||||
else:
|
||||
# Add the new chunk to the repository
|
||||
chunk_entry = cache.add_chunk(
|
||||
chunk_id, {}, data, stats=archive.stats, wait=False, ro_type=ROBJ_FILE_STREAM
|
||||
)
|
||||
cache.repository.async_response(wait=False)
|
||||
transfer += size
|
||||
chunks.append(chunk_entry)
|
||||
else:
|
||||
# In dry-run mode, just estimate the size
|
||||
size = len(chunk.data) if chunk.data is not None else chunk.size
|
||||
transfer += size
|
||||
else:
|
||||
# Original implementation without re-chunking
|
||||
for chunk_id, size in other_chunks:
|
||||
chunk_present = cache.seen_chunk(chunk_id, size)
|
||||
if not chunk_present: # target repo does not yet have this chunk
|
||||
if not dry_run:
|
||||
try:
|
||||
cdata = other_repository.get(chunk_id)
|
||||
except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound):
|
||||
# missing correct chunk in other_repository (source) will result in
|
||||
# a missing chunk in repository (destination).
|
||||
# we do NOT want to transfer all-zero replacement chunks from borg1 repos.
|
||||
pass
|
||||
else:
|
||||
if recompress == "never":
|
||||
# keep compressed payload same, verify via assert_id (that will
|
||||
# decompress, but avoid needing to compress it again):
|
||||
meta, data = other_manifest.repo_objs.parse(
|
||||
chunk_id, cdata, decompress=True, want_compressed=True, ro_type=ROBJ_FILE_STREAM
|
||||
)
|
||||
meta, data = upgrader.upgrade_compressed_chunk(meta, data)
|
||||
chunk_entry = cache.add_chunk(
|
||||
chunk_id,
|
||||
meta,
|
||||
data,
|
||||
stats=archive.stats,
|
||||
wait=False,
|
||||
compress=False,
|
||||
size=size,
|
||||
ctype=meta["ctype"],
|
||||
clevel=meta["clevel"],
|
||||
ro_type=ROBJ_FILE_STREAM,
|
||||
)
|
||||
elif recompress == "always":
|
||||
# always decompress and re-compress file data chunks
|
||||
meta, data = other_manifest.repo_objs.parse(chunk_id, cdata, ro_type=ROBJ_FILE_STREAM)
|
||||
chunk_entry = cache.add_chunk(
|
||||
chunk_id, meta, data, stats=archive.stats, wait=False, ro_type=ROBJ_FILE_STREAM
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"unsupported recompress mode: {recompress}")
|
||||
cache.repository.async_response(wait=False)
|
||||
chunks.append(chunk_entry)
|
||||
transfer += size
|
||||
else:
|
||||
if not dry_run:
|
||||
chunk_entry = cache.reuse_chunk(chunk_id, size, archive.stats)
|
||||
chunks.append(chunk_entry)
|
||||
present += size
|
||||
|
||||
return chunks, transfer, present
|
||||
|
||||
|
||||
class TransferMixIn:
|
||||
@with_other_repository(manifest=True, compatibility=(Manifest.Operation.READ,))
|
||||
@with_repository(manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,))
|
||||
|
|
@ -76,7 +175,7 @@ class TransferMixIn:
|
|||
if UpgraderCls is not upgrade_mod.UpgraderFrom12To20 and other_manifest.repository.version == 1:
|
||||
raise Error("To transfer from a borg 1.x repo, you need to use: --upgrader=From12To20")
|
||||
|
||||
upgrader = UpgraderCls(cache=cache)
|
||||
upgrader = UpgraderCls(cache=cache, args=args)
|
||||
|
||||
for archive_info in archive_infos:
|
||||
name, id, ts = archive_info.name, archive_info.id, archive_info.ts
|
||||
|
|
@ -120,68 +219,22 @@ class TransferMixIn:
|
|||
else:
|
||||
other_chunks = None
|
||||
if other_chunks is not None:
|
||||
chunks = []
|
||||
for chunk_id, size in other_chunks:
|
||||
chunk_present = cache.seen_chunk(chunk_id, size)
|
||||
if not chunk_present: # target repo does not yet have this chunk
|
||||
if not dry_run:
|
||||
try:
|
||||
cdata = other_repository.get(chunk_id)
|
||||
except (Repository.ObjectNotFound, LegacyRepository.ObjectNotFound):
|
||||
# missing correct chunk in other_repository (source) will result in
|
||||
# a missing chunk in repository (destination).
|
||||
# we do NOT want to transfer all-zero replacement chunks from borg1 repos.
|
||||
pass
|
||||
else:
|
||||
if args.recompress == "never":
|
||||
# keep compressed payload same, verify via assert_id (that will
|
||||
# decompress, but avoid needing to compress it again):
|
||||
meta, data = other_manifest.repo_objs.parse(
|
||||
chunk_id,
|
||||
cdata,
|
||||
decompress=True,
|
||||
want_compressed=True,
|
||||
ro_type=ROBJ_FILE_STREAM,
|
||||
)
|
||||
meta, data = upgrader.upgrade_compressed_chunk(meta, data)
|
||||
chunk_entry = cache.add_chunk(
|
||||
chunk_id,
|
||||
meta,
|
||||
data,
|
||||
stats=archive.stats,
|
||||
wait=False,
|
||||
compress=False,
|
||||
size=size,
|
||||
ctype=meta["ctype"],
|
||||
clevel=meta["clevel"],
|
||||
ro_type=ROBJ_FILE_STREAM,
|
||||
)
|
||||
elif args.recompress == "always":
|
||||
# always decompress and re-compress file data chunks
|
||||
meta, data = other_manifest.repo_objs.parse(
|
||||
chunk_id, cdata, ro_type=ROBJ_FILE_STREAM
|
||||
)
|
||||
chunk_entry = cache.add_chunk(
|
||||
chunk_id,
|
||||
meta,
|
||||
data,
|
||||
stats=archive.stats,
|
||||
wait=False,
|
||||
ro_type=ROBJ_FILE_STREAM,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"unsupported recompress mode: {args.recompress}")
|
||||
cache.repository.async_response(wait=False)
|
||||
chunks.append(chunk_entry)
|
||||
transfer_size += size
|
||||
else:
|
||||
if not dry_run:
|
||||
chunk_entry = cache.reuse_chunk(chunk_id, size, archive.stats)
|
||||
chunks.append(chunk_entry)
|
||||
present_size += size
|
||||
chunks, transfer, present = transfer_chunks(
|
||||
upgrader,
|
||||
other_repository,
|
||||
other_manifest,
|
||||
other_chunks,
|
||||
archive,
|
||||
cache,
|
||||
args.recompress,
|
||||
dry_run,
|
||||
args.chunker_params,
|
||||
)
|
||||
if not dry_run:
|
||||
item.chunks = chunks
|
||||
archive.stats.nfiles += 1
|
||||
transfer_size += transfer
|
||||
present_size += present
|
||||
if not dry_run:
|
||||
item = upgrader.upgrade_item(item=item)
|
||||
archive.add_item(item, show_progress=args.progress)
|
||||
|
|
@ -213,6 +266,7 @@ class TransferMixIn:
|
|||
This command transfers archives from one repository to another repository.
|
||||
Optionally, it can also upgrade the transferred data.
|
||||
Optionally, it can also recompress the transferred data.
|
||||
Optionally, it can also re-chunk the transferred data using different chunker parameters.
|
||||
|
||||
It is easiest (and fastest) to give ``--compression=COMPRESSION --recompress=never`` using
|
||||
the same COMPRESSION mode as in the SRC_REPO - borg will use that COMPRESSION for metadata (in
|
||||
|
|
@ -258,6 +312,10 @@ class TransferMixIn:
|
|||
borg --repo=DST_REPO transfer --other-repo=SRC_REPO --from-borg1 \\
|
||||
--compress=zstd,3 --recompress=always
|
||||
|
||||
# to re-chunk using different chunker parameters:
|
||||
borg --repo=DST_REPO transfer --other-repo=SRC_REPO \\
|
||||
--chunker-params=buzhash,19,23,21,4095
|
||||
|
||||
|
||||
"""
|
||||
)
|
||||
|
|
@ -321,5 +379,16 @@ class TransferMixIn:
|
|||
"If no MODE is given, `always` will be used. "
|
||||
'Not passing --recompress is equivalent to "--recompress never".',
|
||||
)
|
||||
subparser.add_argument(
|
||||
"--chunker-params",
|
||||
metavar="PARAMS",
|
||||
dest="chunker_params",
|
||||
type=ChunkerParams,
|
||||
default=None,
|
||||
action=Highlander,
|
||||
help="rechunk using given chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, "
|
||||
"HASH_MASK_BITS, HASH_WINDOW_SIZE) or `default` to use the chunker defaults. "
|
||||
"default: do not rechunk",
|
||||
)
|
||||
|
||||
define_archive_filters_group(subparser)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import stat
|
||||
import tarfile
|
||||
|
|
@ -8,12 +10,13 @@ from contextlib import contextmanager
|
|||
import pytest
|
||||
|
||||
from ...constants import * # NOQA
|
||||
from ...helpers import open_item
|
||||
from ...helpers.time import parse_timestamp
|
||||
from ...helpers.parseformat import parse_file_size
|
||||
from ...helpers.parseformat import parse_file_size, ChunkerParams
|
||||
from ..platform_test import is_win32
|
||||
from . import cmd, create_test_files, RK_ENCRYPTION, open_archive, generate_archiver_tests
|
||||
from . import cmd, create_regular_file, create_test_files, RK_ENCRYPTION, open_archive, generate_archiver_tests
|
||||
|
||||
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA
|
||||
pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote") # NOQA
|
||||
|
||||
|
||||
def test_transfer_upgrade(archivers, request, monkeypatch):
|
||||
|
|
@ -285,9 +288,11 @@ def setup_repos(archiver, mp):
|
|||
when the context manager is exited, archiver will work with REPO2 (so the transfer can be run).
|
||||
"""
|
||||
original_location = archiver.repository_location
|
||||
original_path = archiver.repository_path
|
||||
|
||||
mp.setenv("BORG_PASSPHRASE", "pw1")
|
||||
archiver.repository_location = original_location + "1"
|
||||
archiver.repository_path = original_path + "1"
|
||||
cmd(archiver, "repo-create", RK_ENCRYPTION)
|
||||
|
||||
other_repo1 = f"--other-repo={original_location}1"
|
||||
|
|
@ -296,6 +301,7 @@ def setup_repos(archiver, mp):
|
|||
mp.setenv("BORG_PASSPHRASE", "pw2")
|
||||
mp.setenv("BORG_OTHER_PASSPHRASE", "pw1")
|
||||
archiver.repository_location = original_location + "2"
|
||||
archiver.repository_path = original_path + "2"
|
||||
cmd(archiver, "repo-create", RK_ENCRYPTION, other_repo1)
|
||||
|
||||
|
||||
|
|
@ -400,3 +406,66 @@ def test_transfer_recompress(archivers, request, monkeypatch, recompress_mode):
|
|||
# We allow a small percentage difference to account for metadata changes.
|
||||
size_diff_percent = abs(source_size - dest_size) / source_size * 100
|
||||
assert size_diff_percent < 5, f"dest_size ({dest_size}) should be similar as source_size ({source_size})."
|
||||
|
||||
|
||||
def test_transfer_rechunk(archivers, request, monkeypatch):
|
||||
"""Test transfer with re-chunking"""
|
||||
archiver = request.getfixturevalue(archivers)
|
||||
|
||||
BLKSIZE = 4096
|
||||
source_chunker_params = "buzhash,19,23,21,4095" # default buzhash chunks
|
||||
dest_chunker_params = f"fixed,{BLKSIZE}" # fixed chunk size
|
||||
|
||||
with setup_repos(archiver, monkeypatch) as other_repo1:
|
||||
contents_1 = random.randbytes(1 * BLKSIZE)
|
||||
contents_255 = random.randbytes(255 * BLKSIZE)
|
||||
contents_1024 = random.randbytes(1024 * BLKSIZE)
|
||||
create_regular_file(archiver.input_path, "file_1", contents=contents_1)
|
||||
create_regular_file(archiver.input_path, "file_256", contents=contents_255 + contents_1)
|
||||
create_regular_file(archiver.input_path, "file_1280", contents=contents_1024 + contents_255 + contents_1)
|
||||
|
||||
cmd(archiver, "create", f"--chunker-params={source_chunker_params}", "archive", "input")
|
||||
|
||||
# Get metadata from source archive
|
||||
source_info_json = cmd(archiver, "info", "--json", "archive")
|
||||
source_info = json.loads(source_info_json)
|
||||
source_archive = source_info["archives"][0]
|
||||
source_chunker_params_info = source_archive["chunker_params"]
|
||||
|
||||
# Calculate SHA256 hashes of file contents from source archive
|
||||
source_archive_obj, source_repo = open_archive(archiver.repository_path, "archive")
|
||||
with source_repo:
|
||||
source_file_hashes = {}
|
||||
for item in source_archive_obj.iter_items():
|
||||
if hasattr(item, "chunks"): # Only process regular files with chunks
|
||||
f = open_item(source_archive_obj, item)
|
||||
content = f.read(10 * 1024 * 1024) # Read up to 10 MB
|
||||
source_file_hashes[item.path] = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Transfer with rechunking
|
||||
cmd(archiver, "transfer", other_repo1, f"--chunker-params={dest_chunker_params}")
|
||||
|
||||
# Get metadata from destination archive
|
||||
dest_info_json = cmd(archiver, "info", "--json", "archive")
|
||||
dest_info = json.loads(dest_info_json)
|
||||
dest_archive = dest_info["archives"][0]
|
||||
dest_chunker_params_info = dest_archive["chunker_params"]
|
||||
|
||||
# chunker params in metadata must reflect the chunker params given on the CLI
|
||||
assert tuple(source_chunker_params_info) == ChunkerParams(source_chunker_params)
|
||||
assert tuple(dest_chunker_params_info) == ChunkerParams(dest_chunker_params)
|
||||
|
||||
# Compare file hashes between source and destination archives, also check expected chunk counts.
|
||||
dest_archive_obj, dest_repo = open_archive(archiver.repository_path, "archive")
|
||||
with dest_repo:
|
||||
for item in dest_archive_obj.iter_items():
|
||||
if hasattr(item, "chunks"): # Only process regular files with chunks
|
||||
# Verify expected chunk count for each file
|
||||
expected_chunk_count = {"input/file_1": 1, "input/file_256": 256, "input/file_1280": 1280}[item.path]
|
||||
assert len(item.chunks) == expected_chunk_count
|
||||
f = open_item(dest_archive_obj, item)
|
||||
content = f.read(10 * 1024 * 1024) # Read up to 10 MB
|
||||
dest_hash = hashlib.sha256(content).hexdigest()
|
||||
# Verify that the file hash is identical to the source
|
||||
assert item.path in source_file_hashes, f"File {item.path} not found in source archive"
|
||||
assert dest_hash == source_file_hashes[item.path], f"Content hash mismatch for {item.path}"
|
||||
|
|
|
|||
|
|
@ -10,8 +10,8 @@ logger = create_logger(__name__)
|
|||
|
||||
|
||||
class UpgraderNoOp:
|
||||
def __init__(self, *, cache):
|
||||
pass
|
||||
def __init__(self, *, cache, args):
|
||||
self.args = args
|
||||
|
||||
def new_archive(self, *, archive):
|
||||
pass
|
||||
|
|
@ -37,14 +37,19 @@ class UpgraderNoOp:
|
|||
):
|
||||
if hasattr(metadata, attr):
|
||||
new_metadata[attr] = getattr(metadata, attr)
|
||||
rechunking = self.args.chunker_params is not None
|
||||
if rechunking:
|
||||
# if we are rechunking while transferring, we take the new chunker_params.
|
||||
new_metadata["chunker_params"] = self.args.chunker_params
|
||||
return new_metadata
|
||||
|
||||
|
||||
class UpgraderFrom12To20:
|
||||
borg1_header_fmt = Struct(">I")
|
||||
|
||||
def __init__(self, *, cache):
|
||||
def __init__(self, *, cache, args):
|
||||
self.cache = cache
|
||||
self.args = args
|
||||
|
||||
def new_archive(self, *, archive):
|
||||
self.archive = archive
|
||||
|
|
@ -144,10 +149,15 @@ class UpgraderFrom12To20:
|
|||
for attr in ("hostname", "username", "comment", "chunker_params"):
|
||||
if hasattr(metadata, attr):
|
||||
new_metadata[attr] = getattr(metadata, attr)
|
||||
if chunker_params := new_metadata.get("chunker_params"):
|
||||
if len(chunker_params) == 4 and isinstance(chunker_params[0], int):
|
||||
# this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
|
||||
new_metadata["chunker_params"] = (CH_BUZHASH,) + chunker_params
|
||||
rechunking = self.args.chunker_params is not None
|
||||
if rechunking:
|
||||
# if we are rechunking while transferring, we take the new chunker_params.
|
||||
new_metadata["chunker_params"] = self.args.chunker_params
|
||||
else:
|
||||
if chunker_params := new_metadata.get("chunker_params"):
|
||||
if len(chunker_params) == 4 and isinstance(chunker_params[0], int):
|
||||
# this is a borg < 1.2 chunker_params tuple, no chunker algo specified, but we only had buzhash:
|
||||
new_metadata["chunker_params"] = (CH_BUZHASH,) + chunker_params
|
||||
# old borg used UTC timestamps, but did not have the explicit tz offset in them.
|
||||
for attr in ("time", "time_end"):
|
||||
if hasattr(metadata, attr):
|
||||
|
|
|
|||
Loading…
Reference in a new issue