diff --git a/changelog/unreleased/issue-5453 b/changelog/unreleased/issue-5453 new file mode 100644 index 000000000..686474597 --- /dev/null +++ b/changelog/unreleased/issue-5453 @@ -0,0 +1,12 @@ +Enhancement: `restic copy` can now spool packfiles across muliple snapshots + +When using `restic copy` used to save all newly created packfiles when finishing one snapshot, +even when the actual packfile size was quite small. This applied particularly to +incremental backups, when there was only small changes between individual backups. + +When using the new option `--batch`, `restic copy` now creates one large request list +which contains all blobs from all snapshots to be copied and then executes the +copy operation. + +https://github.com/restic/restic/issues/5175 +https://github.com/restic/restic/pull/5464 diff --git a/cmd/restic/cmd_copy.go b/cmd/restic/cmd_copy.go index 498d6f75d..f9a75108b 100644 --- a/cmd/restic/cmd_copy.go +++ b/cmd/restic/cmd_copy.go @@ -3,6 +3,7 @@ package main import ( "context" "fmt" + "slices" "github.com/restic/restic/internal/data" "github.com/restic/restic/internal/debug" @@ -63,13 +64,52 @@ Exit status is 12 if the password is incorrect. type CopyOptions struct { global.SecondaryRepoOptions data.SnapshotFilter + batch bool } func (opts *CopyOptions) AddFlags(f *pflag.FlagSet) { + f.BoolVar(&opts.batch, "batch", false, "batch all snapshots to be copied into one step to optimize use of packfiles") opts.SecondaryRepoOptions.AddFlags(f, "destination", "to copy snapshots from") initMultiSnapshotFilter(f, &opts.SnapshotFilter, true) } +// collectAllSnapshots: select all snapshot trees to be copied +func collectAllSnapshots(ctx context.Context, opts CopyOptions, + srcSnapshotLister restic.Lister, srcRepo restic.Repository, + dstSnapshotByOriginal map[restic.ID][]*data.Snapshot, args []string, printer progress.Printer) (selectedSnapshots []*data.Snapshot) { + + selectedSnapshots = make([]*data.Snapshot, 0, 10) + for sn := range FindFilteredSnapshots(ctx, srcSnapshotLister, srcRepo, &opts.SnapshotFilter, args, printer) { + // check whether the destination has a snapshot with the same persistent ID which has similar snapshot fields + srcOriginal := *sn.ID() + if sn.Original != nil { + srcOriginal = *sn.Original + } + + if originalSns, ok := dstSnapshotByOriginal[srcOriginal]; ok { + isCopy := false + for _, originalSn := range originalSns { + if similarSnapshots(originalSn, sn) { + printer.V("\n%v\n", sn) + printer.V("skipping source snapshot %s, was already copied to snapshot %s\n", sn.ID().Str(), originalSn.ID().Str()) + isCopy = true + break + } + } + if isCopy { + continue + } + } + selectedSnapshots = append(selectedSnapshots, sn) + } + + slices.SortStableFunc(selectedSnapshots, func(a, b *data.Snapshot) int { + return a.Time.Compare(b.Time) + }) + + return selectedSnapshots +} + func runCopy(ctx context.Context, opts CopyOptions, gopts global.Options, args []string, term ui.Terminal) error { printer := ui.NewProgressPrinter(false, gopts.Verbosity, term) secondaryGopts, isFromRepo, err := opts.SecondaryRepoOptions.FillGlobalOpts(ctx, gopts, "destination") @@ -124,10 +164,11 @@ func runCopy(ctx context.Context, opts CopyOptions, gopts global.Options, args [ return ctx.Err() } + selectedSnapshots := collectAllSnapshots(ctx, opts, srcSnapshotLister, srcRepo, dstSnapshotByOriginal, args, printer) + // remember already processed trees across all snapshots visitedTrees := restic.NewIDSet() - - for sn := range FindFilteredSnapshots(ctx, srcSnapshotLister, srcRepo, &opts.SnapshotFilter, args, printer) { + for _, sn := range selectedSnapshots { // check whether the destination has a snapshot with the same persistent ID which has similar snapshot fields srcOriginal := *sn.ID() if sn.Original != nil { @@ -148,25 +189,12 @@ func runCopy(ctx context.Context, opts CopyOptions, gopts global.Options, args [ continue } } - printer.P("\n%v", sn) - printer.P(" copy started, this may take a while...") - if err := copyTree(ctx, srcRepo, dstRepo, visitedTrees, *sn.Tree, printer); err != nil { - return err - } - debug.Log("tree copied") - - // save snapshot - sn.Parent = nil // Parent does not have relevance in the new repo. - // Use Original as a persistent snapshot ID - if sn.Original == nil { - sn.Original = sn.ID() - } - newID, err := data.SaveSnapshot(ctx, dstRepo, sn) - if err != nil { - return err - } - printer.P("snapshot %s saved", newID.Str()) } + + if err := copyTreeBatched(ctx, srcRepo, dstRepo, visitedTrees, selectedSnapshots, opts, printer); err != nil { + return err + } + return ctx.Err() } @@ -190,7 +218,7 @@ func similarSnapshots(sna *data.Snapshot, snb *data.Snapshot) bool { } func copyTree(ctx context.Context, srcRepo restic.Repository, dstRepo restic.Repository, - visitedTrees restic.IDSet, rootTreeID restic.ID, printer progress.Printer) error { + visitedTrees restic.IDSet, rootTreeID restic.ID, printer progress.Printer, uploader restic.BlobSaver, seenBlobs restic.IDSet) error { wg, wgCtx := errgroup.WithContext(ctx) @@ -204,11 +232,15 @@ func copyTree(ctx context.Context, srcRepo restic.Repository, dstRepo restic.Rep packList := restic.NewIDSet() enqueue := func(h restic.BlobHandle) { + if seenBlobs.Has(h.ID) { + return + } pb := srcRepo.LookupBlob(h.Type, h.ID) copyBlobs.Insert(h) for _, p := range pb { packList.Insert(p.PackID) } + seenBlobs.Insert(h.ID) } wg.Go(func() error { @@ -244,7 +276,9 @@ func copyTree(ctx context.Context, srcRepo restic.Repository, dstRepo restic.Rep copyStats(srcRepo, copyBlobs, packList, printer) bar := printer.NewCounter("packs copied") - err = repository.Repack(ctx, srcRepo, dstRepo, packList, copyBlobs, bar, printer.P) + bar.SetMax(uint64(len(packList))) + err = repository.CopyBlobs(ctx, srcRepo, dstRepo, uploader, packList, copyBlobs, bar, printer.P) + bar.Done() if err != nil { return errors.Fatalf("%s", err) } @@ -268,3 +302,81 @@ func copyStats(srcRepo restic.Repository, copyBlobs restic.BlobSet, packList res printer.V(" copy %d blobs with disk size %s in %d packfiles\n", countBlobs, ui.FormatBytes(uint64(sizeBlobs)), len(packList)) } + +func copySaveSnapshot(ctx context.Context, sn *data.Snapshot, dstRepo restic.Repository, printer progress.Printer) error { + sn.Parent = nil // Parent does not have relevance in the new repo. + // Use Original as a persistent snapshot ID + if sn.Original == nil { + sn.Original = sn.ID() + } + newID, err := data.SaveSnapshot(ctx, dstRepo, sn) + if err != nil { + return err + } + printer.P("snapshot %s saved", newID.Str()) + return nil +} + +// copyTreeBatched: copy multiple snapshot trees in one go, using calls to +// repository.RepackInner() for all selected snapshot trees and thereby packing the packfiles optimally. +// Usually each snapshot creates at least one tree packfile and one data packfile. +func copyTreeBatched(ctx context.Context, srcRepo restic.Repository, dstRepo restic.Repository, + visitedTrees restic.IDSet, selectedSnapshots []*data.Snapshot, opts CopyOptions, + printer progress.Printer) error { + + // seenBlobs is necessary in about 1 of 10000 blobs, in the other 99.99% the check + // dstRepo.LookupBlobSize() is working + seenBlobs := restic.NewIDSet() + // dependent on opts.batch the package Uploader is started either for + // each snapshot to be copied or once for all snapshots + + if opts.batch { + // call WithBlobUploader() once and then loop over all selectedSnapshots + err := dstRepo.WithBlobUploader(context.TODO(), func(ctx context.Context, uploader restic.BlobSaver) error { + for _, sn := range selectedSnapshots { + printer.P("\n%v", sn) + printer.P(" copy started, this may take a while...") + err := copyTree(ctx, srcRepo, dstRepo, visitedTrees, *sn.Tree, printer, uploader, seenBlobs) + if err != nil { + return err + } + debug.Log("tree copied") + } + + // save all the snapshots + for _, sn := range selectedSnapshots { + err := copySaveSnapshot(ctx, sn, dstRepo, printer) + if err != nil { + return err + } + } + return nil + }) + + return err + } + + // no batch option, loop over selectedSnapshots and call WithBlobUploader() + // inside the loop + for _, sn := range selectedSnapshots { + printer.P("\n%v", sn) + printer.P(" copy started, this may take a while...") + err := dstRepo.WithBlobUploader(context.TODO(), func(ctx context.Context, uploader restic.BlobSaver) error { + if err := copyTree(ctx, srcRepo, dstRepo, visitedTrees, *sn.Tree, printer, uploader, seenBlobs); err != nil { + return err + } + debug.Log("tree copied") + return nil + }) + if err != nil { + return err + } + + err = copySaveSnapshot(ctx, sn, dstRepo, printer) + if err != nil { + return err + } + } + + return nil +} diff --git a/cmd/restic/cmd_copy_integration_test.go b/cmd/restic/cmd_copy_integration_test.go index c35e960ff..41b8355da 100644 --- a/cmd/restic/cmd_copy_integration_test.go +++ b/cmd/restic/cmd_copy_integration_test.go @@ -6,8 +6,11 @@ import ( "path/filepath" "testing" + "github.com/restic/restic/internal/data" "github.com/restic/restic/internal/global" + "github.com/restic/restic/internal/restic" rtest "github.com/restic/restic/internal/test" + "github.com/restic/restic/internal/ui" ) func testRunCopy(t testing.TB, srcGopts global.Options, dstGopts global.Options) { @@ -28,6 +31,25 @@ func testRunCopy(t testing.TB, srcGopts global.Options, dstGopts global.Options) })) } +func testRunCopyBatched(t testing.TB, srcGopts global.Options, dstGopts global.Options) { + gopts := srcGopts + gopts.Repo = dstGopts.Repo + gopts.Password = dstGopts.Password + gopts.InsecureNoPassword = dstGopts.InsecureNoPassword + copyOpts := CopyOptions{ + SecondaryRepoOptions: global.SecondaryRepoOptions{ + Repo: srcGopts.Repo, + Password: srcGopts.Password, + InsecureNoPassword: srcGopts.InsecureNoPassword, + }, + batch: true, + } + + rtest.OK(t, withTermStatus(t, gopts, func(ctx context.Context, gopts global.Options) error { + return runCopy(context.TODO(), copyOpts, gopts, nil, gopts.Term) + })) +} + func TestCopy(t *testing.T) { env, cleanup := withTestEnvironment(t) defer cleanup() @@ -85,6 +107,178 @@ func TestCopy(t *testing.T) { rtest.Assert(t, len(origRestores) == 0, "found not copied snapshots") } +// packfile with size and type +type packInfo struct { + Type string + size int64 + numberBlobs int +} + +// testGetUsedBlobs: call data.FindUsedBlobs for all snapshots in repositpry +func testGetUsedBlobs(t *testing.T, repo restic.Repository) (usedBlobs restic.BlobSet) { + selectedTrees := make([]restic.ID, 0, 3) + usedBlobs = restic.NewBlobSet() + + snapshotLister, err := restic.MemorizeList(context.TODO(), repo, restic.SnapshotFile) + rtest.OK(t, err) + rtest.OK(t, repo.LoadIndex(context.TODO(), nil)) + + // gather all snapshots + nullFilter := &data.SnapshotFilter{} + err = nullFilter.FindAll(context.TODO(), snapshotLister, repo, nil, func(_ string, sn *data.Snapshot, err error) error { + rtest.OK(t, err) + selectedTrees = append(selectedTrees, *sn.Tree) + return nil + }) + rtest.OK(t, err) + + rtest.OK(t, data.FindUsedBlobs(context.TODO(), repo, selectedTrees, usedBlobs, nil)) + + return usedBlobs +} + +// getPackfileInfo: get packfiles, their length, type and number of blobs in packfile +func getPackfileInfo(t *testing.T, repo restic.Repository) (packfiles map[restic.ID]packInfo) { + packfiles = make(map[restic.ID]packInfo) + + rtest.OK(t, repo.List(context.TODO(), restic.PackFile, func(id restic.ID, size int64) error { + blobs, _, err := repo.ListPack(context.TODO(), id, size) + rtest.OK(t, err) + rtest.Assert(t, len(blobs) > 0, "a packfile should contain at least one blob") + + Type := "" + if len(blobs) > 0 { + Type = blobs[0].Type.String() + } + + packfiles[id] = packInfo{ + Type: Type, + size: size, + numberBlobs: len(blobs), + } + + return nil + })) + + return packfiles +} + +// get various counts from the packfiles in the repository +func getCounts(t *testing.T, repo restic.Repository) (int, int, int) { + countTreePacks := 0 + countDataPacks := 0 + countBlobs := 0 + for _, item := range getPackfileInfo(t, repo) { + switch item.Type { + case "tree": + countTreePacks++ + case "data": + countDataPacks++ + } + countBlobs += item.numberBlobs + } + + return countTreePacks, countDataPacks, countBlobs +} + +func TestCopyBatched(t *testing.T) { + env, cleanup := withTestEnvironment(t) + defer cleanup() + env3, cleanup3 := withTestEnvironment(t) + defer cleanup3() + + testSetupBackupData(t, env) + opts := BackupOptions{} + testRunBackup(t, "", []string{filepath.Join(env.testdata, "0", "0", "9")}, opts, env.gopts) + testRunBackup(t, "", []string{filepath.Join(env.testdata, "0", "0", "9", "2")}, opts, env.gopts) + testRunBackup(t, "", []string{filepath.Join(env.testdata, "0", "0", "9", "3")}, opts, env.gopts) + testRunCheck(t, env.gopts) + + // batch copy + testRunInit(t, env3.gopts) + testRunCopyBatched(t, env.gopts, env3.gopts) + + // check integrity of the copy + testRunCheck(t, env3.gopts) + + snapshotIDs := testListSnapshots(t, env.gopts, 3) + copiedSnapshotIDs := testListSnapshots(t, env3.gopts, 3) + + // check that the copied snapshots have the same tree contents as the old ones (= identical tree hash) + origRestores := make(map[string]struct{}) + for i, snapshotID := range snapshotIDs { + restoredir := filepath.Join(env.base, fmt.Sprintf("restore%d", i)) + origRestores[restoredir] = struct{}{} + testRunRestore(t, env.gopts, restoredir, snapshotID.String()) + } + + for i, snapshotID := range copiedSnapshotIDs { + restoredir := filepath.Join(env3.base, fmt.Sprintf("restore%d", i)) + testRunRestore(t, env3.gopts, restoredir, snapshotID.String()) + foundMatch := false + for cmpdir := range origRestores { + diff := directoriesContentsDiff(t, restoredir, cmpdir) + if diff == "" { + delete(origRestores, cmpdir) + foundMatch = true + } + } + + rtest.Assert(t, foundMatch, "found no counterpart for snapshot %v", snapshotID) + } + + rtest.Assert(t, len(origRestores) == 0, "found not copied snapshots") + + // get access to the repositories + var repo1 restic.Repository + var unlock1 func() + var err error + rtest.OK(t, withTermStatus(t, env.gopts, func(ctx context.Context, gopts global.Options) error { + printer := ui.NewProgressPrinter(gopts.JSON, gopts.Verbosity, gopts.Term) + _, repo1, unlock1, err = openWithReadLock(ctx, gopts, false, printer) + rtest.OK(t, err) + defer unlock1() + return err + })) + + var repo3 restic.Repository + var unlock3 func() + rtest.OK(t, withTermStatus(t, env3.gopts, func(ctx context.Context, gopts global.Options) error { + printer := ui.NewProgressPrinter(gopts.JSON, gopts.Verbosity, gopts.Term) + _, repo3, unlock3, err = openWithReadLock(ctx, gopts, false, printer) + rtest.OK(t, err) + defer unlock3() + return err + })) + + usedBlobs1 := testGetUsedBlobs(t, repo1) + usedBlobs3 := testGetUsedBlobs(t, repo3) + rtest.Assert(t, len(usedBlobs1) == len(usedBlobs3), + "used blob length must be identical in both repositories, but is not: (normal) %d <=> (batched) %d", + len(usedBlobs1), len(usedBlobs3)) + + // compare usedBlobs1 <=> usedBlobs3 + good := true + for bh := range usedBlobs1 { + if !usedBlobs3.Has(bh) { + good = false + break + } + } + rtest.Assert(t, good, "all blobs in both repositories should be equal but they are not") + + _, _, countBlobs1 := getCounts(t, repo1) + countTreePacks3, countDataPacks3, countBlobs3 := getCounts(t, repo3) + + rtest.Assert(t, countBlobs1 == countBlobs3, + "expected 1 blob count in boths repos to be equal, but got %d and %d blobs", + countBlobs1, countBlobs3) + + rtest.Assert(t, countTreePacks3 == 1 && countDataPacks3 == 1, + "expected 1 data packfile and 1 tree packfile, but got %d trees and %d data packfiles", + countTreePacks3, countDataPacks3) +} + func TestCopyIncremental(t *testing.T) { env, cleanup := withTestEnvironment(t) defer cleanup() diff --git a/doc/045_working_with_repos.rst b/doc/045_working_with_repos.rst index 75a7e79f1..10f898444 100644 --- a/doc/045_working_with_repos.rst +++ b/doc/045_working_with_repos.rst @@ -216,6 +216,14 @@ example from a local to a remote repository, you can use the ``copy`` command: snapshot 4e5d5487 of [/home/user/work] at 2020-05-01 22:44:07.012113 +0200 CEST by user@kasimir skipping snapshot 4e5d5487, was already copied to snapshot 50eb62b7 +In case you want to copy a repository which contains many backups with little changes +between ``restic backup`` runs, you can use the option ``--batch`` to make full use of +the ``--pack-size`` option. Newly created packfiles are saved when the ``copy`` +operation for one snapshot finishes. The option ``--batch`` disregards these snapshot boundaries +and creates optimally filled packfiles. You can always always achieve the same effect +by running ``restic prune`` after a ``restic copy`` operation, but this involves the extra +``prune`` step. + The example command copies all snapshots from the source repository ``/srv/restic-repo`` to the destination repository ``/srv/restic-repo-copy``. Snapshots which have previously been copied between repositories will @@ -353,7 +361,7 @@ modifying the repository. Instead restic will only print the actions it would perform. .. note:: The ``rewrite`` command verifies that it does not modify snapshots in - unexpected ways and fails with an ``cannot encode tree at "[...]" without losing information`` + unexpected ways and fails with an ``cannot encode tree at "[...]" without loosing information`` error otherwise. This can occur when rewriting a snapshot created by a newer version of restic or some third-party implementation. diff --git a/internal/backend/local/local_windows.go b/internal/backend/local/local_windows.go index fa21d8240..b3677b0ef 100644 --- a/internal/backend/local/local_windows.go +++ b/internal/backend/local/local_windows.go @@ -24,7 +24,7 @@ func removeFile(f string) error { // as Windows won't let you delete a read-only file err := os.Chmod(f, 0666) if err != nil && !os.IsPermission(err) { - return errors.WithStack(err) + return errors.WithStack(err) } return os.Remove(f) diff --git a/internal/repository/repack.go b/internal/repository/repack.go index 730325afd..17f4fbf3f 100644 --- a/internal/repository/repack.go +++ b/internal/repository/repack.go @@ -54,6 +54,26 @@ func Repack( }) } +/* the following code is a terrible hack, but there is currently no other way + of calling the functionality in repack() without a lot duplication of code. + + Repack() is still called from `restic prune` via plan.Execute() inside prune.go +*/ +// CopyBlobs is a wrapper around repack(). The parameter 'uploader' is passed through +// from WithBlobUploader() to CopyBlobs() via cmd/restic/cmd_copy.copyTree(). +func CopyBlobs( + ctx context.Context, + repo restic.Repository, + dstRepo restic.Repository, + uploader restic.BlobSaver, + packs restic.IDSet, + keepBlobs repackBlobSet, + p *progress.Counter, + logf LogFunc, +) error { + return repack(ctx, repo, dstRepo, uploader, packs, keepBlobs, p, logf) +} + func repack( ctx context.Context, repo restic.Repository,