package main import ( "context" "crypto/sha256" "encoding/json" "fmt" "os" "path/filepath" "github.com/restic/restic/internal/restic" "github.com/spf13/cobra" ) var cmdStats = &cobra.Command{ Use: "stats", Short: "Scan the repository and show basic statistics", Long: ` The "stats" command walks one or all snapshots in a repository and accumulates statistics about the data stored therein. It reports on the number of unique files and their sizes, according to one of the counting modes as given by a flag. `, DisableAutoGenTag: true, RunE: func(cmd *cobra.Command, args []string) error { return runStats(globalOptions, args) }, } var countModeFlag []string func init() { cmdRoot.AddCommand(cmdStats) f := cmdStats.Flags() f.BoolVar(&countModeRestoreSize, "count-restore-size", false, "count the size of files that would be restored (default)") f.BoolVar(&countModeUniqueFilesByContent, "count-files-by-contents", false, "count files as unique by their contents") f.BoolVar(&countModeBlobsPerFile, "count-blobs-per-file", false, "count sizes of blobs by filename") f.BoolVar(&countModeRawData, "count-raw-data", false, "count unique blob sizes irrespective of files referencing them") f.StringVar(&snapshotByHost, "host", "", "filter latest snapshot by this hostname") } func runStats(gopts GlobalOptions, args []string) error { err := verifyStatsInput(gopts, args) if err != nil { return err } ctx, cancel := context.WithCancel(gopts.ctx) defer cancel() repo, err := OpenRepository(gopts) if err != nil { return err } if err = repo.LoadIndex(ctx); err != nil { return err } if !gopts.NoLock { lock, err := lockRepo(repo) defer unlockRepo(lock) if err != nil { return err } } // create a container for the stats (and other needed state) stats := &statsContainer{ uniqueFiles: make(map[fileID]struct{}), idSet: make(restic.IDSet), fileBlobs: make(map[string]restic.IDSet), blobs: restic.NewBlobSet(), blobsSeen: restic.NewBlobSet(), } if snapshotIDString != "" { // scan just a single snapshot var sID restic.ID if snapshotIDString == "latest" { sID, err = restic.FindLatestSnapshot(ctx, repo, []string{}, []restic.TagList{}, snapshotByHost) if err != nil { Exitf(1, "latest snapshot for criteria not found: %v", err) } } else { sID, err = restic.FindSnapshot(repo, snapshotIDString) if err != nil { return err } } snapshot, err := restic.LoadSnapshot(ctx, repo, sID) if err != nil { return err } err = statsWalkSnapshot(ctx, snapshot, repo, stats) } else { // iterate every snapshot in the repo err = repo.List(ctx, restic.SnapshotFile, func(snapshotID restic.ID, size int64) error { snapshot, err := restic.LoadSnapshot(ctx, repo, snapshotID) if err != nil { return fmt.Errorf("Error loading snapshot %s: %v", snapshotID.Str(), err) } return statsWalkSnapshot(ctx, snapshot, repo, stats) }) } if err != nil { return err } if countModeRawData { // the blob handles have been collected, but not yet counted for blobHandle := range stats.blobs { blobSize, found := repo.LookupBlobSize(blobHandle.ID, blobHandle.Type) if !found { return fmt.Errorf("blob %v not found", blobHandle) } stats.TotalSize += uint64(blobSize) stats.TotalBlobCount++ } } if gopts.JSON { err = json.NewEncoder(os.Stdout).Encode(stats) if err != nil { return fmt.Errorf("encoding output: %v", err) } return nil } if stats.TotalBlobCount > 0 { Printf(" Total Blob Count: %d\n", stats.TotalBlobCount) } if stats.TotalFileCount > 0 { Printf(" Total File Count: %d\n", stats.TotalFileCount) } Printf(" Total Size: %-5s\n", formatBytes(stats.TotalSize)) return nil } func statsWalkSnapshot(ctx context.Context, snapshot *restic.Snapshot, repo restic.Repository, stats *statsContainer) error { if snapshot.Tree == nil { return fmt.Errorf("snapshot %s has nil tree", snapshot.ID().Str()) } if countModeRawData { // count just the sizes of unique blobs; we don't need to walk the tree // ourselves in this case, since a nifty function does it for us return restic.FindUsedBlobs(ctx, repo, *snapshot.Tree, stats.blobs, stats.blobsSeen) } err := statsWalkTree(ctx, repo, *snapshot.Tree, stats, string(filepath.Separator)) if err != nil { return fmt.Errorf("walking tree %s: %v", *snapshot.Tree, err) } return nil } func statsWalkTree(ctx context.Context, repo restic.Repository, treeID restic.ID, stats *statsContainer, fpath string) error { // don't visit a tree we've already walked if stats.idSet.Has(treeID) { return nil } stats.idSet.Insert(treeID) tree, err := repo.LoadTree(ctx, treeID) if err != nil { return fmt.Errorf("loading tree: %v", err) } for _, node := range tree.Nodes { if countModeUniqueFilesByContent || countModeBlobsPerFile { // only count this file if we haven't visited it before fid := makeFileIDByContents(node) if _, ok := stats.uniqueFiles[fid]; !ok { // mark the file as visited stats.uniqueFiles[fid] = struct{}{} if countModeUniqueFilesByContent { // simply count the size of each unique file (unique by contents only) stats.TotalSize += node.Size stats.TotalFileCount++ } if countModeBlobsPerFile { // count the size of each unique blob reference, which is // by unique file (unique by contents and file path) for _, blobID := range node.Content { // ensure we have this file (by path) in our map; in this // mode, a file is unique by both contents and path nodePath := filepath.Join(fpath, node.Name) if _, ok := stats.fileBlobs[nodePath]; !ok { stats.fileBlobs[nodePath] = restic.NewIDSet() stats.TotalFileCount++ } if _, ok := stats.fileBlobs[nodePath][blobID]; !ok { // is always a data blob since we're accessing it via a file's Content array blobSize, found := repo.LookupBlobSize(blobID, restic.DataBlob) if !found { return fmt.Errorf("blob %s not found for tree %s", blobID, treeID) } // count the blob's size, then add this blob by this // file (path) so we don't double-count it stats.TotalSize += uint64(blobSize) stats.fileBlobs[nodePath].Insert(blobID) // this mode also counts total unique blob _references_ per file stats.TotalBlobCount++ } } } } } if countModeRestoreSize { // as this is a file in the snapshot, we can simply count its // size without worrying about uniqueness, since duplicate files // will still be restored stats.TotalSize += node.Size stats.TotalFileCount++ } // visit subtrees (i.e. directory contents) if node.Subtree != nil { err = statsWalkTree(ctx, repo, *node.Subtree, stats, filepath.Join(fpath, node.Name)) if err != nil { return err } } } return nil } // makeFileIDByContents returns a hash of the blob IDs of the // node's Content in sequence. func makeFileIDByContents(node *restic.Node) fileID { var bb []byte for _, c := range node.Content { bb = append(bb, []byte(c[:])...) } return sha256.Sum256(bb) } func verifyStatsInput(gopts GlobalOptions, args []string) error { // ensure only one counting mode was specified, for clarity var countModes int if countModeRestoreSize { countModes++ } if countModeUniqueFilesByContent { countModes++ } if countModeBlobsPerFile { countModes++ } if countModeRawData { countModes++ } if countModes > 1 { return fmt.Errorf("only one counting mode may be used") } // set a default count mode if none were specified if countModes == 0 { countModeRestoreSize = true } // ensure one or none snapshots were specified if len(args) > 1 { return fmt.Errorf("only one snapshot may be specified") } // set the snapshot to scan, if one was specified if len(args) == 1 { snapshotIDString = args[0] } return nil } // statsContainer holds information during a walk of a repository // to collect information about it, as well as state needed // for a successful and efficient walk. type statsContainer struct { TotalSize uint64 `json:"total_size"` TotalFileCount uint64 `json:"total_file_count"` TotalBlobCount uint64 `json:"total_blob_count,omitempty"` // idSet marks visited trees, to avoid repeated walks idSet restic.IDSet // uniqueFiles marks visited files according to their // contents (hashed sequence of content blob IDs) uniqueFiles map[fileID]struct{} // fileBlobs maps a file name (path) to the set of // blobs that have been seen as a part of the file fileBlobs map[string]restic.IDSet // blobs and blobsSeen are used to count indiviudal // unique blobs, independent of references to files blobs, blobsSeen restic.BlobSet } // fileID is a 256-bit hash that distinguishes unique files. type fileID [32]byte var ( countModeRestoreSize bool countModeUniqueFilesByContent bool countModeBlobsPerFile bool countModeRawData bool // the snapshot to scan, as given by the user snapshotIDString string // snapshotByHost is the host to filter latest // snapshot by, if given by user snapshotByHost string )