diff --git a/borg/archive.py b/borg/archive.py index 5ece57240..ad22d3d17 100644 --- a/borg/archive.py +++ b/borg/archive.py @@ -26,10 +26,10 @@ import msgpack ITEMS_BUFFER = 1024 * 1024 -CHUNK_MIN_EXP = 10 # 2**10 == 1kiB +CHUNK_MIN_EXP = 19 # 2**19 == 512kiB CHUNK_MAX_EXP = 23 # 2**23 == 8MiB HASH_WINDOW_SIZE = 0xfff # 4095B -HASH_MASK_BITS = 16 # results in ~64kiB chunks statistically +HASH_MASK_BITS = 21 # results in ~2MiB chunks statistically # defaults, use --chunker-params to override CHUNKER_PARAMS = (CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) diff --git a/docs/internals.rst b/docs/internals.rst index 246ffca9f..21ae24452 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -210,9 +210,9 @@ producing chunks of 2^HASH_MASK_BITS Bytes on average. ``borg create --chunker-params CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE`` can be used to tune the chunker parameters, the default is: -- CHUNK_MIN_EXP = 10 (minimum chunk size = 2^10 B = 1 kiB) +- CHUNK_MIN_EXP = 19 (minimum chunk size = 2^19 B = 512 kiB) - CHUNK_MAX_EXP = 23 (maximum chunk size = 2^23 B = 8 MiB) -- HASH_MASK_BITS = 16 (statistical medium chunk size ~= 2^16 B = 64 kiB) +- HASH_MASK_BITS = 21 (statistical medium chunk size ~= 2^21 B = 2 MiB) - HASH_WINDOW_SIZE = 4095 [B] (`0xFFF`) The buzhash table is altered by XORing it with a seed randomly generated once @@ -313,13 +313,13 @@ If a remote repository is used the repo index will be allocated on the remote si E.g. backing up a total count of 1 Mi (IEC binary prefix e.g. 2^20) files with a total size of 1TiB. -a) with create ``--chunker-params 10,23,16,4095`` (default): +a) with ``create --chunker-params 10,23,16,4095`` (custom, like borg < 1.0 or attic): mem_usage = 2.8GiB -b) with create ``--chunker-params 10,23,20,4095`` (custom): +b) with ``create --chunker-params 19,23,21,4095`` (default): - mem_usage = 0.4GiB + mem_usage = 0.31GiB .. note:: There is also the ``--no-files-cache`` option to switch off the files cache. You'll save some memory, but it will need to read / chunk all the files as diff --git a/docs/misc/create_chunker-params.txt b/docs/misc/create_chunker-params.txt index 73cac6a3b..3e322b660 100644 --- a/docs/misc/create_chunker-params.txt +++ b/docs/misc/create_chunker-params.txt @@ -6,7 +6,7 @@ About borg create --chunker-params CHUNK_MIN_EXP and CHUNK_MAX_EXP give the exponent N of the 2^N minimum and maximum chunk size. Required: CHUNK_MIN_EXP < CHUNK_MAX_EXP. -Defaults: 10 (2^10 == 1KiB) minimum, 23 (2^23 == 8MiB) maximum. +Defaults: 19 (2^19 == 512KiB) minimum, 23 (2^23 == 8MiB) maximum. HASH_MASK_BITS is the number of least-significant bits of the rolling hash that need to be zero to trigger a chunk cut. @@ -14,7 +14,7 @@ Recommended: CHUNK_MIN_EXP + X <= HASH_MASK_BITS <= CHUNK_MAX_EXP - X, X >= 2 (this allows the rolling hash some freedom to make its cut at a place determined by the windows contents rather than the min/max. chunk size). -Default: 16 (statistically, chunks will be about 2^16 == 64kiB in size) +Default: 21 (statistically, chunks will be about 2^21 == 2MiB in size) HASH_WINDOW_SIZE: the size of the window used for the rolling hash computation. Default: 4095B diff --git a/docs/usage.rst b/docs/usage.rst index 4abd44b26..d6aad6a87 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -249,8 +249,10 @@ Examples NAME="root-`date +%Y-%m-%d`" $ borg create -C zlib,6 /mnt/backup::$NAME / --do-not-cross-mountpoints - # Backup huge files with little chunk management overhead - $ borg create --chunker-params 19,23,21,4095 /mnt/backup::VMs /srv/VMs + # Make a big effort in fine granular deduplication (big chunk management + # overhead, needs a lot of RAM and disk space, see formula in internals + # docs - same parameters as borg < 1.0 or attic): + $ borg create --chunker-params 10,23,16,4095 /mnt/backup::small /smallstuff # Backup a raw device (must not be active/in use/mounted at that time) $ dd if=/dev/sda bs=10M | borg create /mnt/backup::my-sda - @@ -506,15 +508,15 @@ resource usage (RAM and disk space) as the amount of resources needed is (also) determined by the total amount of chunks in the repository (see `Indexes / Caches memory usage` for details). -``--chunker-params=10,23,16,4095 (default)`` results in a fine-grained deduplication -and creates a big amount of chunks and thus uses a lot of resources to manage them. -This is good for relatively small data volumes and if the machine has a good -amount of free RAM and disk space. +``--chunker-params=10,23,16,4095`` results in a fine-grained deduplication +and creates a big amount of chunks and thus uses a lot of resources to manage +them. This is good for relatively small data volumes and if the machine has a +good amount of free RAM and disk space. -``--chunker-params=19,23,21,4095`` results in a coarse-grained deduplication and -creates a much smaller amount of chunks and thus uses less resources. -This is good for relatively big data volumes and if the machine has a relatively -low amount of free RAM and disk space. +``--chunker-params=19,23,21,4095`` (default) results in a coarse-grained +deduplication and creates a much smaller amount of chunks and thus uses less +resources. This is good for relatively big data volumes and if the machine has +a relatively low amount of free RAM and disk space. If you already have made some archives in a repository and you then change chunker params, this of course impacts deduplication as the chunks will be