From 27de1b0a438d0b63299ee6b9aa973d07d5922021 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 1 Aug 2015 15:07:54 +0200 Subject: [PATCH 01/13] add a wrapper around liblz4 --- .gitignore | 1 + borg/compress.pyx | 67 +++++++++++++++++++++++++++++++++++++++++++ docs/global.rst.inc | 1 + docs/installation.rst | 14 +++++++-- setup.py | 8 +++++- 5 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 borg/compress.pyx diff --git a/.gitignore b/.gitignore index f3564a429..f6b10cf78 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ env .tox hashindex.c chunker.c +compress.c crypto.c platform_darwin.c platform_freebsd.c diff --git a/borg/compress.pyx b/borg/compress.pyx new file mode 100644 index 000000000..5bd5fdfcb --- /dev/null +++ b/borg/compress.pyx @@ -0,0 +1,67 @@ +""" +A thin liblz4 wrapper for raw LZ4 compression / decompression. + +Features: + - lz4 is super fast + - wrapper releases CPython's GIL to support multithreaded code + - helper buffer only allocated once at instance creation and then reused + +But beware: + - this is not very generic, you MUST know the maximum uncompressed input + data size you will feed into the compressor / get from the decompressor! + - you must not do method calls to the same LZ4 instance from different + threads at the same time - create one LZ4 instance per thread! + - compress returns raw compressed data without adding any frame metadata + (like checksums, magics, length of data, etc.) + - decompress expects such raw compressed data as input +""" + +from libc.stdlib cimport malloc, free + + +cdef extern from "lz4.h": + int LZ4_compressBound(int inputSize) + int LZ4_compress(const char* source, char* dest, int inputSize) nogil + int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil + + +cdef class LZ4: + cdef char *buffer # helper buffer for (de)compression output + cdef int bufsize # size of this buffer + cdef int max_isize # maximum compressor input size safe for this bufsize + + def __cinit__(self, int max_isize): + self.max_isize = max_isize + # compute worst case bufsize for not compressible data: + self.bufsize = LZ4_compressBound(max_isize) + self.buffer = malloc(self.bufsize) + if not self.buffer: + raise MemoryError + + def __dealloc__(self): + free(self.buffer) + + def compress(self, idata): + cdef int isize = len(idata) + if isize > self.max_isize: + raise Exception('lz4 buffer might be too small, increase max_isize!') + cdef int osize + cdef char *source = idata + cdef char *dest = self.buffer + with nogil: + osize = LZ4_compress(source, dest, isize) + if not osize: + raise Exception('lz4 compress failed') + return dest[:osize] + + def decompress(self, idata): + cdef int isize = len(idata) + cdef int osize = self.bufsize + cdef char *source = idata # <-- does not work for memoryview idata, wants bytes + cdef char *dest = self.buffer + with nogil: + osize = LZ4_decompress_safe(source, dest, isize, osize) + if osize < 0: + # malformed input data, buffer too small, ... + raise Exception('lz4 decompress failed') + return dest[:osize] diff --git a/docs/global.rst.inc b/docs/global.rst.inc index c0629a143..c8c490498 100644 --- a/docs/global.rst.inc +++ b/docs/global.rst.inc @@ -13,6 +13,7 @@ .. _PBKDF2: https://en.wikipedia.org/wiki/PBKDF2 .. _ACL: https://en.wikipedia.org/wiki/Access_control_list .. _libacl: http://savannah.nongnu.org/projects/acl/ +.. _liblz4: https://github.com/Cyan4973/lz4 .. _OpenSSL: https://www.openssl.org/ .. _Python: http://www.python.org/ .. _Buzhash: https://en.wikipedia.org/wiki/Buzhash diff --git a/docs/installation.rst b/docs/installation.rst index 90bd33f84..5a027b2c6 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -9,6 +9,7 @@ Installation * Python_ >= 3.2 * OpenSSL_ >= 1.0.0 * libacl_ +* liblz4_ * some python dependencies, see install_requires in setup.py General notes @@ -59,6 +60,9 @@ Some of the steps detailled below might be useful also for non-git installs. # ACL support Headers + Library apt-get install libacl1-dev libacl1 + # lz4 super fast compression support Headers + Library + apt-get install liblz4-dev liblz4-1 + # if you do not have gcc / make / etc. yet apt-get install build-essential @@ -107,13 +111,16 @@ Some of the steps detailled below might be useful also for non-git installs. # ACL support Headers + Library sudo dnf install libacl-devel libacl - + + # lz4 super fast compression support Headers + Library + sudo dnf install lz4 + # optional: lowlevel FUSE py binding - to mount backup archives sudo dnf install python3-llfuse fuse - + # optional: for unit testing sudo dnf install fakeroot - + # get |project_name| from github, install it git clone |git_url| @@ -148,6 +155,7 @@ You'll need at least (use the cygwin installer to fetch/install these): gcc-core git libopenssl + liblz4_1 liblz4-devel # from cygwinports.org make openssh openssl-devel diff --git a/setup.py b/setup.py index edd75dc1a..87de52b71 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ if sys.version_info < min_python: from setuptools import setup, Extension +compress_source = 'borg/compress.pyx' crypto_source = 'borg/crypto.pyx' chunker_source = 'borg/chunker.pyx' hashindex_source = 'borg/hashindex.pyx' @@ -38,6 +39,7 @@ try: def make_distribution(self): self.filelist.extend([ + 'borg/compress.c', 'borg/crypto.c', 'borg/chunker.c', 'borg/_chunker.c', 'borg/hashindex.c', 'borg/_hashindex.c', @@ -52,6 +54,7 @@ except ImportError: def __init__(self, *args, **kwargs): raise Exception('Cython is required to run sdist') + compress_source = compress_source.replace('.pyx', '.c') crypto_source = crypto_source.replace('.pyx', '.c') chunker_source = chunker_source.replace('.pyx', '.c') hashindex_source = hashindex_source.replace('.pyx', '.c') @@ -59,7 +62,9 @@ except ImportError: platform_freebsd_source = platform_freebsd_source.replace('.pyx', '.c') platform_darwin_source = platform_darwin_source.replace('.pyx', '.c') from distutils.command.build_ext import build_ext - if not all(os.path.exists(path) for path in [crypto_source, chunker_source, hashindex_source, platform_linux_source, platform_freebsd_source]): + if not all(os.path.exists(path) for path in [ + compress_source, crypto_source, chunker_source, hashindex_source, + platform_linux_source, platform_freebsd_source]): raise ImportError('The GIT version of Borg needs Cython. Install Cython or use a released version') @@ -89,6 +94,7 @@ cmdclass = versioneer.get_cmdclass() cmdclass.update({'build_ext': build_ext, 'sdist': Sdist}) ext_modules = [ + Extension('borg.compress', [compress_source], libraries=['lz4']), Extension('borg.crypto', [crypto_source], libraries=['crypto'], include_dirs=include_dirs, library_dirs=library_dirs), Extension('borg.chunker', [chunker_source]), Extension('borg.hashindex', [hashindex_source]) From 746984c33b6349b051e84c58b667469ffcd903a3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Aug 2015 01:21:41 +0200 Subject: [PATCH 02/13] compress: add tests, zlib and null compression, ID header and autodetection --- borg/compress.pyx | 163 ++++++++++++++++++++++++++++--------- borg/testsuite/compress.py | 81 ++++++++++++++++++ 2 files changed, 207 insertions(+), 37 deletions(-) create mode 100644 borg/testsuite/compress.py diff --git a/borg/compress.pyx b/borg/compress.pyx index 5bd5fdfcb..1ff00305f 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -1,63 +1,91 @@ -""" -A thin liblz4 wrapper for raw LZ4 compression / decompression. - -Features: - - lz4 is super fast - - wrapper releases CPython's GIL to support multithreaded code - - helper buffer only allocated once at instance creation and then reused - -But beware: - - this is not very generic, you MUST know the maximum uncompressed input - data size you will feed into the compressor / get from the decompressor! - - you must not do method calls to the same LZ4 instance from different - threads at the same time - create one LZ4 instance per thread! - - compress returns raw compressed data without adding any frame metadata - (like checksums, magics, length of data, etc.) - - decompress expects such raw compressed data as input -""" +import zlib from libc.stdlib cimport malloc, free cdef extern from "lz4.h": - int LZ4_compressBound(int inputSize) - int LZ4_compress(const char* source, char* dest, int inputSize) nogil + int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil -cdef class LZ4: +cdef class CompressorBase: + """ + base class for all (de)compression classes, + also handles compression format auto detection and + adding/stripping the ID header (which enable auto detection). + """ + ID = b'\xFF\xFF' # reserved and not used + # overwrite with a unique 2-bytes bytestring in child classes + name = 'baseclass' + + @classmethod + def detect(cls, data): + return data.startswith(cls.ID) + + def __init__(self, **kwargs): + pass + + def compress(self, data): + # add ID bytes + return self.ID + data + + def decompress(self, data): + # strip ID bytes + return data[2:] + + +class CNULL(CompressorBase): + """ + null compression, just pass through data + """ + ID = b'\x00\x00' + name = 'null' + # base class does all we need + + +cdef class LZ4(CompressorBase): + """ + raw LZ4 compression / decompression (liblz4). + + Features: + - lz4 is super fast + - wrapper releases CPython's GIL to support multithreaded code + - buffer given by caller, avoiding frequent reallocation and buffer duplication + - uses safe lz4 methods that never go beyond the end of the output buffer + + But beware: + - this is not very generic, the given buffer MUST be large enough to + handle all compression or decompression output (or it will fail). + - you must not do method calls to the same LZ4 instance from different + threads at the same time - create one LZ4 instance per thread! + """ + ID = b'\x01\x00' + name = 'lz4' + cdef char *buffer # helper buffer for (de)compression output cdef int bufsize # size of this buffer - cdef int max_isize # maximum compressor input size safe for this bufsize - def __cinit__(self, int max_isize): - self.max_isize = max_isize - # compute worst case bufsize for not compressible data: - self.bufsize = LZ4_compressBound(max_isize) - self.buffer = malloc(self.bufsize) - if not self.buffer: - raise MemoryError - - def __dealloc__(self): - free(self.buffer) + def __cinit__(self, **kwargs): + buffer = kwargs['buffer'] + self.buffer = buffer + self.bufsize = len(buffer) def compress(self, idata): cdef int isize = len(idata) - if isize > self.max_isize: - raise Exception('lz4 buffer might be too small, increase max_isize!') - cdef int osize + cdef int osize = self.bufsize cdef char *source = idata cdef char *dest = self.buffer with nogil: - osize = LZ4_compress(source, dest, isize) + osize = LZ4_compress_limitedOutput(source, dest, isize, osize) if not osize: raise Exception('lz4 compress failed') - return dest[:osize] + return super().compress(dest[:osize]) def decompress(self, idata): + idata = super().decompress(idata) cdef int isize = len(idata) cdef int osize = self.bufsize - cdef char *source = idata # <-- does not work for memoryview idata, wants bytes + cdef char *source = idata cdef char *dest = self.buffer with nogil: osize = LZ4_decompress_safe(source, dest, isize, osize) @@ -65,3 +93,64 @@ cdef class LZ4: # malformed input data, buffer too small, ... raise Exception('lz4 decompress failed') return dest[:osize] + + +class ZLIB(CompressorBase): + """ + zlib compression / decompression (python stdlib) + """ + ID = b'\x08\x00' # not used here, see detect() + # avoid all 0x.8.. IDs elsewhere! + name = 'zlib' + + @classmethod + def detect(cls, data): + # matches misc. patterns 0x.8.. used by zlib + cmf, flg = data[:2] + is_deflate = cmf & 0x0f == 8 + check_ok = (cmf * 256 + flg) % 31 == 0 + return check_ok and is_deflate + + def __init__(self, level=6, **kwargs): + super().__init__(**kwargs) + self.level = level + + def compress(self, data): + # note: for compatibility no super call, do not add ID bytes + return zlib.compress(data, self.level) + + def decompress(self, data): + # note: for compatibility no super call, do not strip ID bytes + return zlib.decompress(data) + + +COMPRESSOR_TABLE = { + CNULL.name: CNULL, + LZ4.name: LZ4, + ZLIB.name: ZLIB, +} +COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, ] # check fast stuff first + +def get_compressor(name, **kwargs): + cls = COMPRESSOR_TABLE[name] + return cls(**kwargs) + + +class Compressor: + """ + compresses using a compressor with given name and parameters + decompresses everything we can handle (autodetect) + """ + def __init__(self, name='zlib', **kwargs): + self.params = kwargs + self.compressor = get_compressor(name, **self.params) + + def compress(self, data): + return self.compressor.compress(data) + + def decompress(self, data): + for cls in COMPRESSOR_LIST: + if cls.detect(data): + return cls(**self.params).decompress(data) + else: + raise ValueError('No decompressor for this data found: %r.', data[:2]) diff --git a/borg/testsuite/compress.py b/borg/testsuite/compress.py new file mode 100644 index 000000000..441214e7b --- /dev/null +++ b/borg/testsuite/compress.py @@ -0,0 +1,81 @@ +import zlib + +import pytest + +from ..compress import get_compressor, Compressor, CNULL, ZLIB, LZ4 + + +buffer = bytes(2**16) +data = b'fooooooooobaaaaaaaar' +params = dict(name='zlib', level=6, buffer=buffer) + + +def test_get_compressor(): + c = get_compressor(name='null') + assert isinstance(c, CNULL) + c = get_compressor(name='lz4', buffer=buffer) + assert isinstance(c, LZ4) + c = get_compressor(name='zlib') + assert isinstance(c, ZLIB) + with pytest.raises(KeyError): + get_compressor(name='foobar') + + +def test_cnull(): + c = get_compressor(name='null') + cdata = c.compress(data) + assert len(cdata) > len(data) + assert data in cdata # it's not compressed and just in there 1:1 + assert data == c.decompress(cdata) + assert data == Compressor(**params).decompress(cdata) # autodetect + + +def test_lz4(): + c = get_compressor(name='lz4', buffer=buffer) + cdata = c.compress(data) + assert len(cdata) < len(data) + assert data == c.decompress(cdata) + assert data == Compressor(**params).decompress(cdata) # autodetect + + +def test_zlib(): + c = get_compressor(name='zlib') + cdata = c.compress(data) + assert len(cdata) < len(data) + assert data == c.decompress(cdata) + assert data == Compressor(**params).decompress(cdata) # autodetect + + +def test_autodetect_invalid(): + with pytest.raises(ValueError): + Compressor(**params).decompress(b'\xff\xfftotalcrap') + with pytest.raises(ValueError): + Compressor(**params).decompress(b'\x08\x00notreallyzlib') + + +def test_zlib_compat(): + # for compatibility reasons, we do not add an extra header for zlib, + # nor do we expect one when decompressing / autodetecting + for level in range(10): + c = get_compressor(name='zlib', level=level) + cdata1 = c.compress(data) + cdata2 = zlib.compress(data, level) + assert cdata1 == cdata2 + data2 = c.decompress(cdata2) + assert data == data2 + data2 = Compressor(**params).decompress(cdata2) + assert data == data2 + + +def test_compressor(): + for params in [ + dict(name='null', buffer=buffer), + dict(name='lz4', buffer=buffer), + dict(name='zlib', level=0, buffer=buffer), + dict(name='zlib', level=6, buffer=buffer), + dict(name='zlib', level=9, buffer=buffer), + ]: + c = Compressor(**params) + assert data == c.decompress(c.compress(data)) + + From 899776620209f3707ee9c640ebec93e224a114bb Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Aug 2015 18:10:30 +0200 Subject: [PATCH 03/13] integrate compress code, new compression spec parser for commandline New null and lz4 compression. Giving -C 0 now uses null compression, not zlib level 0 any more (null has almost zero overhead while zlib-level0 still had to package everything into zlib frames). Giving -C 10 uses new lz4 compression, super fast compression and even faster decompression. See borg create --help (and --compression argument). fix some issues, clean up, optimize: CNULL: always return bytes LZ4: deal with getting memoryviews Compressor: give bytes to detect(), avoid memoryviews for lz4, always use same COMPR_BUFFER, avoid memory management costs. check --chunker-params CHUNK_MAX_EXP upper limit --- borg/archiver.py | 20 +++++++++++++++----- borg/compress.pyx | 27 +++++++++++++++++++++------ borg/helpers.py | 40 ++++++++++++++++++++++++++++++++++++++++ borg/key.py | 12 ++++++------ 4 files changed, 82 insertions(+), 17 deletions(-) diff --git a/borg/archiver.py b/borg/archiver.py index 84e568e73..032313dbf 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -14,6 +14,7 @@ import traceback from . import __version__ from .archive import Archive, ArchiveChecker, CHUNKER_PARAMS +from .compress import Compressor, COMPR_BUFFER from .repository import Repository from .cache import Cache from .key import key_creator @@ -21,7 +22,7 @@ from .helpers import Error, location_validator, format_time, format_file_size, \ format_file_mode, ExcludePattern, exclude_path, adjust_patterns, to_localtime, timestamp, \ get_cache_dir, get_keys_dir, format_timedelta, prune_within, prune_split, \ Manifest, remove_surrogates, update_excludes, format_archive, check_extension_modules, Statistics, \ - is_cachedir, bigint_to_int, ChunkerParams + is_cachedir, bigint_to_int, ChunkerParams, CompressionSpec from .remote import RepositoryServer, RemoteRepository @@ -101,7 +102,9 @@ Type "Yes I am sure" if you understand this and want to continue.\n""") t0 = datetime.now() repository = self.open_repository(args.archive, exclusive=True) manifest, key = Manifest.load(repository) - key.compression_level = args.compression + compr_args = dict(buffer=COMPR_BUFFER) + compr_args.update(args.compression) + key.compressor = Compressor(**compr_args) cache = Cache(repository, key, manifest, do_files=args.cache_files) archive = Archive(repository, key, manifest, args.archive.archive, cache=cache, create=True, checkpoint_interval=args.checkpoint_interval, @@ -634,9 +637,16 @@ Type "Yes I am sure" if you understand this and want to continue.\n""") metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE', help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS) subparser.add_argument('-C', '--compression', dest='compression', - type=int, default=0, metavar='N', - help='select compression algorithm and level. 0..9 is supported and means zlib ' - 'level 0 (no compression, fast, default) .. zlib level 9 (high compression, slow).') + type=CompressionSpec, default=dict(name='null'), metavar='COMPRESSION', + help='select compression algorithm and level, by giving a number: ' + '0 == no compression [default], ' + '1..9 == zlib level 1..9, ' + '10 == lz4. ' + 'Alternatively, you can also give a name and optionally additional args: ' + 'null == no compression, ' + 'zlib == zlib (default level 6), ' + 'zlib,0 .. zlib,9 == zlib (with level 0..9), ' + 'lz4 == lz4.') subparser.add_argument('archive', metavar='ARCHIVE', type=location_validator(archive=True), help='archive to create') diff --git a/borg/compress.pyx b/borg/compress.pyx index 1ff00305f..03815b3a5 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -1,8 +1,5 @@ import zlib -from libc.stdlib cimport malloc, free - - cdef extern from "lz4.h": int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize) nogil @@ -40,7 +37,15 @@ class CNULL(CompressorBase): """ ID = b'\x00\x00' name = 'null' - # base class does all we need + + def compress(self, data): + return super().compress(data) + + def decompress(self, data): + data = super().decompress(data) + if not isinstance(data, bytes): + data = bytes(data) + return data cdef class LZ4(CompressorBase): @@ -71,6 +76,8 @@ cdef class LZ4(CompressorBase): self.bufsize = len(buffer) def compress(self, idata): + if not isinstance(idata, bytes): + idata = bytes(idata) # code below does not work with memoryview cdef int isize = len(idata) cdef int osize = self.bufsize cdef char *source = idata @@ -82,6 +89,8 @@ cdef class LZ4(CompressorBase): return super().compress(dest[:osize]) def decompress(self, idata): + if not isinstance(idata, bytes): + idata = bytes(idata) # code below does not work with memoryview idata = super().decompress(idata) cdef int isize = len(idata) cdef int osize = self.bufsize @@ -141,7 +150,7 @@ class Compressor: compresses using a compressor with given name and parameters decompresses everything we can handle (autodetect) """ - def __init__(self, name='zlib', **kwargs): + def __init__(self, name='null', **kwargs): self.params = kwargs self.compressor = get_compressor(name, **self.params) @@ -149,8 +158,14 @@ class Compressor: return self.compressor.compress(data) def decompress(self, data): + hdr = bytes(data[:2]) # detect() does not work with memoryview for cls in COMPRESSOR_LIST: - if cls.detect(data): + if cls.detect(hdr): return cls(**self.params).decompress(data) else: raise ValueError('No decompressor for this data found: %r.', data[:2]) + + +# a buffer used for (de)compression result, which can be slightly bigger +# than the chunk buffer in the worst (incompressible data) case, add 10%: +COMPR_BUFFER = bytes(int(1.1 * 2 ** 23)) # CHUNK_MAX_EXP == 23 diff --git a/borg/helpers.py b/borg/helpers.py index d20532723..69a6db0db 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -278,9 +278,49 @@ def timestamp(s): def ChunkerParams(s): window_size, chunk_mask, chunk_min, chunk_max = s.split(',') + if int(chunk_max) > 23: + # do not go beyond 2**23 (8MB) chunk size now, + # COMPR_BUFFER can only cope with up to this size + raise ValueError return int(window_size), int(chunk_mask), int(chunk_min), int(chunk_max) +def CompressionSpec(s): + values = s.split(',') + count = len(values) + if count < 1: + raise ValueError + compression = values[0] + try: + compression = int(compression) + if count > 1: + raise ValueError + # it is just --compression N + if compression == 0: + return dict(name='null') + if 1 <= compression <= 9: + return dict(name='zlib', level=compression) + if compression == 10: + return dict(name='lz4') + raise ValueError + except ValueError: + # --compression algo[,...] + name = compression + if name in ('null', 'lz4', ): + return dict(name=name) + if name == 'zlib': + if count < 2: + level = 6 # default compression level in py stdlib + elif count == 2: + level = int(values[1]) + if not 0 <= level <= 9: + raise ValueError + else: + raise ValueError + return dict(name='zlib', level=level) + raise ValueError + + def is_cachedir(path): """Determines whether the specified path is a cache directory (and therefore should potentially be excluded from the backup) according to diff --git a/borg/key.py b/borg/key.py index fabdae5b3..fcf083586 100644 --- a/borg/key.py +++ b/borg/key.py @@ -6,9 +6,9 @@ import msgpack import textwrap import hmac from hashlib import sha256 -import zlib from .crypto import pbkdf2_sha256, get_random_bytes, AES, bytes_to_long, long_to_bytes, bytes_to_int, num_aes_blocks +from .compress import Compressor, COMPR_BUFFER from .helpers import IntegrityError, get_keys_dir, Error PREFIX = b'\0' * 8 @@ -68,7 +68,7 @@ class KeyBase: self.TYPE_STR = bytes([self.TYPE]) self.repository = repository self.target = None # key location file path / repo obj - self.compression_level = 0 + self.compressor = Compressor('null', buffer=COMPR_BUFFER) def id_hash(self, data): """Return HMAC hash using the "id" HMAC key @@ -99,12 +99,12 @@ class PlaintextKey(KeyBase): return sha256(data).digest() def encrypt(self, data): - return b''.join([self.TYPE_STR, zlib.compress(data, self.compression_level)]) + return b''.join([self.TYPE_STR, self.compressor.compress(data)]) def decrypt(self, id, data): if data[0] != self.TYPE: raise IntegrityError('Invalid encryption envelope') - data = zlib.decompress(memoryview(data)[1:]) + data = self.compressor.decompress(memoryview(data)[1:]) if id and sha256(data).digest() != id: raise IntegrityError('Chunk id verification failed') return data @@ -131,7 +131,7 @@ class AESKeyBase(KeyBase): return HMAC(self.id_key, data, sha256).digest() def encrypt(self, data): - data = zlib.compress(data, self.compression_level) + data = self.compressor.compress(data) self.enc_cipher.reset() data = b''.join((self.enc_cipher.iv[8:], self.enc_cipher.encrypt(data))) hmac = HMAC(self.enc_hmac_key, data, sha256).digest() @@ -144,7 +144,7 @@ class AESKeyBase(KeyBase): if memoryview(HMAC(self.enc_hmac_key, memoryview(data)[33:], sha256).digest()) != hmac: raise IntegrityError('Encryption envelope checksum mismatch') self.dec_cipher.reset(iv=PREFIX + data[33:41]) - data = zlib.decompress(self.dec_cipher.decrypt(data[41:])) # should use memoryview + data = self.compressor.decompress(self.dec_cipher.decrypt(data[41:])) if id and HMAC(self.id_key, data, sha256).digest() != id: raise IntegrityError('Chunk id verification failed') return data From a15daf3b80b0f1687daebba6c062db9b65c7b202 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Aug 2015 18:21:28 +0200 Subject: [PATCH 04/13] add liblz4-dev to travis installation packages --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 87d3afb02..5497cd096 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ python: - "3.4" # command to install dependencies install: - - "sudo apt-get install -y libacl1-dev" + - "sudo apt-get install -y libacl1-dev liblz4-dev" - "pip install --use-mirrors Cython" - "pip install -e ." # command to run tests From 946507aeaf9de0c9dbd3d1af4f025e23d1cda28b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 2 Aug 2015 22:24:02 +0200 Subject: [PATCH 05/13] fix travis to install liblz4-dev from ppa it is not available in ubuntu 12.04 by default. --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5497cd096..8d910c0fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,10 @@ python: - "3.4" # command to install dependencies install: - - "sudo apt-get install -y libacl1-dev liblz4-dev" + - "sudo add-apt-repository -y ppa:gezakovacs/lz4" + - "sudo apt-get update" + - "sudo apt-get install -y liblz4-dev" + - "sudo apt-get install -y libacl1-dev" - "pip install --use-mirrors Cython" - "pip install -e ." # command to run tests From 4c0012bddfc91e45167f65293369814511695de9 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 3 Aug 2015 00:31:33 +0200 Subject: [PATCH 06/13] add lzma compression needs python 3.3+, on 3.2 it won't be available. --- borg/archiver.py | 6 ++++-- borg/compress.pyx | 30 +++++++++++++++++++++++++++++- borg/helpers.py | 6 ++++-- borg/testsuite/compress.py | 27 ++++++++++++++++++++++++--- 4 files changed, 61 insertions(+), 8 deletions(-) diff --git a/borg/archiver.py b/borg/archiver.py index 032313dbf..fb6db8a19 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -641,12 +641,14 @@ Type "Yes I am sure" if you understand this and want to continue.\n""") help='select compression algorithm and level, by giving a number: ' '0 == no compression [default], ' '1..9 == zlib level 1..9, ' - '10 == lz4. ' + '10 == lz4, ' + '20-29 == lzma level 0..9.' 'Alternatively, you can also give a name and optionally additional args: ' 'null == no compression, ' 'zlib == zlib (default level 6), ' 'zlib,0 .. zlib,9 == zlib (with level 0..9), ' - 'lz4 == lz4.') + 'lz4 == lz4, ' + 'lzma,0 .. lzma,9 == lzma (with level 0..9).') subparser.add_argument('archive', metavar='ARCHIVE', type=location_validator(archive=True), help='archive to create') diff --git a/borg/compress.pyx b/borg/compress.pyx index 03815b3a5..c1bdeff82 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -1,4 +1,8 @@ import zlib +try: + import lzma +except ImportError: + lzma = None cdef extern from "lz4.h": int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil @@ -104,6 +108,29 @@ cdef class LZ4(CompressorBase): return dest[:osize] +class LZMA(CompressorBase): + """ + lzma compression / decompression (python 3.3+ stdlib) + """ + ID = b'\x02\x00' + name = 'lzma' + + def __init__(self, level=6, **kwargs): + super().__init__(**kwargs) + self.level = level + if lzma is None: + raise ValueError('No lzma support found.') + + def compress(self, data): + # we do not need integrity checks in lzma, we do that already + data = lzma.compress(data, preset=self.level, check=lzma.CHECK_NONE) + return super().compress(data) + + def decompress(self, data): + data = super().decompress(data) + return lzma.decompress(data) + + class ZLIB(CompressorBase): """ zlib compression / decompression (python stdlib) @@ -137,8 +164,9 @@ COMPRESSOR_TABLE = { CNULL.name: CNULL, LZ4.name: LZ4, ZLIB.name: ZLIB, + LZMA.name: LZMA, } -COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, ] # check fast stuff first +COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, LZMA, ] # check fast stuff first def get_compressor(name, **kwargs): cls = COMPRESSOR_TABLE[name] diff --git a/borg/helpers.py b/borg/helpers.py index 69a6db0db..020c263e7 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -302,13 +302,15 @@ def CompressionSpec(s): return dict(name='zlib', level=compression) if compression == 10: return dict(name='lz4') + if 20 <= compression <= 29: + return dict(name='lzma', level=compression-20) raise ValueError except ValueError: # --compression algo[,...] name = compression if name in ('null', 'lz4', ): return dict(name=name) - if name == 'zlib': + if name in ('zlib', 'lzma', ): if count < 2: level = 6 # default compression level in py stdlib elif count == 2: @@ -317,7 +319,7 @@ def CompressionSpec(s): raise ValueError else: raise ValueError - return dict(name='zlib', level=level) + return dict(name=name, level=level) raise ValueError diff --git a/borg/testsuite/compress.py b/borg/testsuite/compress.py index 441214e7b..6d7319c1b 100644 --- a/borg/testsuite/compress.py +++ b/borg/testsuite/compress.py @@ -1,4 +1,8 @@ import zlib +try: + import lzma +except ImportError: + lzma = None import pytest @@ -6,7 +10,7 @@ from ..compress import get_compressor, Compressor, CNULL, ZLIB, LZ4 buffer = bytes(2**16) -data = b'fooooooooobaaaaaaaar' +data = b'fooooooooobaaaaaaaar' * 10 params = dict(name='zlib', level=6, buffer=buffer) @@ -46,6 +50,16 @@ def test_zlib(): assert data == Compressor(**params).decompress(cdata) # autodetect +def test_lzma(): + if lzma is None: + pytest.skip("No lzma support found.") + c = get_compressor(name='lzma') + cdata = c.compress(data) + assert len(cdata) < len(data) + assert data == c.decompress(cdata) + assert data == Compressor(**params).decompress(cdata) # autodetect + + def test_autodetect_invalid(): with pytest.raises(ValueError): Compressor(**params).decompress(b'\xff\xfftotalcrap') @@ -68,13 +82,20 @@ def test_zlib_compat(): def test_compressor(): - for params in [ + params_list = [ dict(name='null', buffer=buffer), dict(name='lz4', buffer=buffer), dict(name='zlib', level=0, buffer=buffer), dict(name='zlib', level=6, buffer=buffer), dict(name='zlib', level=9, buffer=buffer), - ]: + ] + if lzma: + params_list += [ + dict(name='lzma', level=0, buffer=buffer), + dict(name='lzma', level=6, buffer=buffer), + dict(name='lzma', level=9, buffer=buffer), + ] + for params in params_list: c = Compressor(**params) assert data == c.decompress(c.compress(data)) From abe29583f2f6f0d93313be66747251cd9b983175 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 10 Aug 2015 00:03:19 +0200 Subject: [PATCH 07/13] install lz4 from brew --- .travis/install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis/install.sh b/.travis/install.sh index e25ab1288..27eb668db 100755 --- a/.travis/install.sh +++ b/.travis/install.sh @@ -14,6 +14,7 @@ if [[ "$(uname -s)" == 'Darwin' ]]; then eval "$(pyenv init -)" fi + brew install lz4 brew outdated pyenv || brew upgrade pyenv case "${TOXENV}" in From 8b1d46caa403488dd7e6f7ef166349bb2d6c4b8d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 10 Aug 2015 20:36:21 +0200 Subject: [PATCH 08/13] docs: more about compression --- docs/internals.rst | 25 ++++++++++++++++++++----- docs/quickstart.rst | 23 +++++++++++++++++++++++ docs/usage.rst | 16 ++++++++++++++++ 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/docs/internals.rst b/docs/internals.rst index 6dfc8ba9b..0ea68098b 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -382,10 +382,25 @@ representation of the repository id. Compression ----------- -|project_name| currently always pipes all data through a zlib compressor which -supports compression levels 0 (no compression, fast) to 9 (high compression, slow). +|project_name| supports the following compression methods: + +- none (no compression, pass through data 1:1) +- lz4 (low compression, but super fast) +- zlib (level 1-9, level 1 is low, level 9 is high compression) +- lzma (level 0-9, level 0 is low, level 9 is high compression. + +Speed: none > lz4 > zlib > lzma +Compression: lzma > zlib > lz4 > none + +The overall speed of course also depends on the speed of your target storage. +If that is slow, using a higher compression level might yield better overall +performance. You need to experiment a bit. Maybe just watch your CPU load, if +that is relatively low, increase compression until 1 core is 70-100% loaded. + +Be careful, higher zlib and especially lzma compression levels might take a +lot of resources (CPU and memory). + +Compression is applied after deduplication, thus using different compression +methods in one repo does not influence deduplication. See ``borg create --help`` about how to specify the compression level and its default. - -Note: zlib level 0 creates a little bit more output data than it gets as input, -due to zlib protocol overhead. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index fcb223503..9abe4fb6a 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -89,6 +89,29 @@ certain number of old archives:: # and 6 monthly archives. borg prune -v $REPOSITORY --keep-daily=7 --keep-weekly=4 --keep-monthly=6 +.. backup_compression: + +Backup compression +------------------ + +Default is no compression, but we support different methods with high speed +or high compression: + +If you have a quick repo storage and you want a little compression: + + $ borg create --compression lz4 /mnt/backup::repo ~ + +If you have a medium fast repo storage and you want a bit more compression (N=0..9): + + $ borg create --compression zlib,N /mnt/backup::repo ~ + +If you have a very slow repo storage and you want high compression (N=0..9): + + $ borg create --compression lzma,N /mnt/backup::repo ~ + +You'll need to experiment a bit to find the best compression for your use case. +Keep an eye on CPU load and throughput. + .. _encrypted_repos: Repository encryption diff --git a/docs/usage.rst b/docs/usage.rst index fcbee5fef..a68d67c3f 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -76,8 +76,12 @@ Resource Usage |project_name| might use a lot of resources depending on the size of the data set it is dealing with. CPU: it won't go beyond 100% of 1 core as the code is currently single-threaded. + Especially higher zlib and lzma compression uses significant amounts of CPU + cycles. Memory (RAM): the chunks index and the files index are read into memory for performance reasons. + compression, esp. lzma compression with high levels might need substantial amounts + of memory. Temporary files: reading data and metadata from a FUSE mounted repository will consume about the same space as the deduplicated chunks used to represent them in the repository. @@ -175,6 +179,18 @@ Examples # Backup a raw device (must not be active/in use/mounted at that time) $ dd if=/dev/sda bs=10M | borg create /mnt/backup::my-sda - + # No compression (default) + $ borg create /mnt/backup::repo ~ + + # Super fast, low compression + $ borg create --compression lz4 /mnt/backup::repo ~ + + # Less fast, higher compression (N = 0..9) + $ borg create --compression zlib,N /mnt/backup::repo ~ + + # Even slower, even higher compression (N = 0..9) + $ borg create --compression lzma,N /mnt/backup::repo ~ + .. include:: usage/extract.rst.inc From 1724241d0ca676f3de6bad38dda33c0be1296818 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Mon, 10 Aug 2015 20:45:15 +0200 Subject: [PATCH 09/13] README: mention lzma and lz4 compression --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 22320d3fe..3d27de85c 100644 --- a/README.rst +++ b/README.rst @@ -51,7 +51,7 @@ Main features authenticity is verified using HMAC-SHA256. **Compression** - All data can be compressed by zlib, level 0-9. + All data can be compressed by lz4, zlib or lzma. **Off-site backups** Borg can store data on any remote host accessible over SSH. If Borg is From b16dc03e365d3eb0e47a608449255cd6a812928c Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 12 Aug 2015 02:27:41 +0200 Subject: [PATCH 10/13] tests for CompressionSpec --- borg/testsuite/helpers.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index 26b422b0c..bb2400a94 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -2,11 +2,12 @@ import hashlib from time import mktime, strptime from datetime import datetime, timezone, timedelta +import pytest import msgpack from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \ prune_within, prune_split, \ - StableDict, int_to_bigint, bigint_to_int, parse_timestamp + StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec from . import BaseTestCase @@ -104,6 +105,35 @@ class PatternTestCase(BaseTestCase): ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg']) +def test_compression_specs(): + with pytest.raises(ValueError): + CompressionSpec('') + assert CompressionSpec('0') == dict(name='null') + assert CompressionSpec('1') == dict(name='zlib', level=1) + assert CompressionSpec('9') == dict(name='zlib', level=9) + assert CompressionSpec('10') == dict(name='lz4') + with pytest.raises(ValueError): + CompressionSpec('11') + assert CompressionSpec('20') == dict(name='lzma', level=0) + assert CompressionSpec('29') == dict(name='lzma', level=9) + with pytest.raises(ValueError): + CompressionSpec('30') + assert CompressionSpec('null') == dict(name='null') + assert CompressionSpec('lz4') == dict(name='lz4') + assert CompressionSpec('zlib') == dict(name='zlib', level=6) + assert CompressionSpec('zlib,0') == dict(name='zlib', level=0) + assert CompressionSpec('zlib,9') == dict(name='zlib', level=9) + with pytest.raises(ValueError): + CompressionSpec('zlib,9,invalid') + assert CompressionSpec('lzma') == dict(name='lzma', level=6) + assert CompressionSpec('lzma,0') == dict(name='lzma', level=0) + assert CompressionSpec('lzma,9') == dict(name='lzma', level=9) + with pytest.raises(ValueError): + CompressionSpec('lzma,9,invalid') + with pytest.raises(ValueError): + CompressionSpec('invalid') + + class MakePathSafeTestCase(BaseTestCase): def test(self): From a6b6712d6a312f8e839212ea246a896391c90abc Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 14 Aug 2015 23:00:04 +0200 Subject: [PATCH 11/13] deprecate the numeric --compression argument, rename null compression to none, update CHANGES --- CHANGES.rst | 32 ++++++++++++++++++++++++++++++++ borg/archiver.py | 14 +++++--------- borg/compress.pyx | 10 +++++----- borg/helpers.py | 12 +++--------- borg/key.py | 2 +- borg/testsuite/compress.py | 10 +++++----- borg/testsuite/helpers.py | 11 +++-------- 7 files changed, 54 insertions(+), 37 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index b333ba48a..7245371d4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,38 @@ Borg Changelog ============== +Compression branch +------------------ + +Compatibility notes: + +- the new compression code is very compatible: as long as you stay with zlib + compression, older borg releases will still be able to read data from a + repo/archive made with the new code (note: this is not the case for the + default "none" compression, use "zlib,0" if you want a "no compression" mode + that can be read by older borg). Also the new code is able to read repos and + archives made with older borg versions (for all zlib levels 0..9). + +Deprecations: + +- --compression N (with N being a number, as in 0.24) is deprecated. + We keep the --compression 0..9 for now to not break scripts, but it is + deprecated and will be removed later, so better fix your scripts now: + --compression 0 (as in 0.24) is the same as --compression zlib,0 (now). + BUT: if you do not want compression, you rather want --compression none + (which is the default). + --compression 1 (in 0.24) is the same as --compression zlib,1 (now) + --compression 9 (in 0.24) is the same as --compression zlib,9 (now) + +New features: + +- create --compression none (default, means: do not compress, just pass through + data "as is". this is more efficient than zlib level 0 as used in borg 0.24) +- create --compression lz4 (super-fast, but not very high compression) + Please note that borgbackup needs lz4 library as additional requirement. +- create --compression zlib,N (slower, higher compression, default for N is 6) +- create --compression lzma,N (slowest, highest compression, default N is 6) + Version 0.24.0 -------------- diff --git a/borg/archiver.py b/borg/archiver.py index 768dc5361..1f0dc1d39 100644 --- a/borg/archiver.py +++ b/borg/archiver.py @@ -668,17 +668,13 @@ Type "Yes I am sure" if you understand this and want to continue.\n""") metavar='CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,HASH_WINDOW_SIZE', help='specify the chunker parameters. default: %d,%d,%d,%d' % CHUNKER_PARAMS) subparser.add_argument('-C', '--compression', dest='compression', - type=CompressionSpec, default=dict(name='null'), metavar='COMPRESSION', - help='select compression algorithm and level, by giving a number: ' - '0 == no compression [default], ' - '1..9 == zlib level 1..9, ' - '10 == lz4, ' - '20-29 == lzma level 0..9.' - 'Alternatively, you can also give a name and optionally additional args: ' - 'null == no compression, ' + type=CompressionSpec, default=dict(name='none'), metavar='COMPRESSION', + help='select compression algorithm (and level): ' + 'none == no compression (default), ' + 'lz4 == lz4, ' 'zlib == zlib (default level 6), ' 'zlib,0 .. zlib,9 == zlib (with level 0..9), ' - 'lz4 == lz4, ' + 'lzma == lzma (default level 6), ' 'lzma,0 .. lzma,9 == lzma (with level 0..9).') subparser.add_argument('archive', metavar='ARCHIVE', type=location_validator(archive=True), diff --git a/borg/compress.pyx b/borg/compress.pyx index c1bdeff82..2285b55d8 100644 --- a/borg/compress.pyx +++ b/borg/compress.pyx @@ -35,12 +35,12 @@ cdef class CompressorBase: return data[2:] -class CNULL(CompressorBase): +class CNONE(CompressorBase): """ - null compression, just pass through data + none - no compression, just pass through data """ ID = b'\x00\x00' - name = 'null' + name = 'none' def compress(self, data): return super().compress(data) @@ -161,12 +161,12 @@ class ZLIB(CompressorBase): COMPRESSOR_TABLE = { - CNULL.name: CNULL, + CNONE.name: CNONE, LZ4.name: LZ4, ZLIB.name: ZLIB, LZMA.name: LZMA, } -COMPRESSOR_LIST = [LZ4, CNULL, ZLIB, LZMA, ] # check fast stuff first +COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first def get_compressor(name, **kwargs): cls = COMPRESSOR_TABLE[name] diff --git a/borg/helpers.py b/borg/helpers.py index 020c263e7..8643166f6 100644 --- a/borg/helpers.py +++ b/borg/helpers.py @@ -295,20 +295,14 @@ def CompressionSpec(s): compression = int(compression) if count > 1: raise ValueError - # it is just --compression N - if compression == 0: - return dict(name='null') - if 1 <= compression <= 9: + # DEPRECATED: it is just --compression N + if 0 <= compression <= 9: return dict(name='zlib', level=compression) - if compression == 10: - return dict(name='lz4') - if 20 <= compression <= 29: - return dict(name='lzma', level=compression-20) raise ValueError except ValueError: # --compression algo[,...] name = compression - if name in ('null', 'lz4', ): + if name in ('none', 'lz4', ): return dict(name=name) if name in ('zlib', 'lzma', ): if count < 2: diff --git a/borg/key.py b/borg/key.py index fcf083586..7067a4454 100644 --- a/borg/key.py +++ b/borg/key.py @@ -68,7 +68,7 @@ class KeyBase: self.TYPE_STR = bytes([self.TYPE]) self.repository = repository self.target = None # key location file path / repo obj - self.compressor = Compressor('null', buffer=COMPR_BUFFER) + self.compressor = Compressor('none', buffer=COMPR_BUFFER) def id_hash(self, data): """Return HMAC hash using the "id" HMAC key diff --git a/borg/testsuite/compress.py b/borg/testsuite/compress.py index 6d7319c1b..8019925b2 100644 --- a/borg/testsuite/compress.py +++ b/borg/testsuite/compress.py @@ -6,7 +6,7 @@ except ImportError: import pytest -from ..compress import get_compressor, Compressor, CNULL, ZLIB, LZ4 +from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4 buffer = bytes(2**16) @@ -15,8 +15,8 @@ params = dict(name='zlib', level=6, buffer=buffer) def test_get_compressor(): - c = get_compressor(name='null') - assert isinstance(c, CNULL) + c = get_compressor(name='none') + assert isinstance(c, CNONE) c = get_compressor(name='lz4', buffer=buffer) assert isinstance(c, LZ4) c = get_compressor(name='zlib') @@ -26,7 +26,7 @@ def test_get_compressor(): def test_cnull(): - c = get_compressor(name='null') + c = get_compressor(name='none') cdata = c.compress(data) assert len(cdata) > len(data) assert data in cdata # it's not compressed and just in there 1:1 @@ -83,7 +83,7 @@ def test_zlib_compat(): def test_compressor(): params_list = [ - dict(name='null', buffer=buffer), + dict(name='none', buffer=buffer), dict(name='lz4', buffer=buffer), dict(name='zlib', level=0, buffer=buffer), dict(name='zlib', level=6, buffer=buffer), diff --git a/borg/testsuite/helpers.py b/borg/testsuite/helpers.py index bb2400a94..76bafb5b7 100644 --- a/borg/testsuite/helpers.py +++ b/borg/testsuite/helpers.py @@ -108,17 +108,12 @@ class PatternTestCase(BaseTestCase): def test_compression_specs(): with pytest.raises(ValueError): CompressionSpec('') - assert CompressionSpec('0') == dict(name='null') + assert CompressionSpec('0') == dict(name='zlib', level=0) assert CompressionSpec('1') == dict(name='zlib', level=1) assert CompressionSpec('9') == dict(name='zlib', level=9) - assert CompressionSpec('10') == dict(name='lz4') with pytest.raises(ValueError): - CompressionSpec('11') - assert CompressionSpec('20') == dict(name='lzma', level=0) - assert CompressionSpec('29') == dict(name='lzma', level=9) - with pytest.raises(ValueError): - CompressionSpec('30') - assert CompressionSpec('null') == dict(name='null') + CompressionSpec('10') + assert CompressionSpec('none') == dict(name='none') assert CompressionSpec('lz4') == dict(name='lz4') assert CompressionSpec('zlib') == dict(name='zlib', level=6) assert CompressionSpec('zlib,0') == dict(name='zlib', level=0) From 1d16e7a37c74aa965772b0867f0277d2aca08388 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 15 Aug 2015 15:45:15 +0200 Subject: [PATCH 12/13] compression: update / refine docs --- README.rst | 3 ++- docs/internals.rst | 18 ++++++++++++++---- docs/quickstart.rst | 6 ++++-- docs/support.rst | 3 +++ docs/usage.rst | 3 +-- 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 3d27de85c..8180fd2ab 100644 --- a/README.rst +++ b/README.rst @@ -51,7 +51,8 @@ Main features authenticity is verified using HMAC-SHA256. **Compression** - All data can be compressed by lz4, zlib or lzma. + All data can be compressed by lz4 (super fast, low compression), zlib + (medium speed and compression) or lzma (low speed, high compression). **Off-site backups** Borg can store data on any remote host accessible over SSH. If Borg is diff --git a/docs/internals.rst b/docs/internals.rst index 0ea68098b..845dff131 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -386,19 +386,29 @@ Compression - none (no compression, pass through data 1:1) - lz4 (low compression, but super fast) -- zlib (level 1-9, level 1 is low, level 9 is high compression) -- lzma (level 0-9, level 0 is low, level 9 is high compression. +- zlib (level 0-9, level 0 is no compression [but still adding zlib overhead], + level 1 is low, level 9 is high compression) +- lzma (level 0-9, level 0 is low, level 9 is high compression). Speed: none > lz4 > zlib > lzma Compression: lzma > zlib > lz4 > none +Be careful, higher zlib and especially lzma compression levels might take a +lot of resources (CPU and memory). + The overall speed of course also depends on the speed of your target storage. If that is slow, using a higher compression level might yield better overall performance. You need to experiment a bit. Maybe just watch your CPU load, if that is relatively low, increase compression until 1 core is 70-100% loaded. -Be careful, higher zlib and especially lzma compression levels might take a -lot of resources (CPU and memory). +Even if your target storage is rather fast, you might see interesting effects: +while doing no compression at all (none) is a operation that takes no time, it +likely will need to store more data to the storage compared to using lz4. +The time needed to transfer and store the additional data might be much more +than if you had used lz4 (which is super fast, but still might compress your +data about 2:1). This is assuming your data is compressible (if you backup +already compressed data, trying to compress them at backup time is usually +pointless). Compression is applied after deduplication, thus using different compression methods in one repo does not influence deduplication. diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 9abe4fb6a..4b78fefbb 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -101,11 +101,13 @@ If you have a quick repo storage and you want a little compression: $ borg create --compression lz4 /mnt/backup::repo ~ -If you have a medium fast repo storage and you want a bit more compression (N=0..9): +If you have a medium fast repo storage and you want a bit more compression (N=0..9, +0 means no compression, 9 means high compression): $ borg create --compression zlib,N /mnt/backup::repo ~ -If you have a very slow repo storage and you want high compression (N=0..9): +If you have a very slow repo storage and you want high compression (N=0..9, 0 means +low compression, 9 means high compression): $ borg create --compression lzma,N /mnt/backup::repo ~ diff --git a/docs/support.rst b/docs/support.rst index 5e953f202..f53c01285 100644 --- a/docs/support.rst +++ b/docs/support.rst @@ -4,6 +4,9 @@ Support ======= +Please first read the docs and the FAQ section in the docs, a lot of stuff is +documented / explained there. + Issue Tracker ------------- diff --git a/docs/usage.rst b/docs/usage.rst index a68d67c3f..c4e2fa80f 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -76,8 +76,7 @@ Resource Usage |project_name| might use a lot of resources depending on the size of the data set it is dealing with. CPU: it won't go beyond 100% of 1 core as the code is currently single-threaded. - Especially higher zlib and lzma compression uses significant amounts of CPU - cycles. + Especially higher zlib and lzma compression levels use significant amounts of CPU cycles. Memory (RAM): the chunks index and the files index are read into memory for performance reasons. compression, esp. lzma compression with high levels might need substantial amounts From e1de3dce7b7981ec15d58a40eaf17532fa346125 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 15 Aug 2015 15:49:11 +0200 Subject: [PATCH 13/13] integrate compression branch changes into change history for 0.25 --- CHANGES.rst | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index d33f28ddb..439ee4c37 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,9 @@ Borg Changelog ============== -Compression branch ------------------- + +Version 0.25.0 (not released yet) +--------------------------------- Compatibility notes: @@ -24,6 +25,7 @@ Deprecations: --compression 1 (in 0.24) is the same as --compression zlib,1 (now) --compression 9 (in 0.24) is the same as --compression zlib,9 (now) + New features: - create --compression none (default, means: do not compress, just pass through @@ -32,21 +34,6 @@ New features: Please note that borgbackup needs lz4 library as additional requirement. - create --compression zlib,N (slower, higher compression, default for N is 6) - create --compression lzma,N (slowest, highest compression, default N is 6) - - -Version 0.25.0 (not released yet) ---------------------------------- - -Incompatible changes (compared to 0.24): - -- none yet - -Deprecations: - -- none yet - -New features: - - honor the nodump flag (UF_NODUMP) and do not backup such items Bug fixes: