diff --git a/src/borg/archive.py b/src/borg/archive.py index 5ed60570a..995e0e4fd 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -21,11 +21,11 @@ logger = create_logger() from . import xattr from .cache import ChunkListEntry from .chunker import Chunker -from .compress import Compressor +from .compress import Compressor, CompressionSpec from .constants import * # NOQA from .hashindex import ChunkIndex, ChunkIndexEntry from .helpers import Manifest -from .helpers import Chunk, ChunkIteratorFileWrapper, open_item +from .helpers import ChunkIteratorFileWrapper, open_item from .helpers import Error, IntegrityError, set_ec from .helpers import uid2user, user2uid, gid2group, group2gid from .helpers import parse_timestamp, to_localtime @@ -36,7 +36,6 @@ from .helpers import bin_to_hex from .helpers import safe_ns from .helpers import ellipsis_truncate, ProgressIndicatorPercent, log_multi from .helpers import PathPrefixPattern, FnmatchPattern -from .helpers import CompressionDecider1, CompressionDecider2, CompressionSpec from .item import Item, ArchiveItem from .key import key_factory from .platform import acl_get, acl_set, set_flags, get_flags, swidth @@ -196,7 +195,7 @@ class DownloadPipeline: otherwise preloaded chunks will accumulate in RemoteRepository and create a memory leak. """ unpacker = msgpack.Unpacker(use_list=False) - for _, data in self.fetch_many(ids): + for data in self.fetch_many(ids): unpacker.feed(data) items = [Item(internal_dict=item) for item in unpacker] for item in items: @@ -238,7 +237,9 @@ class ChunkBuffer: if self.buffer.tell() == 0: return self.buffer.seek(0) - chunks = list(Chunk(bytes(s)) for s in self.chunker.chunkify(self.buffer)) + # The chunker returns a memoryview to its internal buffer, + # thus a copy is needed before resuming the chunker iterator. + chunks = list(bytes(s) for s in self.chunker.chunkify(self.buffer)) self.buffer.seek(0) self.buffer.truncate(0) # Leave the last partial chunk in the buffer unless flush is True @@ -246,7 +247,7 @@ class ChunkBuffer: for chunk in chunks[:end]: self.chunks.append(self.write_chunk(chunk)) if end == -1: - self.buffer.write(chunks[-1].data) + self.buffer.write(chunks[-1]) def is_full(self): return self.buffer.tell() > self.BUFFER_SIZE @@ -260,7 +261,7 @@ class CacheChunkBuffer(ChunkBuffer): self.stats = stats def write_chunk(self, chunk): - id_, _, _ = self.cache.add_chunk(self.key.id_hash(chunk.data), chunk, self.stats, wait=False) + id_, _, _ = self.cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats, wait=False) self.cache.repository.async_response(wait=False) return id_ @@ -278,7 +279,7 @@ class Archive: def __init__(self, repository, key, manifest, name, cache=None, create=False, checkpoint_interval=300, numeric_owner=False, noatime=False, noctime=False, progress=False, - chunker_params=CHUNKER_PARAMS, start=None, start_monotonic=None, end=None, compression=None, compression_files=None, + chunker_params=CHUNKER_PARAMS, start=None, start_monotonic=None, end=None, consider_part_files=False, log_json=False): self.cwd = os.getcwd() self.key = key @@ -307,12 +308,8 @@ class Archive: self.pipeline = DownloadPipeline(self.repository, self.key) self.create = create if self.create: - self.file_compression_logger = create_logger('borg.debug.file-compression') self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats) self.chunker = Chunker(self.key.chunk_seed, *chunker_params) - self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'), - compression_files or []) - key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none')) if name in manifest.archives: raise self.AlreadyExists(name) self.last_checkpoint = time.monotonic() @@ -330,7 +327,7 @@ class Archive: self.zeros = None def _load_meta(self, id): - _, data = self.key.decrypt(id, self.repository.get(id)) + data = self.key.decrypt(id, self.repository.get(id)) metadata = ArchiveItem(internal_dict=msgpack.unpackb(data, unicode_errors='surrogateescape')) if metadata.version != 1: raise Exception('Unknown archive metadata version') @@ -469,7 +466,7 @@ Utilization of max. archive size: {csize_max:.0%} metadata = ArchiveItem(metadata) data = self.key.pack_and_authenticate_metadata(metadata.as_dict(), context=b'archive') self.id = self.key.id_hash(data) - self.cache.add_chunk(self.id, Chunk(data), self.stats) + self.cache.add_chunk(self.id, data, self.stats) while self.repository.async_response(wait=True) is not None: pass self.manifest.archives[name] = (self.id, metadata.time) @@ -495,7 +492,7 @@ Utilization of max. archive size: {csize_max:.0%} add(self.id) for id, chunk in zip(self.metadata.items, self.repository.get_many(self.metadata.items)): add(id) - _, data = self.key.decrypt(id, chunk) + data = self.key.decrypt(id, chunk) unpacker.feed(data) for item in unpacker: chunks = item.get(b'chunks') @@ -525,7 +522,7 @@ Utilization of max. archive size: {csize_max:.0%} if dry_run or stdout: if 'chunks' in item: item_chunks_size = 0 - for _, data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True): + for data in self.pipeline.fetch_many([c.id for c in item.chunks], is_preloaded=True): if pi: pi.show(increase=len(data), info=[remove_surrogates(item.path)]) if stdout: @@ -589,7 +586,7 @@ Utilization of max. archive size: {csize_max:.0%} self.zeros = b'\0' * (1 << self.chunker_params[1]) with fd: ids = [c.id for c in item.chunks] - for _, data in self.pipeline.fetch_many(ids, is_preloaded=True): + for data in self.pipeline.fetch_many(ids, is_preloaded=True): if pi: pi.show(increase=len(data), info=[remove_surrogates(item.path)]) with backup_io('write'): @@ -717,7 +714,7 @@ Utilization of max. archive size: {csize_max:.0%} setattr(metadata, key, value) data = msgpack.packb(metadata.as_dict(), unicode_errors='surrogateescape') new_id = self.key.id_hash(data) - self.cache.add_chunk(new_id, Chunk(data), self.stats) + self.cache.add_chunk(new_id, data, self.stats) self.manifest.archives[self.name] = (new_id, metadata.time) self.cache.chunk_decref(self.id, self.stats) self.id = new_id @@ -764,7 +761,7 @@ Utilization of max. archive size: {csize_max:.0%} for (i, (items_id, data)) in enumerate(zip(items_ids, self.repository.get_many(items_ids))): if progress: pi.show(i) - _, data = self.key.decrypt(items_id, data) + data = self.key.decrypt(items_id, data) unpacker.feed(data) chunk_decref(items_id, stats) try: @@ -879,10 +876,10 @@ Utilization of max. archive size: {csize_max:.0%} self.write_checkpoint() return length, number - def chunk_file(self, item, cache, stats, chunk_iter, chunk_processor=None, **chunk_kw): + def chunk_file(self, item, cache, stats, chunk_iter, chunk_processor=None): if not chunk_processor: def chunk_processor(data): - chunk_entry = cache.add_chunk(self.key.id_hash(data), Chunk(data, **chunk_kw), stats, wait=False) + chunk_entry = cache.add_chunk(self.key.id_hash(data), data, stats, wait=False) self.cache.repository.async_response(wait=False) return chunk_entry @@ -971,12 +968,10 @@ Utilization of max. archive size: {csize_max:.0%} if chunks is not None: item.chunks = chunks else: - compress = self.compression_decider1.decide(path) - self.file_compression_logger.debug('%s -> compression %s', path, compress.name) with backup_io('open'): fh = Archive._open_rb(path) with os.fdopen(fh, 'rb') as fd: - self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh)), compress=compress) + self.chunk_file(item, cache, self.stats, backup_io_iter(self.chunker.chunkify(fd, fh))) if not is_special_file: # we must not memorize special files, because the contents of e.g. a # block or char device will change without its mtime/size/inode changing. @@ -1212,9 +1207,9 @@ class ArchiveChecker: chunk_ids = list(reversed(chunk_ids_revd)) chunk_data_iter = self.repository.get_many(chunk_ids) else: + _chunk_id = None if chunk_id == Manifest.MANIFEST_ID else chunk_id try: - _chunk_id = None if chunk_id == Manifest.MANIFEST_ID else chunk_id - _, data = self.key.decrypt(_chunk_id, encrypted_data) + self.key.decrypt(_chunk_id, encrypted_data) except IntegrityError as integrity_error: self.error_found = True errors += 1 @@ -1284,7 +1279,7 @@ class ArchiveChecker: for chunk_id, _ in self.chunks.iteritems(): cdata = self.repository.get(chunk_id) try: - _, data = self.key.decrypt(chunk_id, cdata) + data = self.key.decrypt(chunk_id, cdata) except IntegrityError as exc: logger.error('Skipping corrupted chunk: %s', exc) self.error_found = True @@ -1329,9 +1324,9 @@ class ArchiveChecker: self.possibly_superseded.add(id_) def add_callback(chunk): - id_ = self.key.id_hash(chunk.data) + id_ = self.key.id_hash(chunk) cdata = self.key.encrypt(chunk) - add_reference(id_, len(chunk.data), len(cdata), cdata) + add_reference(id_, len(chunk), len(cdata), cdata) return id_ def add_reference(id_, size, csize, cdata=None): @@ -1352,7 +1347,7 @@ class ArchiveChecker: def replacement_chunk(size): data = bytes(size) chunk_id = self.key.id_hash(data) - cdata = self.key.encrypt(Chunk(data)) + cdata = self.key.encrypt(data) csize = len(cdata) return chunk_id, size, csize, cdata @@ -1461,7 +1456,7 @@ class ArchiveChecker: if state > 0: unpacker.resync() for chunk_id, cdata in zip(items, repository.get_many(items)): - _, data = self.key.decrypt(chunk_id, cdata) + data = self.key.decrypt(chunk_id, cdata) unpacker.feed(data) try: for item in unpacker: @@ -1511,7 +1506,7 @@ class ArchiveChecker: continue mark_as_possibly_superseded(archive_id) cdata = self.repository.get(archive_id) - _, data = self.key.decrypt(archive_id, cdata) + data = self.key.decrypt(archive_id, cdata) archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) if archive.version != 1: raise Exception('Unknown archive metadata version') @@ -1528,7 +1523,7 @@ class ArchiveChecker: archive.items = items_buffer.chunks data = msgpack.packb(archive.as_dict(), unicode_errors='surrogateescape') new_archive_id = self.key.id_hash(data) - cdata = self.key.encrypt(Chunk(data)) + cdata = self.key.encrypt(data) add_reference(new_archive_id, len(data), len(cdata), cdata) self.manifest.archives[info.name] = (new_archive_id, info.ts) @@ -1562,7 +1557,7 @@ class ArchiveRecreater: def __init__(self, repository, manifest, key, cache, matcher, exclude_caches=False, exclude_if_present=None, keep_exclude_tags=False, - chunker_params=None, compression=None, compression_files=None, always_recompress=False, + chunker_params=None, compression=None, always_recompress=False, dry_run=False, stats=False, progress=False, file_status_printer=None, checkpoint_interval=1800): self.repository = repository @@ -1583,9 +1578,6 @@ class ArchiveRecreater: self.always_recompress = always_recompress self.compression = compression or CompressionSpec('none') self.seen_chunks = set() - self.compression_decider1 = CompressionDecider1(compression or CompressionSpec('none'), - compression_files or []) - key.compression_decider2 = CompressionDecider2(compression or CompressionSpec('none')) self.dry_run = dry_run self.stats = stats @@ -1654,24 +1646,21 @@ class ArchiveRecreater: self.cache.chunk_incref(chunk_id, target.stats) return item.chunks chunk_iterator = self.iter_chunks(archive, target, list(item.chunks)) - compress = self.compression_decider1.decide(item.path) - chunk_processor = partial(self.chunk_processor, target, compress) + chunk_processor = partial(self.chunk_processor, target) target.chunk_file(item, self.cache, target.stats, chunk_iterator, chunk_processor) - def chunk_processor(self, target, compress, data): + def chunk_processor(self, target, data): chunk_id = self.key.id_hash(data) if chunk_id in self.seen_chunks: return self.cache.chunk_incref(chunk_id, target.stats) - chunk = Chunk(data, compress=compress) - compression_spec, chunk = self.key.compression_decider2.decide(chunk) overwrite = self.recompress if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks: # Check if this chunk is already compressed the way we want it old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False) - if Compressor.detect(old_chunk.data).name == compression_spec.name: + if Compressor.detect(old_chunk).name == self.key.compressor.decide(data).name: # Stored chunk has the same compression we wanted overwrite = False - chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite, wait=False) + chunk_entry = self.cache.add_chunk(chunk_id, data, target.stats, overwrite=overwrite, wait=False) self.cache.repository.async_response(wait=False) self.seen_chunks.add(chunk_entry.id) return chunk_entry @@ -1685,7 +1674,7 @@ class ArchiveRecreater: yield from target.chunker.chunkify(file) else: for chunk in chunk_iterator: - yield chunk.data + yield chunk def save(self, archive, target, comment=None, replace_original=True): if self.dry_run: @@ -1756,7 +1745,7 @@ class ArchiveRecreater: def create_target_archive(self, name): target = Archive(self.repository, self.key, self.manifest, name, create=True, progress=self.progress, chunker_params=self.chunker_params, cache=self.cache, - checkpoint_interval=self.checkpoint_interval, compression=self.compression) + checkpoint_interval=self.checkpoint_interval) return target def open_archive(self, name, **kwargs): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index e241002eb..7dccd55a6 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -34,10 +34,11 @@ from .archive import Archive, ArchiveChecker, ArchiveRecreater, Statistics, is_s from .archive import BackupOSError, backup_io from .cache import Cache from .constants import * # NOQA +from .compress import CompressionSpec from .crc32 import crc32 from .helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR from .helpers import Error, NoManifestError, set_ec -from .helpers import location_validator, archivename_validator, ChunkerParams, CompressionSpec, ComprSpec +from .helpers import location_validator, archivename_validator, ChunkerParams from .helpers import PrefixSpec, SortBySpec, HUMAN_SORT_KEYS from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter from .helpers import format_time, format_timedelta, format_file_size, format_archive @@ -107,6 +108,15 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, excl with repository: if manifest or cache: kwargs['manifest'], kwargs['key'] = Manifest.load(repository) + # do_recreate uses args.compression is None as in band signalling for "don't recompress", + # note that it does not look at key.compressor. In this case the default compressor applies + # to new chunks. + # + # We can't use a check like `'compression' in args` (an argparse.Namespace speciality), + # since the compression attribute is set. So we need to see whether it's set to something + # true-ish, like a CompressionSpec instance. + if getattr(args, 'compression', False): + kwargs['key'].compressor = args.compression.compressor if cache: with Cache(repository, kwargs['key'], kwargs['manifest'], do_files=getattr(args, 'cache_files', False), @@ -167,14 +177,14 @@ class Archiver: a = next(chunks1, end) if a is end: return not blen - bi and next(chunks2, end) is end - a = memoryview(a.data) + a = memoryview(a) alen = len(a) ai = 0 if not blen - bi: b = next(chunks2, end) if b is end: return not alen - ai and next(chunks1, end) is end - b = memoryview(b.data) + b = memoryview(b) blen = len(b) bi = 0 slicelen = min(alen - ai, blen - bi) @@ -471,7 +481,6 @@ class Archiver: numeric_owner=args.numeric_owner, noatime=args.noatime, noctime=args.noctime, progress=args.progress, chunker_params=args.chunker_params, start=t0, start_monotonic=t0_monotonic, - compression=args.compression, compression_files=args.compression_files, log_json=args.log_json) create_inner(archive, cache) else: @@ -1325,8 +1334,7 @@ class Archiver: recreater = ArchiveRecreater(repository, manifest, key, cache, matcher, exclude_caches=args.exclude_caches, exclude_if_present=args.exclude_if_present, keep_exclude_tags=args.keep_exclude_tags, chunker_params=args.chunker_params, - compression=args.compression, compression_files=args.compression_files, - always_recompress=args.always_recompress, + compression=args.compression, always_recompress=args.always_recompress, progress=args.progress, stats=args.stats, file_status_printer=self.print_file_status, checkpoint_interval=args.checkpoint_interval, @@ -1387,7 +1395,7 @@ class Archiver: archive = Archive(repository, key, manifest, args.location.archive, consider_part_files=args.consider_part_files) for i, item_id in enumerate(archive.metadata.items): - _, data = key.decrypt(item_id, repository.get(item_id)) + data = key.decrypt(item_id, repository.get(item_id)) filename = '%06d_%s.items' % (i, bin_to_hex(item_id)) print('Dumping', filename) with open(filename, 'wb') as fd: @@ -1417,7 +1425,7 @@ class Archiver: fd.write(do_indent(prepare_dump_dict(archive_meta_orig))) fd.write(',\n') - _, data = key.decrypt(archive_meta_orig[b'id'], repository.get(archive_meta_orig[b'id'])) + data = key.decrypt(archive_meta_orig[b'id'], repository.get(archive_meta_orig[b'id'])) archive_org_dict = msgpack.unpackb(data, object_hook=StableDict, unicode_errors='surrogateescape') fd.write(' "_meta":\n') @@ -1428,7 +1436,7 @@ class Archiver: unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict) first = True for item_id in archive_org_dict[b'items']: - _, data = key.decrypt(item_id, repository.get(item_id)) + data = key.decrypt(item_id, repository.get(item_id)) unpacker.feed(data) for item in unpacker: item = prepare_dump_dict(item) @@ -1452,7 +1460,7 @@ class Archiver: def do_debug_dump_manifest(self, args, repository, manifest, key): """dump decoded repository manifest""" - _, data = key.decrypt(None, repository.get(manifest.MANIFEST_ID)) + data = key.decrypt(None, repository.get(manifest.MANIFEST_ID)) meta = prepare_dump_dict(msgpack.fallback.unpackb(data, object_hook=StableDict, unicode_errors='surrogateescape')) @@ -1476,7 +1484,7 @@ class Archiver: for id in result: cdata = repository.get(id) give_id = id if id != Manifest.MANIFEST_ID else None - _, data = key.decrypt(give_id, cdata) + data = key.decrypt(give_id, cdata) filename = '%06d_%s.obj' % (i, bin_to_hex(id)) print('Dumping', filename) with open(filename, 'wb') as fd: @@ -1789,43 +1797,13 @@ class Archiver: For compressible data, it uses the given C[,L] compression - with C[,L] being any valid compression specifier. - The decision about which compression to use is done by borg like this: - - 1. find a compression specifier (per file): - match the path/filename against all patterns in all --compression-from - files (if any). If a pattern matches, use the compression spec given for - that pattern. If no pattern matches (and also if you do not give any - --compression-from option), default to the compression spec given by - --compression. See docs/misc/compression.conf for an example config. - - 2. if the found compression spec is not "auto", the decision is taken: - use the found compression spec. - - 3. if the found compression spec is "auto", test compressibility of each - chunk using lz4. - If it is compressible, use the C,[L] compression spec given within the - "auto" specifier. If it is not compressible, use no compression. - Examples:: borg create --compression lz4 REPO::ARCHIVE data borg create --compression zlib REPO::ARCHIVE data borg create --compression zlib,1 REPO::ARCHIVE data borg create --compression auto,lzma,6 REPO::ARCHIVE data - borg create --compression-from compression.conf --compression auto,lzma ... - - compression.conf has entries like:: - - # example config file for --compression-from option - # - # Format of non-comment / non-empty lines: - # : - # compression-spec is same format as for --compression option - # path/filename pattern is same format as for --exclude option - none:*.gz - none:*.zip - none:*.mp3 - none:*.ogg + borg create --compression auto,lzma ... General remarks: @@ -2411,14 +2389,9 @@ class Archiver: help='specify the chunker parameters (CHUNK_MIN_EXP, CHUNK_MAX_EXP, ' 'HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %d,%d,%d,%d' % CHUNKER_PARAMS) archive_group.add_argument('-C', '--compression', dest='compression', - type=CompressionSpec, default=ComprSpec(name='lz4', spec=None), metavar='COMPRESSION', + type=CompressionSpec, default=CompressionSpec('lz4'), metavar='COMPRESSION', help='select compression algorithm, see the output of the ' '"borg help compression" command for details.') - archive_group.add_argument('--compression-from', dest='compression_files', - type=argparse.FileType('r'), action='append', - metavar='COMPRESSIONCONFIG', - help='read compression patterns from COMPRESSIONCONFIG, see the output of the ' - '"borg help compression" command for details.') subparser.add_argument('location', metavar='ARCHIVE', type=location_validator(archive=True), @@ -2954,7 +2927,7 @@ class Archiver: resulting archive will only contain files from these PATHs. Note that all paths in an archive are relative, therefore absolute patterns/paths - will *not* match (--exclude, --exclude-from, --compression-from, PATHs). + will *not* match (--exclude, --exclude-from, PATHs). --compression: all chunks seen will be stored using the given method. Due to how Borg stores compressed size information this might display @@ -3049,11 +3022,6 @@ class Archiver: archive_group.add_argument('--always-recompress', dest='always_recompress', action='store_true', help='always recompress chunks, don\'t skip chunks already compressed with the same ' 'algorithm.') - archive_group.add_argument('--compression-from', dest='compression_files', - type=argparse.FileType('r'), action='append', - metavar='COMPRESSIONCONFIG', - help='read compression patterns from COMPRESSIONCONFIG, see the output of the ' - '"borg help compression" command for details.') archive_group.add_argument('--chunker-params', dest='chunker_params', type=ChunkerParams, default=CHUNKER_PARAMS, metavar='PARAMS', diff --git a/src/borg/cache.py b/src/borg/cache.py index b3d7e12f2..e1138e7a0 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -424,14 +424,14 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}""" def fetch_and_build_idx(archive_id, repository, key, chunk_idx): cdata = repository.get(archive_id) - _, data = key.decrypt(archive_id, cdata) + data = key.decrypt(archive_id, cdata) chunk_idx.add(archive_id, 1, len(data), len(cdata)) archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) if archive.version != 1: raise Exception('Unknown archive metadata version') unpacker = msgpack.Unpacker() for item_id, chunk in zip(archive.items, repository.get_many(archive.items)): - _, data = key.decrypt(item_id, chunk) + data = key.decrypt(item_id, chunk) chunk_idx.add(item_id, 1, len(data), len(chunk)) unpacker.feed(data) for item in unpacker: @@ -527,7 +527,7 @@ Chunk index: {0.total_unique_chunks:20d} {0.total_chunks:20d}""" def add_chunk(self, id, chunk, stats, overwrite=False, wait=True): if not self.txn_active: self.begin_txn() - size = len(chunk.data) + size = len(chunk) refcount = self.seen_chunk(id, size) if refcount and not overwrite: return self.chunk_incref(id, stats) diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 786e19fd1..c226d494a 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -1,4 +1,22 @@ +""" +borg.compress +============= + +Compression is applied to chunks after ID hashing (so the ID is a direct function of the +plain chunk, compression is irrelevant to it), and of course before encryption. + +The "auto" mode (e.g. --compression auto,lzma,4) is implemented as a meta Compressor, +meaning that Auto acts like a Compressor, but defers actual work to others (namely +LZ4 as a heuristic whether compression is worth it, and the specified Compressor +for the actual compression). + +Decompression is normally handled through Compressor.decompress which will detect +which compressor has been used to compress the data and dispatch to the correct +decompressor. +""" + import zlib + try: import lzma except ImportError: @@ -6,7 +24,7 @@ except ImportError: from .helpers import Buffer, DecompressionError -API_VERSION = '1.1_02' +API_VERSION = '1.1_03' cdef extern from "lz4.h": int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil @@ -34,11 +52,34 @@ cdef class CompressorBase: def __init__(self, **kwargs): pass + def decide(self, data): + """ + Return which compressor will perform the actual compression for *data*. + + This exists for a very specific case: If borg recreate is instructed to recompress + using Auto compression it needs to determine the _actual_ target compression of a chunk + in order to detect whether it should be recompressed. + + For all Compressors that are not Auto this always returns *self*. + """ + return self + def compress(self, data): + """ + Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor, + which is needed so that the correct decompressor can be used for decompression. + """ # add ID bytes return self.ID + data def decompress(self, data): + """ + Decompress *data* (bytes) and return bytes result. The leading Compressor ID + bytes need to be present. + + Only handles input generated by _this_ Compressor - for a general purpose + decompression method see *Compressor.decompress*. + """ # strip ID bytes return data[2:] @@ -179,12 +220,64 @@ class ZLIB(CompressorBase): raise DecompressionError(str(e)) from None +class Auto(CompressorBase): + """ + Meta-Compressor that decides which compression to use based on LZ4's ratio. + + As a meta-Compressor the actual compression is deferred to other Compressors, + therefore this Compressor has no ID, no detect() and no decompress(). + """ + + ID = None + name = 'auto' + + def __init__(self, compressor): + super().__init__() + self.compressor = compressor + self.lz4 = get_compressor('lz4') + self.none = get_compressor('none') + + def _decide(self, data): + """ + Decides what to do with *data*. Returns (compressor, lz4_data). + + *lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None. + """ + lz4_data = self.lz4.compress(data) + ratio = len(lz4_data) / len(data) + if ratio < 0.97: + return self.compressor, None + elif ratio < 1: + return self.lz4, lz4_data + else: + return self.none, None + + def decide(self, data): + return self._decide(data)[0] + + def compress(self, data): + compressor, lz4_data = self._decide(data) + if lz4_data is None: + return compressor.compress(data) + else: + return lz4_data + + def decompress(self, data): + raise NotImplementedError + + def detect(cls, data): + raise NotImplementedError + + +# Maps valid compressor names to their class COMPRESSOR_TABLE = { CNONE.name: CNONE, LZ4.name: LZ4, ZLIB.name: ZLIB, LZMA.name: LZMA, + Auto.name: Auto, } +# List of possible compression types. Does not include Auto, since it is a meta-Compressor. COMPRESSOR_LIST = [LZ4, CNONE, ZLIB, LZMA, ] # check fast stuff first def get_compressor(name, **kwargs): @@ -216,3 +309,42 @@ class Compressor: return cls else: raise ValueError('No decompressor for this data found: %r.', data[:2]) + + +class CompressionSpec: + def __init__(self, s): + values = s.split(',') + count = len(values) + if count < 1: + raise ValueError + # --compression algo[,level] + self.name = values[0] + if self.name in ('none', 'lz4', ): + return + elif self.name in ('zlib', 'lzma', ): + if count < 2: + level = 6 # default compression level in py stdlib + elif count == 2: + level = int(values[1]) + if not 0 <= level <= 9: + raise ValueError + else: + raise ValueError + self.level = level + elif self.name == 'auto': + if 2 <= count <= 3: + compression = ','.join(values[1:]) + else: + raise ValueError + self.inner = CompressionSpec(compression) + else: + raise ValueError + + @property + def compressor(self): + if self.name in ('none', 'lz4', ): + return get_compressor(self.name) + elif self.name in ('zlib', 'lzma', ): + return get_compressor(self.name, level=self.level) + elif self.name == 'auto': + return get_compressor(self.name, compressor=self.inner.compressor) diff --git a/src/borg/fuse.py b/src/borg/fuse.py index 33c6b3897..fc19e6e04 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -144,7 +144,7 @@ class FuseOperations(llfuse.Operations): self.file_versions = {} # for versions mode: original path -> version unpacker = msgpack.Unpacker() for key, chunk in zip(archive.metadata.items, self.repository.get_many(archive.metadata.items)): - _, data = self.key.decrypt(key, chunk) + data = self.key.decrypt(key, chunk) unpacker.feed(data) for item in unpacker: item = Item(internal_dict=item) @@ -340,7 +340,7 @@ class FuseOperations(llfuse.Operations): # evict fully read chunk from cache del self.data_cache[id] else: - _, data = self.key.decrypt(id, self.repository.get(id)) + data = self.key.decrypt(id, self.repository.get(id)) if offset + n < len(data): # chunk was only partially read, cache it self.data_cache[id] = data diff --git a/src/borg/helpers.py b/src/borg/helpers.py index 2e343e4e7..902132432 100644 --- a/src/borg/helpers.py +++ b/src/borg/helpers.py @@ -44,13 +44,6 @@ from . import hashindex from . import shellpattern from .constants import * # NOQA -# meta dict, data bytes -_Chunk = namedtuple('_Chunk', 'meta data') - - -def Chunk(data, **meta): - return _Chunk(meta, data) - ''' The global exit_code variable is used so that modules other than archiver can increase the program exit code if a @@ -123,7 +116,7 @@ def check_extension_modules(): raise ExtensionModuleError if chunker.API_VERSION != '1.1_01': raise ExtensionModuleError - if compress.API_VERSION != '1.1_02': + if compress.API_VERSION != '1.1_03': raise ExtensionModuleError if crypto.API_VERSION != '1.1_01': raise ExtensionModuleError @@ -247,7 +240,7 @@ class Manifest: if not key: key = key_factory(repository, cdata) manifest = cls(key, repository) - data = key.decrypt(None, cdata).data + data = key.decrypt(None, cdata) manifest_dict, manifest.tam_verified = key.unpack_and_verify_manifest(data, force_tam_not_required=force_tam_not_required) m = ManifestItem(internal_dict=manifest_dict) manifest.id = key.id_hash(data) @@ -292,7 +285,7 @@ class Manifest: self.tam_verified = True data = self.key.pack_and_authenticate_metadata(manifest.as_dict()) self.id = self.key.id_hash(data) - self.repository.put(self.MANIFEST_ID, self.key.encrypt(Chunk(data, compression={'name': 'none'}))) + self.repository.put(self.MANIFEST_ID, self.key.encrypt(data)) def prune_within(archives, within): @@ -726,37 +719,6 @@ def ChunkerParams(s): return int(chunk_min), int(chunk_max), int(chunk_mask), int(window_size) -ComprSpec = namedtuple('ComprSpec', ('name', 'spec')) - - -def CompressionSpec(s): - values = s.split(',') - count = len(values) - if count < 1: - raise ValueError - # --compression algo[,level] - name = values[0] - if name in ('none', 'lz4', ): - return ComprSpec(name=name, spec=None) - if name in ('zlib', 'lzma', ): - if count < 2: - level = 6 # default compression level in py stdlib - elif count == 2: - level = int(values[1]) - if not 0 <= level <= 9: - raise ValueError - else: - raise ValueError - return ComprSpec(name=name, spec=level) - if name == 'auto': - if 2 <= count <= 3: - compression = ','.join(values[1:]) - else: - raise ValueError - return ComprSpec(name=name, spec=CompressionSpec(compression)) - raise ValueError - - def dir_is_cachedir(path): """Determines whether the specified path is a cache directory (and therefore should potentially be excluded from the backup) according to @@ -1940,7 +1902,7 @@ class ItemFormatter(BaseFormatter): if 'chunks' not in item: return "" hash = hashlib.new(hash_function) - for _, data in self.archive.pipeline.fetch_many([c.id for c in item.chunks]): + for data in self.archive.pipeline.fetch_many([c.id for c in item.chunks]): hash.update(data) return hash.hexdigest() @@ -1965,7 +1927,7 @@ class ChunkIteratorFileWrapper: if not remaining: try: chunk = next(self.chunk_iterator) - self.chunk = memoryview(chunk.data) + self.chunk = memoryview(chunk) except StopIteration: self.exhausted = True return 0 # EOF @@ -2127,73 +2089,6 @@ def clean_lines(lines, lstrip=None, rstrip=None, remove_empty=True, remove_comme yield line -class CompressionDecider1: - def __init__(self, compression, compression_files): - """ - Initialize a CompressionDecider instance (and read config files, if needed). - - :param compression: default CompressionSpec (e.g. from --compression option) - :param compression_files: list of compression config files (e.g. from --compression-from) or - a list of other line iterators - """ - self.compression = compression - if not compression_files: - self.matcher = None - else: - self.matcher = PatternMatcher(fallback=compression) - for file in compression_files: - try: - for line in clean_lines(file): - try: - compr_spec, fn_pattern = line.split(':', 1) - except: - continue - self.matcher.add([parse_pattern(fn_pattern)], CompressionSpec(compr_spec)) - finally: - if hasattr(file, 'close'): - file.close() - - def decide(self, path): - if self.matcher is not None: - return self.matcher.match(path) - return self.compression - - -class CompressionDecider2: - logger = create_logger('borg.debug.file-compression') - - def __init__(self, compression): - self.compression = compression - - def decide(self, chunk): - # nothing fancy here yet: we either use what the metadata says or the default - # later, we can decide based on the chunk data also. - # if we compress the data here to decide, we can even update the chunk data - # and modify the metadata as desired. - compr_spec = chunk.meta.get('compress', self.compression) - if compr_spec.name == 'auto': - # we did not decide yet, use heuristic: - compr_spec, chunk = self.heuristic_lz4(compr_spec, chunk) - return compr_spec, chunk - - def heuristic_lz4(self, compr_args, chunk): - from .compress import get_compressor - meta, data = chunk - lz4 = get_compressor('lz4') - cdata = lz4.compress(data) - data_len = len(data) - cdata_len = len(cdata) - if cdata_len < 0.97 * data_len: - compr_spec = compr_args.spec - else: - # uncompressible - we could have a special "uncompressible compressor" - # that marks such data as uncompressible via compression-type metadata. - compr_spec = CompressionSpec('none') - self.logger.debug("len(data) == %d, len(lz4(data)) == %d, ratio == %.3f, choosing %s", data_len, cdata_len, cdata_len/data_len, compr_spec) - meta['compress'] = compr_spec - return compr_spec, Chunk(data, **meta) - - class ErrorIgnoringTextIOWrapper(io.TextIOWrapper): def read(self, n): if not self.closed: diff --git a/src/borg/key.py b/src/borg/key.py index 3d3cfc53e..6cefba248 100644 --- a/src/borg/key.py +++ b/src/borg/key.py @@ -13,14 +13,13 @@ from .logger import create_logger logger = create_logger() from .constants import * # NOQA -from .compress import Compressor, get_compressor +from .compress import Compressor from .crypto import AES, bytes_to_long, bytes_to_int, num_aes_blocks, hmac_sha256, blake2b_256, hkdf_hmac_sha512 -from .helpers import Chunk, StableDict +from .helpers import StableDict from .helpers import Error, IntegrityError from .helpers import yes from .helpers import get_keys_dir, get_security_dir from .helpers import bin_to_hex -from .helpers import CompressionDecider2, CompressionSpec from .item import Key, EncryptedKey from .platform import SaveFile from .nonces import NonceManager @@ -143,21 +142,16 @@ class KeyBase: self.TYPE_STR = bytes([self.TYPE]) self.repository = repository self.target = None # key location file path / repo obj - self.compression_decider2 = CompressionDecider2(CompressionSpec('none')) - self.compressor = Compressor('none') # for decompression + # Some commands write new chunks (e.g. rename) but don't take a --compression argument. This duplicates + # the default used by those commands who do take a --compression argument. + self.compressor = Compressor('lz4') + self.decompress = self.compressor.decompress self.tam_required = True def id_hash(self, data): """Return HMAC hash using the "id" HMAC key """ - def compress(self, chunk): - compr_args, chunk = self.compression_decider2.decide(chunk) - compressor = Compressor(name=compr_args.name, level=compr_args.spec) - meta, data = chunk - data = compressor.compress(data) - return Chunk(data, **meta) - def encrypt(self, chunk): pass @@ -258,8 +252,8 @@ class PlaintextKey(KeyBase): return sha256(data).digest() def encrypt(self, chunk): - chunk = self.compress(chunk) - return b''.join([self.TYPE_STR, chunk.data]) + data = self.compressor.compress(chunk) + return b''.join([self.TYPE_STR, data]) def decrypt(self, id, data, decompress=True): if data[0] != self.TYPE: @@ -267,10 +261,10 @@ class PlaintextKey(KeyBase): raise IntegrityError('Chunk %s: Invalid encryption envelope' % id_str) payload = memoryview(data)[1:] if not decompress: - return Chunk(payload) - data = self.compressor.decompress(payload) + return payload + data = self.decompress(payload) self.assert_id(id, data) - return Chunk(data) + return data def _tam_key(self, salt, context): return salt + context @@ -336,10 +330,10 @@ class AESKeyBase(KeyBase): MAC = hmac_sha256 def encrypt(self, chunk): - chunk = self.compress(chunk) - self.nonce_manager.ensure_reservation(num_aes_blocks(len(chunk.data))) + data = self.compressor.compress(chunk) + self.nonce_manager.ensure_reservation(num_aes_blocks(len(data))) self.enc_cipher.reset() - data = b''.join((self.enc_cipher.iv[8:], self.enc_cipher.encrypt(chunk.data))) + data = b''.join((self.enc_cipher.iv[8:], self.enc_cipher.encrypt(data))) assert (self.MAC is blake2b_256 and len(self.enc_hmac_key) == 128 or self.MAC is hmac_sha256 and len(self.enc_hmac_key) == 32) hmac = self.MAC(self.enc_hmac_key, data) @@ -361,10 +355,10 @@ class AESKeyBase(KeyBase): self.dec_cipher.reset(iv=PREFIX + data[33:41]) payload = self.dec_cipher.decrypt(data_view[41:]) if not decompress: - return Chunk(payload) - data = self.compressor.decompress(payload) + return payload + data = self.decompress(payload) self.assert_id(id, data) - return Chunk(data) + return data def extract_nonce(self, payload): if not (payload[0] == self.TYPE or @@ -748,18 +742,18 @@ class AuthenticatedKey(ID_BLAKE2b_256, RepoKey): STORAGE = KeyBlobStorage.REPO def encrypt(self, chunk): - chunk = self.compress(chunk) - return b''.join([self.TYPE_STR, chunk.data]) + data = self.compressor.compress(chunk) + return b''.join([self.TYPE_STR, data]) def decrypt(self, id, data, decompress=True): if data[0] != self.TYPE: raise IntegrityError('Chunk %s: Invalid envelope' % bin_to_hex(id)) payload = memoryview(data)[1:] if not decompress: - return Chunk(payload) - data = self.compressor.decompress(payload) + return payload + data = self.decompress(payload) self.assert_id(id, data) - return Chunk(data) + return data AVAILABLE_KEY_TYPES = ( diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index dc172d438..3d82a4ceb 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -72,8 +72,8 @@ class MockCache: self.repository = self.MockRepo() def add_chunk(self, id, chunk, stats=None, wait=True): - self.objects[id] = chunk.data - return id, len(chunk.data), len(chunk.data) + self.objects[id] = chunk + return id, len(chunk), len(chunk) class ArchiveTimestampTestCase(BaseTestCase): diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 874987263..287dfe2c2 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -34,7 +34,7 @@ from ..cache import Cache from ..constants import * # NOQA from ..crypto import bytes_to_long, num_aes_blocks from ..helpers import PatternMatcher, parse_pattern, Location, get_security_dir -from ..helpers import Chunk, Manifest +from ..helpers import Manifest from ..helpers import EXIT_SUCCESS, EXIT_WARNING, EXIT_ERROR from ..helpers import bin_to_hex from ..item import Item @@ -2449,7 +2449,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase): 'version': 1, }) archive_id = key.id_hash(archive) - repository.put(archive_id, key.encrypt(Chunk(archive))) + repository.put(archive_id, key.encrypt(archive)) repository.commit() self.cmd('check', self.repository_location, exit_code=1) self.cmd('check', '--repair', self.repository_location, exit_code=0) @@ -2537,12 +2537,12 @@ class ManifestAuthenticationTest(ArchiverTestCaseBase): def spoof_manifest(self, repository): with repository: _, key = Manifest.load(repository) - repository.put(Manifest.MANIFEST_ID, key.encrypt(Chunk(msgpack.packb({ + repository.put(Manifest.MANIFEST_ID, key.encrypt(msgpack.packb({ 'version': 1, 'archives': {}, 'config': {}, 'timestamp': (datetime.utcnow() + timedelta(days=1)).isoformat(), - })))) + }))) repository.commit() def test_fresh_init_tam_required(self): @@ -2550,11 +2550,11 @@ class ManifestAuthenticationTest(ArchiverTestCaseBase): repository = Repository(self.repository_path, exclusive=True) with repository: manifest, key = Manifest.load(repository) - repository.put(Manifest.MANIFEST_ID, key.encrypt(Chunk(msgpack.packb({ + repository.put(Manifest.MANIFEST_ID, key.encrypt(msgpack.packb({ 'version': 1, 'archives': {}, 'timestamp': (datetime.utcnow() + timedelta(days=1)).isoformat(), - })))) + }))) repository.commit() with pytest.raises(TAMRequiredError): @@ -2570,9 +2570,9 @@ class ManifestAuthenticationTest(ArchiverTestCaseBase): key.tam_required = False key.change_passphrase(key._passphrase) - manifest = msgpack.unpackb(key.decrypt(None, repository.get(Manifest.MANIFEST_ID)).data) + manifest = msgpack.unpackb(key.decrypt(None, repository.get(Manifest.MANIFEST_ID))) del manifest[b'tam'] - repository.put(Manifest.MANIFEST_ID, key.encrypt(Chunk(msgpack.packb(manifest)))) + repository.put(Manifest.MANIFEST_ID, key.encrypt(msgpack.packb(manifest))) repository.commit() output = self.cmd('list', '--debug', self.repository_location) assert 'archive1234' in output @@ -2844,8 +2844,8 @@ def test_get_args(): def test_compare_chunk_contents(): def ccc(a, b): - chunks_a = [Chunk(data) for data in a] - chunks_b = [Chunk(data) for data in b] + chunks_a = [data for data in a] + chunks_b = [data for data in b] compare1 = Archiver.compare_chunk_contents(iter(chunks_a), iter(chunks_b)) compare2 = Archiver.compare_chunk_contents(iter(chunks_b), iter(chunks_a)) assert compare1 == compare2 diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index ff9d42713..ee6da55a1 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -7,7 +7,7 @@ except ImportError: import pytest -from ..compress import get_compressor, Compressor, CNONE, ZLIB, LZ4 +from ..compress import get_compressor, Compressor, CompressionSpec, CNONE, ZLIB, LZ4, LZMA, Auto buffer = bytes(2**16) @@ -107,3 +107,48 @@ def test_compressor(): for params in params_list: c = Compressor(**params) assert data == c.decompress(c.compress(data)) + + +def test_auto(): + compressor = CompressionSpec('auto,zlib,9').compressor + + compressed = compressor.compress(bytes(500)) + assert Compressor.detect(compressed) == ZLIB + + compressed = compressor.compress(b'\x00\xb8\xa3\xa2-O\xe1i\xb6\x12\x03\xc21\xf3\x8a\xf78\\\x01\xa5b\x07\x95\xbeE\xf8\xa3\x9ahm\xb1~') + assert Compressor.detect(compressed) == CNONE + + +def test_compression_specs(): + with pytest.raises(ValueError): + CompressionSpec('') + + assert isinstance(CompressionSpec('none').compressor, CNONE) + assert isinstance(CompressionSpec('lz4').compressor, LZ4) + + zlib = CompressionSpec('zlib').compressor + assert isinstance(zlib, ZLIB) + assert zlib.level == 6 + zlib = CompressionSpec('zlib,0').compressor + assert isinstance(zlib, ZLIB) + assert zlib.level == 0 + zlib = CompressionSpec('zlib,9').compressor + assert isinstance(zlib, ZLIB) + assert zlib.level == 9 + with pytest.raises(ValueError): + CompressionSpec('zlib,9,invalid') + + lzma = CompressionSpec('lzma').compressor + assert isinstance(lzma, LZMA) + assert lzma.level == 6 + lzma = CompressionSpec('lzma,0').compressor + assert isinstance(lzma, LZMA) + assert lzma.level == 0 + lzma = CompressionSpec('lzma,9').compressor + assert isinstance(lzma, LZMA) + assert lzma.level == 9 + + with pytest.raises(ValueError): + CompressionSpec('lzma,9,invalid') + with pytest.raises(ValueError): + CompressionSpec('invalid') diff --git a/src/borg/testsuite/helpers.py b/src/borg/testsuite/helpers.py index 19c5e9c51..075967c4a 100644 --- a/src/borg/testsuite/helpers.py +++ b/src/borg/testsuite/helpers.py @@ -21,10 +21,9 @@ from ..helpers import get_cache_dir, get_keys_dir, get_security_dir from ..helpers import is_slow_msgpack from ..helpers import yes, TRUISH, FALSISH, DEFAULTISH from ..helpers import StableDict, int_to_bigint, bigint_to_int, bin_to_hex -from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams, Chunk +from ..helpers import parse_timestamp, ChunkIteratorFileWrapper, ChunkerParams from ..helpers import ProgressIndicatorPercent, ProgressIndicatorEndless from ..helpers import load_exclude_file, load_pattern_file -from ..helpers import CompressionSpec, ComprSpec, CompressionDecider1, CompressionDecider2 from ..helpers import parse_pattern, PatternMatcher from ..helpers import PathFullPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, RegexPattern from ..helpers import swidth_slice @@ -698,25 +697,6 @@ def test_pattern_matcher(): assert PatternMatcher(fallback="hey!").fallback == "hey!" -def test_compression_specs(): - with pytest.raises(ValueError): - CompressionSpec('') - assert CompressionSpec('none') == ComprSpec(name='none', spec=None) - assert CompressionSpec('lz4') == ComprSpec(name='lz4', spec=None) - assert CompressionSpec('zlib') == ComprSpec(name='zlib', spec=6) - assert CompressionSpec('zlib,0') == ComprSpec(name='zlib', spec=0) - assert CompressionSpec('zlib,9') == ComprSpec(name='zlib', spec=9) - with pytest.raises(ValueError): - CompressionSpec('zlib,9,invalid') - assert CompressionSpec('lzma') == ComprSpec(name='lzma', spec=6) - assert CompressionSpec('lzma,0') == ComprSpec(name='lzma', spec=0) - assert CompressionSpec('lzma,9') == ComprSpec(name='lzma', spec=9) - with pytest.raises(ValueError): - CompressionSpec('lzma,9,invalid') - with pytest.raises(ValueError): - CompressionSpec('invalid') - - def test_chunkerparams(): assert ChunkerParams('19,23,21,4095') == (19, 23, 21, 4095) assert ChunkerParams('10,23,16,4095') == (10, 23, 16, 4095) @@ -1178,7 +1158,7 @@ def test_partial_format(): def test_chunk_file_wrapper(): - cfw = ChunkIteratorFileWrapper(iter([Chunk(b'abc'), Chunk(b'def')])) + cfw = ChunkIteratorFileWrapper(iter([b'abc', b'def'])) assert cfw.read(2) == b'ab' assert cfw.read(50) == b'cdef' assert cfw.exhausted @@ -1220,38 +1200,6 @@ data2 assert list(clean_lines(conf, remove_comments=False)) == ['#comment', 'data1 #data1', 'data2', 'data3', ] -def test_compression_decider1(): - default = CompressionSpec('zlib') - conf = """ -# use super-fast lz4 compression on huge VM files in this path: -lz4:/srv/vm_disks - -# jpeg or zip files do not compress: -none:*.jpeg -none:*.zip -""".splitlines() - - cd = CompressionDecider1(default, []) # no conf, always use default - assert cd.decide('/srv/vm_disks/linux').name == 'zlib' - assert cd.decide('test.zip').name == 'zlib' - assert cd.decide('test').name == 'zlib' - - cd = CompressionDecider1(default, [conf, ]) - assert cd.decide('/srv/vm_disks/linux').name == 'lz4' - assert cd.decide('test.zip').name == 'none' - assert cd.decide('test').name == 'zlib' # no match in conf, use default - - -def test_compression_decider2(): - default = CompressionSpec('zlib') - - cd = CompressionDecider2(default) - compr_spec, chunk = cd.decide(Chunk(None)) - assert compr_spec.name == 'zlib' - compr_spec, chunk = cd.decide(Chunk(None, compress=CompressionSpec('lzma'))) - assert compr_spec.name == 'lzma' - - def test_format_line(): data = dict(foo='bar baz') assert format_line('', data) == '' diff --git a/src/borg/testsuite/key.py b/src/borg/testsuite/key.py index dd3448c27..e92f9f0cd 100644 --- a/src/borg/testsuite/key.py +++ b/src/borg/testsuite/key.py @@ -9,7 +9,7 @@ import msgpack from ..crypto import bytes_to_long, num_aes_blocks from ..helpers import Location -from ..helpers import Chunk, StableDict +from ..helpers import StableDict from ..helpers import IntegrityError from ..helpers import get_security_dir from ..key import PlaintextKey, PassphraseKey, KeyfileKey, RepoKey, Blake2KeyfileKey, Blake2RepoKey, AuthenticatedKey @@ -104,17 +104,17 @@ class TestKey: def test_plaintext(self): key = PlaintextKey.create(None, None) - chunk = Chunk(b'foo') - assert hexlify(key.id_hash(chunk.data)) == b'2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae' - assert chunk == key.decrypt(key.id_hash(chunk.data), key.encrypt(chunk)) + chunk = b'foo' + assert hexlify(key.id_hash(chunk)) == b'2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae' + assert chunk == key.decrypt(key.id_hash(chunk), key.encrypt(chunk)) def test_keyfile(self, monkeypatch, keys_dir): monkeypatch.setenv('BORG_PASSPHRASE', 'test') key = KeyfileKey.create(self.MockRepository(), self.MockArgs()) assert bytes_to_long(key.enc_cipher.iv, 8) == 0 - manifest = key.encrypt(Chunk(b'ABC')) + manifest = key.encrypt(b'ABC') assert key.extract_nonce(manifest) == 0 - manifest2 = key.encrypt(Chunk(b'ABC')) + manifest2 = key.encrypt(b'ABC') assert manifest != manifest2 assert key.decrypt(None, manifest) == key.decrypt(None, manifest2) assert key.extract_nonce(manifest2) == 1 @@ -124,8 +124,8 @@ class TestKey: # Key data sanity check assert len({key2.id_key, key2.enc_key, key2.enc_hmac_key}) == 3 assert key2.chunk_seed != 0 - chunk = Chunk(b'foo') - assert chunk == key2.decrypt(key.id_hash(chunk.data), key.encrypt(chunk)) + chunk = b'foo' + assert chunk == key2.decrypt(key.id_hash(chunk), key.encrypt(chunk)) def test_keyfile_nonce_rollback_protection(self, monkeypatch, keys_dir): monkeypatch.setenv('BORG_PASSPHRASE', 'test') @@ -133,9 +133,9 @@ class TestKey: with open(os.path.join(get_security_dir(repository.id_str), 'nonce'), "w") as fd: fd.write("0000000000002000") key = KeyfileKey.create(repository, self.MockArgs()) - data = key.encrypt(Chunk(b'ABC')) + data = key.encrypt(b'ABC') assert key.extract_nonce(data) == 0x2000 - assert key.decrypt(None, data).data == b'ABC' + assert key.decrypt(None, data) == b'ABC' def test_keyfile_kfenv(self, tmpdir, monkeypatch): keyfile = tmpdir.join('keyfile') @@ -144,8 +144,8 @@ class TestKey: assert not keyfile.exists() key = KeyfileKey.create(self.MockRepository(), self.MockArgs()) assert keyfile.exists() - chunk = Chunk(b'ABC') - chunk_id = key.id_hash(chunk.data) + chunk = b'ABC' + chunk_id = key.id_hash(chunk) chunk_cdata = key.encrypt(chunk) key = KeyfileKey.detect(self.MockRepository(), chunk_cdata) assert chunk == key.decrypt(chunk_id, chunk_cdata) @@ -158,7 +158,7 @@ class TestKey: fd.write(self.keyfile2_key_file) monkeypatch.setenv('BORG_PASSPHRASE', 'passphrase') key = KeyfileKey.detect(self.MockRepository(), self.keyfile2_cdata) - assert key.decrypt(self.keyfile2_id, self.keyfile2_cdata).data == b'payload' + assert key.decrypt(self.keyfile2_id, self.keyfile2_cdata) == b'payload' def test_keyfile2_kfenv(self, tmpdir, monkeypatch): keyfile = tmpdir.join('keyfile') @@ -167,14 +167,14 @@ class TestKey: monkeypatch.setenv('BORG_KEY_FILE', str(keyfile)) monkeypatch.setenv('BORG_PASSPHRASE', 'passphrase') key = KeyfileKey.detect(self.MockRepository(), self.keyfile2_cdata) - assert key.decrypt(self.keyfile2_id, self.keyfile2_cdata).data == b'payload' + assert key.decrypt(self.keyfile2_id, self.keyfile2_cdata) == b'payload' def test_keyfile_blake2(self, monkeypatch, keys_dir): with keys_dir.join('keyfile').open('w') as fd: fd.write(self.keyfile_blake2_key_file) monkeypatch.setenv('BORG_PASSPHRASE', 'passphrase') key = Blake2KeyfileKey.detect(self.MockRepository(), self.keyfile_blake2_cdata) - assert key.decrypt(self.keyfile_blake2_id, self.keyfile_blake2_cdata).data == b'payload' + assert key.decrypt(self.keyfile_blake2_id, self.keyfile_blake2_cdata) == b'payload' def test_passphrase(self, keys_dir, monkeypatch): monkeypatch.setenv('BORG_PASSPHRASE', 'test') @@ -184,9 +184,9 @@ class TestKey: assert hexlify(key.enc_hmac_key) == b'b885a05d329a086627412a6142aaeb9f6c54ab7950f996dd65587251f6bc0901' assert hexlify(key.enc_key) == b'2ff3654c6daf7381dbbe718d2b20b4f1ea1e34caa6cc65f6bb3ac376b93fed2a' assert key.chunk_seed == -775740477 - manifest = key.encrypt(Chunk(b'ABC')) + manifest = key.encrypt(b'ABC') assert key.extract_nonce(manifest) == 0 - manifest2 = key.encrypt(Chunk(b'ABC')) + manifest2 = key.encrypt(b'ABC') assert manifest != manifest2 assert key.decrypt(None, manifest) == key.decrypt(None, manifest2) assert key.extract_nonce(manifest2) == 1 @@ -197,9 +197,9 @@ class TestKey: assert key.enc_hmac_key == key2.enc_hmac_key assert key.enc_key == key2.enc_key assert key.chunk_seed == key2.chunk_seed - chunk = Chunk(b'foo') - assert hexlify(key.id_hash(chunk.data)) == b'818217cf07d37efad3860766dcdf1d21e401650fed2d76ed1d797d3aae925990' - assert chunk == key2.decrypt(key2.id_hash(chunk.data), key.encrypt(chunk)) + chunk = b'foo' + assert hexlify(key.id_hash(chunk)) == b'818217cf07d37efad3860766dcdf1d21e401650fed2d76ed1d797d3aae925990' + assert chunk == key2.decrypt(key2.id_hash(chunk), key.encrypt(chunk)) def _corrupt_byte(self, key, data, offset): data = bytearray(data) @@ -224,7 +224,7 @@ class TestKey: key.decrypt(id, data) def test_decrypt_decompress(self, key): - plaintext = Chunk(b'123456789') + plaintext = b'123456789' encrypted = key.encrypt(plaintext) assert key.decrypt(None, encrypted, decompress=False) != plaintext assert key.decrypt(None, encrypted) == plaintext @@ -244,10 +244,11 @@ class TestKey: def test_authenticated_encrypt(self, monkeypatch): monkeypatch.setenv('BORG_PASSPHRASE', 'test') key = AuthenticatedKey.create(self.MockRepository(), self.MockArgs()) - plaintext = Chunk(b'123456789') + plaintext = b'123456789' authenticated = key.encrypt(plaintext) - # 0x06 is the key TYPE, 0x0000 identifies CNONE compression - assert authenticated == b'\x06\x00\x00' + plaintext.data + # 0x06 is the key TYPE, 0x0100 identifies LZ4 compression, 0x90 is part of LZ4 and means that an uncompressed + # block of length nine follows (the plaintext). + assert authenticated == b'\x06\x01\x00\x90' + plaintext class TestPassphrase: