Unify ComprSpec and CompressionSpec; don't instanciate Compressors right away

This commit is contained in:
Marian Beermann 2017-04-01 00:12:16 +02:00
parent d1826cca92
commit 0847c3f9a5
5 changed files with 92 additions and 45 deletions

View file

@ -1660,14 +1660,14 @@ class ArchiveRecreater:
chunk_id = self.key.id_hash(data)
if chunk_id in self.seen_chunks:
return self.cache.chunk_incref(chunk_id, target.stats)
chunk = Chunk(data, compressor=compressor)
overwrite = self.recompress
if self.recompress and not self.always_recompress and chunk_id in self.cache.chunks:
# Check if this chunk is already compressed the way we want it
old_chunk = self.key.decrypt(None, self.repository.get(chunk_id), decompress=False)
if Compressor.detect(old_chunk.data).name == compressor.name:
if Compressor.detect(old_chunk.data).name == compressor.decide(data).name:
# Stored chunk has the same compression we wanted
overwrite = False
chunk = Chunk(data, compressor=compressor)
chunk_entry = self.cache.add_chunk(chunk_id, chunk, target.stats, overwrite=overwrite, wait=False)
self.cache.repository.async_response(wait=False)
self.seen_chunks.add(chunk_entry.id)

View file

@ -108,7 +108,14 @@ def with_repository(fake=False, invert_fake=False, create=False, lock=True, excl
with repository:
if manifest or cache:
kwargs['manifest'], kwargs['key'] = Manifest.load(repository)
if args.__dict__.get('compression'):
# do_recreate uses args.compression is None as in band signalling for "don't recompress",
# note that it does not look at key.compressor. In this case the default compressor applies
# to new chunks.
#
# We can't use a check like `'compression' in args` (an argparse.Namespace speciality),
# since the compression attribute is set. So we need to see whether it's set to something
# true-ish, like a CompressionSpec instance.
if getattr(args, 'compression', False):
kwargs['key'].compressor = args.compression.compressor
if cache:
with Cache(repository, kwargs['key'], kwargs['manifest'],

View file

@ -28,17 +28,15 @@ decompressor.
"""
import zlib
from collections import namedtuple
try:
import lzma
except ImportError:
lzma = None
from .logger import create_logger
from .helpers import Buffer, DecompressionError
API_VERSION = '1.1_02'
API_VERSION = '1.1_03'
cdef extern from "lz4.h":
int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) nogil
@ -66,11 +64,34 @@ cdef class CompressorBase:
def __init__(self, **kwargs):
pass
def decide(self, data):
"""
Return which compressor will perform the actual compression for *data*.
This exists for a very specific case: If borg recreate is instructed to recompress
using Auto compression it needs to determine the _actual_ target compression of a chunk
in order to detect whether it should be recompressed.
For all Compressors that are not Auto this always returns *self*.
"""
return self
def compress(self, data):
"""
Compress *data* (bytes) and return bytes result. Prepend the ID bytes of this compressor,
which is needed so that the correct decompressor can be used for decompression.
"""
# add ID bytes
return self.ID + data
def decompress(self, data):
"""
Decompress *data* (bytes) and return bytes result. The leading Compressor ID
bytes need to be present.
Only handles input generated by _this_ Compressor - for a general purpose
decompression method see *Compressor.decompress*.
"""
# strip ID bytes
return data[2:]
@ -222,22 +243,36 @@ class Auto(CompressorBase):
ID = None
name = 'auto'
logger = create_logger('borg.debug.file-compression')
def __init__(self, compressor):
super().__init__()
self.compressor = compressor
self.lz4 = get_compressor('lz4')
self.none = get_compressor('none')
def compress(self, data):
def _decide(self, data):
"""
Decides what to do with *data*. Returns (compressor, lz4_data).
*lz4_data* is the LZ4 result if *compressor* is LZ4 as well, otherwise it is None.
"""
lz4_data = self.lz4.compress(data)
if len(lz4_data) < 0.97 * len(data):
return self.compressor.compress(data)
elif len(lz4_data) < len(data):
return lz4_data
ratio = len(lz4_data) / len(data)
if ratio < 0.97:
return self.compressor, None
elif ratio < 1:
return self.lz4, lz4_data
else:
return self.none.compress(data)
return self.none, None
def decide(self, data):
return self._decide(data)[0]
def compress(self, data):
compressor, lz4_data = self._decide(data)
if lz4_data is None:
return compressor.compress(data)
else:
return lz4_data
def decompress(self, data):
raise NotImplementedError
@ -288,35 +323,40 @@ class Compressor:
raise ValueError('No decompressor for this data found: %r.', data[:2])
ComprSpec = namedtuple('ComprSpec', ('name', 'spec', 'compressor'))
def CompressionSpec(s):
values = s.split(',')
count = len(values)
if count < 1:
raise ValueError
# --compression algo[,level]
name = values[0]
if name == 'none':
return ComprSpec(name=name, spec=None, compressor=CNONE())
elif name == 'lz4':
return ComprSpec(name=name, spec=None, compressor=LZ4())
if name in ('zlib', 'lzma', ):
if count < 2:
level = 6 # default compression level in py stdlib
elif count == 2:
level = int(values[1])
if not 0 <= level <= 9:
class CompressionSpec:
def __init__(self, s):
values = s.split(',')
count = len(values)
if count < 1:
raise ValueError
# --compression algo[,level]
self.name = values[0]
if self.name in ('none', 'lz4', ):
return
elif self.name in ('zlib', 'lzma', ):
if count < 2:
level = 6 # default compression level in py stdlib
elif count == 2:
level = int(values[1])
if not 0 <= level <= 9:
raise ValueError
else:
raise ValueError
self.level = level
elif self.name == 'auto':
if 2 <= count <= 3:
compression = ','.join(values[1:])
else:
raise ValueError
self.inner = CompressionSpec(compression)
else:
raise ValueError
return ComprSpec(name=name, spec=level, compressor=get_compressor(name, level=level))
if name == 'auto':
if 2 <= count <= 3:
compression = ','.join(values[1:])
else:
raise ValueError
inner = CompressionSpec(compression)
return ComprSpec(name=name, spec=inner, compressor=Auto(inner.compressor))
raise ValueError
@property
def compressor(self):
if self.name in ('none', 'lz4', ):
return get_compressor(self.name)
elif self.name in ('zlib', 'lzma', ):
return get_compressor(self.name, level=self.level)
elif self.name == 'auto':
return get_compressor(self.name, compressor=self.inner.compressor)

View file

@ -123,7 +123,7 @@ def check_extension_modules():
raise ExtensionModuleError
if chunker.API_VERSION != '1.1_01':
raise ExtensionModuleError
if compress.API_VERSION != '1.1_02':
if compress.API_VERSION != '1.1_03':
raise ExtensionModuleError
if crypto.API_VERSION != '1.1_01':
raise ExtensionModuleError

View file

@ -7,7 +7,7 @@ except ImportError:
import pytest
from ..compress import get_compressor, Compressor, CompressionSpec, ComprSpec, CNONE, ZLIB, LZ4, LZMA, Auto
from ..compress import get_compressor, Compressor, CompressionSpec, CNONE, ZLIB, LZ4, LZMA, Auto
buffer = bytes(2**16)