mirror of
https://github.com/borgbackup/borg.git
synced 2026-05-28 04:03:21 -04:00
Chunkifier improvements. Including tin foil hat compliant seeding.
This commit is contained in:
parent
dafd75f4e6
commit
330315ba0d
4 changed files with 34 additions and 143 deletions
|
|
@ -1,6 +1,9 @@
|
|||
#include <Python.h>
|
||||
#include <structmember.h>
|
||||
|
||||
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
|
||||
#define ABS(X) ((X) < 0 ? (-(X)) : (X))
|
||||
|
||||
static unsigned long int
|
||||
checksum(const unsigned char *data, int len, unsigned long int sum)
|
||||
{
|
||||
|
|
@ -28,7 +31,7 @@ roll_checksum(unsigned long int sum, unsigned char remove, unsigned char add, in
|
|||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
int chunk_size, window_size, i, last, eof, done, buf_size, data_len, initial;
|
||||
int chunk_size, window_size, i, last, eof, done, buf_size, data_len, seed;
|
||||
PyObject *chunks, *fd;
|
||||
unsigned long int sum;
|
||||
unsigned char *data, add, remove;
|
||||
|
|
@ -43,8 +46,7 @@ ChunkifyIter_iter(PyObject *self)
|
|||
c->eof = 0;
|
||||
c->i = 0;
|
||||
c->sum = 0;
|
||||
c->last = -1;
|
||||
c->initial = c->window_size;
|
||||
c->last = 0;
|
||||
Py_INCREF(self);
|
||||
return self;
|
||||
}
|
||||
|
|
@ -62,6 +64,8 @@ static PyObject*
|
|||
ChunkifyIter_iternext(PyObject *self)
|
||||
{
|
||||
ChunkifyIter *c = (ChunkifyIter *)self;
|
||||
int initial = c->window_size;
|
||||
|
||||
if(c->done)
|
||||
{
|
||||
PyErr_SetNone(PyExc_StopIteration);
|
||||
|
|
@ -72,6 +76,7 @@ ChunkifyIter_iternext(PyObject *self)
|
|||
if(c->i == c->buf_size)
|
||||
{
|
||||
int diff = c->last + 1 - c->window_size;
|
||||
assert(diff >= 0);
|
||||
memmove(c->data, c->data + diff, c->buf_size - diff);
|
||||
c->i -= diff;
|
||||
c->last -= diff;
|
||||
|
|
@ -90,18 +95,20 @@ ChunkifyIter_iternext(PyObject *self)
|
|||
}
|
||||
if(c->i == c->data_len)
|
||||
{
|
||||
if(c->last < c->i - 1) {
|
||||
if(c->last < c->i) {
|
||||
c->done = 1;
|
||||
return PyString_FromStringAndSize((char *)(c->data + c->last + 1),
|
||||
c->data_len - c->last - 1);
|
||||
return PyString_FromStringAndSize((char *)(c->data + c->last),
|
||||
c->data_len - c->last);
|
||||
}
|
||||
PyErr_SetNone(PyExc_StopIteration);
|
||||
return NULL;
|
||||
}
|
||||
if(c->initial)
|
||||
if(initial)
|
||||
{
|
||||
c->initial--;
|
||||
c->sum = checksum(c->data + c->i, 1, c->sum);
|
||||
int bytes = MIN(initial, c->data_len - c->i);
|
||||
initial -= bytes;
|
||||
c->sum = checksum(c->data + c->i, bytes, 0);
|
||||
c->i += bytes;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -109,20 +116,20 @@ ChunkifyIter_iternext(PyObject *self)
|
|||
c->data[c->i - c->window_size],
|
||||
c->data[c->i],
|
||||
c->window_size);
|
||||
c->i++;
|
||||
}
|
||||
c->i++;
|
||||
if(c->i == c->buf_size && c->last == -1)
|
||||
if((c->sum % c->chunk_size) == c->seed)
|
||||
{
|
||||
int old_last = c->last;
|
||||
c->last = c->i - 1;
|
||||
return PyString_FromStringAndSize((char *)(c->data + old_last + 1),
|
||||
c->last = c->i;
|
||||
return PyString_FromStringAndSize((char *)(c->data + old_last),
|
||||
c->last - old_last);
|
||||
}
|
||||
else if((c->sum % c->chunk_size) == 0)
|
||||
if(c->i == c->buf_size && c->last <= c->window_size)
|
||||
{
|
||||
int old_last = c->last;
|
||||
c->last = c->i - 1;
|
||||
return PyString_FromStringAndSize((char *)(c->data + old_last + 1),
|
||||
c->last = c->i;
|
||||
return PyString_FromStringAndSize((char *)(c->data + old_last),
|
||||
c->last - old_last);
|
||||
}
|
||||
}
|
||||
|
|
@ -167,10 +174,10 @@ static PyObject *
|
|||
chunkify(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *fd;
|
||||
long int chunk_size, window_size;
|
||||
int chunk_size, window_size, seed;
|
||||
ChunkifyIter *c;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "Oii", &fd, &chunk_size, &window_size))
|
||||
if (!PyArg_ParseTuple(args, "Oiii", &fd, &chunk_size, &window_size, &seed))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -184,6 +191,7 @@ chunkify(PyObject *self, PyObject *args)
|
|||
c->fd = fd;
|
||||
c->chunk_size = chunk_size;
|
||||
c->window_size = window_size;
|
||||
c->seed = seed % chunk_size;
|
||||
Py_INCREF(fd);
|
||||
return (PyObject *)c;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,10 +8,11 @@ import sys
|
|||
from xattr import xattr, XATTR_NOFOLLOW
|
||||
|
||||
from . import NS_ARCHIVE_METADATA, NS_ARCHIVE_ITEMS, NS_ARCHIVE_CHUNKS, NS_CHUNK
|
||||
from .chunkifier import chunkify
|
||||
from ._speedups import chunkify
|
||||
from .helpers import uid2user, user2uid, gid2group, group2gid, IntegrityError
|
||||
|
||||
CHUNK_SIZE = 55001
|
||||
CHUNK_SIZE = 64 * 1024
|
||||
WINDOW_SIZE = 4096
|
||||
|
||||
have_lchmod = hasattr(os, 'lchmod')
|
||||
linux = sys.platform == 'linux2'
|
||||
|
|
@ -253,7 +254,8 @@ class Archive(object):
|
|||
size = 0
|
||||
ids = []
|
||||
chunks = []
|
||||
for chunk in chunkify(fd, CHUNK_SIZE, 30):
|
||||
for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
|
||||
self.keychain.get_chunkify_seed()):
|
||||
id = self.keychain.id_hash(chunk)
|
||||
ids.append(id)
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -1,122 +0,0 @@
|
|||
def checksum(data, sum=0):
|
||||
"""Simple but fast checksum that can be updated at either end.
|
||||
|
||||
>>> checksum('FOOBAR')
|
||||
102367679
|
||||
>>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
|
||||
True
|
||||
"""
|
||||
s1 = sum & 0xffff
|
||||
s2 = sum >> 16
|
||||
for c in data:
|
||||
s1 += ord(c) + 1
|
||||
s2 += s1
|
||||
return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
|
||||
|
||||
|
||||
def roll_checksum(sum, remove, add, len):
|
||||
"""
|
||||
>>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
|
||||
True
|
||||
"""
|
||||
s1 = sum & 0xffff
|
||||
s2 = sum >> 16
|
||||
add = ord(add)
|
||||
remove = ord(remove)
|
||||
s1 -= remove - add
|
||||
s2 -= len * (remove + 1) - s1
|
||||
return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
|
||||
|
||||
|
||||
class ChunkifyIter(object):
|
||||
|
||||
def __init__(self, fd, chunk_size, window_size):
|
||||
self.fd = fd
|
||||
self.chunk_size = chunk_size
|
||||
self.window_size = window_size
|
||||
self.buf_size = self.chunk_size * 10
|
||||
|
||||
def __iter__(self):
|
||||
self.data = ''
|
||||
self.done = False
|
||||
self.i = 0
|
||||
self.sum = 0
|
||||
self.last = -1
|
||||
self.initial = self.window_size
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
if self.done:
|
||||
raise StopIteration
|
||||
while True:
|
||||
if self.i == self.buf_size:
|
||||
diff = self.last + 1 - self.window_size
|
||||
if diff < 0:
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
self.data = self.data[diff:]
|
||||
self.last -= diff
|
||||
self.i -= diff
|
||||
if self.i == len(self.data):
|
||||
self.data += self.fd.read(self.buf_size - len(self.data))
|
||||
if self.i == len(self.data):
|
||||
if self.last < self.i - 1:
|
||||
self.done = True
|
||||
return self.data[self.last + 1:]
|
||||
raise StopIteration
|
||||
if self.initial:
|
||||
self.initial -= 1
|
||||
self.sum = checksum(self.data[self.i], self.sum)
|
||||
else:
|
||||
self.sum = roll_checksum(self.sum,
|
||||
self.data[self.i - self.window_size],
|
||||
self.data[self.i],
|
||||
self.window_size)
|
||||
self.i += 1
|
||||
if self.i == self.buf_size and self.last == -1:
|
||||
old_last = self.last
|
||||
self.last = self.i - 1
|
||||
return self.data[old_last + 1:self.last + 1]
|
||||
elif self.sum % self.chunk_size == 0:
|
||||
old_last = self.last
|
||||
self.last = self.i - 1
|
||||
return self.data[old_last + 1:self.last + 1]
|
||||
|
||||
|
||||
def chunkify(fd, chunk_size, chunks):
|
||||
"""
|
||||
>>> list(chunkify(StringIO.StringIO(''), 5, 3))
|
||||
[]
|
||||
>>> list(chunkify(StringIO.StringIO('A'), 5, 3))
|
||||
['A']
|
||||
>>> list(chunkify(StringIO.StringIO('AB'), 5, 3))
|
||||
['AB']
|
||||
>>> list(chunkify(StringIO.StringIO('1B'), 5, 3))
|
||||
['1', 'B']
|
||||
>>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMNOPQ'), 5, 3))
|
||||
['ABCD', 'EFGHI', 'JKLMN', 'OPQ']
|
||||
>>> list(chunkify(StringIO.StringIO('1ABCDEFGHIJKLMNOPQ'), 5, 3))
|
||||
['1', 'ABCD', 'EFGHI', 'JKLMN', 'OPQ']
|
||||
>>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQ'), 5, 3))
|
||||
['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQ']
|
||||
>>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
|
||||
['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
|
||||
>>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
|
||||
['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
|
||||
"""
|
||||
return ChunkifyIter(fd, chunk_size, chunks)
|
||||
|
||||
try:
|
||||
import _speedups
|
||||
checksum = _speedups.checksum
|
||||
roll_checksum = _speedups.roll_checksum
|
||||
py_chunkify = chunkify
|
||||
chunkify = _speedups.chunkify
|
||||
except ImportError:
|
||||
print 'Failed to load _speedups module, things will be slow'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import doctest
|
||||
import StringIO
|
||||
doctest.testmod()
|
||||
|
|
@ -30,6 +30,9 @@ class Keychain(object):
|
|||
if path:
|
||||
self.open(path)
|
||||
|
||||
def get_chunkify_seed(self):
|
||||
return bytes_to_long(self.aes_id[:4])
|
||||
|
||||
def open(self, path):
|
||||
print 'Opening keychain "%s"' % path
|
||||
with open(path, 'rb') as fd:
|
||||
|
|
|
|||
Loading…
Reference in a new issue