Chunkifier improvements. Including tin foil hat compliant seeding.

This commit is contained in:
Jonas Borgström 2010-11-04 21:19:01 +01:00
parent dafd75f4e6
commit 330315ba0d
4 changed files with 34 additions and 143 deletions

View file

@ -1,6 +1,9 @@
#include <Python.h>
#include <structmember.h>
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
#define ABS(X) ((X) < 0 ? (-(X)) : (X))
static unsigned long int
checksum(const unsigned char *data, int len, unsigned long int sum)
{
@ -28,7 +31,7 @@ roll_checksum(unsigned long int sum, unsigned char remove, unsigned char add, in
typedef struct {
PyObject_HEAD
int chunk_size, window_size, i, last, eof, done, buf_size, data_len, initial;
int chunk_size, window_size, i, last, eof, done, buf_size, data_len, seed;
PyObject *chunks, *fd;
unsigned long int sum;
unsigned char *data, add, remove;
@ -43,8 +46,7 @@ ChunkifyIter_iter(PyObject *self)
c->eof = 0;
c->i = 0;
c->sum = 0;
c->last = -1;
c->initial = c->window_size;
c->last = 0;
Py_INCREF(self);
return self;
}
@ -62,6 +64,8 @@ static PyObject*
ChunkifyIter_iternext(PyObject *self)
{
ChunkifyIter *c = (ChunkifyIter *)self;
int initial = c->window_size;
if(c->done)
{
PyErr_SetNone(PyExc_StopIteration);
@ -72,6 +76,7 @@ ChunkifyIter_iternext(PyObject *self)
if(c->i == c->buf_size)
{
int diff = c->last + 1 - c->window_size;
assert(diff >= 0);
memmove(c->data, c->data + diff, c->buf_size - diff);
c->i -= diff;
c->last -= diff;
@ -90,18 +95,20 @@ ChunkifyIter_iternext(PyObject *self)
}
if(c->i == c->data_len)
{
if(c->last < c->i - 1) {
if(c->last < c->i) {
c->done = 1;
return PyString_FromStringAndSize((char *)(c->data + c->last + 1),
c->data_len - c->last - 1);
return PyString_FromStringAndSize((char *)(c->data + c->last),
c->data_len - c->last);
}
PyErr_SetNone(PyExc_StopIteration);
return NULL;
}
if(c->initial)
if(initial)
{
c->initial--;
c->sum = checksum(c->data + c->i, 1, c->sum);
int bytes = MIN(initial, c->data_len - c->i);
initial -= bytes;
c->sum = checksum(c->data + c->i, bytes, 0);
c->i += bytes;
}
else
{
@ -109,20 +116,20 @@ ChunkifyIter_iternext(PyObject *self)
c->data[c->i - c->window_size],
c->data[c->i],
c->window_size);
c->i++;
}
c->i++;
if(c->i == c->buf_size && c->last == -1)
if((c->sum % c->chunk_size) == c->seed)
{
int old_last = c->last;
c->last = c->i - 1;
return PyString_FromStringAndSize((char *)(c->data + old_last + 1),
c->last = c->i;
return PyString_FromStringAndSize((char *)(c->data + old_last),
c->last - old_last);
}
else if((c->sum % c->chunk_size) == 0)
if(c->i == c->buf_size && c->last <= c->window_size)
{
int old_last = c->last;
c->last = c->i - 1;
return PyString_FromStringAndSize((char *)(c->data + old_last + 1),
c->last = c->i;
return PyString_FromStringAndSize((char *)(c->data + old_last),
c->last - old_last);
}
}
@ -167,10 +174,10 @@ static PyObject *
chunkify(PyObject *self, PyObject *args)
{
PyObject *fd;
long int chunk_size, window_size;
int chunk_size, window_size, seed;
ChunkifyIter *c;
if (!PyArg_ParseTuple(args, "Oii", &fd, &chunk_size, &window_size))
if (!PyArg_ParseTuple(args, "Oiii", &fd, &chunk_size, &window_size, &seed))
{
return NULL;
}
@ -184,6 +191,7 @@ chunkify(PyObject *self, PyObject *args)
c->fd = fd;
c->chunk_size = chunk_size;
c->window_size = window_size;
c->seed = seed % chunk_size;
Py_INCREF(fd);
return (PyObject *)c;
}

View file

@ -8,10 +8,11 @@ import sys
from xattr import xattr, XATTR_NOFOLLOW
from . import NS_ARCHIVE_METADATA, NS_ARCHIVE_ITEMS, NS_ARCHIVE_CHUNKS, NS_CHUNK
from .chunkifier import chunkify
from ._speedups import chunkify
from .helpers import uid2user, user2uid, gid2group, group2gid, IntegrityError
CHUNK_SIZE = 55001
CHUNK_SIZE = 64 * 1024
WINDOW_SIZE = 4096
have_lchmod = hasattr(os, 'lchmod')
linux = sys.platform == 'linux2'
@ -253,7 +254,8 @@ class Archive(object):
size = 0
ids = []
chunks = []
for chunk in chunkify(fd, CHUNK_SIZE, 30):
for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
self.keychain.get_chunkify_seed()):
id = self.keychain.id_hash(chunk)
ids.append(id)
try:

View file

@ -1,122 +0,0 @@
def checksum(data, sum=0):
"""Simple but fast checksum that can be updated at either end.
>>> checksum('FOOBAR')
102367679
>>> checksum('FOOBAR') == checksum('BAR', checksum('FOO'))
True
"""
s1 = sum & 0xffff
s2 = sum >> 16
for c in data:
s1 += ord(c) + 1
s2 += s1
return ((s2 & 0xffff) << 16) + (s1 & 0xffff)
def roll_checksum(sum, remove, add, len):
"""
>>> roll_checksum(checksum('XFOOBA'), 'X', 'R', 6) == checksum('FOOBAR')
True
"""
s1 = sum & 0xffff
s2 = sum >> 16
add = ord(add)
remove = ord(remove)
s1 -= remove - add
s2 -= len * (remove + 1) - s1
return (s1 & 0xffff) + ((s2 & 0xffff) << 16)
class ChunkifyIter(object):
def __init__(self, fd, chunk_size, window_size):
self.fd = fd
self.chunk_size = chunk_size
self.window_size = window_size
self.buf_size = self.chunk_size * 10
def __iter__(self):
self.data = ''
self.done = False
self.i = 0
self.sum = 0
self.last = -1
self.initial = self.window_size
return self
def next(self):
if self.done:
raise StopIteration
while True:
if self.i == self.buf_size:
diff = self.last + 1 - self.window_size
if diff < 0:
import ipdb
ipdb.set_trace()
self.data = self.data[diff:]
self.last -= diff
self.i -= diff
if self.i == len(self.data):
self.data += self.fd.read(self.buf_size - len(self.data))
if self.i == len(self.data):
if self.last < self.i - 1:
self.done = True
return self.data[self.last + 1:]
raise StopIteration
if self.initial:
self.initial -= 1
self.sum = checksum(self.data[self.i], self.sum)
else:
self.sum = roll_checksum(self.sum,
self.data[self.i - self.window_size],
self.data[self.i],
self.window_size)
self.i += 1
if self.i == self.buf_size and self.last == -1:
old_last = self.last
self.last = self.i - 1
return self.data[old_last + 1:self.last + 1]
elif self.sum % self.chunk_size == 0:
old_last = self.last
self.last = self.i - 1
return self.data[old_last + 1:self.last + 1]
def chunkify(fd, chunk_size, chunks):
"""
>>> list(chunkify(StringIO.StringIO(''), 5, 3))
[]
>>> list(chunkify(StringIO.StringIO('A'), 5, 3))
['A']
>>> list(chunkify(StringIO.StringIO('AB'), 5, 3))
['AB']
>>> list(chunkify(StringIO.StringIO('1B'), 5, 3))
['1', 'B']
>>> list(chunkify(StringIO.StringIO('ABCDEFGHIJKLMNOPQ'), 5, 3))
['ABCD', 'EFGHI', 'JKLMN', 'OPQ']
>>> list(chunkify(StringIO.StringIO('1ABCDEFGHIJKLMNOPQ'), 5, 3))
['1', 'ABCD', 'EFGHI', 'JKLMN', 'OPQ']
>>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQ'), 5, 3))
['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQ']
>>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
>>> list(chunkify(StringIO.StringIO('12ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 5, 3))
['1', '2A', 'BCD', 'EFGHI', 'JKLMN', 'OPQRS', 'TUVWX', 'YZ']
"""
return ChunkifyIter(fd, chunk_size, chunks)
try:
import _speedups
checksum = _speedups.checksum
roll_checksum = _speedups.roll_checksum
py_chunkify = chunkify
chunkify = _speedups.chunkify
except ImportError:
print 'Failed to load _speedups module, things will be slow'
if __name__ == '__main__':
import doctest
import StringIO
doctest.testmod()

View file

@ -30,6 +30,9 @@ class Keychain(object):
if path:
self.open(path)
def get_chunkify_seed(self):
return bytes_to_long(self.aes_id[:4])
def open(self, path):
print 'Opening keychain "%s"' % path
with open(path, 'rb') as fd: