diff --git a/src/borg/_chunker.c b/src/borg/_chunker.c deleted file mode 100644 index e5d9b54f8..000000000 --- a/src/borg/_chunker.c +++ /dev/null @@ -1,318 +0,0 @@ -#include -#include -#if !defined(_MSC_VER) -# include -#endif - -/* Cyclic polynomial / buzhash - -https://en.wikipedia.org/wiki/Rolling_hash - -http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor) - -http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide) - -Some properties of buzhash / of this implementation: - -(1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window; - any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within - the hash function, e.g. in "X X", the last X would cancel out the influence - of the first X on the hash value. - -(2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of - 0/1 bit values per position, but the hard coded table below doesn't fit that property. - -(3) if you would use a window size divisible by 64, the seed would cancel itself out completely. - this is why we use a window size of 4095 bytes. - -Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant -is equivalent to XORing the hash output with a different constant. but since the seed is stored -encrypted, i think it still serves its purpose. -*/ - -static uint32_t table_base[] = -{ - 0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4, - 0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436, - 0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7, - 0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a, - 0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613, - 0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0, - 0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205, - 0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7, - 0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035, - 0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d, - 0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355, - 0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f, - 0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424, - 0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86, - 0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f, - 0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf, - 0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9, - 0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186, - 0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0, - 0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35, - 0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd, - 0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4, - 0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f, - 0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec, - 0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889, - 0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4, - 0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512, - 0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009, - 0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2, - 0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c, - 0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9, - 0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b -}; - -#define BARREL_SHIFT(v, shift) ( ((v) << (shift)) | ((v) >> ((32 - (shift)) & 0x1f)) ) - -size_t pagemask; - -static uint32_t * -buzhash_init_table(uint32_t seed) -{ - int i; - uint32_t *table = malloc(1024); - for(i = 0; i < 256; i++) - { - table[i] = table_base[i] ^ seed; - } - return table; -} - -static uint32_t -buzhash(const unsigned char *data, size_t len, const uint32_t *h) -{ - uint32_t i; - uint32_t sum = 0, imod; - for(i = len - 1; i > 0; i--) - { - imod = i & 0x1f; - sum ^= BARREL_SHIFT(h[*data], imod); - data++; - } - return sum ^ h[*data]; -} - -static uint32_t -buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t *h) -{ - uint32_t lenmod = len & 0x1f; /* Note: replace by constant to get small speedup */ - return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add]; -} - -typedef struct { - uint32_t chunk_mask; - uint32_t *table; - uint8_t *data; - PyObject *fd; - int fh; - int done, eof; - size_t min_size, buf_size, window_size, remaining, position, last; - off_t bytes_read, bytes_yielded; -} Chunker; - -static Chunker * -chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed) -{ - Chunker *c = calloc(sizeof(Chunker), 1); - c->window_size = window_size; - c->chunk_mask = chunk_mask; - c->min_size = min_size; - c->table = buzhash_init_table(seed); - c->buf_size = max_size; - c->data = malloc(c->buf_size); - c->fh = -1; - return c; -} - -static void -chunker_set_fd(Chunker *c, PyObject *fd, int fh) -{ - Py_XDECREF(c->fd); - c->fd = fd; - Py_INCREF(fd); - c->fh = fh; - c->done = 0; - c->remaining = 0; - c->bytes_read = 0; - c->bytes_yielded = 0; - c->position = 0; - c->last = 0; - c->eof = 0; -} - -static void -chunker_free(Chunker *c) -{ - Py_XDECREF(c->fd); - free(c->table); - free(c->data); - free(c); -} - -static int -chunker_fill(Chunker *c) -{ - ssize_t n; - PyObject *data; - PyThreadState *thread_state; - - memmove(c->data, c->data + c->last, c->position + c->remaining - c->last); - c->position -= c->last; - c->last = 0; - n = c->buf_size - c->position - c->remaining; - if(c->eof || n == 0) { - return 1; - } - if(c->fh >= 0) { - thread_state = PyEval_SaveThread(); - - #if ( ( _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L ) && defined(POSIX_FADV_DONTNEED) ) - off_t offset = c->bytes_read; - #endif - - // if we have a os-level file descriptor, use os-level API - n = read(c->fh, c->data + c->position + c->remaining, n); - if(n > 0) { - c->remaining += n; - c->bytes_read += n; - } - else - if(n == 0) { - c->eof = 1; - } - else { - PyEval_RestoreThread(thread_state); - // some error happened - PyErr_SetFromErrno(PyExc_OSError); - return 0; - } - #if ( ( _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L ) && defined(POSIX_FADV_DONTNEED) ) - off_t length = c->bytes_read - offset; - - // Only do it once per run. - if (pagemask == 0) - pagemask = getpagesize() - 1; - - // We tell the OS that we do not need the data that we just have read any - // more (that it maybe has in the cache). This avoids that we spoil the - // complete cache with data that we only read once and (due to cache - // size limit) kick out data from the cache that might be still useful - // for the OS or other processes. - // We rollback the initial offset back to the start of the page, - // to avoid it not being truncated as a partial page request. - int overshoot; - if (length > 0) { - // All Linux kernels (at least up to and including 4.6(.0)) have a bug where - // they truncate last partial page of POSIX_FADV_DONTNEED request, so we need - // to page-align it ourselves. We'll need the rest of this page on the next - // read (assuming this was not EOF). - overshoot = (offset + length) & pagemask; - } else { - // For length == 0 we set overshoot 0, so the below - // length - overshoot is 0, which means till end of file for - // fadvise. This will cancel the final page and is not part - // of the above workaround. - overshoot = 0; - } - - posix_fadvise(c->fh, offset & ~pagemask, length - overshoot, POSIX_FADV_DONTNEED); - #endif - - PyEval_RestoreThread(thread_state); - } - else { - // no os-level file descriptor, use Python file object API - data = PyObject_CallMethod(c->fd, "read", "i", n); - if(!data) { - return 0; - } - n = PyBytes_Size(data); - if(PyErr_Occurred()) { - // we wanted bytes(), but got something else - return 0; - } - if(n) { - memcpy(c->data + c->position + c->remaining, PyBytes_AsString(data), n); - c->remaining += n; - c->bytes_read += n; - } - else { - c->eof = 1; - } - Py_DECREF(data); - } - return 1; -} - -static PyObject * -chunker_process(Chunker *c) -{ - uint32_t sum, chunk_mask = c->chunk_mask; - size_t n, old_last, min_size = c->min_size, window_size = c->window_size; - - if(c->done) { - if(c->bytes_read == c->bytes_yielded) - PyErr_SetNone(PyExc_StopIteration); - else - PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch"); - return NULL; - } - while(c->remaining < min_size + window_size + 1 && !c->eof) { /* see assert in Chunker init */ - if(!chunker_fill(c)) { - return NULL; - } - } - /* here we either are at eof ... */ - if(c->eof) { - c->done = 1; - if(c->remaining) { - c->bytes_yielded += c->remaining; - return PyMemoryView_FromMemory((char *)(c->data + c->position), c->remaining, PyBUF_READ); - } - else { - if(c->bytes_read == c->bytes_yielded) - PyErr_SetNone(PyExc_StopIteration); - else - PyErr_SetString(PyExc_Exception, "chunkifier byte count mismatch"); - return NULL; - } - } - /* ... or we have at least min_size + window_size + 1 bytes remaining. - * We do not want to "cut" a chunk smaller than min_size and the hash - * window starts at the potential cutting place. - */ - c->position += min_size; - c->remaining -= min_size; - sum = buzhash(c->data + c->position, window_size, c->table); - while(c->remaining > c->window_size && (sum & chunk_mask)) { - uint8_t *p = c->data + c->position; - uint8_t *stop_at = p + c->remaining - window_size; - size_t did_bytes; - while (p < stop_at && (sum & chunk_mask)) { - sum = buzhash_update(sum, p[0], p[window_size], - window_size, c->table); - p++; - } - did_bytes = p - (c->data + c->position); - c->position += did_bytes; - c->remaining -= did_bytes; - if(c->remaining <= window_size) { - if(!chunker_fill(c)) { - return NULL; - } - } - } - if(c->remaining <= window_size) { - c->position += c->remaining; - c->remaining = 0; - } - old_last = c->last; - c->last = c->position; - n = c->last - old_last; - c->bytes_yielded += n; - return PyMemoryView_FromMemory((char *)(c->data + old_last), n, PyBUF_READ); -} diff --git a/src/borg/chunker.pyx b/src/borg/chunker.pyx index d69976afd..fde9d7bfa 100644 --- a/src/borg/chunker.pyx +++ b/src/borg/chunker.pyx @@ -1,27 +1,22 @@ +# cython: language_level=3 +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False + API_VERSION = '1.2_01' -import errno import os +import errno import time from collections import namedtuple +from cpython.bytes cimport PyBytes_AsString +from libc.stdint cimport uint8_t, uint32_t +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, memmove +from posix.unistd cimport read from .constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros -from libc.stdlib cimport free - -cdef extern from "_chunker.c": - ctypedef int uint32_t - ctypedef struct _Chunker "Chunker": - pass - _Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed) - void chunker_set_fd(_Chunker *chunker, object f, int fd) - void chunker_free(_Chunker *chunker) - object chunker_process(_Chunker *chunker) - uint32_t *buzhash_init_table(uint32_t seed) - uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h) - uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h) - - # this will be True if Python's seek implementation supports data/holes seeking. # this does not imply that it will actually work on the filesystem, # because the FS also needs to support this. @@ -274,6 +269,96 @@ class ChunkerFixed: return +# Cyclic polynomial / buzhash +# +# https://en.wikipedia.org/wiki/Rolling_hash +# +# http://www.serve.net/buz/Notes.1st.year/HTML/C6/rand.012.html (by "BUZ", the inventor) +# +# http://www.dcs.gla.ac.uk/~hamer/cakes-talk.pdf (see buzhash slide) +# +# Some properties of buzhash / of this implementation: +# +# (1) the hash is designed for inputs <= 32 bytes, but the chunker uses it on a 4095 byte window; +# any repeating bytes at distance 32 within those 4095 bytes can cause cancellation within +# the hash function, e.g. in "X X", the last X would cancel out the influence +# of the first X on the hash value. +# +# (2) the hash table is supposed to have (according to the BUZ) exactly a 50% distribution of +# 0/1 bit values per position, but the hard coded table below doesn't fit that property. +# +# (3) if you would use a window size divisible by 64, the seed would cancel itself out completely. +# this is why we use a window size of 4095 bytes. +# +# Another quirk is that, even with the 4095 byte window, XORing the entire table by a constant +# is equivalent to XORing the hash output with a different constant. but since the seed is stored +# encrypted, i think it still serves its purpose. + +cdef uint32_t table_base[256] +table_base = [ + 0xe7f831ec, 0xf4026465, 0xafb50cae, 0x6d553c7a, 0xd639efe3, 0x19a7b895, 0x9aba5b21, 0x5417d6d4, + 0x35fd2b84, 0xd1f6a159, 0x3f8e323f, 0xb419551c, 0xf444cebf, 0x21dc3b80, 0xde8d1e36, 0x84a32436, + 0xbeb35a9d, 0xa36f24aa, 0xa4e60186, 0x98d18ffe, 0x3f042f9e, 0xdb228bcd, 0x096474b7, 0x5c20c2f7, + 0xf9eec872, 0xe8625275, 0xb9d38f80, 0xd48eb716, 0x22a950b4, 0x3cbaaeaa, 0xc37cddd3, 0x8fea6f6a, + 0x1d55d526, 0x7fd6d3b3, 0xdaa072ee, 0x4345ac40, 0xa077c642, 0x8f2bd45b, 0x28509110, 0x55557613, + 0xffc17311, 0xd961ffef, 0xe532c287, 0xaab95937, 0x46d38365, 0xb065c703, 0xf2d91d0f, 0x92cd4bb0, + 0x4007c712, 0xf35509dd, 0x505b2f69, 0x557ead81, 0x310f4563, 0xbddc5be8, 0x9760f38c, 0x701e0205, + 0x00157244, 0x14912826, 0xdc4ca32b, 0x67b196de, 0x5db292e8, 0x8c1b406b, 0x01f34075, 0xfa2520f7, + 0x73bc37ab, 0x1e18bc30, 0xfe2c6cb3, 0x20c522d0, 0x5639e3db, 0x942bda35, 0x899af9d1, 0xced44035, + 0x98cc025b, 0x255f5771, 0x70fefa24, 0xe928fa4d, 0x2c030405, 0xb9325590, 0x20cb63bd, 0xa166305d, + 0x80e52c0a, 0xa8fafe2f, 0x1ad13f7d, 0xcfaf3685, 0x6c83a199, 0x7d26718a, 0xde5dfcd9, 0x79cf7355, + 0x8979d7fb, 0xebf8c55e, 0xebe408e4, 0xcd2affba, 0xe483be6e, 0xe239d6de, 0x5dc1e9e0, 0x0473931f, + 0x851b097c, 0xac5db249, 0x09c0f9f2, 0xd8d2f134, 0xe6f38e41, 0xb1c71bf1, 0x52b6e4db, 0x07224424, + 0x6cf73e85, 0x4f25d89c, 0x782a7d74, 0x10a68dcd, 0x3a868189, 0xd570d2dc, 0x69630745, 0x9542ed86, + 0x331cd6b2, 0xa84b5b28, 0x07879c9d, 0x38372f64, 0x7185db11, 0x25ba7c83, 0x01061523, 0xe6792f9f, + 0xe5df07d1, 0x4321b47f, 0x7d2469d8, 0x1a3a4f90, 0x48be29a3, 0x669071af, 0x8ec8dd31, 0x0810bfbf, + 0x813a06b4, 0x68538345, 0x65865ddc, 0x43a71b8e, 0x78619a56, 0x5a34451d, 0x5bdaa3ed, 0x71edc7e9, + 0x17ac9a20, 0x78d10bfa, 0x6c1e7f35, 0xd51839d9, 0x240cbc51, 0x33513cc1, 0xd2b4f795, 0xccaa8186, + 0x0babe682, 0xa33cf164, 0x18c643ea, 0xc1ca105f, 0x9959147a, 0x6d3d94de, 0x0b654fbe, 0xed902ca0, + 0x7d835cb5, 0x99ba1509, 0x6445c922, 0x495e76c2, 0xf07194bc, 0xa1631d7e, 0x677076a5, 0x89fffe35, + 0x1a49bcf3, 0x8e6c948a, 0x0144c917, 0x8d93aea1, 0x16f87ddf, 0xc8f25d49, 0x1fb11297, 0x27e750cd, + 0x2f422da1, 0xdee89a77, 0x1534c643, 0x457b7b8b, 0xaf172f7a, 0x6b9b09d6, 0x33573f7f, 0xf14e15c4, + 0x526467d5, 0xaf488241, 0x87c3ee0d, 0x33be490c, 0x95aa6e52, 0x43ec242e, 0xd77de99b, 0xd018334f, + 0x5b78d407, 0x498eb66b, 0xb1279fa8, 0xb38b0ea6, 0x90718376, 0xe325dee2, 0x8e2f2cba, 0xcaa5bdec, + 0x9d652c56, 0xad68f5cb, 0xa77591af, 0x88e37ee8, 0xf8faa221, 0xfcbbbe47, 0x4f407786, 0xaf393889, + 0xf444a1d9, 0x15ae1a2f, 0x40aa7097, 0x6f9486ac, 0x29d232a3, 0xe47609e9, 0xe8b631ff, 0xba8565f4, + 0x11288749, 0x46c9a838, 0xeb1b7cd8, 0xf516bbb1, 0xfb74fda0, 0x010996e6, 0x4c994653, 0x1d889512, + 0x53dcd9a3, 0xdd074697, 0x1e78e17c, 0x637c98bf, 0x930bb219, 0xcf7f75b0, 0xcb9355fb, 0x9e623009, + 0xe466d82c, 0x28f968d3, 0xfeb385d9, 0x238e026c, 0xb8ed0560, 0x0c6a027a, 0x3d6fec4b, 0xbb4b2ec2, + 0xe715031c, 0xeded011d, 0xcdc4d3b9, 0xc456fc96, 0xdd0eea20, 0xb3df8ec9, 0x12351993, 0xd9cbb01c, + 0x603147a2, 0xcf37d17d, 0xf7fcd9dc, 0xd8556fa3, 0x104c8131, 0x13152774, 0xb4715811, 0x6a72c2c9, + 0xc5ae37bb, 0xa76ce12a, 0x8150d8f3, 0x2ec29218, 0xa35f0984, 0x48c0647e, 0x0b5ff98c, 0x71893f7b +] + +# Barrel shift macro from C +cdef inline uint32_t BARREL_SHIFT(uint32_t v, uint32_t shift): + return ((v << shift) | (v >> ((32 - shift) & 0x1f))) + + +cdef uint32_t* buzhash_init_table(uint32_t seed): + """Initialize the buzhash table with the given seed.""" + cdef int i + cdef uint32_t* table = malloc(1024) # 256 * sizeof(uint32_t) + for i in range(256): + table[i] = table_base[i] ^ seed + return table + +cdef uint32_t _buzhash(const unsigned char* data, size_t len, const uint32_t* h): + """Calculate the buzhash of the given data.""" + cdef uint32_t i + cdef uint32_t sum = 0, imod + for i in range(len - 1, 0, -1): + imod = i & 0x1f + sum ^= BARREL_SHIFT(h[data[0]], imod) + data += 1 + return sum ^ h[data[0]] + +cdef uint32_t _buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, const uint32_t* h): + """Update the buzhash with a new byte.""" + cdef uint32_t lenmod = len & 0x1f # Note: replace by constant to get small speedup + return BARREL_SHIFT(sum, 1) ^ BARREL_SHIFT(h[remove], lenmod) ^ h[add] + + cdef class Chunker: """ Content-Defined Chunker, variable chunk sizes. @@ -286,7 +371,14 @@ cdef class Chunker: Additionally it obeys some more criteria, like a minimum and maximum chunk size. It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks. """ - cdef _Chunker *chunker + cdef uint32_t chunk_mask + cdef uint32_t* table + cdef uint8_t* data + cdef object _fd # Python object for file descriptor + cdef int fh + cdef int done, eof + cdef size_t min_size, buf_size, window_size, remaining, position, last + cdef long long bytes_read, bytes_yielded # off_t in C, using long long for compatibility cdef readonly float chunking_time def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size): @@ -295,10 +387,141 @@ cdef class Chunker: assert max_size <= len(zeros) # see chunker_process, first while loop condition, first term must be able to get True: assert hash_window_size + min_size + 1 <= max_size, "too small max_size" - hash_mask = (1 << hash_mask_bits) - 1 - self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff) + + self.window_size = hash_window_size + self.chunk_mask = (1 << hash_mask_bits) - 1 + self.min_size = min_size + self.table = buzhash_init_table(seed & 0xffffffff) + self.buf_size = max_size + self.data = malloc(self.buf_size) + self.fh = -1 + self.done = 0 + self.eof = 0 + self.remaining = 0 + self.position = 0 + self.last = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self._fd = None self.chunking_time = 0.0 + def __dealloc__(self): + """Free the chunker's resources.""" + if self.table != NULL: + free(self.table) + self.table = NULL + if self.data != NULL: + free(self.data) + self.data = NULL + + cdef int fill(self) except 0: + """Fill the chunker's buffer with more data.""" + cdef ssize_t n + cdef object data_py + + # Move remaining data to the beginning of the buffer + memmove(self.data, self.data + self.last, self.position + self.remaining - self.last) + self.position -= self.last + self.last = 0 + n = self.buf_size - self.position - self.remaining + + if self.eof or n == 0: + return 1 + + if self.fh >= 0: + # Use OS-level file descriptor + with nogil: + n = read(self.fh, self.data + self.position + self.remaining, n) + + if n > 0: + self.remaining += n + self.bytes_read += n + elif n == 0: + self.eof = 1 + else: + # Error occurred + raise OSError(errno.errno, os.strerror(errno.errno)) + + else: + # Use Python file object + data_py = self._fd.read(n) + n = len(data_py) + + if n: + # Copy data from Python bytes to our buffer + memcpy(self.data + self.position + self.remaining, PyBytes_AsString(data_py), n) + self.remaining += n + self.bytes_read += n + else: + self.eof = 1 + + return 1 + + cdef object process(self) except *: + """Process the chunker's buffer and return the next chunk.""" + cdef uint32_t sum, chunk_mask = self.chunk_mask + cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size + cdef uint8_t* p + cdef uint8_t* stop_at + cdef size_t did_bytes + + if self.done: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + while self.remaining < min_size + window_size + 1 and not self.eof: # see assert in Chunker init + if not self.fill(): + return None + + # Here we either are at eof... + if self.eof: + self.done = 1 + if self.remaining: + self.bytes_yielded += self.remaining + # Return a memory view of the remaining data + return memoryview((self.data + self.position)[:self.remaining]) + else: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + # ... or we have at least min_size + window_size + 1 bytes remaining. + # We do not want to "cut" a chunk smaller than min_size and the hash + # window starts at the potential cutting place. + self.position += min_size + self.remaining -= min_size + sum = _buzhash(self.data + self.position, window_size, self.table) + + while self.remaining > self.window_size and (sum & chunk_mask): + p = self.data + self.position + stop_at = p + self.remaining - window_size + + while p < stop_at and (sum & chunk_mask): + sum = _buzhash_update(sum, p[0], p[window_size], window_size, self.table) + p += 1 + + did_bytes = p - (self.data + self.position) + self.position += did_bytes + self.remaining -= did_bytes + + if self.remaining <= window_size: + if not self.fill(): + return None + + if self.remaining <= window_size: + self.position += self.remaining + self.remaining = 0 + + old_last = self.last + self.last = self.position + n = self.last - old_last + self.bytes_yielded += n + + # Return a memory view of the chunk + return memoryview((self.data + old_last)[:n]) def chunkify(self, fd, fh=-1): """ @@ -308,19 +531,23 @@ cdef class Chunker: :param fh: OS-level file handle (if available), defaults to -1 which means not to use OS-level fd. """ - chunker_set_fd(self.chunker, fd, fh) + self._fd = fd + self.fh = fh + self.done = 0 + self.remaining = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self.position = 0 + self.last = 0 + self.eof = 0 return self - def __dealloc__(self): - if self.chunker: - chunker_free(self.chunker) - def __iter__(self): return self def __next__(self): started_chunking = time.monotonic() - data = chunker_process(self.chunker) + data = self.process() got = len(data) # we do not have SEEK_DATA/SEEK_HOLE support in chunker_process C code, # but we can just check if data was all-zero (and either came from a hole @@ -334,6 +561,23 @@ cdef class Chunker: return Chunk(data, size=got, allocation=allocation) +def buzhash(data, unsigned long seed): + cdef uint32_t *table + cdef uint32_t sum + table = buzhash_init_table(seed & 0xffffffff) + sum = _buzhash( data, len(data), table) + free(table) + return sum + + +def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): + cdef uint32_t *table + table = buzhash_init_table(seed & 0xffffffff) + sum = _buzhash_update(sum, remove, add, len, table) + free(table) + return sum + + def get_chunker(algo, *params, **kw): if algo == 'buzhash': seed = kw['seed'] @@ -344,20 +588,3 @@ def get_chunker(algo, *params, **kw): if algo == 'fail': return ChunkerFailing(*params) raise TypeError('unsupported chunker algo %r' % algo) - - -def buzhash(data, unsigned long seed): - cdef uint32_t *table - cdef uint32_t sum - table = buzhash_init_table(seed & 0xffffffff) - sum = c_buzhash( data, len(data), table) - free(table) - return sum - - -def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed): - cdef uint32_t *table - table = buzhash_init_table(seed & 0xffffffff) - sum = c_buzhash_update(sum, remove, add, len, table) - free(table) - return sum