Memory usage improvments

This commit is contained in:
Jonas Borgström 2010-11-23 14:46:53 +01:00
parent 84b4b08309
commit 198b3f90fc
4 changed files with 20 additions and 42 deletions

View file

@ -27,8 +27,6 @@ class Archive(object):
self.keychain = keychain
self.store = store
self.items = []
self.chunks = []
self.chunk_idx = {}
self.hard_links = {}
if name:
self.load(self.keychain.id_hash(name))
@ -53,11 +51,10 @@ class Archive(object):
assert items['version'] == 1
assert self.metadata['items_hash'] == items_hash
self.items = items['items']
for i, chunk in enumerate(self.chunks):
self.chunk_idx[i] = chunk[0]
def save(self, name):
def save(self, name, cache):
self.id = self.keychain.id_hash(name)
self.chunks = [(id, size) for (id, (count, size)) in cache.chunk_counts.iteritems() if count > 1000000]
chunks = {'version': 1, 'chunks': self.chunks}
data, chunks_hash = self.keychain.encrypt_create(msgpack.packb(chunks))
self.store.put(NS_ARCHIVE_CHUNKS, self.id, data)
@ -124,12 +121,11 @@ class Archive(object):
os.link(source, path)
else:
with open(path, 'wb') as fd:
for chunk in item['chunks']:
id = self.chunk_idx[chunk]
for id in item['chunks']:
try:
data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id))
if self.keychain.id_hash(data) != id:
raise IntegrityError('chunk id did not match')
raise IntegrityError('chunk hash did not match')
fd.write(data)
except ValueError:
raise Exception('Invalid chunk checksum')
@ -161,8 +157,7 @@ class Archive(object):
os.utime(path, (item['atime'], item['mtime']))
def verify_file(self, item):
for chunk in item['chunks']:
id = self.chunk_idx[chunk]
for id in item['chunks']:
try:
data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id))
if self.keychain.id_hash(data) != id:
@ -239,45 +234,22 @@ class Archive(object):
ids = None
break
else:
chunks = [self.process_chunk2(id, cache) for id in ids]
for id in ids:
cache.chunk_incref(id)
# Only chunkify the file if needed
if ids is None:
fd = open(path, 'rb')
with open(path, 'rb') as fd:
size = 0
ids = []
chunks = []
for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE,
self.keychain.get_chunkify_seed()):
id = self.keychain.id_hash(chunk)
ids.append(id)
try:
chunks.append(self.chunk_idx[id])
except KeyError:
chunks.append(self.process_chunk(id, chunk, cache))
ids.append(cache.add_chunk(self.keychain.id_hash(chunk), chunk))
size += len(chunk)
cache.memorize_file_chunks(path_hash, st, ids)
item = {'path': safe_path, 'chunks': chunks, 'size': size}
item = {'path': safe_path, 'chunks': ids, 'size': size}
item.update(self.stat_attrs(st, path))
self.items.append(item)
def process_chunk2(self, id, cache):
try:
return self.chunk_idx[id]
except KeyError:
idx = len(self.chunks)
id, size = cache.chunk_incref(id)
self.chunks.append((id, size))
self.chunk_idx[id] = idx
return idx
def process_chunk(self, id, data, cache):
idx = len(self.chunks)
id, size = cache.add_chunk(id, data)
self.chunks.append((id, size))
self.chunk_idx[id] = idx
return idx
@staticmethod
def list_archives(store, keychain):
for id in list(store.list(NS_ARCHIVE_METADATA)):

View file

@ -75,7 +75,7 @@ class Archiver(object):
pass
for path in args.paths:
self._process(archive, cache, args.patterns, unicode(path))
archive.save(args.archive.archive)
archive.save(args.archive.archive, cache)
cache.save()
return self.exit_code

View file

@ -60,6 +60,10 @@ class Cache(object):
yield key, (value[0] + 1,) + value[1:]
def save(self):
for id, (count, size) in self.chunk_counts.iteritems():
if count > 1000000:
self.chunk_counts[id] = count - 1000000, size
cache = {'version': 1,
'tid': self.store.tid,
'chunk_counts': self.chunk_counts,
@ -78,16 +82,17 @@ class Cache(object):
data, hash = self.keychain.encrypt_read(data)
csize = len(data)
self.store.put(NS_CHUNK, id, data)
self.chunk_counts[id] = (1, csize)
return id, csize
self.chunk_counts[id] = (1000001, csize)
return id
def seen_chunk(self, id):
return self.chunk_counts.get(id, (0, 0))[0]
def chunk_incref(self, id):
count, size = self.chunk_counts[id]
self.chunk_counts[id] = (count + 1, size)
return id, size
if count < 1000000:
self.chunk_counts[id] = (count + 1000001, size)
return id
def chunk_decref(self, id):
count, size = self.chunk_counts[id]

View file

@ -97,6 +97,7 @@ class Test(unittest.TestCase):
os.symlink('somewhere', os.path.join(self.input_path, 'link1'))
os.mkfifo(os.path.join(self.input_path, 'fifo1'))
self.darc('create', self.store_path + '::test', 'input')
self.darc('create', self.store_path + '::test.2', 'input')
self.darc('extract', self.store_path + '::test', 'output')
self.diff_dirs('input', 'output/input')