From 198b3f90fc782c790349a87f21395194c2a8b4aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20Borgstr=C3=B6m?= Date: Tue, 23 Nov 2010 14:46:53 +0100 Subject: [PATCH] Memory usage improvments --- darc/archive.py | 46 +++++++++------------------------------------- darc/archiver.py | 2 +- darc/cache.py | 13 +++++++++---- darc/test.py | 1 + 4 files changed, 20 insertions(+), 42 deletions(-) diff --git a/darc/archive.py b/darc/archive.py index ffb7261e3..6718f5f19 100644 --- a/darc/archive.py +++ b/darc/archive.py @@ -27,8 +27,6 @@ class Archive(object): self.keychain = keychain self.store = store self.items = [] - self.chunks = [] - self.chunk_idx = {} self.hard_links = {} if name: self.load(self.keychain.id_hash(name)) @@ -53,11 +51,10 @@ class Archive(object): assert items['version'] == 1 assert self.metadata['items_hash'] == items_hash self.items = items['items'] - for i, chunk in enumerate(self.chunks): - self.chunk_idx[i] = chunk[0] - def save(self, name): + def save(self, name, cache): self.id = self.keychain.id_hash(name) + self.chunks = [(id, size) for (id, (count, size)) in cache.chunk_counts.iteritems() if count > 1000000] chunks = {'version': 1, 'chunks': self.chunks} data, chunks_hash = self.keychain.encrypt_create(msgpack.packb(chunks)) self.store.put(NS_ARCHIVE_CHUNKS, self.id, data) @@ -124,12 +121,11 @@ class Archive(object): os.link(source, path) else: with open(path, 'wb') as fd: - for chunk in item['chunks']: - id = self.chunk_idx[chunk] + for id in item['chunks']: try: data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id)) if self.keychain.id_hash(data) != id: - raise IntegrityError('chunk id did not match') + raise IntegrityError('chunk hash did not match') fd.write(data) except ValueError: raise Exception('Invalid chunk checksum') @@ -161,8 +157,7 @@ class Archive(object): os.utime(path, (item['atime'], item['mtime'])) def verify_file(self, item): - for chunk in item['chunks']: - id = self.chunk_idx[chunk] + for id in item['chunks']: try: data, hash = self.keychain.decrypt(self.store.get(NS_CHUNK, id)) if self.keychain.id_hash(data) != id: @@ -239,45 +234,22 @@ class Archive(object): ids = None break else: - chunks = [self.process_chunk2(id, cache) for id in ids] + for id in ids: + cache.chunk_incref(id) # Only chunkify the file if needed if ids is None: - fd = open(path, 'rb') with open(path, 'rb') as fd: size = 0 ids = [] - chunks = [] for chunk in chunkify(fd, CHUNK_SIZE, WINDOW_SIZE, self.keychain.get_chunkify_seed()): - id = self.keychain.id_hash(chunk) - ids.append(id) - try: - chunks.append(self.chunk_idx[id]) - except KeyError: - chunks.append(self.process_chunk(id, chunk, cache)) + ids.append(cache.add_chunk(self.keychain.id_hash(chunk), chunk)) size += len(chunk) cache.memorize_file_chunks(path_hash, st, ids) - item = {'path': safe_path, 'chunks': chunks, 'size': size} + item = {'path': safe_path, 'chunks': ids, 'size': size} item.update(self.stat_attrs(st, path)) self.items.append(item) - def process_chunk2(self, id, cache): - try: - return self.chunk_idx[id] - except KeyError: - idx = len(self.chunks) - id, size = cache.chunk_incref(id) - self.chunks.append((id, size)) - self.chunk_idx[id] = idx - return idx - - def process_chunk(self, id, data, cache): - idx = len(self.chunks) - id, size = cache.add_chunk(id, data) - self.chunks.append((id, size)) - self.chunk_idx[id] = idx - return idx - @staticmethod def list_archives(store, keychain): for id in list(store.list(NS_ARCHIVE_METADATA)): diff --git a/darc/archiver.py b/darc/archiver.py index d3c0f4487..665d5789a 100644 --- a/darc/archiver.py +++ b/darc/archiver.py @@ -75,7 +75,7 @@ class Archiver(object): pass for path in args.paths: self._process(archive, cache, args.patterns, unicode(path)) - archive.save(args.archive.archive) + archive.save(args.archive.archive, cache) cache.save() return self.exit_code diff --git a/darc/cache.py b/darc/cache.py index 5bb6633c0..438c4ca3e 100644 --- a/darc/cache.py +++ b/darc/cache.py @@ -60,6 +60,10 @@ class Cache(object): yield key, (value[0] + 1,) + value[1:] def save(self): + for id, (count, size) in self.chunk_counts.iteritems(): + if count > 1000000: + self.chunk_counts[id] = count - 1000000, size + cache = {'version': 1, 'tid': self.store.tid, 'chunk_counts': self.chunk_counts, @@ -78,16 +82,17 @@ class Cache(object): data, hash = self.keychain.encrypt_read(data) csize = len(data) self.store.put(NS_CHUNK, id, data) - self.chunk_counts[id] = (1, csize) - return id, csize + self.chunk_counts[id] = (1000001, csize) + return id def seen_chunk(self, id): return self.chunk_counts.get(id, (0, 0))[0] def chunk_incref(self, id): count, size = self.chunk_counts[id] - self.chunk_counts[id] = (count + 1, size) - return id, size + if count < 1000000: + self.chunk_counts[id] = (count + 1000001, size) + return id def chunk_decref(self, id): count, size = self.chunk_counts[id] diff --git a/darc/test.py b/darc/test.py index b9bdea66f..dc6e813ce 100644 --- a/darc/test.py +++ b/darc/test.py @@ -97,6 +97,7 @@ class Test(unittest.TestCase): os.symlink('somewhere', os.path.join(self.input_path, 'link1')) os.mkfifo(os.path.join(self.input_path, 'fifo1')) self.darc('create', self.store_path + '::test', 'input') + self.darc('create', self.store_path + '::test.2', 'input') self.darc('extract', self.store_path + '::test', 'output') self.diff_dirs('input', 'output/input')