From 2766693706729bf656d15f8cab272a5b78304381 Mon Sep 17 00:00:00 2001 From: Marian Beermann Date: Thu, 15 Jun 2017 23:50:17 +0200 Subject: [PATCH] fuse: update comments --- src/borg/fuse.py | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/borg/fuse.py b/src/borg/fuse.py index b2db06813..8492ee3f5 100644 --- a/src/borg/fuse.py +++ b/src/borg/fuse.py @@ -43,8 +43,9 @@ class ItemCache: and retrieves items from these inode numbers. """ - # Approximately ~230000 items (depends on the average number of items per metadata chunk) - # Since growing a bytearray has to copy it, growing it will converge to o(n), however, + # 2 MiB are approximately ~230000 items (depends on the average number of items per metadata chunk). + # + # Since growing a bytearray has to copy it, growing it will converge to O(n^2), however, # this is not yet relevant due to the swiftness of copying memory. If it becomes an issue, # use an anonymous mmap and just resize that (or, if on 64 bit, make it so big you never need # to resize it in the first place; that's free). @@ -58,32 +59,35 @@ class ItemCache: # self.meta, the "meta-array" is a densely packed array of metadata about where items can be found. # It is indexed by the inode number minus self.offset. (This is in a way eerily similar to how the first # unices did this). - # The meta-array contains chunk IDs and item entries (described in inode_for_current_item). + # The meta-array contains chunk IDs and item entries (described in iter_archive_items). # The chunk IDs are referenced by item entries through relative offsets, # which are bounded by the metadata chunk size. self.meta = bytearray() # The current write offset in self.meta self.write_offset = 0 - # Offset added to meta-indices, resulting in an inode, - # or substracted from inodes, resulting in a meta-indices. + # Offset added to meta-indices, resulting in inodes, + # or subtracted from inodes, resulting in meta-indices. + # XXX: Merge FuseOperations.items and ItemCache to avoid + # this implicit limitation / hack (on the number of synthetic inodes, degenerate + # cases can inflate their number far beyond the number of archives). self.offset = 1000000 # A temporary file that contains direct items, i.e. items directly cached in this layer. # These are items that span more than one chunk and thus cannot be efficiently cached # by the object cache (self.decrypted_repository), which would require variable-length structures; - # possible but not worth the effort, see inode_for_current_item. + # possible but not worth the effort, see iter_archive_items. self.fd = tempfile.TemporaryFile(prefix='borg-tmp') # A small LRU cache for chunks requested by ItemCache.get() from the object cache, # this significantly speeds up directory traversal and similar operations which # tend to re-read the same chunks over and over. # The capacity is kept low because increasing it does not provide any significant advantage, - # but makes LRUCache's square behaviour noticeable as well as consuming some memory. + # but makes LRUCache's square behaviour noticeable and consumes more memory. self.chunks = LRUCache(capacity=10, dispose=lambda _: None) # Instrumentation - # Count of indirect items, i.e. data is cached in the object cache, in this cache + # Count of indirect items, i.e. data is cached in the object cache, not directly in this cache self.indirect_items = 0 # Count of direct items, i.e. data is in self.fd self.direct_items = 0 @@ -92,16 +96,11 @@ class ItemCache: offset = inode - self.offset if offset < 0: raise ValueError('ItemCache.get() called with an invalid inode number') - if self.meta[offset] == ord(b'S'): - fd_offset = int.from_bytes(self.meta[offset + 1:offset + 9], 'little') - self.fd.seek(fd_offset, io.SEEK_SET) - return Item(internal_dict=next(msgpack.Unpacker(self.fd, read_size=1024))) - else: + if self.meta[offset] == ord(b'I'): _, chunk_id_relative_offset, chunk_offset = self.indirect_entry_struct.unpack_from(self.meta, offset) chunk_id_offset = offset - chunk_id_relative_offset # bytearray slices are bytearrays as well, explicitly convert to bytes() chunk_id = bytes(self.meta[chunk_id_offset:chunk_id_offset + 32]) - chunk_offset = int.from_bytes(self.meta[offset + 5:offset + 9], 'little') chunk = self.chunks.get(chunk_id) if not chunk: csize, chunk = next(self.decrypted_repository.get_many([chunk_id])) @@ -110,12 +109,21 @@ class ItemCache: unpacker = msgpack.Unpacker() unpacker.feed(data) return Item(internal_dict=next(unpacker)) + elif self.meta[offset] == ord(b'S'): + fd_offset = int.from_bytes(self.meta[offset + 1:offset + 9], 'little') + self.fd.seek(fd_offset, io.SEEK_SET) + return Item(internal_dict=next(msgpack.Unpacker(self.fd, read_size=1024))) + else: + raise ValueError('Invalid entry type in self.meta') def iter_archive_items(self, archive_item_ids): unpacker = msgpack.Unpacker() + # Current offset in the metadata stream, which consists of all metadata chunks glued together stream_offset = 0 + # Offset of the current chunk in the metadata stream chunk_begin = 0 + # Length of the chunk preciding the current chunk last_chunk_length = 0 msgpacked_bytes = b'' @@ -124,6 +132,7 @@ class ItemCache: pack_indirect_into = self.indirect_entry_struct.pack_into def write_bytes(append_msgpacked_bytes): + # XXX: Future versions of msgpack include an Unpacker.tell() method that provides this for free. nonlocal msgpacked_bytes nonlocal stream_offset msgpacked_bytes += append_msgpacked_bytes @@ -150,9 +159,9 @@ class ItemCache: # Need more data, feed the next chunk break - current_item_length = len(msgpacked_bytes) - current_spans_chunks = stream_offset - current_item_length <= chunk_begin current_item = msgpacked_bytes + current_item_length = len(current_item) + current_spans_chunks = stream_offset - current_item_length < chunk_begin msgpacked_bytes = b'' if write_offset + 9 >= len(meta): @@ -178,15 +187,13 @@ class ItemCache: pos = self.fd.seek(0, io.SEEK_END) self.fd.write(current_item) meta[write_offset:write_offset + 9] = b'S' + pos.to_bytes(8, 'little') - write_offset += 9 self.direct_items += 1 - inode = write_offset - 9 + self.offset else: item_offset = stream_offset - current_item_length - chunk_begin pack_indirect_into(meta, write_offset, b'I', write_offset - current_id_offset, item_offset) - write_offset += 9 self.indirect_items += 1 - inode = write_offset - 9 + self.offset + inode = write_offset + self.offset + write_offset += 9 yield inode, Item(internal_dict=item)