From c9f35a16e9bf9e7073c486553177cef79ff1cb06 Mon Sep 17 00:00:00 2001 From: nain <126972030+F49FF806@users.noreply.github.com> Date: Thu, 25 May 2023 11:25:10 -0400 Subject: [PATCH] unify scanning and listing of segment dirs / segment files and apply good practices + os.scandir instead of os.listdir Improved speed and added flexibility with attributes (name,path,is_dir(),is_file()) + use is_dir / is_file to make sure we're reading only dirs / files respectively + Filtering to particular start, end index range built in + Have seperate generators to avoid unnecessary less_than / greater_than checking for every dir/file when not required Resolves #7597 --- src/borg/repository.py | 46 ++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 61905b394..203596c85 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1319,28 +1319,52 @@ class LoggedIO: safe_fadvise(fd.fileno(), 0, 0, 'DONTNEED') fd.close() + def get_segment_dirs(self, start_index=None, end_index=None): + """Returns generator yeilding required segment dirs as `os.DirEntry` objects. Start and end are inclusive. + """ + data_dir = os.path.join(self.path, 'data') + if start_index is None and end_index is None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit()) + elif start_index is not None and end_index is None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and start_index <= int(f.name)) + elif start_index is None and end_index is not None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and int(f.name) <= end_index) + elif start_index is not None and end_index is not None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and start_index <= int(f.name) <= end_index) + return segment_dirs + + def get_segment_files(self, segment_dir, start_index=None, end_index=None): + """Returns generator yeilding required segment files in segment_dir as `os.DirEntry` objects. Start and end are inclusive. + """ + if start_index is None and end_index is None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit()) + elif start_index is not None and end_index is None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and start_index <= int(f.name)) + elif start_index is None and end_index is not None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and int(f.name) <= end_index) + elif start_index is not None and end_index is not None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and start_index <= int(f.name) <= end_index) + return segment_files + def segment_iterator(self, segment=None, reverse=False): if segment is None: segment = 0 if not reverse else 2 ** 32 - 1 - data_path = os.path.join(self.path, 'data') start_segment_dir = segment // self.segments_per_dir - dirs = os.listdir(data_path) if not reverse: - dirs = [dir for dir in dirs if dir.isdigit() and int(dir) >= start_segment_dir] + dirs = self.get_segment_dirs(start_index=start_segment_dir) else: - dirs = [dir for dir in dirs if dir.isdigit() and int(dir) <= start_segment_dir] - dirs = sorted(dirs, key=int, reverse=reverse) + dirs = self.get_segment_dirs(end_index=start_segment_dir) + dirs = sorted(dirs, key=lambda dir: int(dir.name), reverse=reverse) for dir in dirs: - filenames = os.listdir(os.path.join(data_path, dir)) if not reverse: - filenames = [filename for filename in filenames if filename.isdigit() and int(filename) >= segment] + files = self.get_segment_files(dir, start_index=segment) else: - filenames = [filename for filename in filenames if filename.isdigit() and int(filename) <= segment] - filenames = sorted(filenames, key=int, reverse=reverse) - for filename in filenames: + files = self.get_segment_files(dir, end_index=segment) + files = sorted(files, key=lambda file: int(file.name), reverse=reverse) + for file in files: # Note: Do not filter out logically deleted segments (see "File system interaction" above), # since this is used by cleanup and txn state detection as well. - yield int(filename), os.path.join(data_path, dir, filename) + yield int(file.name), file.path def get_latest_segment(self): for segment, filename in self.segment_iterator(reverse=True):