X-Git-Url: http://git.vrable.net/?a=blobdiff_plain;ds=sidebyside;f=python%2Fcumulus%2F__init__.py;h=323a7c75b4679ca6293ff9a6ee67d7aa6806e217;hb=da1d95d3242ee9d596e60b8d5bfcf9e5bedcd80f;hp=ef353257cfcae6d00114c69a2ce09ea1243255c6;hpb=5949214bc01b2c762adfb724d1e63b7e130c91f4;p=cumulus.git diff --git a/python/cumulus/__init__.py b/python/cumulus/__init__.py index ef35325..323a7c7 100644 --- a/python/cumulus/__init__.py +++ b/python/cumulus/__init__.py @@ -26,19 +26,31 @@ various parts of a Cumulus archive: - reading and maintaining the local object database """ -from __future__ import division +from __future__ import division, print_function, unicode_literals + +import codecs import hashlib import itertools import os import re import sqlite3 +import subprocess +import sys import tarfile import tempfile -import thread +try: + import _thread +except ImportError: + import thread as _thread import cumulus.store import cumulus.store.file +if sys.version < "3": + StringTypes = (str, unicode) +else: + StringTypes = (str,) + # The largest supported snapshot format that can be understood. FORMAT_VERSION = (0, 11) # Cumulus Snapshot v0.11 @@ -58,6 +70,12 @@ SEGMENT_FILTERS = [ ("", None), ] +def to_lines(data): + """Decode binary data from a file into a sequence of lines. + + Newline markers are retained.""" + return list(codecs.iterdecode(data.splitlines(True), "utf-8")) + def uri_decode(s): """Decode a URI-encoded (%xx escapes) string.""" def hex_decode(m): return chr(int(m.group(1), 16)) @@ -219,7 +237,7 @@ class SearchPath(object): except cumulus.store.NotFoundError: pass if not success: - raise cumulus.store.NotFoundError(basename) + raise cumulus.store.NotFoundError(backend) def _build_segments_searchpath(prefix): for (extension, filter) in SEGMENT_FILTERS: @@ -231,6 +249,9 @@ SEARCH_PATHS = { [SearchPathEntry("meta", ".sha1sums"), SearchPathEntry("checksums", ".sha1sums"), SearchPathEntry("", ".sha1sums")]), + "meta": SearchPath( + r"^snapshot-(.*)\.meta(\.\S+)?$", + _build_segments_searchpath("meta")), "segments": SearchPath( (r"^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})" r"\.tar(\.\S+)?$"), @@ -261,11 +282,8 @@ class BackendWrapper(object): store may either be a Store object or URL. """ - if type(backend) in (str, unicode): - if backend.find(":") >= 0: - self._backend = cumulus.store.open(backend) - else: - self._backend = cumulus.store.file.FileStore(backend) + if type(backend) in StringTypes: + self._backend = cumulus.store.open(backend) else: self._backend = backend @@ -289,6 +307,15 @@ class BackendWrapper(object): return ((x[1].group(1), x[0]) for x in SEARCH_PATHS[filetype].list(self._backend)) + def prefetch_generic(self): + """Calls scan on directories to prefetch file metadata.""" + directories = set() + for typeinfo in SEARCH_PATHS.values(): + directories.update(typeinfo.directories()) + for d in directories: + print("Prefetch", d) + self._backend.scan(d) + class CumulusStore: def __init__(self, backend): if isinstance(backend, BackendWrapper): @@ -316,7 +343,7 @@ class CumulusStore: if m: return ("zero", None, None, (0, int(m.group(1)), False)) - m = re.match(r"^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[(((\d+)\+)?(\d+)|=(\d+))\])?$", refstr) + m = re.match(r"^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[(=?(\d+)|(\d+)\+(\d+))\])?$", refstr) if not m: return segment = m.group(1) @@ -328,12 +355,9 @@ class CumulusStore: checksum = checksum.lstrip("(").rstrip(")") if slice is not None: - if m.group(9) is not None: + if m.group(6) is not None: # Size-assertion slice - slice = (0, int(m.group(9)), True) - elif m.group(6) is None: - # Abbreviated slice - slice = (0, int(m.group(8)), False) + slice = (0, int(m.group(6)), True) else: slice = (int(m.group(7)), int(m.group(8)), False) @@ -347,13 +371,15 @@ class CumulusStore: def load_snapshot(self, snapshot): snapshot_file = self.backend.open_snapshot(snapshot)[0] - return snapshot_file.read().splitlines(True) + return to_lines(snapshot_file.read()) @staticmethod def filter_data(filehandle, filter_cmd): if filter_cmd is None: return filehandle - (input, output) = os.popen2(filter_cmd) + p = subprocess.Popen(filter_cmd, shell=True, stdin=subprocess.PIPE, + stdout=subprocess.PIPE, close_fds=True) + input, output = p.stdin, p.stdout def copy_thread(src, dst): BLOCK_SIZE = 4096 while True: @@ -362,7 +388,8 @@ class CumulusStore: dst.write(block) src.close() dst.close() - thread.start_new_thread(copy_thread, (filehandle, input)) + p.wait() + _thread.start_new_thread(copy_thread, (filehandle, input)) return output def get_segment(self, segment): @@ -421,12 +448,18 @@ class CumulusStore: if slice is not None: (start, length, exact) = slice + # Note: The following assertion check may need to be commented out + # to restore from pre-v0.8 snapshots, as the syntax for + # size-assertion slices has changed. if exact and len(data) != length: raise ValueError data = data[start:start+length] if len(data) != length: raise IndexError return data + def prefetch(self): + self.backend.prefetch_generic() + def parse(lines, terminate=None): """Generic parser for RFC822-style "Key: Value" data streams. @@ -466,7 +499,7 @@ def parse(lines, terminate=None): def parse_full(lines): try: - return parse(lines).next() + return next(parse(lines)) except StopIteration: return {} @@ -491,7 +524,7 @@ def read_metadata(object_store, root): def follow_ref(refstr): if len(stack) >= MAX_RECURSION_DEPTH: raise OverflowError - lines = object_store.get(refstr).splitlines(True) + lines = to_lines(object_store.get(refstr)) lines.reverse() stack.append(lines) @@ -721,7 +754,7 @@ class LocalDatabase: can_delete = True if can_delete and not first: - print "Delete snapshot %d (%s)" % (id, name) + print("Delete snapshot %d (%s)" % (id, name)) cur.execute("delete from snapshots where snapshotid = ?", (id,)) first = False @@ -927,11 +960,11 @@ class LocalDatabase: target_size = max(2 * segment_size_estimate, total_bytes / target_buckets) - print "segment_size:", segment_size_estimate - print "distribution:", distribution - print "total_bytes:", total_bytes - print "target_buckets:", target_buckets - print "min, target size:", min_size, target_size + print("segment_size:", segment_size_estimate) + print("distribution:", distribution) + print("total_bytes:", total_bytes) + print("target_buckets:", target_buckets) + print("min, target size:", min_size, target_size) # Chosen cutoffs. Each bucket consists of objects with age greater # than one cutoff value, but not greater than the next largest cutoff. @@ -961,7 +994,7 @@ class LocalDatabase: cutoffs.append(-1) cutoffs.append(-1) - print "cutoffs:", cutoffs + print("cutoffs:", cutoffs) # Update the database to assign each object to the appropriate bucket. cutoffs.reverse()