Start a proof-of-concept FUSE interface to old snapshots.
authorMichael Vrable <mvrable@cs.ucsd.edu>
Tue, 12 Aug 2008 22:14:35 +0000 (15:14 -0700)
committerMichael Vrable <mvrable@turin.ucsd.edu>
Tue, 12 Aug 2008 22:14:35 +0000 (15:14 -0700)
Begin work on a FUSE interface to cumulus, allowing old snapshots to
displayed as a mounted filesystem.  Though only partly-implemented, already
it is possible to read the directory structure and stat information for
files.  File contents cannot yet be extracted.

To implement this efficiently, random access to cumulus metadata was
implemented through the use of a binary search.  Some optimization is still
needed, and some caching should probably still be added.

.gitignore
contrib/cumulus-fuse [new file with mode: 0755]
python/cumulus/metadata.py [new file with mode: 0644]
python/cumulus/store/file.py [new file with mode: 0644]

index d0ae7ac..23672b5 100644 (file)
@@ -1,5 +1,5 @@
 *.o
 *.pyc
-cumulus
-Makefile.dep
-version
+/cumulus
+/Makefile.dep
+/version
diff --git a/contrib/cumulus-fuse b/contrib/cumulus-fuse
new file mode 100755 (executable)
index 0000000..8849335
--- /dev/null
@@ -0,0 +1,199 @@
+#!/usr/bin/python
+#
+# FUSE interface to Cumulus, allowing snapshots to be mounted as a virtual
+# filesystem.
+#
+# Copyright (C) 2006-2008  The Regents of the University of California
+# Written by Michael Vrable <mvrable@cs.ucsd.edu>
+#
+# This program can be distributed under the terms of the GNU GPL, either
+# version 2 of the License, or (at your option) any later version.  See the
+# file COPYING.
+
+import itertools, os, stat, errno
+import fuse
+from fuse import Fuse
+import cumulus
+import cumulus.metadata
+
+fuse.fuse_python_api = (0, 2)
+
+# TODO: Figure out FUSE option parsing
+lowlevel = cumulus.LowlevelDataStore('/backups/lbs/turin')
+store = cumulus.ObjectStore(lowlevel)
+
+def _printable(ptr):
+    if ptr is None: return None
+    return tuple(x[1] for x in ptr)
+
+def parse_path(path):
+    """Strip leading slashe from path, and split apart into components."""
+    if not path.startswith('/'):
+        return None
+    if path == '/':
+        return []
+    else:
+        return path[1:].split('/')
+
+def load_metadata(path):
+    if type(path) != type([]):
+        path = parse_path(path)
+
+    if path is None or len(path) < 2:
+        return None
+
+    snapshot = cumulus.parse_full(store.load_snapshot(path[0]))
+    metadata = cumulus.metadata.Metadata(store, snapshot['Root'])
+    ptr = metadata.search(lambda x: cmp(x, path[1:]))
+    item = metadata._read(ptr)
+    if metadata._get_path(item) != path[1:]:
+        return None
+    return cumulus.MetadataItem(item, store)
+
+class MyStat(fuse.Stat):
+    def __init__(self):
+        self.st_mode = 0
+        self.st_ino = 0
+        self.st_dev = 0
+        self.st_nlink = 0
+        self.st_uid = 0
+        self.st_gid = 0
+        self.st_size = 0
+        self.st_atime = 0
+        self.st_mtime = 0
+        self.st_ctime = 0
+
+class CumulusFS(Fuse):
+    def getattr(self, path):
+        st = MyStat()
+        path = parse_path(path)
+
+        if path is None: return -errno.ENOENT
+        if path == []:
+            # Root directory
+            st.st_mode = stat.S_IFDIR | 0755
+            st.st_nlink = 2
+            return st
+
+        snapshot = cumulus.parse_full(store.load_snapshot(path[0]))
+        if len(path) == 1:
+            # Snapshot directory
+            st.st_mode = stat.S_IFDIR | 0755
+            st.st_nlink = 2
+        else:
+            # File contained within a snapshot
+            m = load_metadata(path)
+            if m is None:
+                return -errno.ENOENT
+
+            st.st_nlink = 1
+            st.st_uid = m.items.user[0]
+            st.st_gid = m.items.group[0]
+            st.st_mtime = m.items.mtime
+            st.st_ctime = m.items.ctime
+            st.st_atime = m.items.mtime
+            if m.items.type == 'd':
+                st.st_mode = stat.S_IFDIR | m.items.mode
+                st.st_nlink = 2
+            elif m.items.type == 'l':
+                st.st_mode = stat.S_IFLNK | m.items.mode
+            else:
+                st.st_mode = stat.S_IFREG | m.items.mode
+                st.st_size = m.items.size
+
+        return st
+
+    def _cumulus_readdir(self, metadata, path):
+        # Find pointer to base directory in metadata
+        ptr1 = metadata.search(lambda x: cmp(x, path))
+
+        # Find pointer to end of directory contents
+        def endcmp(p1):
+            def _cmp(p2):
+                if len(p2) > len(p1): p2 = p2[0:len(p1)]
+                if p2 > p1:
+                    return 1
+                else:
+                    return -1
+            return _cmp
+        ptr2 = metadata.search(endcmp(path), ptr1)
+
+        # Scan through looking for top-level files and directories.  Skip over
+        # data for files in subdirectories.
+        while metadata._cmp(ptr1, ptr2) < 0:
+            item = metadata._read(ptr1)
+            m = cumulus.MetadataItem(item, store)
+            if m.items.name == '.':
+                itempath = []
+            else:
+                itempath = m.items.name.split('/')
+            assert itempath[0:len(path)] == path
+
+            if len(itempath) == len(path):
+                ptr1 = metadata._advance(ptr1)
+                continue
+
+            if len(itempath) > len(path) + 1:
+                ptr1 = metadata.search(endcmp(itempath[0:len(path)+1]),
+                                       ptr1, ptr2)
+                continue
+
+            yield itempath[len(path)]
+            ptr1 = metadata._advance(ptr1)
+
+    def readdir(self, path, offset):
+        if path == '/':
+            for r in itertools.chain(('.', '..'), lowlevel.list_snapshots()):
+                yield fuse.Direntry(r)
+        else:
+            path = parse_path(path)
+            if path is None:
+                return
+            snapshot = cumulus.parse_full(store.load_snapshot(path[0]))
+            metadata = cumulus.metadata.Metadata(store, snapshot['Root'])
+            for r in itertools.chain(('.', '..'),
+                                     self._cumulus_readdir(metadata, path[1:])):
+                yield fuse.Direntry(r)
+
+    def readlink(self, path):
+        m = load_metadata(path)
+        if m is None:
+            return -errno.ENOENT
+        else:
+            return m.items.target
+
+    def open(self, path, flags):
+        m = load_metadata(path)
+        if m is None:
+            return -errno.ENOENT
+        accmode = os.O_RDONLY | os.O_WRONLY | os.O_RDWR
+        if (flags & accmode) != os.O_RDONLY:
+            return -errno.EACCES
+
+    def read(self, path, size, offset):
+        m = load_metadata(path)
+        if m is None:
+            return -errno.ENOENT
+
+        return '\0' * size
+
+def main():
+    usage="""
+cumulus-fuse: Mount cumulus snapshots as a filesystem
+
+""" + Fuse.fusage
+    server = CumulusFS(version="%prog " + fuse.__version__,
+                       usage=usage,
+                       dash_s_do='setsingle')
+
+    server.parser.add_option(mountopt="root", metavar="PATH", default='/',
+                             help="read snapshots from PATH [default: %default]")
+
+    server.parse(errex=1)
+    print server.fuse_args
+    print server.fuse_args.assemble()
+    server.main()
+    store.cleanup()
+
+if __name__ == '__main__':
+    main()
diff --git a/python/cumulus/metadata.py b/python/cumulus/metadata.py
new file mode 100644 (file)
index 0000000..e5d5507
--- /dev/null
@@ -0,0 +1,182 @@
+"""Advanced metadata iterator for Cumulus snapshots.
+
+Allow fast random access to Cumulus metadata logs.  This requires that the
+underlying metadata log have been generated by a depth-first-search traversal
+of the filesystem, in sorted order.
+"""
+
+import cumulus
+
+class Metadata:
+    def __init__(self, object_store, root):
+        self.store = object_store
+        self.root = root
+
+    def _load(self, ref):
+        """Return the contents of the given object, as a list of text lines."""
+
+        data = self.store.get(ref)
+        return data.splitlines()
+
+    def _read(self, ptr):
+        """Parse the metadata item at the given address and return it."""
+
+        if ptr is None or ptr == []: return {}
+
+        (ref, n) = ptr[-1]
+        lines = self._load(ref)[n:]
+
+        try:
+            return cumulus.parse(lines, lambda l: len(l) == 0).next()
+        except StopIteration:
+            return {}
+
+    def _advance(self, ptr):
+        """Advance the specified metadata pointer to the next metadata item."""
+
+        if ptr == None:
+            return None
+
+        advanced = False
+        ptr = list(ptr)
+        if ptr == []:
+            ptr.append((self.root, 0))
+            advanced = True
+
+        while True:
+            last = ptr.pop(-1)
+            lines = self._load(last[0])
+
+            # Reached the end of the current object?  Advance pointer by one
+            # line in the containing object (or return None if reached the very
+            # end of input).
+            if last[1] >= len(lines):
+                if ptr == []: return None
+                last = ptr.pop(-1)
+                ptr.append((last[0], last[1] + 1))
+                advanced = True
+                continue
+
+            # Reached a line with an indirect reference?  Recurse.
+            line = lines[last[1]]
+            if line.startswith('@'):
+                ptr.append(last)
+                ptr.append((line[1:], 0))
+                advanced = True
+                continue
+
+            # Skip over blank lines.
+            if line == "":
+                ptr.append((last[0], last[1] + 1))
+                advanced = True
+                continue
+
+            # Skip over the remainder of a metadata stanza, if we started in
+            # the middle of one.
+            if not advanced:
+                ptr.append((last[0], last[1] + 1))
+                continue
+
+            # Done!  Found a non-blank line which is not an indirect reference.
+            ptr.append(last)
+            return ptr
+
+    def _cmp(self, ptr1, ptr2):
+        if ptr1 is None and ptr2 is None: return 0
+        if ptr1 is None: return 1
+        if ptr2 is None: return -1
+        return cmp(ptr1, ptr2)
+
+    def _get_path(self, metadata):
+        if metadata is None or 'name' not in metadata:
+            return None
+        path = metadata['name']
+        path = cumulus.MetadataItem.decode_str(path)
+        if path == '.': return []
+        return path.split('/')
+
+    def _midpoint(self, ptr1, ptr2):
+        """Return a pointer to some metadata item in the range [ptr1, ptr2)."""
+
+        if ptr1 == []: ptr1 = self._advance([])
+        if ptr1 is None: return None
+        if self._cmp(ptr1, ptr2) > 0: return None
+
+        #print "ptr1:", ptr1
+        #print "ptr2:", ptr2
+
+        level = 0
+        while True:
+            if level >= len(ptr1): return ptr1
+
+            if ptr2 is not None and level < len(ptr2) and ptr1[level] == ptr2[level]:
+                level += 1
+                continue
+
+            midpoint = ptr1[0:level]
+            lastref = ptr1[level][0]
+            limit = len(self._load(lastref))
+            if ptr2 is not None and level < len(ptr2) \
+                    and ptr1[level][0] == ptr2[level][0]:
+                limit = ptr2[level][1]
+            midpoint.append((lastref, (ptr1[level][1] + limit) // 2))
+
+            if self._cmp(midpoint, ptr1) < 0:
+                #print "    ...descend"
+                level += 1
+                continue
+
+            #print "Guess midpoint:", midpoint
+            midpoint = self._advance(midpoint)
+            #print "    ...advanced to", midpoint
+
+            if self._cmp(midpoint, ptr2) >= 0:
+                #print "    ...overshoot, trying again"
+                level += 1
+                continue
+
+            return midpoint
+
+    def search(self, searchfunc, ptr1=[], ptr2=None):
+        """Perform a binary search for name.
+
+        The search is restricted to the interval [ptr1, ptr2).  Return either a
+        pointer to the item for name, or if it doesn't exist the pointer to the
+        item which follows where it would have been (assuming the original
+        search interval included that location)."""
+
+        def _printable(ptr):
+            if ptr is None: return None
+            return tuple(x[1] for x in ptr)
+
+        while True:
+            #print _printable(ptr1), "...", _printable(ptr2)
+            if self._cmp(ptr1, ptr2) >= 0:
+                #print "    X", _printable(ptr1)
+                return ptr1
+
+            mid = self._midpoint(ptr1, ptr2)
+            midpath = self._get_path(self._read(mid))
+
+            c = searchfunc(midpath)
+            if c == 0:
+                #print "    =", _printable(mid), midpath
+                return mid
+            elif c < 0:
+                #print "    >", _printable(mid), midpath
+                ptr1 = self._advance(mid)
+            else:
+                #print "    <", _printable(mid), midpath
+                ptr2 = mid
+
+if __name__ == '__main__':
+    import cumulus
+
+    store = cumulus.ObjectStore(cumulus.LowlevelDataStore('/backups/lbs/turin'))
+    root = '9e0be287-a74c-4527-aa26-0a10e8f40f7d/00000243(sha1=316e034bb5b86e5ba00b1d54335427d7b742f4f9)'
+
+    metadata = Metadata(store, root)
+    ptr = metadata.search(['home', 'mvrable', 'docs'])
+    print ptr
+    print metadata._read(ptr)
+    store.cleanup()
diff --git a/python/cumulus/store/file.py b/python/cumulus/store/file.py
new file mode 100644 (file)
index 0000000..6034835
--- /dev/null
@@ -0,0 +1,37 @@
+import os, sys, tempfile
+
+import cumulus.store
+
+type_patterns = cumulus.store.type_patterns
+
+class FileStore(cumulus.store.Store):
+    def __init__(self, prefix):
+        while prefix.endswith("/") and prefix != "/": prefix = prefix[:-1]
+        self.prefix = prefix
+
+    def _get_path(self, type, name):
+        return "%s/%s" % (self.prefix, name)
+
+    def list(self, type):
+        files = os.listdir(self.prefix)
+        return (f for f in files if type_patterns[type].match(f))
+
+    def get(self, type, name):
+        k = self._get_path(type, name)
+        return open(k, 'rb')
+
+    def put(self, type, name, fp):
+        k = self._get_path(type, name)
+        out = open(k, 'wb')
+        buf = fp.read(4096)
+        while len(buf) > 0:
+            out.write(buf)
+            buf = fp.read(4096)
+
+    def delete(self, type, name):
+        k = self._get_path(type, name)
+        os.unlink(k)
+
+    def stat(self, type, name):
+        stat = os.stat(self._get_path(type, name))
+        return {'size': stat.st_size}