Changes to the Cumulus backup format and tools.
authorMichael Vrable <vrable@cs.hmc.edu>
Fri, 14 Dec 2012 04:34:21 +0000 (20:34 -0800)
committerMichael Vrable <vrable@cs.hmc.edu>
Wed, 22 Jan 2014 05:11:09 +0000 (21:11 -0800)
  - Switch to a hierarchical file layout.
  - Remove old references to the "LBS" name.

12 files changed:
doc/format.txt
localdb.cc
localdb.h
main.cc
python/cumulus/__init__.py
python/cumulus/store/__init__.py
python/cumulus/store/file.py
remote.cc
schema.sql
store.cc
tests/digest_tree
tests/run-test

index 582ef59..19bf2f2 100644 (file)
@@ -19,6 +19,30 @@ This document does not explain the rationale behind the format; for
 that, see design.txt.
 
 
+BACKUP REPOSITORY LAYOUT
+========================
+
+Cumulus backups are stored using a relatively simple layout.  Data files
+described below are written into one of several directories on the
+backup server, depending on their purpose:
+    snapshots/
+        Snapshot descriptor files, which quickly summarize each backup
+        snapshot stored.
+    segments0/
+    segments1/
+        Storage of the bulk of the backup data, in compressed/encrypted
+        form.  Technically any segment could be stored in either
+        directory (both directories will be searched when looking for a
+        segment).  However, data in segments0 might be faster to access
+        (but more expensive) depending on the storage backend.  The
+        intent is that segments0 can store filesystem tree metadata and
+        segments1 can store file contents.
+    meta/
+        Snapshot-specific metadata that is not core to the backup.  This
+        can include checksums of segments, some data for rebuilding
+        local database contents, etc.
+
+
 DATA CHECKSUMS
 ==============
 
@@ -71,6 +95,8 @@ fixed points; an example UUID is
 This segment could be stored in the filesystem as a file
     a704eeae-97f2-4f30-91a4-d4473956366b.tar
 The UUID used to name a segment is assigned when the segment is created.
+These files are stored in either the segments0 or segments1 directories
+on the backup server.
 
 Filters can be layered on top of the segment storage to provide
 compression, encryption, or other features.  For example, the example
@@ -101,8 +127,8 @@ object.
 
 NOTE: When naming an object, the segment portion consists of the UUID
 only.  Any extensions appended to the segment when storing it as a file
-in the filesystem (for example, .tar.bz2) are _not_ part of the name of
-the object.
+in the filesystem (for example, .tar.bz2) and path information (for
+example, segments0) are _not_ part of the name of the object.
 
 There are two additional components which may appear in an object name;
 both are optional.
index ca83559..7cabcb1 100644 (file)
@@ -283,17 +283,6 @@ void LocalDb::StoreObject(const ObjectReference& ref, double age)
     }
 
     sqlite3_finalize(stmt);
-
-    if (age != 0.0) {
-        stmt = Prepare("update segments "
-                       "set mtime = coalesce(max(mtime, ?), ?) "
-                       "where segmentid = ?");
-        sqlite3_bind_double(stmt, 1, age);
-        sqlite3_bind_double(stmt, 2, age);
-        sqlite3_bind_int64(stmt, 3, SegmentToId(ref.get_segment()));
-        rc = sqlite3_step(stmt);
-        sqlite3_finalize(stmt);
-    }
 }
 
 ObjectReference LocalDb::FindObject(const string &checksum, int64_t size)
@@ -495,25 +484,28 @@ void LocalDb::UseObject(const ObjectReference& ref)
     }
 }
 
-void LocalDb::SetSegmentChecksum(const std::string &segment,
+void LocalDb::SetSegmentMetadata(const std::string &segment,
                                  const std::string &path,
                                  const std::string &checksum,
+                                 const std::string &type,
                                  int data_size, int disk_size)
 {
     int rc;
     sqlite3_stmt *stmt;
 
     stmt = Prepare("update segments set path = ?, checksum = ?, "
-                   "data_size = ?, disk_size = ?, "
+                   "type = ?, data_size = ?, disk_size = ?, "
                    "mtime = coalesce(mtime, julianday('now')) "
                    "where segmentid = ?");
     sqlite3_bind_text(stmt, 1, path.c_str(), path.size(),
                       SQLITE_TRANSIENT);
     sqlite3_bind_text(stmt, 2, checksum.c_str(), checksum.size(),
                       SQLITE_TRANSIENT);
-    sqlite3_bind_int64(stmt, 3, data_size);
-    sqlite3_bind_int64(stmt, 4, disk_size);
-    sqlite3_bind_int64(stmt, 5, SegmentToId(segment));
+    sqlite3_bind_text(stmt, 3, type.c_str(), type.size(),
+                      SQLITE_TRANSIENT);
+    sqlite3_bind_int64(stmt, 4, data_size);
+    sqlite3_bind_int64(stmt, 5, disk_size);
+    sqlite3_bind_int64(stmt, 6, SegmentToId(segment));
 
     rc = sqlite3_step(stmt);
     if (rc != SQLITE_DONE) {
@@ -524,7 +516,7 @@ void LocalDb::SetSegmentChecksum(const std::string &segment,
     sqlite3_finalize(stmt);
 }
 
-bool LocalDb::GetSegmentChecksum(const string &segment,
+bool LocalDb::GetSegmentMetadata(const string &segment,
                                  string *seg_path,
                                  string *seg_checksum)
 {
index 7764aed..6d2190d 100644 (file)
--- a/localdb.h
+++ b/localdb.h
@@ -49,10 +49,11 @@ public:
     void UseObject(const ObjectReference& ref);
 
     std::set<std::string> GetUsedSegments();
-    void SetSegmentChecksum(const std::string &segment, const std::string &path,
+    void SetSegmentMetadata(const std::string &segment, const std::string &path,
                             const std::string &checksum,
-                            int data_size, int disk_size);
-    bool GetSegmentChecksum(const std::string &segment,
+                            const std::string &type, int data_size,
+                            int disk_size);
+    bool GetSegmentMetadata(const std::string &segment,
                             std::string *seg_path, std::string *seg_checksum);
 
     bool LoadChunkSignatures(ObjectReference ref,
diff --git a/main.cc b/main.cc
index e10a04a..fa8e4c9 100644 (file)
--- a/main.cc
+++ b/main.cc
@@ -806,7 +806,7 @@ int main(int argc, char *argv[])
      * a temporary directory for staging files.  Otherwise, write backups
      * directly to the destination directory. */
     if (backup_script != "") {
-        tmp_dir = tmp_dir + "/lbs." + generate_uuid();
+        tmp_dir = tmp_dir + "/cumulus." + generate_uuid();
         if (mkdir(tmp_dir.c_str(), 0700) < 0) {
             fprintf(stderr, "Cannot create temporary directory %s: %m\n",
                     tmp_dir.c_str());
@@ -863,14 +863,14 @@ int main(int argc, char *argv[])
         checksum_filename += backup_scheme + "-";
     checksum_filename = checksum_filename + desc_buf + "." + csum_type + "sums";
     RemoteFile *checksum_file = remote->alloc_file(checksum_filename,
-                                                   "checksums");
+                                                   "meta");
     FILE *checksums = fdopen(checksum_file->get_fd(), "w");
 
     std::set<string> segment_list = db->GetUsedSegments();
     for (std::set<string>::iterator i = segment_list.begin();
          i != segment_list.end(); ++i) {
         string seg_path, seg_csum;
-        if (db->GetSegmentChecksum(*i, &seg_path, &seg_csum)) {
+        if (db->GetSegmentMetadata(*i, &seg_path, &seg_csum)) {
             const char *raw_checksum = NULL;
             if (strncmp(seg_csum.c_str(), csum_type,
                         strlen(csum_type)) == 0) {
@@ -912,7 +912,7 @@ int main(int argc, char *argv[])
     string desc_filename = "snapshot-";
     if (backup_scheme.size() > 0)
         desc_filename += backup_scheme + "-";
-    desc_filename = desc_filename + desc_buf + ".lbs";
+    desc_filename = desc_filename + desc_buf + ".cumulus";
 
     RemoteFile *descriptor_file = remote->alloc_file(desc_filename,
                                                      "snapshots");
index d86d5ed..d8b6814 100644 (file)
@@ -146,7 +146,7 @@ class LowlevelDataStore:
         """Return a file-like object for reading data from the given file."""
 
         (type, filename) = self._classify(filename)
-        return self.store.get(type, filename)
+        return self.store.get(type + "/" + filename)
 
     def lowlevel_stat(self, filename):
         """Return a dictionary of information about the given file.
@@ -156,7 +156,7 @@ class LowlevelDataStore:
         """
 
         (type, filename) = self._classify(filename)
-        return self.store.stat(type, filename)
+        return self.store.stat(type + "/" + filename)
 
     # Slightly higher-level list methods.
     def list_snapshots(self):
@@ -248,7 +248,7 @@ class ObjectStore:
                 yield (path[1], data_obj.read())
 
     def load_snapshot(self, snapshot):
-        file = self.store.lowlevel_open("snapshot-" + snapshot + ".lbs")
+        file = self.store.lowlevel_open("snapshot-" + snapshot + ".cumulus")
         return file.read().splitlines(True)
 
     def extract_segment(self, segment):
index 1e7d41f..3b54cbb 100644 (file)
@@ -21,7 +21,7 @@ import exceptions, re, urlparse
 type_patterns = {
     'checksums': re.compile(r"^snapshot-(.*)\.(\w+)sums$"),
     'segments': re.compile(r"^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(\.\S+)?$"),
-    'snapshots': re.compile(r"^snapshot-(.*)\.lbs$")
+    'snapshots': re.compile(r"^snapshot-(.*)\.(cumulus|lbs)$")
 }
 
 class NotFoundError(exceptions.KeyError):
index 0998448..3d536bf 100644 (file)
@@ -30,17 +30,17 @@ class FileStore(cumulus.store.Store):
         self.prefix = self.path.rstrip("/")
 
     def _get_path(self, type, name):
-        return "%s/%s" % (self.prefix, name)
+        return os.path.join(self.prefix, type, name)
 
-    def list(self, type):
-        files = os.listdir(self.prefix)
-        return (f for f in files if type_patterns[type].match(f))
+    def list(self, subdir):
+        return os.listdir(os.path.join(self.prefix, subdir))
 
-    def get(self, type, name):
-        k = self._get_path(type, name)
-        return open(k, 'rb')
+    def get(self, path):
+        return open(os.path.join(self.prefix, path), 'rb')
 
-    def put(self, type, name, fp):
+    def put(self, path, fp):
+        # TODO: Implement
+        raise NotImplementedError
         k = self._get_path(type, name)
         out = open(k, 'wb')
         buf = fp.read(4096)
@@ -48,15 +48,14 @@ class FileStore(cumulus.store.Store):
             out.write(buf)
             buf = fp.read(4096)
 
-    def delete(self, type, name):
-        k = self._get_path(type, name)
-        os.unlink(k)
+    def delete(self, path):
+        os.unlink(os.path.join(self.prefix, path))
 
-    def stat(self, type, name):
+    def stat(self, path):
         try:
-            stat = os.stat(self._get_path(type, name))
+            stat = os.stat(os.path.join(self.prefix, path))
             return {'size': stat.st_size}
         except OSError:
-            raise cumulus.store.NotFoundError, (type, name)
+            raise cumulus.store.NotFoundError, path
 
 Store = FileStore
index 8f2df90..e8e046e 100644 (file)
--- a/remote.cc
+++ b/remote.cc
@@ -26,6 +26,7 @@
  * scripts that are called when a file is to be transferred. */
 
 #include <assert.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 
 using std::string;
 
+static const char *backup_directories[] = {
+    "meta",
+    "segments0",
+    "segments1",
+    "snapshots",
+    NULL
+};
+
 RemoteStore::RemoteStore(const string &stagedir, const string &script)
 {
     staging_dir = stagedir;
     backup_script = script;
 
+    /* Ensure all necessary directories exist for each type of backup file. */
+    for (size_t i = 0; backup_directories[i]; i++) {
+        string path = stagedir + "/" + backup_directories[i];
+        if (mkdir(path.c_str(), 0777) < 0) {
+            /* Ignore errors for already-existing directories. */
+            if (errno != EEXIST) {
+                fprintf(stderr,
+                        "Warning: Cannot create backup directory %s: %m!",
+                        path.c_str());
+            }
+        }
+    }
+
     /* A background thread is created for each RemoteStore to manage the actual
      * transfers to a remote server.  The main program thread can enqueue
      * RemoteFile objects to be transferred asynchronously. */
@@ -93,7 +115,8 @@ RemoteFile *RemoteStore::alloc_file(const string &name, const string &type)
     pthread_mutex_lock(&lock);
     files_outstanding++;
     pthread_mutex_unlock(&lock);
-    return new RemoteFile(this, name, type, staging_dir + "/" + name);
+    return new RemoteFile(this, name, type,
+                          staging_dir + "/" + type + "/" + name);
 }
 
 /* Request that a file be transferred to the remote server.  The actual
@@ -250,7 +273,7 @@ RemoteFile::RemoteFile(RemoteStore *remote,
     remote_store = remote;
     this->type = type;
     this->local_path = local_path;
-    this->remote_path = name;
+    this->remote_path = type + "/" + name;
 
     fd = open(local_path.c_str(), O_WRONLY | O_CREAT, 0666);
     if (fd < 0)
index d898272..465dde2 100644 (file)
@@ -28,12 +28,12 @@ create table snapshots (
 create table segments (
     segmentid integer primary key,
     segment text unique not null,
+    mtime real,                 -- timestamp when segment was created
     path text,
     checksum text,
-    mtime real,
     data_size integer,          -- sum of bytes in all objects in the segment
-    disk_size integer           -- size of segment on disk, after compression
-    -- TODO: group? metadata vs. non-metadata?
+    disk_size integer,          -- size of segment on disk, after compression
+    type text
 );
 
 -- Index of all data blocks in stored segments.  This is indexed by content
index 772f9c5..115529f 100644 (file)
--- a/store.cc
+++ b/store.cc
@@ -244,7 +244,9 @@ ObjectReference TarSegmentStore::write_object(const char *data, size_t len,
         segment->basename += filter_extension;
         segment->count = 0;
         segment->data_size = 0;
-        segment->rf = remote->alloc_file(segment->basename, "segments");
+        segment->rf = remote->alloc_file(segment->basename,
+                                         group == "metadata" ? "segments0"
+                                                             : "segments1");
         segment->file = new Tarfile(segment->rf, segment->name);
 
         segments[group] = segment;
@@ -313,8 +315,8 @@ void TarSegmentStore::close_segment(const string &group)
             checksum = segment_checksum.checksum_str();
         }
 
-        db->SetSegmentChecksum(segment->name, segment->basename, checksum,
-                               segment->data_size, disk_size);
+        db->SetSegmentMetadata(segment->name, segment->basename, checksum,
+                               group, segment->data_size, disk_size);
     }
 
     segment->rf->send();
index 2ee419e..5882581 100755 (executable)
@@ -35,7 +35,7 @@ def stat_file(path):
     if include_mode:
         metadata.append("mode=%o" % st.st_mode)
     if include_mtime:
-        metadata.append("size=%d" % st.st_mtime)
+        metadata.append("mtime=%d" % st.st_mtime)
     if stat.S_ISREG(st.st_mode):
         digest = hashlib.sha256()
         BUF_SIZE = 1 << 16
index 624a0bd..7d3ec0c 100755 (executable)
@@ -61,6 +61,7 @@ mkdir "$BACKUP_DIR"
 log_action "Modifying files..."
 rm "$TREE/"*.h
 cp -a "$BIN_DIR/third_party" "$TREE"
+"$TEST_DIR"/digest_tree "$TREE" >"$TMP_DIR/digest.2"
 
 log_action "Running second backup..."
 sleep 5
@@ -68,3 +69,15 @@ BACKUP_DIR="$TMP_DIR/backups"
 mkdir "$BACKUP_DIR"
 "$BIN_DIR"/cumulus --dest="$BACKUP_DIR" --localdb="$LOCALDB" \
     --scheme=test -v "$TREE"
+
+log_action "Restoring snapshots"
+export LBS_GPG_PASSPHRASE=""
+snapshots=$("$BIN_DIR"/cumulus-util --store="$BACKUP_DIR" list-snapshots)
+echo "Available snapshots:" $snapshots
+i=0
+for s in $snapshots; do
+    i=$((i + 1))
+    dest="$TMP_DIR/restore-$i"
+    mkdir -p "$dest"
+    "$BIN_DIR"/cumulus-util --store="$BACKUP_DIR" restore-snapshot $s "$dest"
+done