From fbe7425ae37564a99eb49133561eea5f1a6c7877 Mon Sep 17 00:00:00 2001 From: Michael Vrable Date: Thu, 13 Dec 2012 20:34:21 -0800 Subject: [PATCH] Changes to the Cumulus backup format and tools. - Switch to a hierarchical file layout. - Remove old references to the "LBS" name. --- doc/format.txt | 30 ++++++++++++++++++++++++++++-- localdb.cc | 26 +++++++++----------------- localdb.h | 7 ++++--- main.cc | 8 ++++---- python/cumulus/__init__.py | 6 +++--- python/cumulus/store/__init__.py | 2 +- python/cumulus/store/file.py | 27 +++++++++++++-------------- remote.cc | 27 +++++++++++++++++++++++++-- schema.sql | 6 +++--- store.cc | 8 +++++--- tests/digest_tree | 2 +- tests/run-test | 13 +++++++++++++ 12 files changed, 109 insertions(+), 53 deletions(-) diff --git a/doc/format.txt b/doc/format.txt index 582ef59..19bf2f2 100644 --- a/doc/format.txt +++ b/doc/format.txt @@ -19,6 +19,30 @@ This document does not explain the rationale behind the format; for that, see design.txt. +BACKUP REPOSITORY LAYOUT +======================== + +Cumulus backups are stored using a relatively simple layout. Data files +described below are written into one of several directories on the +backup server, depending on their purpose: + snapshots/ + Snapshot descriptor files, which quickly summarize each backup + snapshot stored. + segments0/ + segments1/ + Storage of the bulk of the backup data, in compressed/encrypted + form. Technically any segment could be stored in either + directory (both directories will be searched when looking for a + segment). However, data in segments0 might be faster to access + (but more expensive) depending on the storage backend. The + intent is that segments0 can store filesystem tree metadata and + segments1 can store file contents. + meta/ + Snapshot-specific metadata that is not core to the backup. This + can include checksums of segments, some data for rebuilding + local database contents, etc. + + DATA CHECKSUMS ============== @@ -71,6 +95,8 @@ fixed points; an example UUID is This segment could be stored in the filesystem as a file a704eeae-97f2-4f30-91a4-d4473956366b.tar The UUID used to name a segment is assigned when the segment is created. +These files are stored in either the segments0 or segments1 directories +on the backup server. Filters can be layered on top of the segment storage to provide compression, encryption, or other features. For example, the example @@ -101,8 +127,8 @@ object. NOTE: When naming an object, the segment portion consists of the UUID only. Any extensions appended to the segment when storing it as a file -in the filesystem (for example, .tar.bz2) are _not_ part of the name of -the object. +in the filesystem (for example, .tar.bz2) and path information (for +example, segments0) are _not_ part of the name of the object. There are two additional components which may appear in an object name; both are optional. diff --git a/localdb.cc b/localdb.cc index ca83559..7cabcb1 100644 --- a/localdb.cc +++ b/localdb.cc @@ -283,17 +283,6 @@ void LocalDb::StoreObject(const ObjectReference& ref, double age) } sqlite3_finalize(stmt); - - if (age != 0.0) { - stmt = Prepare("update segments " - "set mtime = coalesce(max(mtime, ?), ?) " - "where segmentid = ?"); - sqlite3_bind_double(stmt, 1, age); - sqlite3_bind_double(stmt, 2, age); - sqlite3_bind_int64(stmt, 3, SegmentToId(ref.get_segment())); - rc = sqlite3_step(stmt); - sqlite3_finalize(stmt); - } } ObjectReference LocalDb::FindObject(const string &checksum, int64_t size) @@ -495,25 +484,28 @@ void LocalDb::UseObject(const ObjectReference& ref) } } -void LocalDb::SetSegmentChecksum(const std::string &segment, +void LocalDb::SetSegmentMetadata(const std::string &segment, const std::string &path, const std::string &checksum, + const std::string &type, int data_size, int disk_size) { int rc; sqlite3_stmt *stmt; stmt = Prepare("update segments set path = ?, checksum = ?, " - "data_size = ?, disk_size = ?, " + "type = ?, data_size = ?, disk_size = ?, " "mtime = coalesce(mtime, julianday('now')) " "where segmentid = ?"); sqlite3_bind_text(stmt, 1, path.c_str(), path.size(), SQLITE_TRANSIENT); sqlite3_bind_text(stmt, 2, checksum.c_str(), checksum.size(), SQLITE_TRANSIENT); - sqlite3_bind_int64(stmt, 3, data_size); - sqlite3_bind_int64(stmt, 4, disk_size); - sqlite3_bind_int64(stmt, 5, SegmentToId(segment)); + sqlite3_bind_text(stmt, 3, type.c_str(), type.size(), + SQLITE_TRANSIENT); + sqlite3_bind_int64(stmt, 4, data_size); + sqlite3_bind_int64(stmt, 5, disk_size); + sqlite3_bind_int64(stmt, 6, SegmentToId(segment)); rc = sqlite3_step(stmt); if (rc != SQLITE_DONE) { @@ -524,7 +516,7 @@ void LocalDb::SetSegmentChecksum(const std::string &segment, sqlite3_finalize(stmt); } -bool LocalDb::GetSegmentChecksum(const string &segment, +bool LocalDb::GetSegmentMetadata(const string &segment, string *seg_path, string *seg_checksum) { diff --git a/localdb.h b/localdb.h index 7764aed..6d2190d 100644 --- a/localdb.h +++ b/localdb.h @@ -49,10 +49,11 @@ public: void UseObject(const ObjectReference& ref); std::set GetUsedSegments(); - void SetSegmentChecksum(const std::string &segment, const std::string &path, + void SetSegmentMetadata(const std::string &segment, const std::string &path, const std::string &checksum, - int data_size, int disk_size); - bool GetSegmentChecksum(const std::string &segment, + const std::string &type, int data_size, + int disk_size); + bool GetSegmentMetadata(const std::string &segment, std::string *seg_path, std::string *seg_checksum); bool LoadChunkSignatures(ObjectReference ref, diff --git a/main.cc b/main.cc index e10a04a..fa8e4c9 100644 --- a/main.cc +++ b/main.cc @@ -806,7 +806,7 @@ int main(int argc, char *argv[]) * a temporary directory for staging files. Otherwise, write backups * directly to the destination directory. */ if (backup_script != "") { - tmp_dir = tmp_dir + "/lbs." + generate_uuid(); + tmp_dir = tmp_dir + "/cumulus." + generate_uuid(); if (mkdir(tmp_dir.c_str(), 0700) < 0) { fprintf(stderr, "Cannot create temporary directory %s: %m\n", tmp_dir.c_str()); @@ -863,14 +863,14 @@ int main(int argc, char *argv[]) checksum_filename += backup_scheme + "-"; checksum_filename = checksum_filename + desc_buf + "." + csum_type + "sums"; RemoteFile *checksum_file = remote->alloc_file(checksum_filename, - "checksums"); + "meta"); FILE *checksums = fdopen(checksum_file->get_fd(), "w"); std::set segment_list = db->GetUsedSegments(); for (std::set::iterator i = segment_list.begin(); i != segment_list.end(); ++i) { string seg_path, seg_csum; - if (db->GetSegmentChecksum(*i, &seg_path, &seg_csum)) { + if (db->GetSegmentMetadata(*i, &seg_path, &seg_csum)) { const char *raw_checksum = NULL; if (strncmp(seg_csum.c_str(), csum_type, strlen(csum_type)) == 0) { @@ -912,7 +912,7 @@ int main(int argc, char *argv[]) string desc_filename = "snapshot-"; if (backup_scheme.size() > 0) desc_filename += backup_scheme + "-"; - desc_filename = desc_filename + desc_buf + ".lbs"; + desc_filename = desc_filename + desc_buf + ".cumulus"; RemoteFile *descriptor_file = remote->alloc_file(desc_filename, "snapshots"); diff --git a/python/cumulus/__init__.py b/python/cumulus/__init__.py index d86d5ed..d8b6814 100644 --- a/python/cumulus/__init__.py +++ b/python/cumulus/__init__.py @@ -146,7 +146,7 @@ class LowlevelDataStore: """Return a file-like object for reading data from the given file.""" (type, filename) = self._classify(filename) - return self.store.get(type, filename) + return self.store.get(type + "/" + filename) def lowlevel_stat(self, filename): """Return a dictionary of information about the given file. @@ -156,7 +156,7 @@ class LowlevelDataStore: """ (type, filename) = self._classify(filename) - return self.store.stat(type, filename) + return self.store.stat(type + "/" + filename) # Slightly higher-level list methods. def list_snapshots(self): @@ -248,7 +248,7 @@ class ObjectStore: yield (path[1], data_obj.read()) def load_snapshot(self, snapshot): - file = self.store.lowlevel_open("snapshot-" + snapshot + ".lbs") + file = self.store.lowlevel_open("snapshot-" + snapshot + ".cumulus") return file.read().splitlines(True) def extract_segment(self, segment): diff --git a/python/cumulus/store/__init__.py b/python/cumulus/store/__init__.py index 1e7d41f..3b54cbb 100644 --- a/python/cumulus/store/__init__.py +++ b/python/cumulus/store/__init__.py @@ -21,7 +21,7 @@ import exceptions, re, urlparse type_patterns = { 'checksums': re.compile(r"^snapshot-(.*)\.(\w+)sums$"), 'segments': re.compile(r"^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(\.\S+)?$"), - 'snapshots': re.compile(r"^snapshot-(.*)\.lbs$") + 'snapshots': re.compile(r"^snapshot-(.*)\.(cumulus|lbs)$") } class NotFoundError(exceptions.KeyError): diff --git a/python/cumulus/store/file.py b/python/cumulus/store/file.py index 0998448..3d536bf 100644 --- a/python/cumulus/store/file.py +++ b/python/cumulus/store/file.py @@ -30,17 +30,17 @@ class FileStore(cumulus.store.Store): self.prefix = self.path.rstrip("/") def _get_path(self, type, name): - return "%s/%s" % (self.prefix, name) + return os.path.join(self.prefix, type, name) - def list(self, type): - files = os.listdir(self.prefix) - return (f for f in files if type_patterns[type].match(f)) + def list(self, subdir): + return os.listdir(os.path.join(self.prefix, subdir)) - def get(self, type, name): - k = self._get_path(type, name) - return open(k, 'rb') + def get(self, path): + return open(os.path.join(self.prefix, path), 'rb') - def put(self, type, name, fp): + def put(self, path, fp): + # TODO: Implement + raise NotImplementedError k = self._get_path(type, name) out = open(k, 'wb') buf = fp.read(4096) @@ -48,15 +48,14 @@ class FileStore(cumulus.store.Store): out.write(buf) buf = fp.read(4096) - def delete(self, type, name): - k = self._get_path(type, name) - os.unlink(k) + def delete(self, path): + os.unlink(os.path.join(self.prefix, path)) - def stat(self, type, name): + def stat(self, path): try: - stat = os.stat(self._get_path(type, name)) + stat = os.stat(os.path.join(self.prefix, path)) return {'size': stat.st_size} except OSError: - raise cumulus.store.NotFoundError, (type, name) + raise cumulus.store.NotFoundError, path Store = FileStore diff --git a/remote.cc b/remote.cc index 8f2df90..e8e046e 100644 --- a/remote.cc +++ b/remote.cc @@ -26,6 +26,7 @@ * scripts that are called when a file is to be transferred. */ #include +#include #include #include #include @@ -44,11 +45,32 @@ using std::string; +static const char *backup_directories[] = { + "meta", + "segments0", + "segments1", + "snapshots", + NULL +}; + RemoteStore::RemoteStore(const string &stagedir, const string &script) { staging_dir = stagedir; backup_script = script; + /* Ensure all necessary directories exist for each type of backup file. */ + for (size_t i = 0; backup_directories[i]; i++) { + string path = stagedir + "/" + backup_directories[i]; + if (mkdir(path.c_str(), 0777) < 0) { + /* Ignore errors for already-existing directories. */ + if (errno != EEXIST) { + fprintf(stderr, + "Warning: Cannot create backup directory %s: %m!", + path.c_str()); + } + } + } + /* A background thread is created for each RemoteStore to manage the actual * transfers to a remote server. The main program thread can enqueue * RemoteFile objects to be transferred asynchronously. */ @@ -93,7 +115,8 @@ RemoteFile *RemoteStore::alloc_file(const string &name, const string &type) pthread_mutex_lock(&lock); files_outstanding++; pthread_mutex_unlock(&lock); - return new RemoteFile(this, name, type, staging_dir + "/" + name); + return new RemoteFile(this, name, type, + staging_dir + "/" + type + "/" + name); } /* Request that a file be transferred to the remote server. The actual @@ -250,7 +273,7 @@ RemoteFile::RemoteFile(RemoteStore *remote, remote_store = remote; this->type = type; this->local_path = local_path; - this->remote_path = name; + this->remote_path = type + "/" + name; fd = open(local_path.c_str(), O_WRONLY | O_CREAT, 0666); if (fd < 0) diff --git a/schema.sql b/schema.sql index d898272..465dde2 100644 --- a/schema.sql +++ b/schema.sql @@ -28,12 +28,12 @@ create table snapshots ( create table segments ( segmentid integer primary key, segment text unique not null, + mtime real, -- timestamp when segment was created path text, checksum text, - mtime real, data_size integer, -- sum of bytes in all objects in the segment - disk_size integer -- size of segment on disk, after compression - -- TODO: group? metadata vs. non-metadata? + disk_size integer, -- size of segment on disk, after compression + type text ); -- Index of all data blocks in stored segments. This is indexed by content diff --git a/store.cc b/store.cc index 772f9c5..115529f 100644 --- a/store.cc +++ b/store.cc @@ -244,7 +244,9 @@ ObjectReference TarSegmentStore::write_object(const char *data, size_t len, segment->basename += filter_extension; segment->count = 0; segment->data_size = 0; - segment->rf = remote->alloc_file(segment->basename, "segments"); + segment->rf = remote->alloc_file(segment->basename, + group == "metadata" ? "segments0" + : "segments1"); segment->file = new Tarfile(segment->rf, segment->name); segments[group] = segment; @@ -313,8 +315,8 @@ void TarSegmentStore::close_segment(const string &group) checksum = segment_checksum.checksum_str(); } - db->SetSegmentChecksum(segment->name, segment->basename, checksum, - segment->data_size, disk_size); + db->SetSegmentMetadata(segment->name, segment->basename, checksum, + group, segment->data_size, disk_size); } segment->rf->send(); diff --git a/tests/digest_tree b/tests/digest_tree index 2ee419e..5882581 100755 --- a/tests/digest_tree +++ b/tests/digest_tree @@ -35,7 +35,7 @@ def stat_file(path): if include_mode: metadata.append("mode=%o" % st.st_mode) if include_mtime: - metadata.append("size=%d" % st.st_mtime) + metadata.append("mtime=%d" % st.st_mtime) if stat.S_ISREG(st.st_mode): digest = hashlib.sha256() BUF_SIZE = 1 << 16 diff --git a/tests/run-test b/tests/run-test index 624a0bd..7d3ec0c 100755 --- a/tests/run-test +++ b/tests/run-test @@ -61,6 +61,7 @@ mkdir "$BACKUP_DIR" log_action "Modifying files..." rm "$TREE/"*.h cp -a "$BIN_DIR/third_party" "$TREE" +"$TEST_DIR"/digest_tree "$TREE" >"$TMP_DIR/digest.2" log_action "Running second backup..." sleep 5 @@ -68,3 +69,15 @@ BACKUP_DIR="$TMP_DIR/backups" mkdir "$BACKUP_DIR" "$BIN_DIR"/cumulus --dest="$BACKUP_DIR" --localdb="$LOCALDB" \ --scheme=test -v "$TREE" + +log_action "Restoring snapshots" +export LBS_GPG_PASSPHRASE="" +snapshots=$("$BIN_DIR"/cumulus-util --store="$BACKUP_DIR" list-snapshots) +echo "Available snapshots:" $snapshots +i=0 +for s in $snapshots; do + i=$((i + 1)) + dest="$TMP_DIR/restore-$i" + mkdir -p "$dest" + "$BIN_DIR"/cumulus-util --store="$BACKUP_DIR" restore-snapshot $s "$dest" +done -- 2.20.1