#include "localdb.h"
#include "store.h"
#include "sha1.h"
+#include "statcache.h"
using std::list;
using std::string;
* invocations to help in creating incremental snapshots. */
LocalDb *db;
+/* Stat cache, which stored data locally to speed the backup process by quickly
+ * skipping files which have not changed. */
+StatCache *statcache;
+
/* Contents of the root object. This will contain a set of indirect links to
* the metadata objects. */
std::ostringstream metadata_root;
/* Read the contents of a file (specified by an open file descriptor) and copy
* the data to the store. Returns the size of the file (number of bytes
* dumped), or -1 on error. */
-int64_t dumpfile(int fd, dictionary &file_info)
+int64_t dumpfile(int fd, dictionary &file_info, const string &path)
{
struct stat stat_buf;
fstat(fd, &stat_buf);
return -1;
}
- /* The index data consists of a sequence of pointers to the data blocks
- * that actually comprise the file data. This level of indirection is used
- * so that the same data block can be used in multiple files, or multiple
- * versions of the same file. */
- SHA1Checksum hash;
- while (true) {
- size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
- if (bytes == 0)
- break;
+ /* Look up this file in the old stat cache, if we can. If the stat
+ * information indicates that the file has not changed, do not bother
+ * re-reading the entire contents. */
+ bool cached = false;
+
+ if (statcache->Find(path, &stat_buf)) {
+ cached = true;
+ const list<ObjectReference> &blocks = statcache->get_blocks();
+
+ /* If any of the blocks in the object have been expired, then we should
+ * fall back to fully reading in the file. */
+ for (list<ObjectReference>::const_iterator i = blocks.begin();
+ i != blocks.end(); ++i) {
+ const ObjectReference &ref = *i;
+ if (!db->IsAvailable(ref)) {
+ cached = false;
+ break;
+ }
+ }
+
+ /* If everything looks okay, use the cached information */
+ if (cached) {
+ file_info["checksum"] = statcache->get_checksum();
+ for (list<ObjectReference>::const_iterator i = blocks.begin();
+ i != blocks.end(); ++i) {
+ const ObjectReference &ref = *i;
+ object_list.push_back(ref.to_string());
+ segment_list.insert(ref.get_segment());
+ db->UseObject(ref);
+ }
+ size = stat_buf.st_size;
+ }
+ }
+
+ /* If the file is new or changed, we must read in the contents a block at a
+ * time. */
+ if (!cached) {
+ printf(" [new]\n");
- hash.process(block_buf, bytes);
-
- // Either find a copy of this block in an already-existing segment, or
- // index it so it can be re-used in the future
- SHA1Checksum block_hash;
- block_hash.process(block_buf, bytes);
- string block_csum = block_hash.checksum_str();
- ObjectReference ref = db->FindObject(block_csum, bytes);
-
- // Store a copy of the object if one does not yet exist
- if (ref.get_segment().size() == 0) {
- LbsObject *o = new LbsObject;
-
- /* We might still have seen this checksum before, if the object was
- * stored at some time in the past, but we have decided to clean
- * the segment the object was originally stored in (FindObject will
- * not return such objects). When rewriting the object contents,
- * put it in a separate group, so that old objects get grouped
- * together. The hope is that these old objects will continue to
- * be used in the future, and we obtain segments which will
- * continue to be well-utilized. */
- if (db->IsOldObject(block_csum, bytes))
- o->set_group("compacted");
- else
- o->set_group("data");
-
- o->set_data(block_buf, bytes);
- o->write(tss);
- ref = o->get_ref();
- db->StoreObject(ref, block_csum, bytes);
- delete o;
+ SHA1Checksum hash;
+ while (true) {
+ size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
+ if (bytes == 0)
+ break;
+
+ hash.process(block_buf, bytes);
+
+ // Either find a copy of this block in an already-existing segment,
+ // or index it so it can be re-used in the future
+ double block_age = 0.0;
+ SHA1Checksum block_hash;
+ block_hash.process(block_buf, bytes);
+ string block_csum = block_hash.checksum_str();
+ ObjectReference ref = db->FindObject(block_csum, bytes);
+
+ // Store a copy of the object if one does not yet exist
+ if (ref.get_segment().size() == 0) {
+ LbsObject *o = new LbsObject;
+
+ /* We might still have seen this checksum before, if the object
+ * was stored at some time in the past, but we have decided to
+ * clean the segment the object was originally stored in
+ * (FindObject will not return such objects). When rewriting
+ * the object contents, put it in a separate group, so that old
+ * objects get grouped together. The hope is that these old
+ * objects will continue to be used in the future, and we
+ * obtain segments which will continue to be well-utilized.
+ * Additionally, keep track of the age of the data by looking
+ * up the age of the block which was expired and using that
+ * instead of the current time. */
+ if (db->IsOldObject(block_csum, bytes, &block_age))
+ o->set_group("compacted");
+ else
+ o->set_group("data");
+
+ o->set_data(block_buf, bytes);
+ o->write(tss);
+ ref = o->get_ref();
+ db->StoreObject(ref, block_csum, bytes, block_age);
+ delete o;
+ }
+
+ object_list.push_back(ref.to_string());
+ segment_list.insert(ref.get_segment());
+ db->UseObject(ref);
+ size += bytes;
}
- object_list.push_back(ref.to_string());
- segment_list.insert(ref.get_segment());
- db->UseObject(ref);
- size += bytes;
+ file_info["checksum"] = hash.checksum_str();
}
- file_info["checksum"] = hash.checksum_str();
+ statcache->Save(path, &stat_buf, file_info["checksum"], object_list);
/* For files that only need to be broken apart into a few objects, store
* the list of objects directly. For larger files, store the data
flags = fcntl(fd, F_GETFL);
fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
- file_size = dumpfile(fd, file_info);
+ file_size = dumpfile(fd, file_info, path);
file_info["size"] = encode_int(file_size);
close(fd);
while (1) {
static struct option long_options[] = {
- {"localdb", 1, 0, 0}, // 0
- {"exclude", 1, 0, 0}, // 1
+ {"localdb", 1, 0, 0}, // 0
+ {"exclude", 1, 0, 0}, // 1
+ {"filter", 1, 0, 0}, // 2
+ {"filter-extension", 1, 0, 0}, // 3
{NULL, 0, 0, 0},
};
case 1: // --exclude
excludes.push_back(optarg);
break;
+ case 2: // --filter
+ filter_program = optarg;
+ break;
+ case 3: // --filter-extension
+ filter_extension = optarg;
+ break;
default:
fprintf(stderr, "Unhandled long option!\n");
return 1;
localdb_dir = backup_dest;
}
- printf("Source: %s, Dest: %s\n",
- backup_source.c_str(), backup_dest.c_str());
+ printf("Source: %s\nDest: %s\nDatabase: %s\n\n",
+ backup_source.c_str(), backup_dest.c_str(), localdb_dir.c_str());
tss = new TarSegmentStore(backup_dest);
block_buf = new char[LBS_BLOCK_SIZE];
- /* Write a backup descriptor file, which says which segments are needed and
- * where to start to restore this snapshot. The filename is based on the
- * current time. */
+ /* Store the time when the backup started, so it can be included in the
+ * snapshot name. */
time_t now;
struct tm time_buf;
char desc_buf[256];
time(&now);
localtime_r(&now, &time_buf);
strftime(desc_buf, sizeof(desc_buf), "%Y%m%dT%H%M%S", &time_buf);
- string desc_filename = backup_dest + "/" + desc_buf + ".lbs";
- std::ofstream descriptor(desc_filename.c_str());
/* Open the local database which tracks all objects that are stored
* remotely, for efficient incrementals. Provide it with the name of this
* snapshot. */
- string database_path = backup_dest + "/localdb.sqlite";
+ string database_path = localdb_dir + "/localdb.sqlite";
db = new LocalDb;
db->Open(database_path.c_str(), desc_buf);
+ /* Initialize the stat cache, for skipping over unchanged files. */
+ statcache = new StatCache;
+ statcache->Open(localdb_dir.c_str(), desc_buf);
+
try {
scanfile(".");
} catch (IOException e) {
root->set_data(md.data(), md.size());
root->write(tss);
root->checksum();
-
segment_list.insert(root->get_ref().get_segment());
+
+ string backup_root = root->get_ref().to_string();
+ delete root;
+
+ db->Close();
+
+ statcache->Close();
+ delete statcache;
+
+ tss->sync();
+ tss->dump_stats();
+ delete tss;
+
+ /* Write a backup descriptor file, which says which segments are needed and
+ * where to start to restore this snapshot. The filename is based on the
+ * current time. */
+ string desc_filename = backup_dest + "/snapshot-" + desc_buf + ".lbs";
+ std::ofstream descriptor(desc_filename.c_str());
+
descriptor << "Format: LBS Snapshot v0.1\n";
- descriptor << "Root: " << root->get_ref().to_string() << "\n";
strftime(desc_buf, sizeof(desc_buf), "%Y-%m-%d %H:%M:%S %z", &time_buf);
descriptor << "Date: " << desc_buf << "\n";
-
- delete root;
+ descriptor << "Root: " << backup_root << "\n";
descriptor << "Segments:\n";
for (std::set<string>::iterator i = segment_list.begin();
descriptor << " " << *i << "\n";
}
- db->Close();
-
- tss->sync();
- delete tss;
-
return 0;
}