return buf;
}
+/* Return the string representation of an integer. */
+long long parse_int(const string &s)
+{
+ return strtoll(s.c_str(), NULL, 10);
+}
+
/* Output a dictionary of string key/value pairs to the given output stream.
* The format is a sequence of lines of the form "key: value". */
void dict_output(ostream &o, map<string, string> dict)
std::string encode_int(long long n);
void dict_output(std::ostream &o, std::map<std::string, std::string> dict);
+long long parse_int(const std::string &s);
+
#endif // _LBS_TARSTORE_H
return found;
}
+/* Does this object still exist in the database (and not expired)? */
+bool LocalDb::IsAvailable(const ObjectReference &ref)
+{
+ int rc;
+ sqlite3_stmt *stmt;
+ bool found = false;
+
+ stmt = Prepare("select count(*) from block_index "
+ "where segmentid = ? and object = ? and expired is null");
+ sqlite3_bind_int64(stmt, 1, SegmentToId(ref.get_segment()));
+ sqlite3_bind_text(stmt, 2, ref.get_sequence().c_str(),
+ ref.get_sequence().size(), SQLITE_TRANSIENT);
+
+ rc = sqlite3_step(stmt);
+ if (rc == SQLITE_DONE) {
+ found = false;
+ } else if (rc == SQLITE_ROW) {
+ if (sqlite3_column_int(stmt, 0) > 0)
+ found = true;
+ } else {
+ fprintf(stderr, "Could not execute SELECT statement!\n");
+ }
+
+ sqlite3_finalize(stmt);
+
+ return found;
+}
+
void LocalDb::UseObject(const ObjectReference& ref)
{
int rc;
const std::string &checksum, int64_t size, double age);
ObjectReference FindObject(const std::string &checksum, int64_t size);
bool IsOldObject(const std::string &checksum, int64_t size, double *age);
+ bool IsAvailable(const ObjectReference &ref);
void UseObject(const ObjectReference& ref);
private:
sqlite3 *db;
/* Parse a string object reference and return a pointer to a new
* ObjectReference. The caller is responsible for freeing the object. NULL is
* returned if there is an error in the syntax. */
-ObjectReference *ObjectReference::parse(const std::string& s)
+ObjectReference *ObjectReference::parse(const std::string& str)
{
- // TODO: Implement
- return NULL;
+ const char *s = str.c_str();
+ const char *t;
+
+ // Segment
+ t = strchr(s, '/');
+ if (t == NULL)
+ return NULL;
+ string segment(s, t - s);
+
+ // Object sequence number
+ s = t + 1;
+ string object(s);
+
+ // TODO: Checksums, ranges.
+
+ return new ObjectReference(segment, object);
}
return -1;
}
- /* The index data consists of a sequence of pointers to the data blocks
- * that actually comprise the file data. This level of indirection is used
- * so that the same data block can be used in multiple files, or multiple
- * versions of the same file. */
- SHA1Checksum hash;
- while (true) {
- size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
- if (bytes == 0)
- break;
-
- hash.process(block_buf, bytes);
-
- // Either find a copy of this block in an already-existing segment, or
- // index it so it can be re-used in the future
- double block_age = 0.0;
- SHA1Checksum block_hash;
- block_hash.process(block_buf, bytes);
- string block_csum = block_hash.checksum_str();
- ObjectReference ref = db->FindObject(block_csum, bytes);
-
- // Store a copy of the object if one does not yet exist
- if (ref.get_segment().size() == 0) {
- LbsObject *o = new LbsObject;
-
- /* We might still have seen this checksum before, if the object was
- * stored at some time in the past, but we have decided to clean
- * the segment the object was originally stored in (FindObject will
- * not return such objects). When rewriting the object contents,
- * put it in a separate group, so that old objects get grouped
- * together. The hope is that these old objects will continue to
- * be used in the future, and we obtain segments which will
- * continue to be well-utilized. Additionally, keep track of the
- * age of the data by looking up the age of the block which was
- * expired and using that instead of the current time. */
- if (db->IsOldObject(block_csum, bytes, &block_age))
- o->set_group("compacted");
- else
- o->set_group("data");
-
- o->set_data(block_buf, bytes);
- o->write(tss);
- ref = o->get_ref();
- db->StoreObject(ref, block_csum, bytes, block_age);
- delete o;
+ /* Look up this file in the old stat cache, if we can. If the stat
+ * information indicates that the file has not changed, do not bother
+ * re-reading the entire contents. */
+ bool cached = false;
+
+ if (statcache->Find(path, &stat_buf)) {
+ cached = true;
+ const list<ObjectReference> &blocks = statcache->get_blocks();
+
+ /* If any of the blocks in the object have been expired, then we should
+ * fall back to fully reading in the file. */
+ for (list<ObjectReference>::const_iterator i = blocks.begin();
+ i != blocks.end(); ++i) {
+ const ObjectReference &ref = *i;
+ if (!db->IsAvailable(ref)) {
+ cached = false;
+ break;
+ }
}
- object_list.push_back(ref.to_string());
- segment_list.insert(ref.get_segment());
- db->UseObject(ref);
- size += bytes;
+ /* If everything looks okay, use the cached information */
+ if (cached) {
+ file_info["checksum"] = statcache->get_checksum();
+ for (list<ObjectReference>::const_iterator i = blocks.begin();
+ i != blocks.end(); ++i) {
+ const ObjectReference &ref = *i;
+ object_list.push_back(ref.to_string());
+ segment_list.insert(ref.get_segment());
+ db->UseObject(ref);
+ }
+ size = stat_buf.st_size;
+ }
}
- file_info["checksum"] = hash.checksum_str();
+ /* If the file is new or changed, we must read in the contents a block at a
+ * time. */
+ if (!cached) {
+ printf(" [new]\n");
+
+ SHA1Checksum hash;
+ while (true) {
+ size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
+ if (bytes == 0)
+ break;
+
+ hash.process(block_buf, bytes);
+
+ // Either find a copy of this block in an already-existing segment,
+ // or index it so it can be re-used in the future
+ double block_age = 0.0;
+ SHA1Checksum block_hash;
+ block_hash.process(block_buf, bytes);
+ string block_csum = block_hash.checksum_str();
+ ObjectReference ref = db->FindObject(block_csum, bytes);
+
+ // Store a copy of the object if one does not yet exist
+ if (ref.get_segment().size() == 0) {
+ LbsObject *o = new LbsObject;
+
+ /* We might still have seen this checksum before, if the object
+ * was stored at some time in the past, but we have decided to
+ * clean the segment the object was originally stored in
+ * (FindObject will not return such objects). When rewriting
+ * the object contents, put it in a separate group, so that old
+ * objects get grouped together. The hope is that these old
+ * objects will continue to be used in the future, and we
+ * obtain segments which will continue to be well-utilized.
+ * Additionally, keep track of the age of the data by looking
+ * up the age of the block which was expired and using that
+ * instead of the current time. */
+ if (db->IsOldObject(block_csum, bytes, &block_age))
+ o->set_group("compacted");
+ else
+ o->set_group("data");
+
+ o->set_data(block_buf, bytes);
+ o->write(tss);
+ ref = o->get_ref();
+ db->StoreObject(ref, block_csum, bytes, block_age);
+ delete o;
+ }
+
+ object_list.push_back(ref.to_string());
+ segment_list.insert(ref.get_segment());
+ db->UseObject(ref);
+ size += bytes;
+ }
+
+ file_info["checksum"] = hash.checksum_str();
+ }
statcache->Save(path, &stat_buf, file_info["checksum"], object_list);
#include <assert.h>
#include <stdio.h>
#include <string.h>
+#include <ctype.h>
#include <fstream>
#include <iostream>
+#include <map>
#include <string>
#include "format.h"
+#include "ref.h"
#include "statcache.h"
using std::list;
+using std::map;
using std::string;
+using std::getline;
using std::ifstream;
using std::ofstream;
+/* Like strcmp, but sorts in the order that files will be visited in the
+ * filesystem. That is, we break paths apart at slashes, and compare path
+ * components separately. */
+static int pathcmp(const char *path1, const char *path2)
+{
+ /* Find the first component in each path. */
+ const char *slash1 = strchr(path1, '/');
+ const char *slash2 = strchr(path2, '/');
+
+ {
+ string comp1, comp2;
+ if (slash1 == NULL)
+ comp1 = path1;
+ else
+ comp1 = string(path1, slash1 - path1);
+
+ if (slash2 == NULL)
+ comp2 = path2;
+ else
+ comp2 = string(path2, slash2 - path2);
+
+ /* Directly compare the two components first. */
+ if (comp1 < comp2)
+ return -1;
+ if (comp1 > comp2)
+ return 1;
+ }
+
+ if (slash1 == NULL && slash2 == NULL)
+ return 0;
+ if (slash1 == NULL)
+ return -1;
+ if (slash2 == NULL)
+ return 1;
+
+ return pathcmp(slash1 + 1, slash2 + 1);
+}
+
void StatCache::Open(const char *path, const char *snapshot_name)
{
oldpath = path;
oldpath += "/statcache";
newpath = oldpath + "." + snapshot_name;
- oldcache = NULL;
+ oldcache = new ifstream(oldpath.c_str());
newcache = new ofstream(newpath.c_str());
+
+ /* Read the first entry from the old stat cache into memory before we
+ * start. */
+ ReadNext();
}
void StatCache::Close()
}
}
+/* Read the next entry from the old statcache file and cache it in memory. */
+void StatCache::ReadNext()
+{
+ if (oldcache == NULL) {
+ end_of_cache = true;
+ return;
+ }
+
+ std::istream &cache = *oldcache;
+ map<string, string> fields;
+
+ old_mtime = -1;
+ old_ctime = -1;
+ old_inode = -1;
+ old_checksum = "";
+ old_contents.clear();
+
+ /* First, read in the filename. */
+ getline(cache, old_name);
+ if (!cache) {
+ end_of_cache = true;
+ return;
+ }
+
+ /* Start reading in the fields which follow the filename. */
+ string field = "";
+ while (!cache.eof()) {
+ string line;
+ getline(cache, line);
+ const char *s = line.c_str();
+
+ /* Is the line blank? If so, we have reached the end of this entry. */
+ if (s[0] == '\0' || s[0] == '\n')
+ break;
+
+ /* Is this a continuation line? (Does it start with whitespace?) */
+ if (isspace(s[0]) && field != "") {
+ fields[field] += line;
+ continue;
+ }
+
+ /* For lines of the form "Key: Value" look for ':' and split the line
+ * apart. */
+ const char *value = strchr(s, ':');
+ if (value == NULL)
+ continue;
+ field = string(s, value - s);
+
+ value++;
+ while (isspace(*value))
+ value++;
+
+ fields[field] = value;
+ }
+
+ /* Parse the easy fields: mtime, ctime, inode, checksum, ... */
+ if (fields.count("mtime"))
+ old_mtime = parse_int(fields["mtime"]);
+ if (fields.count("ctime"))
+ old_ctime = parse_int(fields["ctime"]);
+ if (fields.count("inode"))
+ old_inode = parse_int(fields["inode"]);
+
+ old_checksum = fields["checksum"];
+
+ /* Parse the list of blocks. */
+ const char *s = fields["blocks"].c_str();
+ while (*s != '\0') {
+ if (isspace(*s)) {
+ s++;
+ continue;
+ }
+
+ string ref = "";
+ while (*s != '\0' && !isspace(*s)) {
+ char buf[2];
+ buf[0] = *s;
+ buf[1] = '\0';
+ ref += buf;
+ s++;
+ }
+
+ ObjectReference *r = ObjectReference::parse(ref);
+ if (r != NULL) {
+ old_contents.push_back(*r);
+ delete r;
+ }
+ }
+
+ end_of_cache = false;
+}
+
+/* Find information about the given filename in the old stat cache, if it
+ * exists. */
+bool StatCache::Find(const string &path, const struct stat *stat_buf)
+{
+ while (!end_of_cache && pathcmp(old_name.c_str(), path.c_str()) < 0)
+ ReadNext();
+
+ /* Could the file be found at all? */
+ if (end_of_cache)
+ return false;
+ if (old_name != path)
+ return false;
+
+ /* Check to see if the file is unchanged. */
+ if (stat_buf->st_mtime != old_mtime)
+ return false;
+ if (stat_buf->st_ctime != old_ctime)
+ return false;
+ if ((long long)stat_buf->st_ino != old_inode)
+ return false;
+
+ /* File looks to be unchanged. */
+ return true;
+}
+
/* Save stat information about a regular file for future invocations. */
void StatCache::Save(const string &path, struct stat *stat_buf,
const string &checksum, const list<string> &blocks)
#include <list>
#include <string>
+#include "ref.h"
+
class StatCache {
public:
void Open(const char *path, const char *snapshot_name);
void Close();
+ bool Find(const std::string &path, const struct stat *stat_buf);
void Save(const std::string &path, struct stat *stat_buf,
const std::string &checksum,
const std::list<std::string> &blocks);
+ std::string get_checksum() const { return old_checksum; }
+ const std::list<ObjectReference> &get_blocks() const
+ { return old_contents; }
+
private:
+ void ReadNext();
+
std::string oldpath, newpath;
std::ifstream *oldcache;
std::ofstream *newcache;
+
+ /* Information about one file read from the old cache. */
+ bool end_of_cache;
+ int64_t old_mtime, old_ctime, old_inode;
+ std::string old_name, old_checksum;
+ std::list<ObjectReference> old_contents;
};
#endif // _LBS_STATCACHE_H