Sort-of-working statcache implementation.

author Michael Vrable <mvrable@cs.ucsd.edu>

Thu, 28 Jun 2007 00:10:11 +0000 (17:10 -0700)

committer Michael Vrable <mvrable@turin.ucsd.edu>

Thu, 28 Jun 2007 00:10:11 +0000 (17:10 -0700)
author Michael Vrable <mvrable@cs.ucsd.edu>
Thu, 28 Jun 2007 00:10:11 +0000 (17:10 -0700)
committer Michael Vrable <mvrable@turin.ucsd.edu>
Thu, 28 Jun 2007 00:10:11 +0000 (17:10 -0700)
diff --git a/format.cc b/format.cc

index 19cf28f..1fccf4e 100644 (file)
--- a/format.cc
+++ b/format.cc
@@ -47,6 +47,12 @@ string encode_int(long long n)
      return buf;
  }
  
+/* Return the string representation of an integer. */
+long long parse_int(const string &s)
+{
+    return strtoll(s.c_str(), NULL, 10);
+}
+
  /* Output a dictionary of string key/value pairs to the given output stream.
   * The format is a sequence of lines of the form "key: value". */
  void dict_output(ostream &o, map<string, string> dict)
diff --git a/format.h b/format.h

index 9f6055a..519bceb 100644 (file)
--- a/format.h
+++ b/format.h
@@ -16,4 +16,6 @@ std::string uri_encode(const std::string &in);
  std::string encode_int(long long n);
  void dict_output(std::ostream &o, std::map<std::string, std::string> dict);
  
+long long parse_int(const std::string &s);
+
  #endif // _LBS_TARSTORE_H
diff --git a/localdb.cc b/localdb.cc

index 1388881..bc2bc69 100644 (file)
--- a/localdb.cc
+++ b/localdb.cc
@@ -230,6 +230,34 @@ bool LocalDb::IsOldObject(const string &checksum, int64_t size, double *age)
      return found;
  }
  
+/* Does this object still exist in the database (and not expired)? */
+bool LocalDb::IsAvailable(const ObjectReference &ref)
+{
+    int rc;
+    sqlite3_stmt *stmt;
+    bool found = false;
+
+    stmt = Prepare("select count(*) from block_index "
+                   "where segmentid = ? and object = ? and expired is null");
+    sqlite3_bind_int64(stmt, 1, SegmentToId(ref.get_segment()));
+    sqlite3_bind_text(stmt, 2, ref.get_sequence().c_str(),
+                      ref.get_sequence().size(), SQLITE_TRANSIENT);
+
+    rc = sqlite3_step(stmt);
+    if (rc == SQLITE_DONE) {
+        found = false;
+    } else if (rc == SQLITE_ROW) {
+        if (sqlite3_column_int(stmt, 0) > 0)
+            found = true;
+    } else {
+        fprintf(stderr, "Could not execute SELECT statement!\n");
+    }
+
+    sqlite3_finalize(stmt);
+
+    return found;
+}
+
  void LocalDb::UseObject(const ObjectReference& ref)
  {
      int rc;
diff --git a/localdb.h b/localdb.h

index 3f5e5ba..4654b37 100644 (file)
--- a/localdb.h
+++ b/localdb.h
@@ -27,6 +27,7 @@ public:
                       const std::string &checksum, int64_t size, double age);
      ObjectReference FindObject(const std::string &checksum, int64_t size);
      bool IsOldObject(const std::string &checksum, int64_t size, double *age);
+    bool IsAvailable(const ObjectReference &ref);
      void UseObject(const ObjectReference& ref);
  private:
      sqlite3 *db;
diff --git a/ref.cc b/ref.cc

index ce16cb8..84b631c 100644 (file)
--- a/ref.cc
+++ b/ref.cc
@@ -71,8 +71,22 @@ string ObjectReference::to_string() const
  /* Parse a string object reference and return a pointer to a new
   * ObjectReference.  The caller is responsible for freeing the object.  NULL is
   * returned if there is an error in the syntax. */
-ObjectReference *ObjectReference::parse(const std::string& s)
+ObjectReference *ObjectReference::parse(const std::string& str)
  {
-    // TODO: Implement
-    return NULL;
+    const char *s = str.c_str();
+    const char *t;
+
+    // Segment
+    t = strchr(s, '/');
+    if (t == NULL)
+        return NULL;
+    string segment(s, t - s);
+
+    // Object sequence number
+    s = t + 1;
+    string object(s);
+
+    // TODO: Checksums, ranges.
+
+    return new ObjectReference(segment, object);
  }
diff --git a/scandir.cc b/scandir.cc

index c48107a..b96b1d3 100644 (file)
--- a/scandir.cc
+++ b/scandir.cc
@@ -128,59 +128,96 @@ int64_t dumpfile(int fd, dictionary &file_info, const string &path)
          return -1;
      }
  
-    /* The index data consists of a sequence of pointers to the data blocks
-     * that actually comprise the file data.  This level of indirection is used
-     * so that the same data block can be used in multiple files, or multiple
-     * versions of the same file. */
-    SHA1Checksum hash;
-    while (true) {
-        size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
-        if (bytes == 0)
-            break;
-
-        hash.process(block_buf, bytes);
-
-        // Either find a copy of this block in an already-existing segment, or
-        // index it so it can be re-used in the future
-        double block_age = 0.0;
-        SHA1Checksum block_hash;
-        block_hash.process(block_buf, bytes);
-        string block_csum = block_hash.checksum_str();
-        ObjectReference ref = db->FindObject(block_csum, bytes);
-
-        // Store a copy of the object if one does not yet exist
-        if (ref.get_segment().size() == 0) {
-            LbsObject *o = new LbsObject;
-
-            /* We might still have seen this checksum before, if the object was
-             * stored at some time in the past, but we have decided to clean
-             * the segment the object was originally stored in (FindObject will
-             * not return such objects).  When rewriting the object contents,
-             * put it in a separate group, so that old objects get grouped
-             * together.  The hope is that these old objects will continue to
-             * be used in the future, and we obtain segments which will
-             * continue to be well-utilized.  Additionally, keep track of the
-             * age of the data by looking up the age of the block which was
-             * expired and using that instead of the current time. */
-            if (db->IsOldObject(block_csum, bytes, &block_age))
-                o->set_group("compacted");
-            else
-                o->set_group("data");
-
-            o->set_data(block_buf, bytes);
-            o->write(tss);
-            ref = o->get_ref();
-            db->StoreObject(ref, block_csum, bytes, block_age);
-            delete o;
+    /* Look up this file in the old stat cache, if we can.  If the stat
+     * information indicates that the file has not changed, do not bother
+     * re-reading the entire contents. */
+    bool cached = false;
+
+    if (statcache->Find(path, &stat_buf)) {
+        cached = true;
+        const list<ObjectReference> &blocks = statcache->get_blocks();
+
+        /* If any of the blocks in the object have been expired, then we should
+         * fall back to fully reading in the file. */
+        for (list<ObjectReference>::const_iterator i = blocks.begin();
+             i != blocks.end(); ++i) {
+            const ObjectReference &ref = *i;
+            if (!db->IsAvailable(ref)) {
+                cached = false;
+                break;
+            }
          }
  
-        object_list.push_back(ref.to_string());
-        segment_list.insert(ref.get_segment());
-        db->UseObject(ref);
-        size += bytes;
+        /* If everything looks okay, use the cached information */
+        if (cached) {
+            file_info["checksum"] = statcache->get_checksum();
+            for (list<ObjectReference>::const_iterator i = blocks.begin();
+                 i != blocks.end(); ++i) {
+                const ObjectReference &ref = *i;
+                object_list.push_back(ref.to_string());
+                segment_list.insert(ref.get_segment());
+                db->UseObject(ref);
+            }
+            size = stat_buf.st_size;
+        }
      }
  
-    file_info["checksum"] = hash.checksum_str();
+    /* If the file is new or changed, we must read in the contents a block at a
+     * time. */
+    if (!cached) {
+        printf("    [new]\n");
+
+        SHA1Checksum hash;
+        while (true) {
+            size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
+            if (bytes == 0)
+                break;
+
+            hash.process(block_buf, bytes);
+
+            // Either find a copy of this block in an already-existing segment,
+            // or index it so it can be re-used in the future
+            double block_age = 0.0;
+            SHA1Checksum block_hash;
+            block_hash.process(block_buf, bytes);
+            string block_csum = block_hash.checksum_str();
+            ObjectReference ref = db->FindObject(block_csum, bytes);
+
+            // Store a copy of the object if one does not yet exist
+            if (ref.get_segment().size() == 0) {
+                LbsObject *o = new LbsObject;
+
+                /* We might still have seen this checksum before, if the object
+                 * was stored at some time in the past, but we have decided to
+                 * clean the segment the object was originally stored in
+                 * (FindObject will not return such objects).  When rewriting
+                 * the object contents, put it in a separate group, so that old
+                 * objects get grouped together.  The hope is that these old
+                 * objects will continue to be used in the future, and we
+                 * obtain segments which will continue to be well-utilized.
+                 * Additionally, keep track of the age of the data by looking
+                 * up the age of the block which was expired and using that
+                 * instead of the current time. */
+                if (db->IsOldObject(block_csum, bytes, &block_age))
+                    o->set_group("compacted");
+                else
+                    o->set_group("data");
+
+                o->set_data(block_buf, bytes);
+                o->write(tss);
+                ref = o->get_ref();
+                db->StoreObject(ref, block_csum, bytes, block_age);
+                delete o;
+            }
+
+            object_list.push_back(ref.to_string());
+            segment_list.insert(ref.get_segment());
+            db->UseObject(ref);
+            size += bytes;
+        }
+
+        file_info["checksum"] = hash.checksum_str();
+    }
  
      statcache->Save(path, &stat_buf, file_info["checksum"], object_list);
  
diff --git a/statcache.cc b/statcache.cc

index 03cfd95..a0bee44 100644 (file)
--- a/statcache.cc
+++ b/statcache.cc
@@ -23,27 +23,74 @@
  #include <assert.h>
  #include <stdio.h>
  #include <string.h>
+#include <ctype.h>
  
  #include <fstream>
  #include <iostream>
+#include <map>
  #include <string>
  
  #include "format.h"
+#include "ref.h"
  #include "statcache.h"
  
  using std::list;
+using std::map;
  using std::string;
+using std::getline;
  using std::ifstream;
  using std::ofstream;
  
+/* Like strcmp, but sorts in the order that files will be visited in the
+ * filesystem.  That is, we break paths apart at slashes, and compare path
+ * components separately. */
+static int pathcmp(const char *path1, const char *path2)
+{
+    /* Find the first component in each path. */
+    const char *slash1 = strchr(path1, '/');
+    const char *slash2 = strchr(path2, '/');
+
+    {
+        string comp1, comp2;
+        if (slash1 == NULL)
+            comp1 = path1;
+        else
+            comp1 = string(path1, slash1 - path1);
+
+        if (slash2 == NULL)
+            comp2 = path2;
+        else
+            comp2 = string(path2, slash2 - path2);
+
+        /* Directly compare the two components first. */
+        if (comp1 < comp2)
+            return -1;
+        if (comp1 > comp2)
+            return 1;
+    }
+
+    if (slash1 == NULL && slash2 == NULL)
+        return 0;
+    if (slash1 == NULL)
+        return -1;
+    if (slash2 == NULL)
+        return 1;
+
+    return pathcmp(slash1 + 1, slash2 + 1);
+}
+
  void StatCache::Open(const char *path, const char *snapshot_name)
  {
      oldpath = path;
      oldpath += "/statcache";
      newpath = oldpath + "." + snapshot_name;
  
-    oldcache = NULL;
+    oldcache = new ifstream(oldpath.c_str());
      newcache = new ofstream(newpath.c_str());
+
+    /* Read the first entry from the old stat cache into memory before we
+     * start. */
+    ReadNext();
  }
  
  void StatCache::Close()
@@ -59,6 +106,123 @@ void StatCache::Close()
      }
  }
  
+/* Read the next entry from the old statcache file and cache it in memory. */
+void StatCache::ReadNext()
+{
+    if (oldcache == NULL) {
+        end_of_cache = true;
+        return;
+    }
+
+    std::istream &cache = *oldcache;
+    map<string, string> fields;
+
+    old_mtime = -1;
+    old_ctime = -1;
+    old_inode = -1;
+    old_checksum = "";
+    old_contents.clear();
+
+    /* First, read in the filename. */
+    getline(cache, old_name);
+    if (!cache) {
+        end_of_cache = true;
+        return;
+    }
+
+    /* Start reading in the fields which follow the filename. */
+    string field = "";
+    while (!cache.eof()) {
+        string line;
+        getline(cache, line);
+        const char *s = line.c_str();
+
+        /* Is the line blank?  If so, we have reached the end of this entry. */
+        if (s[0] == '\0' || s[0] == '\n')
+            break;
+
+        /* Is this a continuation line?  (Does it start with whitespace?) */
+        if (isspace(s[0]) && field != "") {
+            fields[field] += line;
+            continue;
+        }
+
+        /* For lines of the form "Key: Value" look for ':' and split the line
+         * apart. */
+        const char *value = strchr(s, ':');
+        if (value == NULL)
+            continue;
+        field = string(s, value - s);
+
+        value++;
+        while (isspace(*value))
+            value++;
+
+        fields[field] = value;
+    }
+
+    /* Parse the easy fields: mtime, ctime, inode, checksum, ... */
+    if (fields.count("mtime"))
+        old_mtime = parse_int(fields["mtime"]);
+    if (fields.count("ctime"))
+        old_ctime = parse_int(fields["ctime"]);
+    if (fields.count("inode"))
+        old_inode = parse_int(fields["inode"]);
+
+    old_checksum = fields["checksum"];
+
+    /* Parse the list of blocks. */
+    const char *s = fields["blocks"].c_str();
+    while (*s != '\0') {
+        if (isspace(*s)) {
+            s++;
+            continue;
+        }
+
+        string ref = "";
+        while (*s != '\0' && !isspace(*s)) {
+            char buf[2];
+            buf[0] = *s;
+            buf[1] = '\0';
+            ref += buf;
+            s++;
+        }
+
+        ObjectReference *r = ObjectReference::parse(ref);
+        if (r != NULL) {
+            old_contents.push_back(*r);
+            delete r;
+        }
+    }
+
+    end_of_cache = false;
+}
+
+/* Find information about the given filename in the old stat cache, if it
+ * exists. */
+bool StatCache::Find(const string &path, const struct stat *stat_buf)
+{
+    while (!end_of_cache && pathcmp(old_name.c_str(), path.c_str()) < 0)
+        ReadNext();
+
+    /* Could the file be found at all? */
+    if (end_of_cache)
+        return false;
+    if (old_name != path)
+        return false;
+
+    /* Check to see if the file is unchanged. */
+    if (stat_buf->st_mtime != old_mtime)
+        return false;
+    if (stat_buf->st_ctime != old_ctime)
+        return false;
+    if ((long long)stat_buf->st_ino != old_inode)
+        return false;
+
+    /* File looks to be unchanged. */
+    return true;
+}
+
  /* Save stat information about a regular file for future invocations. */
  void StatCache::Save(const string &path, struct stat *stat_buf,
                       const string &checksum, const list<string> &blocks)
diff --git a/statcache.h b/statcache.h

index 10c0397..8b13b7e 100644 (file)
--- a/statcache.h
+++ b/statcache.h
@@ -31,18 +31,33 @@
  #include <list>
  #include <string>
  
+#include "ref.h"
+
  class StatCache {
  public:
      void Open(const char *path, const char *snapshot_name);
      void Close();
+    bool Find(const std::string &path, const struct stat *stat_buf);
      void Save(const std::string &path, struct stat *stat_buf,
                const std::string &checksum,
                const std::list<std::string> &blocks);
  
+    std::string get_checksum() const { return old_checksum; }
+    const std::list<ObjectReference> &get_blocks() const
+        { return old_contents; }
+
  private:
+    void ReadNext();
+
      std::string oldpath, newpath;
      std::ifstream *oldcache;
      std::ofstream *newcache;
+
+    /* Information about one file read from the old cache. */
+    bool end_of_cache;
+    int64_t old_mtime, old_ctime, old_inode;
+    std::string old_name, old_checksum;
+    std::list<ObjectReference> old_contents;
  };
  
  #endif // _LBS_STATCACHE_H
author	Michael Vrable <mvrable@cs.ucsd.edu>
	Thu, 28 Jun 2007 00:10:11 +0000 (17:10 -0700)
committer	Michael Vrable <mvrable@turin.ucsd.edu>
	Thu, 28 Jun 2007 00:10:11 +0000 (17:10 -0700)
format.cc		patch \| blob \| history
format.h		patch \| blob \| history
localdb.cc		patch \| blob \| history
localdb.h		patch \| blob \| history
ref.cc		patch \| blob \| history
scandir.cc		patch \| blob \| history
statcache.cc		patch \| blob \| history
statcache.h		patch \| blob \| history