statcache.cc

   1 /* LBS: An LFS-inspired filesystem backup system Copyright (C) 2007  Michael
   2  * Vrable
   3  *
   4  * To speed backups, we maintain a "stat cache" containing selected information
   5  * about all regular files, including modification times and the list of blocks
   6  * that comprised the file in the last backup.  If the file has not changed
   7  * according to a stat() call, we may re-use the information contained in the
   8  * stat cache instead of re-reading the entire file.  It is always safe to
   9  * discard information from the stat cache; this will only cause a file to be
  10  * re-read to determine that it contains the same data as before.
  11  *
  12  * The stat cache is stored in a file called "statcache" in the local backup
  13  * directory.  During a backup, a new statcache file is written out with a
  14  * suffix based on the current time; at the end of a successful backup this
  15  * file is renamed over the original statcache file.
  16  *
  17  * The information in the statcache file is stored in sorted order as we
  18  * traverse the filesystem, so that we can read and write it in a purely
  19  * streaming manner.  (This is why we don't include the information in the
  20  * SQLite local database; doing so is likely less efficient.)
  21  */
  22
  23 #include <assert.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <ctype.h>
  27
  28 #include <fstream>
  29 #include <iostream>
  30 #include <map>
  31 #include <string>
  32
  33 #include "format.h"
  34 #include "ref.h"
  35 #include "statcache.h"
  36
  37 using std::list;
  38 using std::map;
  39 using std::string;
  40 using std::getline;
  41 using std::ifstream;
  42 using std::ofstream;
  43
  44 /* Like strcmp, but sorts in the order that files will be visited in the
  45  * filesystem.  That is, we break paths apart at slashes, and compare path
  46  * components separately. */
  47 static int pathcmp(const char *path1, const char *path2)
  48 {
  49     /* Find the first component in each path. */
  50     const char *slash1 = strchr(path1, '/');
  51     const char *slash2 = strchr(path2, '/');
  52
  53     {
  54         string comp1, comp2;
  55         if (slash1 == NULL)
  56             comp1 = path1;
  57         else
  58             comp1 = string(path1, slash1 - path1);
  59
  60         if (slash2 == NULL)
  61             comp2 = path2;
  62         else
  63             comp2 = string(path2, slash2 - path2);
  64
  65         /* Directly compare the two components first. */
  66         if (comp1 < comp2)
  67             return -1;
  68         if (comp1 > comp2)
  69             return 1;
  70     }
  71
  72     if (slash1 == NULL && slash2 == NULL)
  73         return 0;
  74     if (slash1 == NULL)
  75         return -1;
  76     if (slash2 == NULL)
  77         return 1;
  78
  79     return pathcmp(slash1 + 1, slash2 + 1);
  80 }
  81
  82 void StatCache::Open(const char *path, const char *snapshot_name)
  83 {
  84     oldpath = path;
  85     oldpath += "/statcache";
  86     newpath = oldpath + "." + snapshot_name;
  87
  88     oldcache = new ifstream(oldpath.c_str());
  89     newcache = new ofstream(newpath.c_str());
  90
  91     /* Read the first entry from the old stat cache into memory before we
  92      * start. */
  93     ReadNext();
  94 }
  95
  96 void StatCache::Close()
  97 {
  98     if (oldcache != NULL)
  99         delete oldcache;
 100
 101     delete newcache;
 102
 103     if (rename(newpath.c_str(), oldpath.c_str()) < 0) {
 104         fprintf(stderr, "Error renaming statcache from %s to %s: %m\n",
 105                 newpath.c_str(), oldpath.c_str());
 106     }
 107 }
 108
 109 /* Read the next entry from the old statcache file and cache it in memory. */
 110 void StatCache::ReadNext()
 111 {
 112     if (oldcache == NULL) {
 113         end_of_cache = true;
 114         return;
 115     }
 116
 117     std::istream &cache = *oldcache;
 118     map<string, string> fields;
 119
 120     old_is_validated = false;
 121     old_mtime = -1;
 122     old_ctime = -1;
 123     old_inode = -1;
 124     old_size = -1;
 125     old_checksum = "";
 126     old_contents.clear();
 127
 128     /* First, read in the filename.  TODO: Unescaping. */
 129     getline(cache, old_name);
 130     if (!cache) {
 131         end_of_cache = true;
 132         return;
 133     }
 134
 135     /* Start reading in the fields which follow the filename. */
 136     string field = "";
 137     while (!cache.eof()) {
 138         string line;
 139         getline(cache, line);
 140         const char *s = line.c_str();
 141
 142         /* Is the line blank?  If so, we have reached the end of this entry. */
 143         if (s[0] == '\0' || s[0] == '\n')
 144             break;
 145
 146         /* Is this a continuation line?  (Does it start with whitespace?) */
 147         if (isspace(s[0]) && field != "") {
 148             fields[field] += line;
 149             continue;
 150         }
 151
 152         /* For lines of the form "Key: Value" look for ':' and split the line
 153          * apart. */
 154         const char *value = strchr(s, ':');
 155         if (value == NULL)
 156             continue;
 157         field = string(s, value - s);
 158
 159         value++;
 160         while (isspace(*value))
 161             value++;
 162
 163         fields[field] = value;
 164     }
 165
 166     /* Parse the easy fields: mtime, ctime, inode, checksum, ... */
 167     if (fields.count("validated"))
 168         old_is_validated = true;
 169     if (fields.count("mtime"))
 170         old_mtime = parse_int(fields["mtime"]);
 171     if (fields.count("ctime"))
 172         old_ctime = parse_int(fields["ctime"]);
 173     if (fields.count("inode"))
 174         old_inode = parse_int(fields["inode"]);
 175     if (fields.count("size"))
 176         old_size = parse_int(fields["size"]);
 177
 178     old_checksum = fields["checksum"];
 179
 180     /* Parse the list of blocks. */
 181     const char *s = fields["blocks"].c_str();
 182     while (*s != '\0') {
 183         if (isspace(*s)) {
 184             s++;
 185             continue;
 186         }
 187
 188         string ref = "";
 189         while (*s != '\0' && !isspace(*s)) {
 190             char buf[2];
 191             buf[0] = *s;
 192             buf[1] = '\0';
 193             ref += buf;
 194             s++;
 195         }
 196
 197         ObjectReference *r = ObjectReference::parse(ref);
 198         if (r != NULL) {
 199             old_contents.push_back(*r);
 200             delete r;
 201         }
 202     }
 203
 204     end_of_cache = false;
 205 }
 206
 207 /* Find information about the given filename in the old stat cache, if it
 208  * exists. */
 209 bool StatCache::Find(const string &path, const struct stat *stat_buf)
 210 {
 211     while (!end_of_cache && pathcmp(old_name.c_str(), path.c_str()) < 0)
 212         ReadNext();
 213
 214     /* Could the file be found at all? */
 215     if (end_of_cache)
 216         return false;
 217     if (old_name != path)
 218         return false;
 219
 220     /* Do we trust cached stat information? */
 221     if (!old_is_validated)
 222         return false;
 223
 224     /* Check to see if the file is unchanged. */
 225     if (stat_buf->st_mtime != old_mtime)
 226         return false;
 227     if (stat_buf->st_ctime != old_ctime)
 228         return false;
 229     if ((long long)stat_buf->st_ino != old_inode)
 230         return false;
 231     if (stat_buf->st_size != old_size)
 232         return false;
 233
 234     /* File looks to be unchanged. */
 235     return true;
 236 }
 237
 238 /* Save stat information about a regular file for future invocations. */
 239 void StatCache::Save(const string &path, struct stat *stat_buf,
 240                      const string &checksum, const list<string> &blocks)
 241 {
 242     /* Was this file in the old stat cache, and is the information unchanged?
 243      * If so, mark the information "validated", which means we are confident
 244      * that we can use it to accurately detect changes.  (Stat information may
 245      * not be updated if, for example, there are two writes within a single
 246      * second and we happen to make the first stat call between them.  However,
 247      * if two stat calls separated in time agree, then we will trust the
 248      * values.) */
 249     bool validated = false;
 250     if (!end_of_cache && path == old_name) {
 251         if (stat_buf->st_mtime == old_mtime
 252             && stat_buf->st_ctime == old_ctime
 253             && (long long)stat_buf->st_ino == old_inode
 254             && old_checksum == checksum)
 255             validated = true;
 256     }
 257
 258     *newcache << uri_encode(path) << "\n";
 259     *newcache << "mtime: " << encode_int(stat_buf->st_mtime) << "\n"
 260               << "ctime: " << encode_int(stat_buf->st_ctime) << "\n"
 261               << "inode: " << encode_int(stat_buf->st_ino) << "\n"
 262               << "size: " << encode_int(stat_buf->st_size) << "\n"
 263               << "checksum: " << checksum << "\n";
 264
 265     *newcache << "blocks:";
 266     if (blocks.size() == 0)
 267         *newcache << "\n";
 268     for (list<string>::const_iterator i = blocks.begin();
 269          i != blocks.end(); ++i) {
 270         *newcache << " " << *i << "\n";
 271     }
 272
 273     if (validated)
 274         *newcache << "validated: true\n";
 275
 276     *newcache << "\n";
 277 }