main.cc

   1 /* Cumulus: Smart Filesystem Backup to Dumb Servers
   2  *
   3  * Copyright (C) 2006-2008  The Regents of the University of California
   4  * Written by Michael Vrable <mvrable@cs.ucsd.edu>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License along
  17  * with this program; if not, write to the Free Software Foundation, Inc.,
  18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19  */
  20
  21 /* Main entry point for Cumulus.  Contains logic for traversing the filesystem
  22  * and constructing a backup. */
  23
  24 #include <dirent.h>
  25 #include <errno.h>
  26 #include <fcntl.h>
  27 #include <getopt.h>
  28 #include <grp.h>
  29 #include <pwd.h>
  30 #include <stdint.h>
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <sys/stat.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/types.h>
  37 #include <sys/wait.h>
  38 #include <unistd.h>
  39
  40 #include <algorithm>
  41 #include <fstream>
  42 #include <iostream>
  43 #include <list>
  44 #include <map>
  45 #include <set>
  46 #include <sstream>
  47 #include <string>
  48 #include <vector>
  49
  50 #include "localdb.h"
  51 #include "metadata.h"
  52 #include "remote.h"
  53 #include "store.h"
  54 #include "sha1.h"
  55 #include "subfile.h"
  56 #include "util.h"
  57
  58 using std::list;
  59 using std::map;
  60 using std::string;
  61 using std::vector;
  62 using std::ostream;
  63
  64 /* Version information.  This will be filled in by the Makefile. */
  65 #ifndef CUMULUS_VERSION
  66 #define CUMULUS_VERSION Unknown
  67 #endif
  68 #define CUMULUS_STRINGIFY(s) CUMULUS_STRINGIFY2(s)
  69 #define CUMULUS_STRINGIFY2(s) #s
  70 static const char cumulus_version[] = CUMULUS_STRINGIFY(CUMULUS_VERSION);
  71
  72 static RemoteStore *remote = NULL;
  73 static TarSegmentStore *tss = NULL;
  74 static MetadataWriter *metawriter = NULL;
  75
  76 /* Buffer for holding a single block of data read from a file. */
  77 static const size_t LBS_BLOCK_SIZE = 1024 * 1024;
  78 static char *block_buf;
  79
  80 /* Local database, which tracks objects written in this and previous
  81  * invocations to help in creating incremental snapshots. */
  82 LocalDb *db;
  83
  84 /* Keep track of all segments which are needed to reconstruct the snapshot. */
  85 std::set<string> segment_list;
  86
  87 /* Snapshot intent: 1=daily, 7=weekly, etc.  This is not used directly, but is
  88  * stored in the local database and can help guide segment cleaning and
  89  * snapshot expiration policies. */
  90 double snapshot_intent = 1.0;
  91
  92 /* Selection of files to include/exclude in the snapshot. */
  93 std::list<string> includes;         // Paths in which files should be saved
  94 std::list<string> excludes;         // Paths which will not be saved
  95 std::list<string> excluded_names;   // Directories which will not be saved
  96 std::list<string> searches;         // Directories we don't want to save, but
  97                                     //   do want to descend searching for data
  98                                     //   in included paths
  99
 100 bool relative_paths = true;
 101
 102 bool flag_rebuild_statcache = false;
 103
 104 /* Whether verbose output is enabled. */
 105 bool verbose = false;
 106
 107 /* Ensure that the given segment is listed as a dependency of the current
 108  * snapshot. */
 109 void add_segment(const string& segment)
 110 {
 111     segment_list.insert(segment);
 112 }
 113
 114 /* Read data from a file descriptor and return the amount of data read.  A
 115  * short read (less than the requested size) will only occur if end-of-file is
 116  * hit. */
 117 ssize_t file_read(int fd, char *buf, size_t maxlen)
 118 {
 119     size_t bytes_read = 0;
 120
 121     while (true) {
 122         ssize_t res = read(fd, buf, maxlen);
 123         if (res < 0) {
 124             if (errno == EINTR)
 125                 continue;
 126             fprintf(stderr, "error reading file: %m\n");
 127             return -1;
 128         } else if (res == 0) {
 129             break;
 130         } else {
 131             bytes_read += res;
 132             buf += res;
 133             maxlen -= res;
 134         }
 135     }
 136
 137     return bytes_read;
 138 }
 139
 140 /* Read the contents of a file (specified by an open file descriptor) and copy
 141  * the data to the store.  Returns the size of the file (number of bytes
 142  * dumped), or -1 on error. */
 143 int64_t dumpfile(int fd, dictionary &file_info, const string &path,
 144                  struct stat& stat_buf)
 145 {
 146     int64_t size = 0;
 147     list<string> object_list;
 148     const char *status = NULL;          /* Status indicator printed out */
 149
 150     /* Look up this file in the old stat cache, if we can.  If the stat
 151      * information indicates that the file has not changed, do not bother
 152      * re-reading the entire contents.  Even if the information has been
 153      * changed, we can use the list of old blocks in the search for a sub-block
 154      * incremental representation. */
 155     bool cached = false;
 156     list<ObjectReference> old_blocks;
 157
 158     bool found = metawriter->find(path);
 159     if (found)
 160         old_blocks = metawriter->get_blocks();
 161
 162     if (found
 163         && !flag_rebuild_statcache
 164         && metawriter->is_unchanged(&stat_buf)) {
 165         cached = true;
 166
 167         /* If any of the blocks in the object have been expired, then we should
 168          * fall back to fully reading in the file. */
 169         for (list<ObjectReference>::const_iterator i = old_blocks.begin();
 170              i != old_blocks.end(); ++i) {
 171             const ObjectReference &ref = *i;
 172             if (!db->IsAvailable(ref)) {
 173                 cached = false;
 174                 status = "repack";
 175                 break;
 176             }
 177         }
 178
 179         /* If everything looks okay, use the cached information */
 180         if (cached) {
 181             file_info["checksum"] = metawriter->get_checksum();
 182             for (list<ObjectReference>::const_iterator i = old_blocks.begin();
 183                  i != old_blocks.end(); ++i) {
 184                 const ObjectReference &ref = *i;
 185                 object_list.push_back(ref.to_string());
 186                 if (ref.is_normal())
 187                     add_segment(ref.get_segment());
 188                 db->UseObject(ref);
 189             }
 190             size = stat_buf.st_size;
 191         }
 192     }
 193
 194     /* If the file is new or changed, we must read in the contents a block at a
 195      * time. */
 196     if (!cached) {
 197         SHA1Checksum hash;
 198         Subfile subfile(db);
 199         subfile.load_old_blocks(old_blocks);
 200
 201         while (true) {
 202             ssize_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
 203             if (bytes == 0)
 204                 break;
 205             if (bytes < 0) {
 206                 fprintf(stderr, "Backup contents for %s may be incorrect\n",
 207                         path.c_str());
 208                 break;
 209             }
 210
 211             hash.process(block_buf, bytes);
 212
 213             // Sparse file processing: if we read a block of all zeroes, encode
 214             // that explicitly.
 215             bool all_zero = true;
 216             for (int i = 0; i < bytes; i++) {
 217                 if (block_buf[i] != 0) {
 218                     all_zero = false;
 219                     break;
 220                 }
 221             }
 222
 223             // Either find a copy of this block in an already-existing segment,
 224             // or index it so it can be re-used in the future
 225             double block_age = 0.0;
 226             ObjectReference ref;
 227
 228             SHA1Checksum block_hash;
 229             block_hash.process(block_buf, bytes);
 230             string block_csum = block_hash.checksum_str();
 231
 232             if (all_zero) {
 233                 ref = ObjectReference(ObjectReference::REF_ZERO);
 234                 ref.set_range(0, bytes);
 235             } else {
 236                 ref = db->FindObject(block_csum, bytes);
 237             }
 238
 239             list<ObjectReference> refs;
 240
 241             // Store a copy of the object if one does not yet exist
 242             if (ref.is_null()) {
 243                 LbsObject *o = new LbsObject;
 244                 int object_group;
 245
 246                 /* We might still have seen this checksum before, if the object
 247                  * was stored at some time in the past, but we have decided to
 248                  * clean the segment the object was originally stored in
 249                  * (FindObject will not return such objects).  When rewriting
 250                  * the object contents, put it in a separate group, so that old
 251                  * objects get grouped together.  The hope is that these old
 252                  * objects will continue to be used in the future, and we
 253                  * obtain segments which will continue to be well-utilized.
 254                  * Additionally, keep track of the age of the data by looking
 255                  * up the age of the block which was expired and using that
 256                  * instead of the current time. */
 257                 if (db->IsOldObject(block_csum, bytes,
 258                                     &block_age, &object_group)) {
 259                     if (object_group == 0) {
 260                         o->set_group("data");
 261                     } else {
 262                         char group[32];
 263                         sprintf(group, "compacted-%d", object_group);
 264                         o->set_group(group);
 265                     }
 266                     if (status == NULL)
 267                         status = "partial";
 268                 } else {
 269                     o->set_group("data");
 270                     status = "new";
 271                 }
 272
 273                 subfile.analyze_new_block(block_buf, bytes);
 274                 refs = subfile.create_incremental(tss, o, block_age);
 275             } else {
 276                 if (flag_rebuild_statcache && ref.is_normal()) {
 277                     subfile.analyze_new_block(block_buf, bytes);
 278                     subfile.store_analyzed_signatures(ref);
 279                 }
 280                 refs.push_back(ref);
 281             }
 282
 283             while (!refs.empty()) {
 284                 ref = refs.front(); refs.pop_front();
 285                 object_list.push_back(ref.to_string());
 286                 if (ref.is_normal())
 287                     add_segment(ref.get_segment());
 288                 db->UseObject(ref);
 289             }
 290             size += bytes;
 291
 292             if (status == NULL)
 293                 status = "old";
 294         }
 295
 296         file_info["checksum"] = hash.checksum_str();
 297     }
 298
 299     // Sanity check: if we are rebuilding the statcache, but the file looks
 300     // like it hasn't changed, then the newly-computed checksum should match
 301     // the checksum in the statcache.  If not, we have possible disk corruption
 302     // and report a warning.
 303     if (flag_rebuild_statcache) {
 304         if (found
 305             && metawriter->is_unchanged(&stat_buf)
 306             && file_info["checksum"] != metawriter->get_checksum()) {
 307             fprintf(stderr,
 308                     "Warning: Checksum for %s does not match expected value\n"
 309                     "    expected: %s\n"
 310                     "    actual:   %s\n",
 311                     path.c_str(),
 312                     metawriter->get_checksum().c_str(),
 313                     file_info["checksum"].c_str());
 314         }
 315     }
 316
 317     if (verbose && status != NULL)
 318         printf("    [%s]\n", status);
 319
 320     string blocklist = "";
 321     for (list<string>::iterator i = object_list.begin();
 322          i != object_list.end(); ++i) {
 323         if (i != object_list.begin())
 324             blocklist += "\n    ";
 325         blocklist += *i;
 326     }
 327     file_info["data"] = blocklist;
 328
 329     return size;
 330 }
 331
 332 /* Look up a user/group and convert it to string form (either strictly numeric
 333  * or numeric plus symbolic).  Caches the results of the call to
 334  * getpwuid/getgrgid. */
 335 string user_to_string(uid_t uid) {
 336     static map<uid_t, string> user_cache;
 337     map<uid_t, string>::const_iterator i = user_cache.find(uid);
 338     if (i != user_cache.end())
 339         return i->second;
 340
 341     string result = encode_int(uid);
 342     struct passwd *pwd = getpwuid(uid);
 343     if (pwd != NULL && pwd->pw_name != NULL) {
 344         result += " (" + uri_encode(pwd->pw_name) + ")";
 345     }
 346     user_cache[uid] = result;
 347     return result;
 348 }
 349
 350 string group_to_string(gid_t gid) {
 351     static map<gid_t, string> group_cache;
 352     map<gid_t, string>::const_iterator i = group_cache.find(gid);
 353     if (i != group_cache.end())
 354         return i->second;
 355
 356     string result = encode_int(gid);
 357     struct group *grp = getgrgid(gid);
 358     if (grp != NULL && grp->gr_name != NULL) {
 359         result += " (" + uri_encode(grp->gr_name) + ")";
 360     }
 361     group_cache[gid] = result;
 362     return result;
 363 }
 364
 365 /* Dump a specified filesystem object (file, directory, etc.) based on its
 366  * inode information.  If the object is a regular file, an open filehandle is
 367  * provided. */
 368 void dump_inode(const string& path,         // Path within snapshot
 369                 const string& fullpath,     // Path to object in filesystem
 370                 struct stat& stat_buf,      // Results of stat() call
 371                 int fd)                     // Open filehandle if regular file
 372 {
 373     char *buf;
 374     dictionary file_info;
 375     int64_t file_size;
 376     ssize_t len;
 377
 378     if (verbose)
 379         printf("%s\n", path.c_str());
 380     metawriter->find(path);
 381
 382     file_info["name"] = uri_encode(path);
 383     file_info["mode"] = encode_int(stat_buf.st_mode & 07777, 8);
 384     file_info["ctime"] = encode_int(stat_buf.st_ctime);
 385     file_info["mtime"] = encode_int(stat_buf.st_mtime);
 386     file_info["user"] = user_to_string(stat_buf.st_uid);
 387     file_info["group"] = group_to_string(stat_buf.st_gid);
 388
 389     time_t now = time(NULL);
 390     if (now - stat_buf.st_ctime < 30 || now - stat_buf.st_mtime < 30)
 391         if ((stat_buf.st_mode & S_IFMT) != S_IFDIR)
 392             file_info["volatile"] = "1";
 393
 394     if (stat_buf.st_nlink > 1 && (stat_buf.st_mode & S_IFMT) != S_IFDIR) {
 395         file_info["links"] = encode_int(stat_buf.st_nlink);
 396     }
 397
 398     file_info["inode"] = encode_int(major(stat_buf.st_dev))
 399         + "/" + encode_int(minor(stat_buf.st_dev))
 400         + "/" + encode_int(stat_buf.st_ino);
 401
 402     char inode_type;
 403
 404     switch (stat_buf.st_mode & S_IFMT) {
 405     case S_IFIFO:
 406         inode_type = 'p';
 407         break;
 408     case S_IFSOCK:
 409         inode_type = 's';
 410         break;
 411     case S_IFBLK:
 412     case S_IFCHR:
 413         inode_type = ((stat_buf.st_mode & S_IFMT) == S_IFBLK) ? 'b' : 'c';
 414         file_info["device"] = encode_int(major(stat_buf.st_rdev))
 415             + "/" + encode_int(minor(stat_buf.st_rdev));
 416         break;
 417     case S_IFLNK:
 418         inode_type = 'l';
 419
 420         /* Use the reported file size to allocate a buffer large enough to read
 421          * the symlink.  Allocate slightly more space, so that we ask for more
 422          * bytes than we expect and so check for truncation. */
 423         buf = new char[stat_buf.st_size + 2];
 424         len = readlink(fullpath.c_str(), buf, stat_buf.st_size + 1);
 425         if (len < 0) {
 426             fprintf(stderr, "error reading symlink: %m\n");
 427         } else if (len <= stat_buf.st_size) {
 428             buf[len] = '\0';
 429             file_info["target"] = uri_encode(buf);
 430         } else if (len > stat_buf.st_size) {
 431             fprintf(stderr, "error reading symlink: name truncated\n");
 432         }
 433
 434         delete[] buf;
 435         break;
 436     case S_IFREG:
 437         inode_type = 'f';
 438
 439         file_size = dumpfile(fd, file_info, path, stat_buf);
 440         file_info["size"] = encode_int(file_size);
 441
 442         if (file_size < 0)
 443             return;             // error occurred; do not dump file
 444
 445         if (file_size != stat_buf.st_size) {
 446             fprintf(stderr, "Warning: Size of %s changed during reading\n",
 447                     path.c_str());
 448             file_info["volatile"] = "1";
 449         }
 450
 451         break;
 452     case S_IFDIR:
 453         inode_type = 'd';
 454         break;
 455
 456     default:
 457         fprintf(stderr, "Unknown inode type: mode=%x\n", stat_buf.st_mode);
 458         return;
 459     }
 460
 461     file_info["type"] = string(1, inode_type);
 462
 463     metawriter->add(file_info);
 464 }
 465
 466 void scanfile(const string& path, bool include)
 467 {
 468     int fd = -1;
 469     long flags;
 470     struct stat stat_buf;
 471     list<string> refs;
 472
 473     string true_path;
 474     if (relative_paths)
 475         true_path = path;
 476     else
 477         true_path = "/" + path;
 478
 479     // Set to true if we should scan through the contents of this directory,
 480     // but not actually back files up
 481     bool scan_only = false;
 482
 483     // Check this file against the include/exclude list to see if it should be
 484     // considered
 485     for (list<string>::iterator i = includes.begin();
 486          i != includes.end(); ++i) {
 487         if (path == *i) {
 488             include = true;
 489         }
 490     }
 491
 492     for (list<string>::iterator i = excludes.begin();
 493          i != excludes.end(); ++i) {
 494         if (path == *i) {
 495             include = false;
 496         }
 497     }
 498
 499     if (excluded_names.size() > 0) {
 500         std::string name = path;
 501         std::string::size_type last_slash = name.rfind('/');
 502         if (last_slash != std::string::npos) {
 503             name.replace(0, last_slash + 1, "");
 504         }
 505
 506         for (list<string>::iterator i = excluded_names.begin();
 507              i != excluded_names.end(); ++i) {
 508             if (name == *i) {
 509                 include = false;
 510             }
 511         }
 512     }
 513
 514     for (list<string>::iterator i = searches.begin();
 515          i != searches.end(); ++i) {
 516         if (path == *i) {
 517             scan_only = true;
 518         }
 519     }
 520
 521     if (!include && !scan_only)
 522         return;
 523
 524     if (lstat(true_path.c_str(), &stat_buf) < 0) {
 525         fprintf(stderr, "lstat(%s): %m\n", path.c_str());
 526         return;
 527     }
 528
 529     if ((stat_buf.st_mode & S_IFMT) == S_IFREG) {
 530         /* Be paranoid when opening the file.  We have no guarantee that the
 531          * file was not replaced between the stat() call above and the open()
 532          * call below, so we might not even be opening a regular file.  We
 533          * supply flags to open to to guard against various conditions before
 534          * we can perform an lstat to check that the file is still a regular
 535          * file:
 536          *   - O_NOFOLLOW: in the event the file was replaced by a symlink
 537          *   - O_NONBLOCK: prevents open() from blocking if the file was
 538          *     replaced by a fifo
 539          * We also add in O_NOATIME, since this may reduce disk writes (for
 540          * inode updates).  However, O_NOATIME may result in EPERM, so if the
 541          * initial open fails, try again without O_NOATIME.  */
 542         fd = open(true_path.c_str(), O_RDONLY|O_NOATIME|O_NOFOLLOW|O_NONBLOCK);
 543         if (fd < 0) {
 544             fd = open(true_path.c_str(), O_RDONLY|O_NOFOLLOW|O_NONBLOCK);
 545         }
 546         if (fd < 0) {
 547             fprintf(stderr, "Unable to open file %s: %m\n", path.c_str());
 548             return;
 549         }
 550
 551         /* Drop the use of the O_NONBLOCK flag; we only wanted that for file
 552          * open. */
 553         flags = fcntl(fd, F_GETFL);
 554         fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
 555
 556         /* Perform the stat call again, and check that we still have a regular
 557          * file. */
 558         if (fstat(fd, &stat_buf) < 0) {
 559             fprintf(stderr, "fstat: %m\n");
 560             close(fd);
 561             return;
 562         }
 563
 564         if ((stat_buf.st_mode & S_IFMT) != S_IFREG) {
 565             fprintf(stderr, "file is no longer a regular file!\n");
 566             close(fd);
 567             return;
 568         }
 569     }
 570
 571     dump_inode(path, true_path, stat_buf, fd);
 572
 573     if (fd >= 0)
 574         close(fd);
 575
 576     // If we hit a directory, now that we've written the directory itself,
 577     // recursively scan the directory.
 578     if ((stat_buf.st_mode & S_IFMT) == S_IFDIR) {
 579         DIR *dir = opendir(true_path.c_str());
 580
 581         if (dir == NULL) {
 582             fprintf(stderr, "Error: %m\n");
 583             return;
 584         }
 585
 586         struct dirent *ent;
 587         vector<string> contents;
 588         while ((ent = readdir(dir)) != NULL) {
 589             string filename(ent->d_name);
 590             if (filename == "." || filename == "..")
 591                 continue;
 592             contents.push_back(filename);
 593         }
 594
 595         closedir(dir);
 596
 597         sort(contents.begin(), contents.end());
 598
 599         for (vector<string>::iterator i = contents.begin();
 600              i != contents.end(); ++i) {
 601             const string& filename = *i;
 602             if (path == ".")
 603                 scanfile(filename, include);
 604             else
 605                 scanfile(path + "/" + filename, include);
 606         }
 607     }
 608 }
 609
 610 /* Include the specified file path in the backups.  Append the path to the
 611  * includes list, and to ensure that we actually see the path when scanning the
 612  * directory tree, add all the parent directories to the search list, which
 613  * means we will scan through the directory listing even if the files
 614  * themselves are excluded from being backed up. */
 615 void add_include(const char *path)
 616 {
 617     /* Was an absolute path specified?  If so, we'll need to start scanning
 618      * from the root directory.  Make sure that the user was consistent in
 619      * providing either all relative paths or all absolute paths. */
 620     if (path[0] == '/') {
 621         if (includes.size() > 0 && relative_paths == true) {
 622             fprintf(stderr,
 623                     "Error: Cannot mix relative and absolute paths!\n");
 624             exit(1);
 625         }
 626
 627         relative_paths = false;
 628
 629         // Skip over leading '/'
 630         path++;
 631     } else if (relative_paths == false && path[0] != '/') {
 632         fprintf(stderr, "Error: Cannot mix relative and absolute paths!\n");
 633         exit(1);
 634     }
 635
 636     includes.push_back(path);
 637
 638     /* Split the specified path into directory components, and ensure that we
 639      * descend into all the directories along the path. */
 640     const char *slash = path;
 641
 642     if (path[0] == '\0')
 643         return;
 644
 645     while ((slash = strchr(slash + 1, '/')) != NULL) {
 646         string component(path, slash - path);
 647         searches.push_back(component);
 648     }
 649 }
 650
 651 void usage(const char *program)
 652 {
 653     fprintf(
 654         stderr,
 655         "Cumulus %s\n\n"
 656         "Usage: %s [OPTION]... --dest=DEST PATHS...\n"
 657         "Produce backup snapshot of files in SOURCE and store to DEST.\n"
 658         "\n"
 659         "Options:\n"
 660         "  --dest=PATH          path where backup is to be written\n"
 661         "  --upload-script=COMMAND\n"
 662         "                       program to invoke for each backup file generated\n"
 663         "  --exclude=PATH       exclude files in PATH from snapshot\n"
 664         "  --exclude-name=NAME  exclude files called NAME from snapshot\n"
 665         "  --localdb=PATH       local backup metadata is stored in PATH\n"
 666         "  --tmpdir=PATH        path for temporarily storing backup files\n"
 667         "                           (defaults to TMPDIR environment variable or /tmp)\n"
 668         "  --filter=COMMAND     program through which to filter segment data\n"
 669         "                           (defaults to \"bzip2 -c\")\n"
 670         "  --filter-extension=EXT\n"
 671         "                       string to append to segment files\n"
 672         "                           (defaults to \".bz2\")\n"
 673         "  --signature-filter=COMMAND\n"
 674         "                       program though which to filter descriptor\n"
 675         "  --scheme=NAME        optional name for this snapshot\n"
 676         "  --intent=FLOAT       intended backup type: 1=daily, 7=weekly, ...\n"
 677         "                           (defaults to \"1\")\n"
 678         "  --full-metadata      do not re-use metadata from previous backups\n"
 679         "  --rebuild-statcache  re-read all file data to verify statcache\n"
 680         "  -v --verbose         list files as they are backed up\n"
 681         "\n"
 682         "Exactly one of --dest or --upload-script must be specified.\n",
 683         cumulus_version, program
 684     );
 685 }
 686
 687 int main(int argc, char *argv[])
 688 {
 689     string backup_dest = "", backup_script = "";
 690     string localdb_dir = "";
 691     string backup_scheme = "";
 692     string signature_filter = "";
 693
 694     string tmp_dir = "/tmp";
 695     if (getenv("TMPDIR") != NULL)
 696         tmp_dir = getenv("TMPDIR");
 697
 698     while (1) {
 699         static struct option long_options[] = {
 700             {"localdb", 1, 0, 0},           // 0
 701             {"exclude", 1, 0, 0},           // 1
 702             {"filter", 1, 0, 0},            // 2
 703             {"filter-extension", 1, 0, 0},  // 3
 704             {"dest", 1, 0, 0},              // 4
 705             {"scheme", 1, 0, 0},            // 5
 706             {"signature-filter", 1, 0, 0},  // 6
 707             {"intent", 1, 0, 0},            // 7
 708             {"full-metadata", 0, 0, 0},     // 8
 709             {"tmpdir", 1, 0, 0},            // 9
 710             {"upload-script", 1, 0, 0},     // 10
 711             {"rebuild-statcache", 0, 0, 0}, // 11
 712             {"exclude-name", 1, 0, 0},      // 12
 713             // Aliases for short options
 714             {"verbose", 0, 0, 'v'},
 715             {NULL, 0, 0, 0},
 716         };
 717
 718         int long_index;
 719         int c = getopt_long(argc, argv, "v", long_options, &long_index);
 720
 721         if (c == -1)
 722             break;
 723
 724         if (c == 0) {
 725             switch (long_index) {
 726             case 0:     // --localdb
 727                 localdb_dir = optarg;
 728                 break;
 729             case 1:     // --exclude
 730                 if (optarg[0] != '/')
 731                     excludes.push_back(optarg);
 732                 else
 733                     excludes.push_back(optarg + 1);
 734                 break;
 735             case 2:     // --filter
 736                 filter_program = optarg;
 737                 break;
 738             case 3:     // --filter-extension
 739                 filter_extension = optarg;
 740                 break;
 741             case 4:     // --dest
 742                 backup_dest = optarg;
 743                 break;
 744             case 5:     // --scheme
 745                 backup_scheme = optarg;
 746                 break;
 747             case 6:     // --signature-filter
 748                 signature_filter = optarg;
 749                 break;
 750             case 7:     // --intent
 751                 snapshot_intent = atof(optarg);
 752                 if (snapshot_intent <= 0)
 753                     snapshot_intent = 1;
 754                 break;
 755             case 8:     // --full-metadata
 756                 flag_full_metadata = true;
 757                 break;
 758             case 9:     // --tmpdir
 759                 tmp_dir = optarg;
 760                 break;
 761             case 10:    // --upload-script
 762                 backup_script = optarg;
 763                 break;
 764             case 11:    // --rebuild-statcache
 765                 flag_rebuild_statcache = true;
 766                 break;
 767             case 12:     // --exclude-name
 768                 excluded_names.push_back(optarg);
 769                 break;
 770             default:
 771                 fprintf(stderr, "Unhandled long option!\n");
 772                 return 1;
 773             }
 774         } else {
 775             switch (c) {
 776             case 'v':
 777                 verbose = true;
 778                 break;
 779             default:
 780                 usage(argv[0]);
 781                 return 1;
 782             }
 783         }
 784     }
 785
 786     if (optind == argc) {
 787         usage(argv[0]);
 788         return 1;
 789     }
 790
 791     searches.push_back(".");
 792     for (int i = optind; i < argc; i++)
 793         add_include(argv[i]);
 794
 795     if (backup_dest == "" && backup_script == "") {
 796         fprintf(stderr,
 797                 "Error: Backup destination must be specified using --dest= or --upload-script=\n");
 798         usage(argv[0]);
 799         return 1;
 800     }
 801
 802     if (backup_dest != "" && backup_script != "") {
 803         fprintf(stderr,
 804                 "Error: Cannot specify both --dest= and --upload-script=\n");
 805         usage(argv[0]);
 806         return 1;
 807     }
 808
 809     // Default for --localdb is the same as --dest
 810     if (localdb_dir == "") {
 811         localdb_dir = backup_dest;
 812     }
 813     if (localdb_dir == "") {
 814         fprintf(stderr,
 815                 "Error: Must specify local database path with --localdb=\n");
 816         usage(argv[0]);
 817         return 1;
 818     }
 819
 820     block_buf = new char[LBS_BLOCK_SIZE];
 821
 822     /* Initialize the remote storage layer.  If using an upload script, create
 823      * a temporary directory for staging files.  Otherwise, write backups
 824      * directly to the destination directory. */
 825     if (backup_script != "") {
 826         tmp_dir = tmp_dir + "/lbs." + generate_uuid();
 827         if (mkdir(tmp_dir.c_str(), 0700) < 0) {
 828             fprintf(stderr, "Cannot create temporary directory %s: %m\n",
 829                     tmp_dir.c_str());
 830             return 1;
 831         }
 832         remote = new RemoteStore(tmp_dir, backup_script=backup_script);
 833     } else {
 834         remote = new RemoteStore(backup_dest);
 835     }
 836
 837     /* Store the time when the backup started, so it can be included in the
 838      * snapshot name. */
 839     time_t now;
 840     struct tm time_buf_local, time_buf_utc;
 841     char desc_buf[256];
 842     time(&now);
 843     localtime_r(&now, &time_buf_local);
 844     gmtime_r(&now, &time_buf_utc);
 845     strftime(desc_buf, sizeof(desc_buf), "%Y%m%dT%H%M%S", &time_buf_utc);
 846
 847     /* Open the local database which tracks all objects that are stored
 848      * remotely, for efficient incrementals.  Provide it with the name of this
 849      * snapshot. */
 850     string database_path = localdb_dir + "/localdb.sqlite";
 851     db = new LocalDb;
 852     db->Open(database_path.c_str(), desc_buf, backup_scheme.c_str(),
 853              snapshot_intent);
 854
 855     tss = new TarSegmentStore(remote, db);
 856
 857     /* Initialize the stat cache, for skipping over unchanged files. */
 858     metawriter = new MetadataWriter(tss, localdb_dir.c_str(), desc_buf,
 859                                     backup_scheme.c_str());
 860
 861     scanfile(".", false);
 862
 863     ObjectReference root_ref = metawriter->close();
 864     add_segment(root_ref.get_segment());
 865     string backup_root = root_ref.to_string();
 866
 867     delete metawriter;
 868
 869     tss->sync();
 870     tss->dump_stats();
 871     delete tss;
 872
 873     /* Write out a checksums file which lists the checksums for all the
 874      * segments included in this snapshot.  The format is designed so that it
 875      * may be easily verified using the sha1sums command. */
 876     const char csum_type[] = "sha1";
 877     string checksum_filename = "snapshot-";
 878     if (backup_scheme.size() > 0)
 879         checksum_filename += backup_scheme + "-";
 880     checksum_filename = checksum_filename + desc_buf + "." + csum_type + "sums";
 881     RemoteFile *checksum_file = remote->alloc_file(checksum_filename,
 882                                                    "checksums");
 883     FILE *checksums = fdopen(checksum_file->get_fd(), "w");
 884
 885     for (std::set<string>::iterator i = segment_list.begin();
 886          i != segment_list.end(); ++i) {
 887         string seg_path, seg_csum;
 888         if (db->GetSegmentChecksum(*i, &seg_path, &seg_csum)) {
 889             const char *raw_checksum = NULL;
 890             if (strncmp(seg_csum.c_str(), csum_type,
 891                         strlen(csum_type)) == 0) {
 892                 raw_checksum = seg_csum.c_str() + strlen(csum_type);
 893                 if (*raw_checksum == '=')
 894                     raw_checksum++;
 895                 else
 896                     raw_checksum = NULL;
 897             }
 898
 899             if (raw_checksum != NULL)
 900                 fprintf(checksums, "%s *%s\n",
 901                         raw_checksum, seg_path.c_str());
 902         }
 903     }
 904     fclose(checksums);
 905
 906     SHA1Checksum checksum_csum;
 907     string csum;
 908     checksum_filename = checksum_file->get_local_path();
 909     if (checksum_csum.process_file(checksum_filename.c_str())) {
 910         csum = checksum_csum.checksum_str();
 911     }
 912
 913     checksum_file->send();
 914
 915     db->Close();
 916
 917     /* All other files should be flushed to remote storage before writing the
 918      * backup descriptor below, so that it is not possible to have a backup
 919      * descriptor written out depending on non-existent (not yet written)
 920      * files. */
 921     remote->sync();
 922
 923     /* Write a backup descriptor file, which says which segments are needed and
 924      * where to start to restore this snapshot.  The filename is based on the
 925      * current time.  If a signature filter program was specified, filter the
 926      * data through that to give a chance to sign the descriptor contents. */
 927     string desc_filename = "snapshot-";
 928     if (backup_scheme.size() > 0)
 929         desc_filename += backup_scheme + "-";
 930     desc_filename = desc_filename + desc_buf + ".lbs";
 931
 932     RemoteFile *descriptor_file = remote->alloc_file(desc_filename,
 933                                                      "snapshots");
 934     int descriptor_fd = descriptor_file->get_fd();
 935     if (descriptor_fd < 0) {
 936         fprintf(stderr, "Unable to open descriptor output file: %m\n");
 937         return 1;
 938     }
 939     pid_t signature_pid = 0;
 940     if (signature_filter.size() > 0) {
 941         int new_fd = spawn_filter(descriptor_fd, signature_filter.c_str(),
 942                                   &signature_pid);
 943         close(descriptor_fd);
 944         descriptor_fd = new_fd;
 945     }
 946     FILE *descriptor = fdopen(descriptor_fd, "w");
 947
 948     fprintf(descriptor, "Format: Cumulus Snapshot v0.11\n");
 949     fprintf(descriptor, "Producer: Cumulus %s\n", cumulus_version);
 950     strftime(desc_buf, sizeof(desc_buf), "%Y-%m-%d %H:%M:%S %z",
 951              &time_buf_local);
 952     fprintf(descriptor, "Date: %s\n", desc_buf);
 953     if (backup_scheme.size() > 0)
 954         fprintf(descriptor, "Scheme: %s\n", backup_scheme.c_str());
 955     fprintf(descriptor, "Backup-Intent: %g\n", snapshot_intent);
 956     fprintf(descriptor, "Root: %s\n", backup_root.c_str());
 957
 958     if (csum.size() > 0) {
 959         fprintf(descriptor, "Checksums: %s\n", csum.c_str());
 960     }
 961
 962     fprintf(descriptor, "Segments:\n");
 963     for (std::set<string>::iterator i = segment_list.begin();
 964          i != segment_list.end(); ++i) {
 965         fprintf(descriptor, "    %s\n", i->c_str());
 966     }
 967
 968     fclose(descriptor);
 969
 970     if (signature_pid) {
 971         int status;
 972         waitpid(signature_pid, &status, 0);
 973
 974         if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
 975             fatal("Signature filter process error");
 976         }
 977     }
 978
 979     descriptor_file->send();
 980
 981     remote->sync();
 982     delete remote;
 983
 984     if (backup_script != "") {
 985         if (rmdir(tmp_dir.c_str()) < 0) {
 986             fprintf(stderr,
 987                     "Warning: Cannot delete temporary directory %s: %m\n",
 988                     tmp_dir.c_str());
 989         }
 990     }
 991
 992     return 0;
 993 }