scandir.cc

   1 /* Recursively descend the filesystem and visit each file. */
   2
   3 #include <stdio.h>
   4 #include <stdlib.h>
   5 #include <stdint.h>
   6 #include <dirent.h>
   7 #include <errno.h>
   8 #include <fcntl.h>
   9 #include <sys/types.h>
  10 #include <sys/stat.h>
  11 #include <unistd.h>
  12
  13 #include <algorithm>
  14 #include <string>
  15 #include <vector>
  16 #include <iostream>
  17 #include <sstream>
  18
  19 #include "store.h"
  20 #include "tarstore.h"
  21 #include "sha1.h"
  22
  23 using std::string;
  24 using std::vector;
  25 using std::ostream;
  26
  27 static SegmentStore *segment_store;
  28 static OutputStream *info_dump = NULL;
  29
  30 static SegmentPartitioner *index_segment, *data_segment;
  31
  32 /* Buffer for holding a single block of data read from a file. */
  33 static const int LBS_BLOCK_SIZE = 1024 * 1024;
  34 static char *block_buf;
  35
  36 void scandir(const string& path, std::ostream& metadata);
  37
  38 /* Converts time to microseconds since the epoch. */
  39 int64_t encode_time(time_t time)
  40 {
  41     return (int64_t)time * 1000000;
  42 }
  43
  44 /* Read data from a file descriptor and return the amount of data read.  A
  45  * short read (less than the requested size) will only occur if end-of-file is
  46  * hit. */
  47 size_t file_read(int fd, char *buf, size_t maxlen)
  48 {
  49     size_t bytes_read = 0;
  50
  51     while (true) {
  52         ssize_t res = read(fd, buf, maxlen);
  53         if (res < 0) {
  54             if (errno == EINTR)
  55                 continue;
  56             throw IOException("file_read: error reading");
  57         } else if (res == 0) {
  58             break;
  59         } else {
  60             bytes_read += res;
  61             buf += res;
  62             maxlen -= res;
  63         }
  64     }
  65
  66     return bytes_read;
  67 }
  68
  69 /* Read the contents of a file (specified by an open file descriptor) and copy
  70  * the data to the store. */
  71 void dumpfile(int fd, dictionary &file_info)
  72 {
  73     struct stat stat_buf;
  74     fstat(fd, &stat_buf);
  75     int64_t size = 0;
  76
  77     if ((stat_buf.st_mode & S_IFMT) != S_IFREG) {
  78         printf("file is no longer a regular file!\n");
  79         return;
  80     }
  81
  82     /* The index data consists of a sequence of pointers to the data blocks
  83      * that actually comprise the file data.  This level of indirection is used
  84      * so that the same data block can be used in multiple files, or multiple
  85      * versions of the same file. */
  86     struct uuid segment_uuid;
  87     int object_id;
  88     OutputStream *index_data = index_segment->new_object(&segment_uuid,
  89                                                          &object_id,
  90                                                          "DREF");
  91
  92     SHA1Checksum hash;
  93     while (true) {
  94         struct uuid block_segment_uuid;
  95         int block_object_id;
  96
  97         size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
  98         if (bytes == 0)
  99             break;
 100
 101         hash.process(block_buf, bytes);
 102         OutputStream *block = data_segment->new_object(&block_segment_uuid,
 103                                                        &block_object_id,
 104                                                        "DATA");
 105         block->write(block_buf, bytes);
 106         index_data->write_uuid(block_segment_uuid);
 107         index_data->write_u32(block_object_id);
 108
 109         size += bytes;
 110     }
 111
 112     file_info["sha1"] = string((const char *)hash.checksum(),
 113                                hash.checksum_size());
 114     file_info["data"] = encode_objref(segment_uuid, object_id);
 115 }
 116
 117 void scanfile(const string& path, ostream &metadata)
 118 {
 119     int fd;
 120     long flags;
 121     struct stat stat_buf;
 122     char *buf;
 123     ssize_t len;
 124
 125     // Set to true if the item is a directory and we should recursively scan
 126     bool recurse = false;
 127
 128     dictionary file_info;
 129
 130     lstat(path.c_str(), &stat_buf);
 131
 132     printf("%s\n", path.c_str());
 133
 134     metadata << "name: " << path << "\n";
 135     metadata << "mode: " << (stat_buf.st_mode & 07777) << "\n";
 136     metadata << "atime: " << stat_buf.st_atime << "\n";
 137     metadata << "ctime: " << stat_buf.st_ctime << "\n";
 138     metadata << "mtime: " << stat_buf.st_mtime << "\n";
 139     metadata << "user: " << stat_buf.st_uid << "\n";
 140     metadata << "group: " << stat_buf.st_gid << "\n";
 141
 142     file_info["mode"] = encode_u16(stat_buf.st_mode & 07777);
 143     file_info["atime"] = encode_u64(encode_time(stat_buf.st_atime));
 144     file_info["ctime"] = encode_u64(encode_time(stat_buf.st_ctime));
 145     file_info["mtime"] = encode_u64(encode_time(stat_buf.st_mtime));
 146     file_info["user"] = encode_u32(stat_buf.st_uid);
 147     file_info["group"] = encode_u32(stat_buf.st_gid);
 148
 149     char inode_type;
 150
 151     switch (stat_buf.st_mode & S_IFMT) {
 152     case S_IFIFO:
 153         inode_type = 'p';
 154         break;
 155     case S_IFSOCK:
 156         inode_type = 's';
 157         break;
 158     case S_IFCHR:
 159         inode_type = 'c';
 160         break;
 161     case S_IFBLK:
 162         inode_type = 'b';
 163         break;
 164     case S_IFLNK:
 165         inode_type = 'l';
 166
 167         /* Use the reported file size to allocate a buffer large enough to read
 168          * the symlink.  Allocate slightly more space, so that we ask for more
 169          * bytes than we expect and so check for truncation. */
 170         buf = new char[stat_buf.st_size + 2];
 171         len = readlink(path.c_str(), buf, stat_buf.st_size + 1);
 172         if (len < 0) {
 173             printf("error reading symlink: %m\n");
 174         } else if (len <= stat_buf.st_size) {
 175             buf[len] = '\0';
 176             printf("    contents=%s\n", buf);
 177         } else if (len > stat_buf.st_size) {
 178             printf("error reading symlink: name truncated\n");
 179         }
 180
 181         file_info["contents"] = buf;
 182
 183         delete[] buf;
 184         break;
 185     case S_IFREG:
 186         inode_type = '-';
 187
 188         /* Be paranoid when opening the file.  We have no guarantee that the
 189          * file was not replaced between the stat() call above and the open()
 190          * call below, so we might not even be opening a regular file.  That
 191          * the file descriptor refers to a regular file is checked in
 192          * dumpfile().  But we also supply flags to open to to guard against
 193          * various conditions before we can perform that verification:
 194          *   - O_NOFOLLOW: in the event the file was replaced by a symlink
 195          *   - O_NONBLOCK: prevents open() from blocking if the file was
 196          *     replaced by a fifo
 197          * We also add in O_NOATIME, since this may reduce disk writes (for
 198          * inode updates). */
 199         fd = open(path.c_str(), O_RDONLY|O_NOATIME|O_NOFOLLOW|O_NONBLOCK);
 200
 201         /* Drop the use of the O_NONBLOCK flag; we only wanted that for file
 202          * open. */
 203         flags = fcntl(fd, F_GETFL);
 204         fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
 205
 206         file_info["size"] = encode_u64(stat_buf.st_size);
 207         dumpfile(fd, file_info);
 208         close(fd);
 209
 210         break;
 211     case S_IFDIR:
 212         inode_type = 'd';
 213         recurse = true;
 214         break;
 215
 216     default:
 217         fprintf(stderr, "Unknown inode type: mode=%x\n", stat_buf.st_mode);
 218         return;
 219     }
 220
 221     file_info["type"] = string(1, inode_type);
 222     metadata << "type: " << inode_type << "\n";
 223
 224     info_dump->write_string(path);
 225     info_dump->write_dictionary(file_info);
 226
 227     metadata << "\n";
 228
 229     // If we hit a directory, now that we've written the directory itself,
 230     // recursively scan the directory.
 231     if (recurse)
 232         scandir(path, metadata);
 233 }
 234
 235 void scandir(const string& path, ostream &metadata)
 236 {
 237     DIR *dir = opendir(path.c_str());
 238
 239     if (dir == NULL) {
 240         printf("Error: %m\n");
 241         return;
 242     }
 243
 244     struct dirent *ent;
 245     vector<string> contents;
 246     while ((ent = readdir(dir)) != NULL) {
 247         string filename(ent->d_name);
 248         if (filename == "." || filename == "..")
 249             continue;
 250         contents.push_back(filename);
 251     }
 252
 253     sort(contents.begin(), contents.end());
 254
 255     for (vector<string>::iterator i = contents.begin();
 256          i != contents.end(); ++i) {
 257         const string& filename = *i;
 258         scanfile(path + "/" + filename, metadata);
 259     }
 260
 261     closedir(dir);
 262 }
 263
 264 int main(int argc, char *argv[])
 265 {
 266     block_buf = new char[LBS_BLOCK_SIZE];
 267
 268     segment_store = new SegmentStore(".");
 269     SegmentWriter *sw = segment_store->new_segment();
 270     info_dump = sw->new_object(NULL, "ROOT");
 271
 272     index_segment = new SegmentPartitioner(segment_store);
 273     data_segment = new SegmentPartitioner(segment_store);
 274
 275     string uuid = SegmentWriter::format_uuid(sw->get_uuid());
 276     printf("Backup UUID: %s\n", uuid.c_str());
 277
 278     std::ostringstream metadata;
 279
 280     try {
 281         scanfile(".", metadata);
 282     } catch (IOException e) {
 283         fprintf(stderr, "IOException: %s\n", e.getError().c_str());
 284     }
 285
 286     Tarfile *t = new Tarfile("tarstore.tar", uuid);
 287     const char testdata[] = "Test string.";
 288     t->write_object(0, testdata, strlen(testdata));
 289     t->write_object(1, testdata, strlen(testdata));
 290     t->write_object(2, testdata, strlen(testdata));
 291
 292     const string md = metadata.str();
 293     t->write_object(3, md.data(), md.size());
 294
 295     delete t;
 296
 297     delete index_segment;
 298     delete data_segment;
 299     delete sw;
 300
 301     return 0;
 302 }