-DCUMULUS_VERSION=$(shell cat version)
LDFLAGS=$(DEBUG) $(shell pkg-config --libs $(PACKAGES))
-SRCS=chunk.cc localdb.cc main.cc metadata.cc ref.cc remote.cc sha1.cc \
- store.cc subfile.cc util.cc
+SRCS=chunk.cc exclude.cc localdb.cc main.cc metadata.cc ref.cc remote.cc \
+ sha1.cc store.cc subfile.cc util.cc
OBJS=$(SRCS:.cc=.o)
cumulus : $(OBJS)
--- /dev/null
+Cumulus File Selection (Includes/Excludes)
+==========================================
+
+A backup tool should support a flexible mechanism for selecting which
+files should be included in or excluded from the backup. This file
+describes the mechanism used by Cumulus. It is loosely based on the
+mechanism used by rsync, though greatly simplified, and it allows for
+users to specify include/exclude rules on a per-directory basis (if
+enabled when running a backup).
+
+Cumulus will back up, recursively, each of the files or directories
+specified on the command-line, subject to include/exclude rules which
+may cause some files to be skipped. Each file is matched against the
+list of rules in sequence. The first rule which matches determines
+whether the file should be included or excluded--thus, more specific
+rules should be listed before more general rules.
+
+There are four types of rules, modeled after those in rsync:
+
+exclude (``-``)
+ matches files which should not be backed up
+
+include (``+``)
+ matches files to be backed up (even if a later exclude rule would
+ match)
+
+dir-merge (``:``)
+ specify a file which, if found in a directory during the backup
+ process, will be read to insert additional rules for that directory
+ and its subdirectories
+
+merge (``.``)
+ immediately read a file containing additional rules and insert those
+ in the current ruleset **(not yet implemented)**
+
+Patterns found in the rules are interpreted as follows:
+
+- Most characters are treated literally and must match exactly.
+- A ``*`` matches zero or more characters, but not ``/``.
+- A ``**`` matches zero or more characters, including ``/``.
+- A ``?`` matches any single character, except for ``/``.
+- A pattern starting with a ``/`` is matched against the complete path
+ to the file. A pattern without a leading ``/`` can match any suffix
+ of the directory components.
+- A pattern ending in a ``/`` matches directories only.
+
+Note that dotfiles are not considered specially by these rules: a
+pattern of "*" matches all files in a directory, including those
+starting with a '.'. This is different from the handling of the shell.
+
+Merged Patterns
+---------------
+
+If a file matching a dir-merge rule is encountered, that file is parsed
+to yield additional filter rules; those filter rules are inserted
+immediately after the dir-merge rule, then removed when leaving the
+directory containing the dir-merge file.
+
+Blank lines and lines beginning with ``#`` in the file are ignored.
+Otherwise, lines should consist of a single character indicating the
+rule type (``+``, ``-``, or ``:`` for include, exclude, or dir-merge), a
+space, and a file pattern.
+
+Any patterns added by a dir-merge rule are matched relative to the
+directory with the patterns file: so, for example, a pattern
+"``+ /file.txt``" would match ``file.txt`` in the same directory as the
+merge file, not at the root.
+
+Example
+-------
+
+Suppose that cumulus is launched with a single filter argument:
+``--dir-merge=/.cumulus-root-filter``.
+
+``/.cumulus-root-filter``::
+
+ # Ignore pseudo-filesystems and temporary directories
+ - /proc/
+ - /sys/
+ # Exclude anywhere directories named tmp (except /var/tmp).
+ # Files named tmp will still be included.
+ + /var/tmp/
+ - tmp/
+ # Merge any user-provided rules in a file named .cumulus-filter here
+ : .cumulus-filter
+ # Ignore backup files anywhere in the file system
+ - *~
+ - *.bak
+ # Ignore per-user cache directories
+ - /home/*/.cache/
+
+``/home/user/.cumulus-filter``::
+
+ # Ignore the /home/user/scratch directory
+ - /scratch/
+ # Ignore vim swap files (in /home/user only)
+ - .*.swp
+ # Do keep backup files (overrides the global exclude rule)
+ + *~
+ # This rule ineffective: the top-level tmp/ rule has a higher precedence
+ + tmp/
+
+``/home/user/workspace/.cumulus-filter``::
+
+ # Exclude backup files again in /home/user/workspace: this has a
+ # higher precedence than the rules in /home/user/.cumulus-filter
+ - *~
+
--- /dev/null
+/* Cumulus: Smart Filesystem Backup to Dumb Servers
+ *
+ * Copyright (C) 2012 Google Inc.
+ * Written by Michael Vrable <vrable@cs.hmc.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/* Implementation of Cumulus include/exclude rules for selecting files to be
+ * backed up. */
+
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "exclude.h"
+
+using std::make_pair;
+using std::pair;
+using std::string;
+
+FilePattern::FilePattern(const string& pattern, const string& basedir)
+ : refcount(1), orig_pattern(pattern), valid(false)
+{
+ string pattern_as_regex = pattern_to_regex(pattern, basedir);
+ int status = regcomp(®ex, pattern_as_regex.c_str(),
+ REG_EXTENDED|REG_NOSUB);
+ if (status != 0) {
+ char errbuf[256];
+ regerror(status, ®ex, errbuf, sizeof(errbuf));
+ fprintf(stderr,
+ "Pattern %s: failed to compile as regular expression %s: %s\n",
+ pattern.c_str(), pattern_as_regex.c_str(), errbuf);
+ regfree(®ex);
+ } else {
+ valid = true;
+ }
+}
+
+FilePattern::~FilePattern()
+{
+ if (valid)
+ regfree(®ex);
+}
+
+bool FilePattern::matches(const std::string& path) const
+{
+ if (!valid)
+ return false;
+ else
+ return regexec(®ex, path.c_str(), 0, NULL, 0) == 0;
+}
+
+string FilePattern::pattern_to_regex(const string& pattern,
+ const string& basedir)
+{
+ /* Matches are always anchored to cover the entire string; we insert
+ * wildcards where needed if we only need to match a suffix of the path. */
+ string result = "^";
+ size_t i = 0;
+ size_t len = pattern.size();
+ if (len == 0) {
+ /* Special case: an empty pattern matches all files. */
+ return result;
+ }
+
+ /* For a non-empty basedir, the match must ensure that the file actually
+ * falls within basedir. */
+ if (!basedir.empty() && basedir != ".") {
+ result += regex_quote(basedir) + "/";
+ }
+
+ /* A leading slash indicates a pattern that must match the entire path. If
+ * there is no leading slash, match any number of leading directory
+ * components. */
+ if (pattern[0] == '/') {
+ i++;
+ } else {
+ result += "(|.*/)";
+ }
+
+ while (i < len) {
+ switch (pattern[i]) {
+ /* Characters that must be quoted in a regular expression that are not
+ * otherwise part of the Cumulus pattern language. */
+ case '^':
+ case '.':
+ case '[':
+ case ']':
+ case '$':
+ case '(':
+ case ')':
+ case '|':
+ case '+':
+ case '{':
+ case '}':
+ case '\\':
+ result += '\\';
+ result += pattern[i];
+ break;
+
+ case '?':
+ /* Any character except a directory separator. */
+ result += "[^/]";
+ break;
+
+ case '*':
+ if (i + 1 < len && pattern[i + 1] == '*') {
+ /* Any number of characters, including slashes. */
+ i++;
+ result += ".*";
+ } else {
+ /* Zero or more characters (but no directory separators). */
+ result += "[^/]*";
+ }
+ break;
+
+ default:
+ /* A character matched literally that does not require quoting. */
+ result += pattern[i];
+ break;
+ }
+ i++;
+ }
+
+ /* A trailing slash should match only a directory. No trailing slash means
+ * match any type of file. Callers should include a slash at the end of a
+ * path that is a directory; if there is no trailing slash in the pattern
+ * match either a trailing slash or none. */
+ if (pattern[len - 1] != '/') {
+ result += "/?";
+ }
+
+ result += "$";
+
+ return result;
+}
+
+string FilePattern::regex_quote(const string& pattern)
+{
+ string result = "";
+ for (size_t i = 0; i < pattern.length(); i++) {
+ switch (pattern[i]) {
+ /* Characters that must be quoted in a regular expression. */
+ case '^':
+ case '.':
+ case '[':
+ case ']':
+ case '$':
+ case '(':
+ case ')':
+ case '|':
+ case '*':
+ case '+':
+ case '?':
+ case '{':
+ case '}':
+ case '\\':
+ result += '\\';
+ // fall through
+
+ default:
+ result += pattern[i];
+ }
+ }
+
+ return result;
+}
+
+PathFilterList::PathFilterList()
+{
+ /* Invariant: pattern_stack is always non-empty (except when the destructor
+ * runs). Thus, reading pattern_stack.front() is always safe. */
+ pattern_stack.push_back(make_pair(1, new PatternList));
+}
+
+PathFilterList::~PathFilterList()
+{
+ /* Pops all items off the saved rule stack. As an optimization, rather
+ * than repeatedly popping items which have a repeat count, just set the
+ * repeat count to one. */
+ while (!pattern_stack.empty()) {
+ pattern_stack.front().first = 1;
+ restore();
+ }
+}
+
+/* save() operates lazily: simply increment the repeat count on the rule set at
+ * the head of the list. If modifications are made, mutable_patterns() will
+ * create a copy of the rules. */
+void PathFilterList::save()
+{
+ pattern_stack.front().first++;
+}
+
+void PathFilterList::restore()
+{
+ if (--pattern_stack.front().first == 0) {
+ PatternList *old_patterns = pattern_stack.front().second;
+ pattern_stack.pop_front();
+ for (PatternList::iterator i = old_patterns->begin();
+ i != old_patterns->end(); ++i) {
+ i->second->unref();
+ }
+ delete old_patterns;
+ }
+}
+
+void PathFilterList::add_pattern(PatternType type, const string& pattern,
+ const string& basedir)
+{
+ FilePattern *pat = new FilePattern(pattern, basedir);
+ mutable_patterns()->push_back(make_pair(type, pat));
+}
+
+bool PathFilterList::is_included(const std::string& path,
+ bool is_directory) const
+{
+ string full_path;
+ if (is_directory) {
+ full_path = path + "/";
+ } else {
+ full_path = path;
+ }
+
+ PatternList::const_iterator i;
+ for (i = patterns().begin(); i != patterns().end(); ++i) {
+ if (i->second->matches(full_path)) {
+ switch (i->first) {
+ case INCLUDE:
+ return true;
+ case EXCLUDE:
+ return false;
+ case DIRMERGE:
+ /* Merge rules are ignored for the purposes of selecting
+ * whether a file is included or not. */
+ continue;
+ }
+ }
+ }
+
+ /* Default is include if no rule matches. */
+ return true;
+}
+
+bool PathFilterList::is_mergefile(const std::string& path) const
+{
+ PatternList::const_iterator i;
+ for (i = patterns().begin(); i != patterns().end(); ++i) {
+ if (i->first == DIRMERGE && i->second->matches(path))
+ return true;
+ }
+ return false;
+}
+
+/* Parses the specified contents of a per-directory rule merge file. The rules
+ * are first parsed into a temporary PatternList, which is then spliced into
+ * the rule set just before the DIRMERGE rule. Thus, if a dir-merge rule
+ * matches multiple times (in successive sub-directories), deeper rules take
+ * precedence over earlier rules. */
+void PathFilterList::merge_patterns(const string& path,
+ const string& basedir,
+ const string& contents)
+{
+ PatternList *rules = mutable_patterns();
+ PatternList::iterator i;
+ for (PatternList::iterator i = rules->begin(); i != rules->end(); ++i) {
+ /* Try to locate where the rules should be inserted by looking for the
+ * DIRMERGE rule which matches the path to the rule file. */
+ if (i->first == DIRMERGE && i->second->matches(path)) {
+ PatternList *new_rules = parse_rules(basedir, contents);
+ rules->splice(i, *new_rules);
+ delete new_rules;
+ break;
+ }
+ }
+}
+
+PathFilterList::PatternList *PathFilterList::parse_rules(const string& basedir,
+ const string& data)
+{
+ PatternList *patterns = new PatternList;
+ std::stringstream rules(data, std::stringstream::in);
+ while (!rules.eof()) {
+ string rule;
+ std::getline(rules, rule);
+ /* Ignore blank lines and lines starting with "#". */
+ if (rule.empty() || rule[0] == '#')
+ continue;
+ if (rule.length() > 2 && rule[1] == ' ') {
+ if (rule[0] == '+' || rule[0] == '-' || rule[0] == ':') {
+ FilePattern *pat = new FilePattern(rule.substr(2), basedir);
+ switch (rule[0]) {
+ case '+':
+ patterns->push_back(make_pair(INCLUDE, pat));
+ break;
+ case '-':
+ patterns->push_back(make_pair(EXCLUDE, pat));
+ break;
+ case ':':
+ patterns->push_back(make_pair(DIRMERGE, pat));
+ break;
+ default:
+ break;
+ }
+ continue;
+ }
+ fprintf(stderr, "Invalid rule: %s\n", rule.c_str());
+ }
+ }
+ return patterns;
+}
+
+PathFilterList::PatternList *PathFilterList::mutable_patterns()
+{
+ PatternList *old_list = pattern_stack.front().second;
+ if (pattern_stack.front().first == 1)
+ return old_list;
+
+ PatternList *new_list = new PatternList;
+ for (PatternList::iterator i = old_list->begin();
+ i != old_list->end(); ++i) {
+ i->second->ref();
+ new_list->push_back(*i);
+ }
+ pattern_stack.front().first--;
+ pattern_stack.push_front(make_pair(1, new_list));
+ return new_list;
+}
+
+
+/*****************************************************************************
+ * Unit tests for pattern matching. These are not compiled in by default, but
+ * exclude.cc can be compiled to a standalone binary with -DRUN_TESTS to run
+ * the unit tests.
+ *****************************************************************************/
+
+#ifdef RUN_TESTS
+/* Tests of pattern matching rules. test_pattern takes a pattern, a base
+ * directory, and a path to match, and prints out whether the rule matches.
+ * expect_match is the expected result; if this doesn't equal the actual result
+ * print a warning message. */
+static void test_pattern(const string& pattern, const string& basedir,
+ const string& path, bool expect_match)
+{
+ FilePattern pat(pattern, basedir);
+ bool result = pat.matches(path);
+ printf("%3s %c %c %-30s %-30s\n",
+ result == expect_match ? "" : "ERR",
+ result ? '+' : '-',
+ expect_match ? '+' : '-',
+ pattern.c_str(),
+ path.c_str());
+}
+
+int main(int argc, char *argv[])
+{
+ printf("Act/Exp Pattern Path\n");
+ test_pattern("*.o", "", "a/b/c.txt", false);
+ test_pattern("*.o", "", "a/b/c.o", true);
+ test_pattern("*.git/", "", "repo/project.git/", true);
+ test_pattern("/.cache", "", ".cache", true);
+ test_pattern("/.cache", "", "home/user/.cache", false);
+ test_pattern("/*/.cache", "", "home/user/.cache", false);
+ test_pattern("/*/*/.cache", "", "home/user/.cache", true);
+ test_pattern("/**/.cache", "", "home/user/.cache", true);
+ test_pattern(".cache", "", "home/user/.cache", true);
+ test_pattern("?.o", "", "home/user/a.o", true);
+ test_pattern("?.o", "", "home/user/a/o", false);
+ test_pattern("*.o", "", "a/b/\n.o", true);
+ test_pattern("/**/.cache", "", "home/new\nline/.cache", true);
+ test_pattern("/*/.cache", "home", "home/user/.cache", true);
+ test_pattern(".cache", "home", "home/user/.cache", true);
+ test_pattern("user/.cache", "home", "home/user/.cache", true);
+ test_pattern("user/.cache", "home/user", "home/user/.cache", false);
+
+ PathFilterList pfl;
+ pfl.add_pattern(PathFilterList::DIRMERGE, ".cumulus-filter", "");
+ pfl.save();
+ pfl.merge_patterns("dir/.cumulus-filter", "dir",
+ "# comment\n"
+ "\n"
+ "- *.o\n"
+ "+ /.git/\n"
+ "* invalid\n");
+ pfl.restore();
+ return 0;
+}
+#endif
--- /dev/null
+/* Cumulus: Smart Filesystem Backup to Dumb Servers
+ *
+ * Copyright (C) 2012 Google Inc.
+ * Written by Michael Vrable <vrable@cs.hmc.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/* Include/exclude processing for selecting files to be backed up: mechanisms
+ * for matching filenames against patterns and constructing lists of
+ * include/exclude rules. */
+
+#ifndef _CUMULUS_EXCLUDE_H
+#define _CUMULUS_EXCLUDE_H
+
+#include <sys/types.h>
+#include <regex.h>
+#include <list>
+#include <map>
+#include <string>
+
+/* Base class for objects which should not have implicit copy constructors and
+ * assignment operators. */
+class noncopyable {
+protected:
+ noncopyable() { }
+private:
+ noncopyable(const noncopyable&);
+ const noncopyable& operator=(const noncopyable&);
+};
+
+/* A pattern which can be matched against file paths while scanning the file
+ * system for backups. The pattern language is described in doc/exclude.rst.
+ * */
+class FilePattern : public noncopyable {
+public:
+ /* Constructs a FilePattern which the specified pattern. If patterns are
+ * loaded from a per-directory merge file, basedir should be the path to
+ * the directory where the patterns were loaded (and the pattern will only
+ * match files in or below that directory). basedir should be empty for a
+ * pattern matching starting at the root. */
+ FilePattern(const std::string& pattern, const std::string& basedir);
+
+ ~FilePattern();
+
+ /* Reference counting for freeing FilePattern objects. Newly created
+ * objects have a reference count of 1. If the reference count drops to
+ * zero via unref(), the object is deleted. */
+ void ref() { refcount++; }
+ void unref() { if (--refcount == 0) delete this; }
+
+ /* Returns the original pattern used to construct the FilePattern object;
+ * this is intended primarily for logging/debugging. */
+ const std::string& pattern() { return orig_pattern; }
+
+ /* Does this pattern match the specified file? Paths should be specified
+ * without any leading slash. A trailing slash should be included in the
+ * path when the object is a directory, to indicate this (so that
+ * directory-only rules can be processed properly). */
+ bool matches(const std::string& path) const;
+
+private:
+ /* Compiles a Cumulus pattern to a regular expression. This is used for
+ * the underlying matching implementation. */
+ static std::string pattern_to_regex(const std::string& pattern,
+ const std::string& basedir);
+
+ /* Quotes any special characters in the input to produce a regular
+ * expression matching the literal string pattern. */
+ static std::string regex_quote(const std::string& pattern);
+
+ int refcount; // Reference count for object lifetime management.
+ std::string orig_pattern; // Original pattern, returned by pattern()
+
+ bool valid; // True if regex is valid and initialized
+ regex_t regex; // The pattern, converted to a compiled regular expression
+};
+
+/* A PathFilterList represents a collection of rules for selecting files to be
+ * included or excluded from a backup. Patterns can be appended to the list,
+ * and PathFilterList also supports updating the list via per-directory rule
+ * files. */
+class PathFilterList : public noncopyable {
+public:
+ PathFilterList();
+ ~PathFilterList();
+
+ /* Possible pattern types, as described in doc/exclude.rst. */
+ enum PatternType { INCLUDE, EXCLUDE, DIRMERGE };
+
+ /* During the backup, a call to save() will store a snapshot of the current
+ * rule set. After any modifications to the filter list, a call to
+ * restore() will change the rules back to those from the time of the
+ * snapshot. Calls to save() and restore() can be nested; the saved
+ * snapshots act as a stack. */
+ void save();
+ void restore();
+
+ /* Append a new pattern to the end of the list of rules. */
+ void add_pattern(PatternType type, const std::string& pattern,
+ const std::string& basedir);
+
+ /* Should a specified file be included in the backup, according to the
+ * current rules? The first matching rule applies; if no rule matches the
+ * default is to include the file. is_directory is a boolean indicating
+ * whether the path specifies a directory (so that directory-only rules can
+ * be matched properly. */
+ bool is_included(const std::string& path, bool is_directory) const;
+
+ /* Does the given file match a dir-merge rule in the current rule set? */
+ bool is_mergefile(const std::string& path) const;
+
+ /* Updates the current rule set from the contents of a per-directory merge
+ * file. If is_mergefile returns true, then call merge_patterns specifying
+ * the path to the merge file once again, the base directory containing the
+ * merge file (which is the starting point for matching the new rules), and
+ * the contents of the merge file as an in-memory string. */
+ void merge_patterns(const std::string& path, const std::string& basedir,
+ const std::string& contents);
+
+private:
+ /* A particular set of rules is stored simply as an ordered list of (rule
+ * type, pattern) tuples. Lifetime of the FilePattern objects is managed
+ * with reference counts. */
+ typedef std::list<std::pair<PatternType, FilePattern *> > PatternList;
+
+ /* A stack of patterns, for handling save()/restore() calls. The current
+ * set of rules appears at the head of the list. As an optimization to
+ * better support save()/restore() calls without any modification to the
+ * rules, the stack uses run-length encoding: each item on the stack
+ * consists of a set of rules and a count of how many times those rules
+ * have been pushed. */
+ std::list<std::pair<int, PatternList *> > pattern_stack;
+
+ /* Parses rules (such as those in a per-directory merge file) and returns a
+ * PatternList. basedir should be the directory where the files were
+ * parsed from (all rules will be matched relative to this directory), and
+ * the contents of the rules file should be read in and passed as rules. */
+ static PatternList *parse_rules(const std::string& basedir,
+ const std::string& rules);
+
+ /* Returns the current set of rules (from the head of pattern_stack). */
+ const PatternList &patterns() const {
+ return *pattern_stack.front().second;
+ }
+
+ /* Returns a copy of the current rules, suitable for modification. If the
+ * current head of pattern_stack has a repetition greater than 1, an
+ * unshared copy of the current rule set is created. */
+ PatternList *mutable_patterns();
+};
+
+#endif // _CUMULUS_EXCLUDE_H
/* Cumulus: Smart Filesystem Backup to Dumb Servers
*
- * Copyright (C) 2006-2008 The Regents of the University of California
+ * Copyright (C) 2006-2009 The Regents of the University of California
+ * Copyright (C) 2012 Google Inc.
* Written by Michael Vrable <mvrable@cs.ucsd.edu>
*
* This program is free software; you can redistribute it and/or modify
#include <string>
#include <vector>
+#include "exclude.h"
#include "localdb.h"
#include "metadata.h"
#include "remote.h"
double snapshot_intent = 1.0;
/* Selection of files to include/exclude in the snapshot. */
-std::list<string> includes; // Paths in which files should be saved
-std::list<string> excludes; // Paths which will not be saved
-std::list<string> excluded_names; // Directories which will not be saved
-std::list<string> searches; // Directories we don't want to save, but
- // do want to descend searching for data
- // in included paths
-
-bool relative_paths = true;
+PathFilterList filter_rules;
bool flag_rebuild_statcache = false;
segment_list.insert(segment);
}
+/* Attempts to open a regular file read-only, but with safety checks for files
+ * that might not be fully trusted. */
+int safe_open(const string& path, struct stat *stat_buf)
+{
+ int fd;
+
+ /* Be paranoid when opening the file. We have no guarantee that the
+ * file was not replaced between the stat() call above and the open()
+ * call below, so we might not even be opening a regular file. We
+ * supply flags to open to to guard against various conditions before
+ * we can perform an lstat to check that the file is still a regular
+ * file:
+ * - O_NOFOLLOW: in the event the file was replaced by a symlink
+ * - O_NONBLOCK: prevents open() from blocking if the file was
+ * replaced by a fifo
+ * We also add in O_NOATIME, since this may reduce disk writes (for
+ * inode updates). However, O_NOATIME may result in EPERM, so if the
+ * initial open fails, try again without O_NOATIME. */
+ fd = open(path.c_str(), O_RDONLY|O_NOATIME|O_NOFOLLOW|O_NONBLOCK);
+ if (fd < 0) {
+ fd = open(path.c_str(), O_RDONLY|O_NOFOLLOW|O_NONBLOCK);
+ }
+ if (fd < 0) {
+ fprintf(stderr, "Unable to open file %s: %m\n", path.c_str());
+ return -1;
+ }
+
+ /* Drop the use of the O_NONBLOCK flag; we only wanted that for file
+ * open. */
+ long flags = fcntl(fd, F_GETFL);
+ fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
+
+ /* Re-check file attributes, storing them into stat_buf if that is
+ * non-NULL. */
+ struct stat internal_stat_buf;
+ if (stat_buf == NULL)
+ stat_buf = &internal_stat_buf;
+
+ /* Perform the stat call again, and check that we still have a regular
+ * file. */
+ if (fstat(fd, stat_buf) < 0) {
+ fprintf(stderr, "fstat: %m\n");
+ close(fd);
+ return -1;
+ }
+
+ if ((stat_buf->st_mode & S_IFMT) != S_IFREG) {
+ fprintf(stderr, "file is no longer a regular file!\n");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
/* Read data from a file descriptor and return the amount of data read. A
* short read (less than the requested size) will only occur if end-of-file is
* hit. */
metawriter->add(file_info);
}
-void scanfile(const string& path, bool include)
+/* Converts a path to the normalized form used in the metadata log. Paths are
+ * written as relative (without any leading slashes). The root directory is
+ * referred to as ".". */
+string metafile_path(const string& path)
{
- int fd = -1;
- long flags;
- struct stat stat_buf;
- list<string> refs;
-
- string true_path;
- if (relative_paths)
- true_path = path;
- else
- true_path = "/" + path;
-
- // Set to true if we should scan through the contents of this directory,
- // but not actually back files up
- bool scan_only = false;
-
- // Check this file against the include/exclude list to see if it should be
- // considered
- for (list<string>::iterator i = includes.begin();
- i != includes.end(); ++i) {
- if (path == *i) {
- include = true;
- }
- }
-
- for (list<string>::iterator i = excludes.begin();
- i != excludes.end(); ++i) {
- if (path == *i) {
- include = false;
- }
- }
+ const char *newpath = path.c_str();
+ if (*newpath == '/')
+ newpath++;
+ if (*newpath == '\0')
+ newpath = ".";
+ return newpath;
+}
- if (excluded_names.size() > 0) {
- std::string name = path;
- std::string::size_type last_slash = name.rfind('/');
- if (last_slash != std::string::npos) {
- name.replace(0, last_slash + 1, "");
- }
+void try_merge_filter(const string& path, const string& basedir)
+{
+ struct stat stat_buf;
+ if (lstat(path.c_str(), &stat_buf) < 0)
+ return;
+ if ((stat_buf.st_mode & S_IFMT) != S_IFREG)
+ return;
+ int fd = safe_open(path, NULL);
+ if (fd < 0)
+ return;
- for (list<string>::iterator i = excluded_names.begin();
- i != excluded_names.end(); ++i) {
- if (name == *i) {
- include = false;
- }
- }
+ /* As a very crude limit on the complexity of merge rules, only read up to
+ * one block (1 MB) worth of data. If the file doesn't seems like it might
+ * be larger than that, don't parse the rules in it. */
+ ssize_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
+ if (bytes < 0 || bytes >= static_cast<ssize_t>(LBS_BLOCK_SIZE - 1)) {
+ /* TODO: Add more strict resource limits on merge files? */
+ fprintf(stderr,
+ "Unable to read filter merge file (possibly size too large\n");
+ return;
}
+ filter_rules.merge_patterns(metafile_path(path), basedir,
+ string(block_buf, bytes));
+}
- for (list<string>::iterator i = searches.begin();
- i != searches.end(); ++i) {
- if (path == *i) {
- scan_only = true;
- }
- }
+void scanfile(const string& path)
+{
+ int fd = -1;
+ struct stat stat_buf;
+ list<string> refs;
- if (!include && !scan_only)
- return;
+ string output_path = metafile_path(path);
- if (lstat(true_path.c_str(), &stat_buf) < 0) {
+ if (lstat(path.c_str(), &stat_buf) < 0) {
fprintf(stderr, "lstat(%s): %m\n", path.c_str());
return;
}
- if ((stat_buf.st_mode & S_IFMT) == S_IFREG) {
- /* Be paranoid when opening the file. We have no guarantee that the
- * file was not replaced between the stat() call above and the open()
- * call below, so we might not even be opening a regular file. We
- * supply flags to open to to guard against various conditions before
- * we can perform an lstat to check that the file is still a regular
- * file:
- * - O_NOFOLLOW: in the event the file was replaced by a symlink
- * - O_NONBLOCK: prevents open() from blocking if the file was
- * replaced by a fifo
- * We also add in O_NOATIME, since this may reduce disk writes (for
- * inode updates). However, O_NOATIME may result in EPERM, so if the
- * initial open fails, try again without O_NOATIME. */
- fd = open(true_path.c_str(), O_RDONLY|O_NOATIME|O_NOFOLLOW|O_NONBLOCK);
- if (fd < 0) {
- fd = open(true_path.c_str(), O_RDONLY|O_NOFOLLOW|O_NONBLOCK);
- }
- if (fd < 0) {
- fprintf(stderr, "Unable to open file %s: %m\n", path.c_str());
- return;
- }
-
- /* Drop the use of the O_NONBLOCK flag; we only wanted that for file
- * open. */
- flags = fcntl(fd, F_GETFL);
- fcntl(fd, F_SETFL, flags & ~O_NONBLOCK);
-
- /* Perform the stat call again, and check that we still have a regular
- * file. */
- if (fstat(fd, &stat_buf) < 0) {
- fprintf(stderr, "fstat: %m\n");
- close(fd);
- return;
- }
+ bool is_directory = ((stat_buf.st_mode & S_IFMT) == S_IFDIR);
+ if (!filter_rules.is_included(output_path, is_directory))
+ return;
- if ((stat_buf.st_mode & S_IFMT) != S_IFREG) {
- fprintf(stderr, "file is no longer a regular file!\n");
- close(fd);
+ if ((stat_buf.st_mode & S_IFMT) == S_IFREG) {
+ fd = safe_open(path, &stat_buf);
+ if (fd < 0)
return;
- }
}
- dump_inode(path, true_path, stat_buf, fd);
+ dump_inode(output_path, path, stat_buf, fd);
if (fd >= 0)
close(fd);
- // If we hit a directory, now that we've written the directory itself,
- // recursively scan the directory.
- if ((stat_buf.st_mode & S_IFMT) == S_IFDIR) {
- DIR *dir = opendir(true_path.c_str());
+ /* If we hit a directory, now that we've written the directory itself,
+ * recursively scan the directory. */
+ if (is_directory) {
+ DIR *dir = opendir(path.c_str());
if (dir == NULL) {
fprintf(stderr, "Error: %m\n");
sort(contents.begin(), contents.end());
+ filter_rules.save();
+
+ /* First pass through the directory items: look for any filter rules to
+ * merge and do so. */
for (vector<string>::iterator i = contents.begin();
i != contents.end(); ++i) {
- const string& filename = *i;
+ string filename;
if (path == ".")
- scanfile(filename, include);
+ filename = *i;
+ else if (path == "/")
+ filename = "/" + *i;
else
- scanfile(path + "/" + filename, include);
+ filename = path + "/" + *i;
+ if (filter_rules.is_mergefile(metafile_path(filename))) {
+ if (verbose) {
+ printf("Merging directory filter rules %s\n",
+ filename.c_str());
+ }
+ try_merge_filter(filename, output_path);
+ }
}
- }
-}
-/* Include the specified file path in the backups. Append the path to the
- * includes list, and to ensure that we actually see the path when scanning the
- * directory tree, add all the parent directories to the search list, which
- * means we will scan through the directory listing even if the files
- * themselves are excluded from being backed up. */
-void add_include(const char *path)
-{
- /* Was an absolute path specified? If so, we'll need to start scanning
- * from the root directory. Make sure that the user was consistent in
- * providing either all relative paths or all absolute paths. */
- if (path[0] == '/') {
- if (includes.size() > 0 && relative_paths == true) {
- fprintf(stderr,
- "Error: Cannot mix relative and absolute paths!\n");
- exit(1);
+ /* Second pass: recursively scan all items in the directory for backup;
+ * scanfile() will check if the item should be included or not. */
+ for (vector<string>::iterator i = contents.begin();
+ i != contents.end(); ++i) {
+ const string& filename = *i;
+ if (path == ".")
+ scanfile(filename);
+ else if (path == "/")
+ scanfile("/" + filename);
+ else
+ scanfile(path + "/" + filename);
}
- relative_paths = false;
-
- // Skip over leading '/'
- path++;
- } else if (relative_paths == false && path[0] != '/') {
- fprintf(stderr, "Error: Cannot mix relative and absolute paths!\n");
- exit(1);
- }
-
- includes.push_back(path);
-
- /* Split the specified path into directory components, and ensure that we
- * descend into all the directories along the path. */
- const char *slash = path;
-
- if (path[0] == '\0')
- return;
-
- while ((slash = strchr(slash + 1, '/')) != NULL) {
- string component(path, slash - path);
- searches.push_back(component);
+ filter_rules.restore();
}
}
while (1) {
static struct option long_options[] = {
{"localdb", 1, 0, 0}, // 0
- {"exclude", 1, 0, 0}, // 1
- {"filter", 1, 0, 0}, // 2
- {"filter-extension", 1, 0, 0}, // 3
- {"dest", 1, 0, 0}, // 4
- {"scheme", 1, 0, 0}, // 5
- {"signature-filter", 1, 0, 0}, // 6
- {"intent", 1, 0, 0}, // 7
- {"full-metadata", 0, 0, 0}, // 8
- {"tmpdir", 1, 0, 0}, // 9
- {"upload-script", 1, 0, 0}, // 10
- {"rebuild-statcache", 0, 0, 0}, // 11
- {"exclude-name", 1, 0, 0}, // 12
+ {"filter", 1, 0, 0}, // 1
+ {"filter-extension", 1, 0, 0}, // 2
+ {"dest", 1, 0, 0}, // 3
+ {"scheme", 1, 0, 0}, // 4
+ {"signature-filter", 1, 0, 0}, // 5
+ {"intent", 1, 0, 0}, // 6
+ {"full-metadata", 0, 0, 0}, // 7
+ {"tmpdir", 1, 0, 0}, // 8
+ {"upload-script", 1, 0, 0}, // 9
+ {"rebuild-statcache", 0, 0, 0}, // 10
+ {"include", 1, 0, 0}, // 11
+ {"exclude", 1, 0, 0}, // 12
+ {"dir-merge", 1, 0, 0}, // 13
// Aliases for short options
{"verbose", 0, 0, 'v'},
{NULL, 0, 0, 0},
case 0: // --localdb
localdb_dir = optarg;
break;
- case 1: // --exclude
- if (optarg[0] != '/')
- excludes.push_back(optarg);
- else
- excludes.push_back(optarg + 1);
- break;
- case 2: // --filter
+ case 1: // --filter
filter_program = optarg;
break;
- case 3: // --filter-extension
+ case 2: // --filter-extension
filter_extension = optarg;
break;
- case 4: // --dest
+ case 3: // --dest
backup_dest = optarg;
break;
- case 5: // --scheme
+ case 4: // --scheme
backup_scheme = optarg;
break;
- case 6: // --signature-filter
+ case 5: // --signature-filter
signature_filter = optarg;
break;
- case 7: // --intent
+ case 6: // --intent
snapshot_intent = atof(optarg);
if (snapshot_intent <= 0)
snapshot_intent = 1;
break;
- case 8: // --full-metadata
+ case 7: // --full-metadata
flag_full_metadata = true;
break;
- case 9: // --tmpdir
+ case 8: // --tmpdir
tmp_dir = optarg;
break;
- case 10: // --upload-script
+ case 9: // --upload-script
backup_script = optarg;
break;
- case 11: // --rebuild-statcache
+ case 10: // --rebuild-statcache
flag_rebuild_statcache = true;
break;
- case 12: // --exclude-name
- excluded_names.push_back(optarg);
+ case 11: // --include
+ filter_rules.add_pattern(PathFilterList::INCLUDE, optarg, "");
+ break;
+ case 12: // --exclude
+ filter_rules.add_pattern(PathFilterList::EXCLUDE, optarg, "");
+ break;
+ case 13: // --dir-merge
+ filter_rules.add_pattern(PathFilterList::DIRMERGE, optarg, "");
break;
default:
fprintf(stderr, "Unhandled long option!\n");
return 1;
}
- searches.push_back(".");
- for (int i = optind; i < argc; i++)
- add_include(argv[i]);
-
if (backup_dest == "" && backup_script == "") {
fprintf(stderr,
"Error: Backup destination must be specified using --dest= or --upload-script=\n");
metawriter = new MetadataWriter(tss, localdb_dir.c_str(), desc_buf,
backup_scheme.c_str());
- scanfile(".", false);
+ for (int i = optind; i < argc; i++) {
+ scanfile(argv[i]);
+ }
ObjectReference root_ref = metawriter->close();
add_segment(root_ref.get_segment());