From fc223ec17dc5e6691291a7b31f6daa8679b484cd Mon Sep 17 00:00:00 2001 From: Michael Vrable Date: Tue, 11 Sep 2012 20:30:32 -0700 Subject: [PATCH] Reimplement the file include/exclude filtering mechanism. Implement something more similar to that rsync does. Accept some number of starting points for the backup, and an ordered set of include/exclude patterns. Also permit rules to be merged in during the backup process, by per-subtree rule files. --- Makefile | 4 +- doc/exclude.rst | 108 +++++++++++++ exclude.cc | 405 ++++++++++++++++++++++++++++++++++++++++++++++++ exclude.h | 165 ++++++++++++++++++++ main.cc | 352 ++++++++++++++++++++--------------------- 5 files changed, 852 insertions(+), 182 deletions(-) create mode 100644 doc/exclude.rst create mode 100644 exclude.cc create mode 100644 exclude.h diff --git a/Makefile b/Makefile index 8ae5754..162d9f9 100644 --- a/Makefile +++ b/Makefile @@ -5,8 +5,8 @@ CXXFLAGS=-O -Wall -Wextra -D_FILE_OFFSET_BITS=64 $(DEBUG) \ -DCUMULUS_VERSION=$(shell cat version) LDFLAGS=$(DEBUG) $(shell pkg-config --libs $(PACKAGES)) -SRCS=chunk.cc localdb.cc main.cc metadata.cc ref.cc remote.cc sha1.cc \ - store.cc subfile.cc util.cc +SRCS=chunk.cc exclude.cc localdb.cc main.cc metadata.cc ref.cc remote.cc \ + sha1.cc store.cc subfile.cc util.cc OBJS=$(SRCS:.cc=.o) cumulus : $(OBJS) diff --git a/doc/exclude.rst b/doc/exclude.rst new file mode 100644 index 0000000..893bc60 --- /dev/null +++ b/doc/exclude.rst @@ -0,0 +1,108 @@ +Cumulus File Selection (Includes/Excludes) +========================================== + +A backup tool should support a flexible mechanism for selecting which +files should be included in or excluded from the backup. This file +describes the mechanism used by Cumulus. It is loosely based on the +mechanism used by rsync, though greatly simplified, and it allows for +users to specify include/exclude rules on a per-directory basis (if +enabled when running a backup). + +Cumulus will back up, recursively, each of the files or directories +specified on the command-line, subject to include/exclude rules which +may cause some files to be skipped. Each file is matched against the +list of rules in sequence. The first rule which matches determines +whether the file should be included or excluded--thus, more specific +rules should be listed before more general rules. + +There are four types of rules, modeled after those in rsync: + +exclude (``-``) + matches files which should not be backed up + +include (``+``) + matches files to be backed up (even if a later exclude rule would + match) + +dir-merge (``:``) + specify a file which, if found in a directory during the backup + process, will be read to insert additional rules for that directory + and its subdirectories + +merge (``.``) + immediately read a file containing additional rules and insert those + in the current ruleset **(not yet implemented)** + +Patterns found in the rules are interpreted as follows: + +- Most characters are treated literally and must match exactly. +- A ``*`` matches zero or more characters, but not ``/``. +- A ``**`` matches zero or more characters, including ``/``. +- A ``?`` matches any single character, except for ``/``. +- A pattern starting with a ``/`` is matched against the complete path + to the file. A pattern without a leading ``/`` can match any suffix + of the directory components. +- A pattern ending in a ``/`` matches directories only. + +Note that dotfiles are not considered specially by these rules: a +pattern of "*" matches all files in a directory, including those +starting with a '.'. This is different from the handling of the shell. + +Merged Patterns +--------------- + +If a file matching a dir-merge rule is encountered, that file is parsed +to yield additional filter rules; those filter rules are inserted +immediately after the dir-merge rule, then removed when leaving the +directory containing the dir-merge file. + +Blank lines and lines beginning with ``#`` in the file are ignored. +Otherwise, lines should consist of a single character indicating the +rule type (``+``, ``-``, or ``:`` for include, exclude, or dir-merge), a +space, and a file pattern. + +Any patterns added by a dir-merge rule are matched relative to the +directory with the patterns file: so, for example, a pattern +"``+ /file.txt``" would match ``file.txt`` in the same directory as the +merge file, not at the root. + +Example +------- + +Suppose that cumulus is launched with a single filter argument: +``--dir-merge=/.cumulus-root-filter``. + +``/.cumulus-root-filter``:: + + # Ignore pseudo-filesystems and temporary directories + - /proc/ + - /sys/ + # Exclude anywhere directories named tmp (except /var/tmp). + # Files named tmp will still be included. + + /var/tmp/ + - tmp/ + # Merge any user-provided rules in a file named .cumulus-filter here + : .cumulus-filter + # Ignore backup files anywhere in the file system + - *~ + - *.bak + # Ignore per-user cache directories + - /home/*/.cache/ + +``/home/user/.cumulus-filter``:: + + # Ignore the /home/user/scratch directory + - /scratch/ + # Ignore vim swap files (in /home/user only) + - .*.swp + # Do keep backup files (overrides the global exclude rule) + + *~ + # This rule ineffective: the top-level tmp/ rule has a higher precedence + + tmp/ + +``/home/user/workspace/.cumulus-filter``:: + + # Exclude backup files again in /home/user/workspace: this has a + # higher precedence than the rules in /home/user/.cumulus-filter + - *~ + diff --git a/exclude.cc b/exclude.cc new file mode 100644 index 0000000..9d2a75f --- /dev/null +++ b/exclude.cc @@ -0,0 +1,405 @@ +/* Cumulus: Smart Filesystem Backup to Dumb Servers + * + * Copyright (C) 2012 Google Inc. + * Written by Michael Vrable + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* Implementation of Cumulus include/exclude rules for selecting files to be + * backed up. */ + +#include +#include +#include +#include +#include +#include +#include + +#include "exclude.h" + +using std::make_pair; +using std::pair; +using std::string; + +FilePattern::FilePattern(const string& pattern, const string& basedir) + : refcount(1), orig_pattern(pattern), valid(false) +{ + string pattern_as_regex = pattern_to_regex(pattern, basedir); + int status = regcomp(®ex, pattern_as_regex.c_str(), + REG_EXTENDED|REG_NOSUB); + if (status != 0) { + char errbuf[256]; + regerror(status, ®ex, errbuf, sizeof(errbuf)); + fprintf(stderr, + "Pattern %s: failed to compile as regular expression %s: %s\n", + pattern.c_str(), pattern_as_regex.c_str(), errbuf); + regfree(®ex); + } else { + valid = true; + } +} + +FilePattern::~FilePattern() +{ + if (valid) + regfree(®ex); +} + +bool FilePattern::matches(const std::string& path) const +{ + if (!valid) + return false; + else + return regexec(®ex, path.c_str(), 0, NULL, 0) == 0; +} + +string FilePattern::pattern_to_regex(const string& pattern, + const string& basedir) +{ + /* Matches are always anchored to cover the entire string; we insert + * wildcards where needed if we only need to match a suffix of the path. */ + string result = "^"; + size_t i = 0; + size_t len = pattern.size(); + if (len == 0) { + /* Special case: an empty pattern matches all files. */ + return result; + } + + /* For a non-empty basedir, the match must ensure that the file actually + * falls within basedir. */ + if (!basedir.empty() && basedir != ".") { + result += regex_quote(basedir) + "/"; + } + + /* A leading slash indicates a pattern that must match the entire path. If + * there is no leading slash, match any number of leading directory + * components. */ + if (pattern[0] == '/') { + i++; + } else { + result += "(|.*/)"; + } + + while (i < len) { + switch (pattern[i]) { + /* Characters that must be quoted in a regular expression that are not + * otherwise part of the Cumulus pattern language. */ + case '^': + case '.': + case '[': + case ']': + case '$': + case '(': + case ')': + case '|': + case '+': + case '{': + case '}': + case '\\': + result += '\\'; + result += pattern[i]; + break; + + case '?': + /* Any character except a directory separator. */ + result += "[^/]"; + break; + + case '*': + if (i + 1 < len && pattern[i + 1] == '*') { + /* Any number of characters, including slashes. */ + i++; + result += ".*"; + } else { + /* Zero or more characters (but no directory separators). */ + result += "[^/]*"; + } + break; + + default: + /* A character matched literally that does not require quoting. */ + result += pattern[i]; + break; + } + i++; + } + + /* A trailing slash should match only a directory. No trailing slash means + * match any type of file. Callers should include a slash at the end of a + * path that is a directory; if there is no trailing slash in the pattern + * match either a trailing slash or none. */ + if (pattern[len - 1] != '/') { + result += "/?"; + } + + result += "$"; + + return result; +} + +string FilePattern::regex_quote(const string& pattern) +{ + string result = ""; + for (size_t i = 0; i < pattern.length(); i++) { + switch (pattern[i]) { + /* Characters that must be quoted in a regular expression. */ + case '^': + case '.': + case '[': + case ']': + case '$': + case '(': + case ')': + case '|': + case '*': + case '+': + case '?': + case '{': + case '}': + case '\\': + result += '\\'; + // fall through + + default: + result += pattern[i]; + } + } + + return result; +} + +PathFilterList::PathFilterList() +{ + /* Invariant: pattern_stack is always non-empty (except when the destructor + * runs). Thus, reading pattern_stack.front() is always safe. */ + pattern_stack.push_back(make_pair(1, new PatternList)); +} + +PathFilterList::~PathFilterList() +{ + /* Pops all items off the saved rule stack. As an optimization, rather + * than repeatedly popping items which have a repeat count, just set the + * repeat count to one. */ + while (!pattern_stack.empty()) { + pattern_stack.front().first = 1; + restore(); + } +} + +/* save() operates lazily: simply increment the repeat count on the rule set at + * the head of the list. If modifications are made, mutable_patterns() will + * create a copy of the rules. */ +void PathFilterList::save() +{ + pattern_stack.front().first++; +} + +void PathFilterList::restore() +{ + if (--pattern_stack.front().first == 0) { + PatternList *old_patterns = pattern_stack.front().second; + pattern_stack.pop_front(); + for (PatternList::iterator i = old_patterns->begin(); + i != old_patterns->end(); ++i) { + i->second->unref(); + } + delete old_patterns; + } +} + +void PathFilterList::add_pattern(PatternType type, const string& pattern, + const string& basedir) +{ + FilePattern *pat = new FilePattern(pattern, basedir); + mutable_patterns()->push_back(make_pair(type, pat)); +} + +bool PathFilterList::is_included(const std::string& path, + bool is_directory) const +{ + string full_path; + if (is_directory) { + full_path = path + "/"; + } else { + full_path = path; + } + + PatternList::const_iterator i; + for (i = patterns().begin(); i != patterns().end(); ++i) { + if (i->second->matches(full_path)) { + switch (i->first) { + case INCLUDE: + return true; + case EXCLUDE: + return false; + case DIRMERGE: + /* Merge rules are ignored for the purposes of selecting + * whether a file is included or not. */ + continue; + } + } + } + + /* Default is include if no rule matches. */ + return true; +} + +bool PathFilterList::is_mergefile(const std::string& path) const +{ + PatternList::const_iterator i; + for (i = patterns().begin(); i != patterns().end(); ++i) { + if (i->first == DIRMERGE && i->second->matches(path)) + return true; + } + return false; +} + +/* Parses the specified contents of a per-directory rule merge file. The rules + * are first parsed into a temporary PatternList, which is then spliced into + * the rule set just before the DIRMERGE rule. Thus, if a dir-merge rule + * matches multiple times (in successive sub-directories), deeper rules take + * precedence over earlier rules. */ +void PathFilterList::merge_patterns(const string& path, + const string& basedir, + const string& contents) +{ + PatternList *rules = mutable_patterns(); + PatternList::iterator i; + for (PatternList::iterator i = rules->begin(); i != rules->end(); ++i) { + /* Try to locate where the rules should be inserted by looking for the + * DIRMERGE rule which matches the path to the rule file. */ + if (i->first == DIRMERGE && i->second->matches(path)) { + PatternList *new_rules = parse_rules(basedir, contents); + rules->splice(i, *new_rules); + delete new_rules; + break; + } + } +} + +PathFilterList::PatternList *PathFilterList::parse_rules(const string& basedir, + const string& data) +{ + PatternList *patterns = new PatternList; + std::stringstream rules(data, std::stringstream::in); + while (!rules.eof()) { + string rule; + std::getline(rules, rule); + /* Ignore blank lines and lines starting with "#". */ + if (rule.empty() || rule[0] == '#') + continue; + if (rule.length() > 2 && rule[1] == ' ') { + if (rule[0] == '+' || rule[0] == '-' || rule[0] == ':') { + FilePattern *pat = new FilePattern(rule.substr(2), basedir); + switch (rule[0]) { + case '+': + patterns->push_back(make_pair(INCLUDE, pat)); + break; + case '-': + patterns->push_back(make_pair(EXCLUDE, pat)); + break; + case ':': + patterns->push_back(make_pair(DIRMERGE, pat)); + break; + default: + break; + } + continue; + } + fprintf(stderr, "Invalid rule: %s\n", rule.c_str()); + } + } + return patterns; +} + +PathFilterList::PatternList *PathFilterList::mutable_patterns() +{ + PatternList *old_list = pattern_stack.front().second; + if (pattern_stack.front().first == 1) + return old_list; + + PatternList *new_list = new PatternList; + for (PatternList::iterator i = old_list->begin(); + i != old_list->end(); ++i) { + i->second->ref(); + new_list->push_back(*i); + } + pattern_stack.front().first--; + pattern_stack.push_front(make_pair(1, new_list)); + return new_list; +} + + +/***************************************************************************** + * Unit tests for pattern matching. These are not compiled in by default, but + * exclude.cc can be compiled to a standalone binary with -DRUN_TESTS to run + * the unit tests. + *****************************************************************************/ + +#ifdef RUN_TESTS +/* Tests of pattern matching rules. test_pattern takes a pattern, a base + * directory, and a path to match, and prints out whether the rule matches. + * expect_match is the expected result; if this doesn't equal the actual result + * print a warning message. */ +static void test_pattern(const string& pattern, const string& basedir, + const string& path, bool expect_match) +{ + FilePattern pat(pattern, basedir); + bool result = pat.matches(path); + printf("%3s %c %c %-30s %-30s\n", + result == expect_match ? "" : "ERR", + result ? '+' : '-', + expect_match ? '+' : '-', + pattern.c_str(), + path.c_str()); +} + +int main(int argc, char *argv[]) +{ + printf("Act/Exp Pattern Path\n"); + test_pattern("*.o", "", "a/b/c.txt", false); + test_pattern("*.o", "", "a/b/c.o", true); + test_pattern("*.git/", "", "repo/project.git/", true); + test_pattern("/.cache", "", ".cache", true); + test_pattern("/.cache", "", "home/user/.cache", false); + test_pattern("/*/.cache", "", "home/user/.cache", false); + test_pattern("/*/*/.cache", "", "home/user/.cache", true); + test_pattern("/**/.cache", "", "home/user/.cache", true); + test_pattern(".cache", "", "home/user/.cache", true); + test_pattern("?.o", "", "home/user/a.o", true); + test_pattern("?.o", "", "home/user/a/o", false); + test_pattern("*.o", "", "a/b/\n.o", true); + test_pattern("/**/.cache", "", "home/new\nline/.cache", true); + test_pattern("/*/.cache", "home", "home/user/.cache", true); + test_pattern(".cache", "home", "home/user/.cache", true); + test_pattern("user/.cache", "home", "home/user/.cache", true); + test_pattern("user/.cache", "home/user", "home/user/.cache", false); + + PathFilterList pfl; + pfl.add_pattern(PathFilterList::DIRMERGE, ".cumulus-filter", ""); + pfl.save(); + pfl.merge_patterns("dir/.cumulus-filter", "dir", + "# comment\n" + "\n" + "- *.o\n" + "+ /.git/\n" + "* invalid\n"); + pfl.restore(); + return 0; +} +#endif diff --git a/exclude.h b/exclude.h new file mode 100644 index 0000000..851cc20 --- /dev/null +++ b/exclude.h @@ -0,0 +1,165 @@ +/* Cumulus: Smart Filesystem Backup to Dumb Servers + * + * Copyright (C) 2012 Google Inc. + * Written by Michael Vrable + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* Include/exclude processing for selecting files to be backed up: mechanisms + * for matching filenames against patterns and constructing lists of + * include/exclude rules. */ + +#ifndef _CUMULUS_EXCLUDE_H +#define _CUMULUS_EXCLUDE_H + +#include +#include +#include +#include +#include + +/* Base class for objects which should not have implicit copy constructors and + * assignment operators. */ +class noncopyable { +protected: + noncopyable() { } +private: + noncopyable(const noncopyable&); + const noncopyable& operator=(const noncopyable&); +}; + +/* A pattern which can be matched against file paths while scanning the file + * system for backups. The pattern language is described in doc/exclude.rst. + * */ +class FilePattern : public noncopyable { +public: + /* Constructs a FilePattern which the specified pattern. If patterns are + * loaded from a per-directory merge file, basedir should be the path to + * the directory where the patterns were loaded (and the pattern will only + * match files in or below that directory). basedir should be empty for a + * pattern matching starting at the root. */ + FilePattern(const std::string& pattern, const std::string& basedir); + + ~FilePattern(); + + /* Reference counting for freeing FilePattern objects. Newly created + * objects have a reference count of 1. If the reference count drops to + * zero via unref(), the object is deleted. */ + void ref() { refcount++; } + void unref() { if (--refcount == 0) delete this; } + + /* Returns the original pattern used to construct the FilePattern object; + * this is intended primarily for logging/debugging. */ + const std::string& pattern() { return orig_pattern; } + + /* Does this pattern match the specified file? Paths should be specified + * without any leading slash. A trailing slash should be included in the + * path when the object is a directory, to indicate this (so that + * directory-only rules can be processed properly). */ + bool matches(const std::string& path) const; + +private: + /* Compiles a Cumulus pattern to a regular expression. This is used for + * the underlying matching implementation. */ + static std::string pattern_to_regex(const std::string& pattern, + const std::string& basedir); + + /* Quotes any special characters in the input to produce a regular + * expression matching the literal string pattern. */ + static std::string regex_quote(const std::string& pattern); + + int refcount; // Reference count for object lifetime management. + std::string orig_pattern; // Original pattern, returned by pattern() + + bool valid; // True if regex is valid and initialized + regex_t regex; // The pattern, converted to a compiled regular expression +}; + +/* A PathFilterList represents a collection of rules for selecting files to be + * included or excluded from a backup. Patterns can be appended to the list, + * and PathFilterList also supports updating the list via per-directory rule + * files. */ +class PathFilterList : public noncopyable { +public: + PathFilterList(); + ~PathFilterList(); + + /* Possible pattern types, as described in doc/exclude.rst. */ + enum PatternType { INCLUDE, EXCLUDE, DIRMERGE }; + + /* During the backup, a call to save() will store a snapshot of the current + * rule set. After any modifications to the filter list, a call to + * restore() will change the rules back to those from the time of the + * snapshot. Calls to save() and restore() can be nested; the saved + * snapshots act as a stack. */ + void save(); + void restore(); + + /* Append a new pattern to the end of the list of rules. */ + void add_pattern(PatternType type, const std::string& pattern, + const std::string& basedir); + + /* Should a specified file be included in the backup, according to the + * current rules? The first matching rule applies; if no rule matches the + * default is to include the file. is_directory is a boolean indicating + * whether the path specifies a directory (so that directory-only rules can + * be matched properly. */ + bool is_included(const std::string& path, bool is_directory) const; + + /* Does the given file match a dir-merge rule in the current rule set? */ + bool is_mergefile(const std::string& path) const; + + /* Updates the current rule set from the contents of a per-directory merge + * file. If is_mergefile returns true, then call merge_patterns specifying + * the path to the merge file once again, the base directory containing the + * merge file (which is the starting point for matching the new rules), and + * the contents of the merge file as an in-memory string. */ + void merge_patterns(const std::string& path, const std::string& basedir, + const std::string& contents); + +private: + /* A particular set of rules is stored simply as an ordered list of (rule + * type, pattern) tuples. Lifetime of the FilePattern objects is managed + * with reference counts. */ + typedef std::list > PatternList; + + /* A stack of patterns, for handling save()/restore() calls. The current + * set of rules appears at the head of the list. As an optimization to + * better support save()/restore() calls without any modification to the + * rules, the stack uses run-length encoding: each item on the stack + * consists of a set of rules and a count of how many times those rules + * have been pushed. */ + std::list > pattern_stack; + + /* Parses rules (such as those in a per-directory merge file) and returns a + * PatternList. basedir should be the directory where the files were + * parsed from (all rules will be matched relative to this directory), and + * the contents of the rules file should be read in and passed as rules. */ + static PatternList *parse_rules(const std::string& basedir, + const std::string& rules); + + /* Returns the current set of rules (from the head of pattern_stack). */ + const PatternList &patterns() const { + return *pattern_stack.front().second; + } + + /* Returns a copy of the current rules, suitable for modification. If the + * current head of pattern_stack has a repetition greater than 1, an + * unshared copy of the current rule set is created. */ + PatternList *mutable_patterns(); +}; + +#endif // _CUMULUS_EXCLUDE_H diff --git a/main.cc b/main.cc index c700173..15ddfc6 100644 --- a/main.cc +++ b/main.cc @@ -1,6 +1,7 @@ /* Cumulus: Smart Filesystem Backup to Dumb Servers * - * Copyright (C) 2006-2008 The Regents of the University of California + * Copyright (C) 2006-2009 The Regents of the University of California + * Copyright (C) 2012 Google Inc. * Written by Michael Vrable * * This program is free software; you can redistribute it and/or modify @@ -47,6 +48,7 @@ #include #include +#include "exclude.h" #include "localdb.h" #include "metadata.h" #include "remote.h" @@ -90,14 +92,7 @@ std::set segment_list; double snapshot_intent = 1.0; /* Selection of files to include/exclude in the snapshot. */ -std::list includes; // Paths in which files should be saved -std::list excludes; // Paths which will not be saved -std::list excluded_names; // Directories which will not be saved -std::list searches; // Directories we don't want to save, but - // do want to descend searching for data - // in included paths - -bool relative_paths = true; +PathFilterList filter_rules; bool flag_rebuild_statcache = false; @@ -111,6 +106,61 @@ void add_segment(const string& segment) segment_list.insert(segment); } +/* Attempts to open a regular file read-only, but with safety checks for files + * that might not be fully trusted. */ +int safe_open(const string& path, struct stat *stat_buf) +{ + int fd; + + /* Be paranoid when opening the file. We have no guarantee that the + * file was not replaced between the stat() call above and the open() + * call below, so we might not even be opening a regular file. We + * supply flags to open to to guard against various conditions before + * we can perform an lstat to check that the file is still a regular + * file: + * - O_NOFOLLOW: in the event the file was replaced by a symlink + * - O_NONBLOCK: prevents open() from blocking if the file was + * replaced by a fifo + * We also add in O_NOATIME, since this may reduce disk writes (for + * inode updates). However, O_NOATIME may result in EPERM, so if the + * initial open fails, try again without O_NOATIME. */ + fd = open(path.c_str(), O_RDONLY|O_NOATIME|O_NOFOLLOW|O_NONBLOCK); + if (fd < 0) { + fd = open(path.c_str(), O_RDONLY|O_NOFOLLOW|O_NONBLOCK); + } + if (fd < 0) { + fprintf(stderr, "Unable to open file %s: %m\n", path.c_str()); + return -1; + } + + /* Drop the use of the O_NONBLOCK flag; we only wanted that for file + * open. */ + long flags = fcntl(fd, F_GETFL); + fcntl(fd, F_SETFL, flags & ~O_NONBLOCK); + + /* Re-check file attributes, storing them into stat_buf if that is + * non-NULL. */ + struct stat internal_stat_buf; + if (stat_buf == NULL) + stat_buf = &internal_stat_buf; + + /* Perform the stat call again, and check that we still have a regular + * file. */ + if (fstat(fd, stat_buf) < 0) { + fprintf(stderr, "fstat: %m\n"); + close(fd); + return -1; + } + + if ((stat_buf->st_mode & S_IFMT) != S_IFREG) { + fprintf(stderr, "file is no longer a regular file!\n"); + close(fd); + return -1; + } + + return fd; +} + /* Read data from a file descriptor and return the amount of data read. A * short read (less than the requested size) will only occur if end-of-file is * hit. */ @@ -463,120 +513,76 @@ void dump_inode(const string& path, // Path within snapshot metawriter->add(file_info); } -void scanfile(const string& path, bool include) +/* Converts a path to the normalized form used in the metadata log. Paths are + * written as relative (without any leading slashes). The root directory is + * referred to as ".". */ +string metafile_path(const string& path) { - int fd = -1; - long flags; - struct stat stat_buf; - list refs; - - string true_path; - if (relative_paths) - true_path = path; - else - true_path = "/" + path; - - // Set to true if we should scan through the contents of this directory, - // but not actually back files up - bool scan_only = false; - - // Check this file against the include/exclude list to see if it should be - // considered - for (list::iterator i = includes.begin(); - i != includes.end(); ++i) { - if (path == *i) { - include = true; - } - } - - for (list::iterator i = excludes.begin(); - i != excludes.end(); ++i) { - if (path == *i) { - include = false; - } - } + const char *newpath = path.c_str(); + if (*newpath == '/') + newpath++; + if (*newpath == '\0') + newpath = "."; + return newpath; +} - if (excluded_names.size() > 0) { - std::string name = path; - std::string::size_type last_slash = name.rfind('/'); - if (last_slash != std::string::npos) { - name.replace(0, last_slash + 1, ""); - } +void try_merge_filter(const string& path, const string& basedir) +{ + struct stat stat_buf; + if (lstat(path.c_str(), &stat_buf) < 0) + return; + if ((stat_buf.st_mode & S_IFMT) != S_IFREG) + return; + int fd = safe_open(path, NULL); + if (fd < 0) + return; - for (list::iterator i = excluded_names.begin(); - i != excluded_names.end(); ++i) { - if (name == *i) { - include = false; - } - } + /* As a very crude limit on the complexity of merge rules, only read up to + * one block (1 MB) worth of data. If the file doesn't seems like it might + * be larger than that, don't parse the rules in it. */ + ssize_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE); + if (bytes < 0 || bytes >= static_cast(LBS_BLOCK_SIZE - 1)) { + /* TODO: Add more strict resource limits on merge files? */ + fprintf(stderr, + "Unable to read filter merge file (possibly size too large\n"); + return; } + filter_rules.merge_patterns(metafile_path(path), basedir, + string(block_buf, bytes)); +} - for (list::iterator i = searches.begin(); - i != searches.end(); ++i) { - if (path == *i) { - scan_only = true; - } - } +void scanfile(const string& path) +{ + int fd = -1; + struct stat stat_buf; + list refs; - if (!include && !scan_only) - return; + string output_path = metafile_path(path); - if (lstat(true_path.c_str(), &stat_buf) < 0) { + if (lstat(path.c_str(), &stat_buf) < 0) { fprintf(stderr, "lstat(%s): %m\n", path.c_str()); return; } - if ((stat_buf.st_mode & S_IFMT) == S_IFREG) { - /* Be paranoid when opening the file. We have no guarantee that the - * file was not replaced between the stat() call above and the open() - * call below, so we might not even be opening a regular file. We - * supply flags to open to to guard against various conditions before - * we can perform an lstat to check that the file is still a regular - * file: - * - O_NOFOLLOW: in the event the file was replaced by a symlink - * - O_NONBLOCK: prevents open() from blocking if the file was - * replaced by a fifo - * We also add in O_NOATIME, since this may reduce disk writes (for - * inode updates). However, O_NOATIME may result in EPERM, so if the - * initial open fails, try again without O_NOATIME. */ - fd = open(true_path.c_str(), O_RDONLY|O_NOATIME|O_NOFOLLOW|O_NONBLOCK); - if (fd < 0) { - fd = open(true_path.c_str(), O_RDONLY|O_NOFOLLOW|O_NONBLOCK); - } - if (fd < 0) { - fprintf(stderr, "Unable to open file %s: %m\n", path.c_str()); - return; - } - - /* Drop the use of the O_NONBLOCK flag; we only wanted that for file - * open. */ - flags = fcntl(fd, F_GETFL); - fcntl(fd, F_SETFL, flags & ~O_NONBLOCK); - - /* Perform the stat call again, and check that we still have a regular - * file. */ - if (fstat(fd, &stat_buf) < 0) { - fprintf(stderr, "fstat: %m\n"); - close(fd); - return; - } + bool is_directory = ((stat_buf.st_mode & S_IFMT) == S_IFDIR); + if (!filter_rules.is_included(output_path, is_directory)) + return; - if ((stat_buf.st_mode & S_IFMT) != S_IFREG) { - fprintf(stderr, "file is no longer a regular file!\n"); - close(fd); + if ((stat_buf.st_mode & S_IFMT) == S_IFREG) { + fd = safe_open(path, &stat_buf); + if (fd < 0) return; - } } - dump_inode(path, true_path, stat_buf, fd); + dump_inode(output_path, path, stat_buf, fd); if (fd >= 0) close(fd); - // If we hit a directory, now that we've written the directory itself, - // recursively scan the directory. - if ((stat_buf.st_mode & S_IFMT) == S_IFDIR) { - DIR *dir = opendir(true_path.c_str()); + /* If we hit a directory, now that we've written the directory itself, + * recursively scan the directory. */ + if (is_directory) { + DIR *dir = opendir(path.c_str()); if (dir == NULL) { fprintf(stderr, "Error: %m\n"); @@ -596,55 +602,42 @@ void scanfile(const string& path, bool include) sort(contents.begin(), contents.end()); + filter_rules.save(); + + /* First pass through the directory items: look for any filter rules to + * merge and do so. */ for (vector::iterator i = contents.begin(); i != contents.end(); ++i) { - const string& filename = *i; + string filename; if (path == ".") - scanfile(filename, include); + filename = *i; + else if (path == "/") + filename = "/" + *i; else - scanfile(path + "/" + filename, include); + filename = path + "/" + *i; + if (filter_rules.is_mergefile(metafile_path(filename))) { + if (verbose) { + printf("Merging directory filter rules %s\n", + filename.c_str()); + } + try_merge_filter(filename, output_path); + } } - } -} -/* Include the specified file path in the backups. Append the path to the - * includes list, and to ensure that we actually see the path when scanning the - * directory tree, add all the parent directories to the search list, which - * means we will scan through the directory listing even if the files - * themselves are excluded from being backed up. */ -void add_include(const char *path) -{ - /* Was an absolute path specified? If so, we'll need to start scanning - * from the root directory. Make sure that the user was consistent in - * providing either all relative paths or all absolute paths. */ - if (path[0] == '/') { - if (includes.size() > 0 && relative_paths == true) { - fprintf(stderr, - "Error: Cannot mix relative and absolute paths!\n"); - exit(1); + /* Second pass: recursively scan all items in the directory for backup; + * scanfile() will check if the item should be included or not. */ + for (vector::iterator i = contents.begin(); + i != contents.end(); ++i) { + const string& filename = *i; + if (path == ".") + scanfile(filename); + else if (path == "/") + scanfile("/" + filename); + else + scanfile(path + "/" + filename); } - relative_paths = false; - - // Skip over leading '/' - path++; - } else if (relative_paths == false && path[0] != '/') { - fprintf(stderr, "Error: Cannot mix relative and absolute paths!\n"); - exit(1); - } - - includes.push_back(path); - - /* Split the specified path into directory components, and ensure that we - * descend into all the directories along the path. */ - const char *slash = path; - - if (path[0] == '\0') - return; - - while ((slash = strchr(slash + 1, '/')) != NULL) { - string component(path, slash - path); - searches.push_back(component); + filter_rules.restore(); } } @@ -698,18 +691,19 @@ int main(int argc, char *argv[]) while (1) { static struct option long_options[] = { {"localdb", 1, 0, 0}, // 0 - {"exclude", 1, 0, 0}, // 1 - {"filter", 1, 0, 0}, // 2 - {"filter-extension", 1, 0, 0}, // 3 - {"dest", 1, 0, 0}, // 4 - {"scheme", 1, 0, 0}, // 5 - {"signature-filter", 1, 0, 0}, // 6 - {"intent", 1, 0, 0}, // 7 - {"full-metadata", 0, 0, 0}, // 8 - {"tmpdir", 1, 0, 0}, // 9 - {"upload-script", 1, 0, 0}, // 10 - {"rebuild-statcache", 0, 0, 0}, // 11 - {"exclude-name", 1, 0, 0}, // 12 + {"filter", 1, 0, 0}, // 1 + {"filter-extension", 1, 0, 0}, // 2 + {"dest", 1, 0, 0}, // 3 + {"scheme", 1, 0, 0}, // 4 + {"signature-filter", 1, 0, 0}, // 5 + {"intent", 1, 0, 0}, // 6 + {"full-metadata", 0, 0, 0}, // 7 + {"tmpdir", 1, 0, 0}, // 8 + {"upload-script", 1, 0, 0}, // 9 + {"rebuild-statcache", 0, 0, 0}, // 10 + {"include", 1, 0, 0}, // 11 + {"exclude", 1, 0, 0}, // 12 + {"dir-merge", 1, 0, 0}, // 13 // Aliases for short options {"verbose", 0, 0, 'v'}, {NULL, 0, 0, 0}, @@ -726,46 +720,46 @@ int main(int argc, char *argv[]) case 0: // --localdb localdb_dir = optarg; break; - case 1: // --exclude - if (optarg[0] != '/') - excludes.push_back(optarg); - else - excludes.push_back(optarg + 1); - break; - case 2: // --filter + case 1: // --filter filter_program = optarg; break; - case 3: // --filter-extension + case 2: // --filter-extension filter_extension = optarg; break; - case 4: // --dest + case 3: // --dest backup_dest = optarg; break; - case 5: // --scheme + case 4: // --scheme backup_scheme = optarg; break; - case 6: // --signature-filter + case 5: // --signature-filter signature_filter = optarg; break; - case 7: // --intent + case 6: // --intent snapshot_intent = atof(optarg); if (snapshot_intent <= 0) snapshot_intent = 1; break; - case 8: // --full-metadata + case 7: // --full-metadata flag_full_metadata = true; break; - case 9: // --tmpdir + case 8: // --tmpdir tmp_dir = optarg; break; - case 10: // --upload-script + case 9: // --upload-script backup_script = optarg; break; - case 11: // --rebuild-statcache + case 10: // --rebuild-statcache flag_rebuild_statcache = true; break; - case 12: // --exclude-name - excluded_names.push_back(optarg); + case 11: // --include + filter_rules.add_pattern(PathFilterList::INCLUDE, optarg, ""); + break; + case 12: // --exclude + filter_rules.add_pattern(PathFilterList::EXCLUDE, optarg, ""); + break; + case 13: // --dir-merge + filter_rules.add_pattern(PathFilterList::DIRMERGE, optarg, ""); break; default: fprintf(stderr, "Unhandled long option!\n"); @@ -788,10 +782,6 @@ int main(int argc, char *argv[]) return 1; } - searches.push_back("."); - for (int i = optind; i < argc; i++) - add_include(argv[i]); - if (backup_dest == "" && backup_script == "") { fprintf(stderr, "Error: Backup destination must be specified using --dest= or --upload-script=\n"); @@ -858,7 +848,9 @@ int main(int argc, char *argv[]) metawriter = new MetadataWriter(tss, localdb_dir.c_str(), desc_buf, backup_scheme.c_str()); - scanfile(".", false); + for (int i = optind; i < argc; i++) { + scanfile(argv[i]); + } ObjectReference root_ref = metawriter->close(); add_segment(root_ref.get_segment()); -- 2.20.1