From 7357b690367c22d35edf3f375bd4a91ccdf014c0 Mon Sep 17 00:00:00 2001 From: Michael Vrable Date: Thu, 20 Sep 2012 17:41:02 -0700 Subject: [PATCH] First step towards a new, improved cumulus front-end. This commit adds several things: - Rules for selecting sets of backups for expiration, for managing old snapshots. - A configuration file format and parser for listing settings such as backup expiration policies. - Code in a small utility library that can expire old snapshots according to configuration settings. Eventually this code should be part of a new cumulus front-end in Python that can integrate snapshot and database management with backup runs. --- python/cumulus/config.py | 76 ++++++++++++++++++ python/cumulus/main.py | 70 +++++++++++++++++ python/cumulus/retention.py | 152 ++++++++++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 python/cumulus/config.py create mode 100644 python/cumulus/main.py create mode 100644 python/cumulus/retention.py diff --git a/python/cumulus/config.py b/python/cumulus/config.py new file mode 100644 index 0000000..62225bd --- /dev/null +++ b/python/cumulus/config.py @@ -0,0 +1,76 @@ +# Cumulus: Smart Filesystem Backup to Dumb Servers +# +# Copyright (C) 2012 Google Inc. +# Written by Michael Vrable +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +"""Parsing of Cumulus backup configuration files. + +See the Cumulus documentation for a description of the configuration file +format. +""" + +import ConfigParser +import datetime +import re + +from cumulus import retention + +_BACKUP_PREFIX = "backup:" +_TIME_UNITS = {"s": 1, "m": 60, "h": 3600, "D": 86400, "W": 7 * 86400, + "M": 30 * 86400, "Y": 365 * 86400} +_INTERVAL_RE = r"(\d+)([smhDWMY])" + +def _build_retention_engine(spec): + """Parse a retention specification and return a RetentionEngine object.""" + policy = retention.RetentionEngine() + class_re = re.compile(r"^(\w+):((%s)+)$" % _INTERVAL_RE) + interval_re = re.compile(r"^%s(.*)$" % _INTERVAL_RE) + for s in spec.split(): + m = class_re.match(s) + if not m: + print "Invalid retain spec:", s + continue + period = datetime.timedelta() + classname = m.group(1) + intervalspec = m.group(2) + while intervalspec: + m = interval_re.match(intervalspec) + seconds = int(m.group(1)) * _TIME_UNITS[m.group(2)] + period = period + datetime.timedelta(seconds=seconds) + intervalspec = m.group(3) + print classname, period + policy.add_policy(classname, period) + return policy + + +class CumulusConfig(object): + def __init__(self, filename): + """Parse a Cumulus backup configuration from the specified file.""" + self._config = ConfigParser.RawConfigParser() + self._config.readfp(open(filename)) + + def get_global(self, key): + return self._config.get("global", key) + + def backup_schemes(self): + """Returns a list of backup schemes.""" + return [s[len(_BACKUP_PREFIX):] for s in self._config.sections() + if s.startswith(_BACKUP_PREFIX)] + + def get_retention_for_scheme(self, scheme): + spec = self._config.get(_BACKUP_PREFIX + scheme, "retain") + return _build_retention_engine(spec) diff --git a/python/cumulus/main.py b/python/cumulus/main.py new file mode 100644 index 0000000..50b3170 --- /dev/null +++ b/python/cumulus/main.py @@ -0,0 +1,70 @@ +# Cumulus: Smart Filesystem Backup to Dumb Servers +# +# Copyright (C) 2012 Google Inc. +# Written by Michael Vrable +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +"""The Python-based Cumulus script. + +This implements maintenance functions and is a wrapper around the C++ +cumulus-backup program. +""" + +import re +import sys + +import cumulus +from cumulus import cmd_util +from cumulus import config + +class FakeOptions: + pass + +def prune_backups(backup_config, scheme): + store = cumulus.LowlevelDataStore(backup_config.get_global("dest")) + snapshot_re = re.compile(r"^(.*)-(.*)$") + retention = backup_config.get_retention_for_scheme(scheme) + expired_snapshots = [] + for snapshot in sorted(store.list_snapshots()): + m = snapshot_re.match(snapshot) + if m.group(1) != scheme: continue + timestamp = m.group(2) + keep = retention.consider_snapshot(timestamp) + if not keep: + expired_snapshots.append(snapshot) + # The most recent snapshot is never removed. + if expired_snapshots: expired_snapshots.pop() + print expired_snapshots + + # TODO: Clean up the expiration part... + for snapshot in expired_snapshots: + store.store.delete("snapshot", "snapshot-%s.lbs" % snapshot) + + print "Collecting garbage..." + options = FakeOptions() + options.store = backup_config.get_global("dest") + options.dry_run = False + cmd_util.options = options + cmd_util.cmd_garbage_collect([]) + +def main(argv): + backup_config = config.CumulusConfig(argv[1]) + for scheme in backup_config.backup_schemes(): + print scheme + prune_backups(backup_config, scheme) + +if __name__ == "__main__": + main(sys.argv) diff --git a/python/cumulus/retention.py b/python/cumulus/retention.py new file mode 100644 index 0000000..e89263a --- /dev/null +++ b/python/cumulus/retention.py @@ -0,0 +1,152 @@ +# Cumulus: Smart Filesystem Backup to Dumb Servers +# +# Copyright (C) 2012 Google Inc. +# Written by Michael Vrable +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +"""Backup retention policies. + +Retention policies control how long different backup snapshots should be kept, +for example keeping daily snapshots for short periods of time but retaining +weekly snapshots going back further in time. +""" + +import calendar +import datetime + +TIMESTAMP_FORMAT = "%Y%m%dT%H%M%S" + +# Different classes of backups--such as "daily" or "monthly"--can have +# different retention periods applied. A single backup snapshot might belong +# to multiple classes (i.e., perhaps be both a "daily" and a "monthly", though +# not a "weekly"). +# +# Backups are classified using partitioning functions, defined below. For a +# "monthly" backup classifier, all backups for a given month should map to the +# same partition. Then, we apply the class label to the earliest snapshot in +# each partition--so the set of "monthly" backups would consist of all backups +# which were the first to run after the start of a month. +# +# A partitioning function must take a datetime instance as input and return a +# partition representative as output; timestamps that should be part of the +# same partition should map to equal partition representatives. For a +# "monthly" classifier, an easy way to do this is to truncate the timestamp to +# keep only the month and year, and in general truncating timestamps works +# well, but the values are not used in any other way than equality testing so +# any type is allowed. +# +# _backup_classes is a registry of useful backup types; it maps a descriptive +# name to a partition function which implements it. +_backup_classes = {} + +def add_backup_class(name, partioning_function): + """Registers a new class of backups for which policies can be applied. + + The new class will be available as name to RetentionEngine.add_policy. + partioning_function should be a function for grouping together backups in + the same time period. + + Predefined backups classes are: "yearly", "monthly", "weekly", "daily", and + "all". + """ + _backup_classes[name] = partioning_function + +add_backup_class("yearly", lambda t: t.date().replace(day=1, month=1)) +add_backup_class("monthly", lambda t: t.date().replace(day=1)) +add_backup_class("weekly", lambda t: t.isocalendar()[0:2]) +add_backup_class("daily", lambda t: t.date()) +add_backup_class("all", lambda t: t) + + +class RetentionEngine(object): + """Class for applying a retention policy to a set of snapshots. + + Allows a retention policy to be set, then matches a sequence of backup + snapshots to the policy to decide which ones should be kept. + """ + + def __init__(self): + self.set_utc(False) + self._policies = {} + self._last_snapshots = {} + self._now = datetime.datetime.utcnow() + + def set_utc(self, use_utc=True): + """Perform policy matching with timestamps in UTC. + + By default, the policy converts timestamps to local time, but calling + set_utc(True) will select snapshots based on UTC timestamps. + """ + self._convert_to_localtime = not use_utc + + def set_now(self, timestamp): + """Sets the "current time" for the purposes of snapshot expiration. + + timestamp should be a datetime object, expressed in UTC. If set_now() + is not called, the current time defaults to the time at which the + RetentionEngine object was instantiated. + """ + self._now = timestamp + + def add_policy(self, backup_class, retention_period): + self._policies[backup_class] = retention_period + self._last_snapshots[backup_class] = (None, None) + + @staticmethod + def parse_timestamp(s): + return datetime.datetime.strptime(s, TIMESTAMP_FORMAT) + + def consider_snapshot(self, snapshot): + """Compute whether a given snapshot should be expired. + + Successive calls to consider_snapshot() must be for snapshots in + chronological order. For each call, consider_snapshot() will return a + boolean indicating whether the snapshot should be retained (True) or + expired (False). + """ + timestamp_utc = self.parse_timestamp(snapshot) + snapshot_age = self._now - timestamp_utc + + # timestamp_policy is the timestamp in the format that will be used for + # doing policy matching: either in the local timezone or UTC, depending + # on the setting of set_utc(). + if self._convert_to_localtime: + unixtime = calendar.timegm(timestamp_utc.timetuple()) + timestamp_policy = datetime.datetime.fromtimestamp(unixtime) + else: + timestamp_policy = timestamp_utc + + self._labels = set() + retain = False + for (backup_class, retention_period) in self._policies.iteritems(): + partition = _backup_classes[backup_class](timestamp_policy) + last_snapshot = self._last_snapshots[backup_class] + if self._last_snapshots[backup_class][0] != partition: + self._last_snapshots[backup_class] = (partition, snapshot) + self._labels.add(backup_class) + if snapshot_age < retention_period: retain = True + return retain + + def last_labels(self): + """Return the set of policies that applied to the last snapshot. + + This will fail if consider_snapshot has not yet been called. + """ + return self._labels + + def last_snapshots(self): + """Returns the most recent snapshot in each backup class.""" + return dict((k, v[1]) for (k, v) in self._last_snapshots.iteritems()) -- 2.20.1