From 260aaf269765b6fdf0c74ec51615e38a70b27fa3 Mon Sep 17 00:00:00 2001 From: Michael Vrable Date: Mon, 30 Jun 2008 14:17:08 -0700 Subject: [PATCH] Extend object reference syntax with size assertions. Object references can now include a size assertion, such as [=1024] which indicates that the referenced object is exactly 1024 bytes in length. If a metadata log or statcache file is produced using this reference form where appropriate, then it should be possible to rebuild much of the object index in the local database (by looking for files which are unchanged and computing hashes of blocks from that file where it is known that an entire object was used, not just a fragment of an object). This commit merely adds support for parsing the new references; they are not yet generated by any code. --- cumulus-util | 4 ++-- doc/format.txt | 9 ++++++++- lbs.py | 16 ++++++++++------ ref.cc | 19 ++++++++++++++++--- ref.h | 17 ++++++++++++----- scandir.cc | 2 +- 6 files changed, 49 insertions(+), 18 deletions(-) diff --git a/cumulus-util b/cumulus-util index 5425bca..59c5fc7 100755 --- a/cumulus-util +++ b/cumulus-util @@ -6,9 +6,9 @@ import getpass, os, stat, sys, time from optparse import OptionParser import lbs -# We support up to "LBS Snapshot v0.6" formats, but are also limited by the lbs +# We support up to "LBS Snapshot v0.8" formats, but are also limited by the lbs # module. -FORMAT_VERSION = min(lbs.FORMAT_VERSION, (0, 6)) +FORMAT_VERSION = min(lbs.FORMAT_VERSION, (0, 8)) def check_version(format): ver = lbs.parse_metadata_version(format) diff --git a/doc/format.txt b/doc/format.txt index 1511115..2ab2696 100644 --- a/doc/format.txt +++ b/doc/format.txt @@ -1,6 +1,6 @@ Backup Format Description for Cumulus: Efficient Filesystem Backup to the Cloud - Version: "LBS Snapshot v0.6" + Version: "LBS Snapshot v0.8" NOTE: This format specification is intended to be mostly stable, but is still subject to change before the 1.0 release. The code may provide @@ -129,6 +129,13 @@ abbreviation, the slice syntax [] is shorthand for [0+] +In place of a traditional slice, the annotation + [=] +may be used. This is somewhat similar to specifying [], but +additionally asserts that the referenced object is exactly +bytes long--that is, this slice syntax does not change the bytes +returned at all, but can be used to provide information about the +underlying object store. Both a checksum and a slice can be used. In this case, the checksum is given first, followed by the slice. The checksum is computed over the diff --git a/lbs.py b/lbs.py index 507b9e1..ee4f445 100644 --- a/lbs.py +++ b/lbs.py @@ -13,7 +13,7 @@ import os, re, sha, tarfile, tempfile, thread from pysqlite2 import dbapi2 as sqlite3 # The largest supported snapshot format that can be understood. -FORMAT_VERSION = (0, 6) # LBS Snapshot v0.6 +FORMAT_VERSION = (0, 8) # LBS Snapshot v0.8 # Maximum number of nested indirect references allowed in a snapshot. MAX_RECURSION_DEPTH = 3 @@ -143,7 +143,7 @@ class ObjectStore: if m: return ("zero", None, None, (0, int(m.group(1)))) - m = re.match(r"^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[((\d+)\+)?(\d+)\])?$", refstr) + m = re.match(r"^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[(((\d+)\+)?(\d+)|=(\d+))\])?$", refstr) if not m: return segment = m.group(1) @@ -155,11 +155,14 @@ class ObjectStore: checksum = checksum.lstrip("(").rstrip(")") if slice is not None: - if m.group(5) is None: + if m.group(9) is not None: + # Size-assertion slice + slice = (0, int(m.group(9)), True) + elif m.group(6) is None: # Abbreviated slice - slice = (0, int(m.group(7))) + slice = (0, int(m.group(8)), False) else: - slice = (int(m.group(6)), int(m.group(7))) + slice = (int(m.group(7)), int(m.group(8)), False) return (segment, object, checksum, slice) @@ -231,7 +234,8 @@ class ObjectStore: raise ValueError if slice is not None: - (start, length) = slice + (start, length, exact) = slice + if exact and len(data) != length: raise ValueError data = data[start:start+length] if len(data) != length: raise IndexError diff --git a/ref.cc b/ref.cc index bed4daf..7e92b7a 100644 --- a/ref.cc +++ b/ref.cc @@ -97,7 +97,9 @@ string ObjectReference::to_string() const if (range_valid) { char buf[64]; - if (range_start == 0) { + if (range_exact) { + sprintf(buf, "[=%zu]", range_length); + } else if (range_start == 0) { sprintf(buf, "[%zu]", range_length); } else { sprintf(buf, "[%zu+%zu]", range_start, range_length); @@ -159,10 +161,16 @@ ObjectReference ObjectReference::parse(const std::string& str) } // Range - bool have_range = false; + bool have_range = false, range_exact = false; int64_t range1 = 0, range2 = 0; if (*t == '[') { t++; + + if (*t == '=') { + range_exact = true; + t++; + } + s = t; while (*t >= '0' && *t <= '9') t++; @@ -174,6 +182,8 @@ ObjectReference ObjectReference::parse(const std::string& str) } else { if (*t != '+') return ObjectReference(); + if (range_exact) + return ObjectReference(); string val(s, t - s); range1 = atoll(val.c_str()); @@ -208,7 +218,7 @@ ObjectReference ObjectReference::parse(const std::string& str) ref.set_checksum(checksum); if (have_range) - ref.set_range(range1, range2); + ref.set_range(range1, range2, range_exact); return ref; } @@ -238,6 +248,9 @@ bool ObjectReference::merge(ObjectReference ref) if (!range_valid || !ref.range_valid) return false; + if (range_exact || ref.range_exact) + return false; + if (range_start + range_length == ref.range_start) { range_length += ref.range_length; return true; diff --git a/ref.h b/ref.h index d1a0e0c..a27b4d6 100644 --- a/ref.h +++ b/ref.h @@ -55,9 +55,12 @@ * a substring rather than the entire string using a range specifier. If no * range specifier is given, then by default the entire object is used. * ::= "+" + * | + * | "=" * Both and are decimal values. If included, the range is * enclosed in brackets. As an abbreviation, if is 0 then the range - * can be given as just (no "+" needed). + * can be given as just (no "+" needed). The "=" form asserts + * that the underlying object is exactly bytes in size. * * When both a checksum and a range are included, note that the checksum is * taken over the entire original object, before the range is taken into @@ -108,9 +111,13 @@ public: bool has_range() const { return range_valid; } size_t get_range_start() const { return range_start; } size_t get_range_length() const { return range_length; } - void clear_range() { range_start = range_length = 0; range_valid = false; } - void set_range(size_t start, size_t length) - { range_start = start; range_length = length; range_valid = true; } + size_t get_range_exact() const { return range_exact; } + void clear_range() + { range_start = range_length = 0; + range_valid = false; range_exact = false; } + void set_range(size_t start, size_t length, bool exact = false) + { range_start = start; range_length = length; + range_valid = true; range_exact = exact; } bool merge(ObjectReference ref); @@ -124,7 +131,7 @@ private: RefType type; std::string segment, object, checksum; size_t range_start, range_length; - bool checksum_valid, range_valid; + bool checksum_valid, range_valid, range_exact; }; #endif // _LBS_REF_H diff --git a/scandir.cc b/scandir.cc index e82a918..a2a7a49 100644 --- a/scandir.cc +++ b/scandir.cc @@ -868,7 +868,7 @@ int main(int argc, char *argv[]) } FILE *descriptor = fdopen(descriptor_fd, "w"); - fprintf(descriptor, "Format: LBS Snapshot v0.6\n"); + fprintf(descriptor, "Format: LBS Snapshot v0.8\n"); fprintf(descriptor, "Producer: Cumulus %s\n", cumulus_version); strftime(desc_buf, sizeof(desc_buf), "%Y-%m-%d %H:%M:%S %z", &time_buf); fprintf(descriptor, "Date: %s\n", desc_buf); -- 2.20.1