From d3c10b747ecec0acc14863fc12db9661c3f88128 Mon Sep 17 00:00:00 2001 From: Michael Vrable Date: Fri, 7 Dec 2007 13:01:30 -0800 Subject: [PATCH] Add format support for efficient sparse file handling. While making other format changes, also add in support for explicitly representing regions of a file that are entirely zero, as can happen with sparse files. These are represented with an object reference of the form "zero[0+]". Update the lbs tool to generate and parse these references, and the utility code to also handle it. The restore tools do not seek over zero regions when writing out a file, so the file is not restored as a sparse file, but that support can easily be added later with no change needed to the format. --- contrib/restore.pl | 11 +++++-- doc/format.txt | 12 ++++++++ lbs.py | 7 +++++ localdb.cc | 8 +++++ ref.cc | 75 ++++++++++++++++++++++++++++++++++------------ ref.h | 7 ++++- scandir.cc | 28 ++++++++++++++--- 7 files changed, 121 insertions(+), 27 deletions(-) diff --git a/contrib/restore.pl b/contrib/restore.pl index 0c7ee21..96cf4b4 100755 --- a/contrib/restore.pl +++ b/contrib/restore.pl @@ -85,11 +85,16 @@ sub verifier_check { # necessary integrity checks (if a checksum is included), and return the object # data. sub load_ref { - # First, try to parse the object reference string into constituent pieces. - # The format is segment/object(checksum)[range]. Both the checksum and - # range are optional. my $ref_str = shift; + # Check for special objects before attempting general parsing. + if ($ref_str =~ m/^zero\[(\d+)\+(\d+)\]$/) { + return "\0" x ($2 + 0); + } + + # Try to parse the object reference string into constituent pieces. The + # format is segment/object(checksum)[range]. Both the checksum and range + # are optional. if ($ref_str !~ m/^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[\S+\])?$/) { die "Malformed object reference: $ref_str"; } diff --git a/doc/format.txt b/doc/format.txt index 5c9d5fa..7449681 100644 --- a/doc/format.txt +++ b/doc/format.txt @@ -122,6 +122,18 @@ Both a checksum and a slice can be used. In this case, the checksum is given first, followed by the slice. The checksum is computed over the original object contents, before slicing. +Special Objects +--------------- + +In addition to the standard syntax for objects described above, the +special name "zero" may be used instead of segment/sequence number. +This represents an object consisting entirely of zeroes. The zero +object must have a slice specification appended to indicate the size of +the object. For example + zero[0+1024] +represents a block consisting of 1024 null bytes. A checksum should not +be given. + FILE METADATA LISTING ===================== diff --git a/lbs.py b/lbs.py index 7a391c7..712b072 100644 --- a/lbs.py +++ b/lbs.py @@ -136,6 +136,10 @@ class ObjectStore: @staticmethod def parse_ref(refstr): + m = re.match(r"^zero\[(\d+)\+(\d+)\]$", refstr) + if m: + return ("zero", None, None, (int(m.group(1)), int(m.group(2)))) + m = re.match(r"^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[(\d+)\+(\d+)\])?$", refstr) if not m: return @@ -206,6 +210,9 @@ class ObjectStore: (segment, object, checksum, slice) = self.parse_ref(refstr) + if segment == "zero": + return "\0" * slice[1] + data = self.load_object(segment, object) if checksum is not None: diff --git a/localdb.cc b/localdb.cc index c0ad7d6..1509b61 100644 --- a/localdb.cc +++ b/localdb.cc @@ -304,6 +304,11 @@ bool LocalDb::IsAvailable(const ObjectReference &ref) sqlite3_stmt *stmt; bool found = false; + // Special objects (such as the zero object) aren't stored in segments, and + // so are always available. + if (!ref.is_normal()) + return true; + stmt = Prepare("select count(*) from block_index " "where segmentid = ? and object = ? and expired is null"); sqlite3_bind_int64(stmt, 1, SegmentToId(ref.get_segment())); @@ -331,6 +336,9 @@ void LocalDb::UseObject(const ObjectReference& ref) int rc; sqlite3_stmt *stmt; + if (!ref.is_normal()) + return; + stmt = Prepare("insert or ignore into snapshot_refs " "select segmentid, object, size from block_index " "where segmentid = ? and object = ?"); diff --git a/ref.cc b/ref.cc index 3019c20..80f4d1d 100644 --- a/ref.cc +++ b/ref.cc @@ -29,12 +29,21 @@ string generate_uuid() } ObjectReference::ObjectReference() - : segment(""), object("") + : type(REF_NULL), segment(""), object("") { + clear_checksum(); + clear_range(); +} + +ObjectReference::ObjectReference(RefType t) + : type(t), segment(""), object("") +{ + clear_checksum(); + clear_range(); } ObjectReference::ObjectReference(const std::string& segment, int sequence) - : segment(segment) + : type(REF_NORMAL), segment(segment) { char seq_buf[64]; sprintf(seq_buf, "%08x", sequence); @@ -46,7 +55,7 @@ ObjectReference::ObjectReference(const std::string& segment, int sequence) ObjectReference::ObjectReference(const std::string& segment, const std::string& sequence) - : segment(segment), object(sequence) + : type(REF_NORMAL), segment(segment), object(sequence) { clear_checksum(); clear_range(); @@ -54,13 +63,18 @@ ObjectReference::ObjectReference(const std::string& segment, string ObjectReference::to_string() const { - if (is_null()) - return "/"; + if (type == REF_NULL) + return "null"; - string result = segment + "/" + object; + string result; + if (type == REF_ZERO) { + result = "zero"; + } else if (type == REF_NORMAL) { + result = segment + "/" + object; - if (checksum_valid) - result += "(" + checksum + ")"; + if (checksum_valid) + result += "(" + checksum + ")"; + } if (range_valid) { char buf[64]; @@ -78,22 +92,34 @@ ObjectReference ObjectReference::parse(const std::string& str) { const char *s = str.c_str(); const char *t; + ObjectReference::RefType type = ObjectReference::REF_NORMAL; + + // Special case: explicit zero objects + if (strncmp(s, "zero", 4) == 0) { + type = ObjectReference::REF_ZERO; + s += 4; + } // Segment t = s; - while ((*t >= '0' && *t <= '9') || (*t >= 'a' && *t <= 'f') || (*t == '-')) - t++; - if (*t != '/') - return ObjectReference(); + if (type == ObjectReference::REF_NORMAL) { + while ((*t >= '0' && *t <= '9') || (*t >= 'a' && *t <= 'f') + || (*t == '-')) + t++; + if (*t != '/') + return ObjectReference(); + } string segment(s, t - s); // Object sequence number - t++; - s = t; - while ((*t >= '0' && *t <= '9') || (*t >= 'a' && *t <= 'f')) + if (type == ObjectReference::REF_NORMAL) { t++; - if (*t != '\0' && *t != '(' && *t != '[') - return ObjectReference(); + s = t; + while ((*t >= '0' && *t <= '9') || (*t >= 'a' && *t <= 'f')) + t++; + if (*t != '\0' && *t != '(' && *t != '[') + return ObjectReference(); + } string object(s, t - s); // Checksum @@ -111,7 +137,7 @@ ObjectReference ObjectReference::parse(const std::string& str) // Range bool have_range = false; - int64_t range1, range2; + int64_t range1 = 0, range2 = 0; if (*t == '[') { t++; s = t; @@ -136,7 +162,18 @@ ObjectReference ObjectReference::parse(const std::string& str) have_range = true; } - ObjectReference ref(segment, object); + ObjectReference ref; + switch (type) { + case ObjectReference::REF_ZERO: + ref = ObjectReference(ObjectReference::REF_ZERO); + break; + case ObjectReference::REF_NORMAL: + ref = ObjectReference(segment, object); + break; + default: + return ObjectReference(); + } + if (checksum.size() > 0) ref.set_checksum(checksum); diff --git a/ref.h b/ref.h index e7d41e1..e499e6e 100644 --- a/ref.h +++ b/ref.h @@ -64,11 +64,15 @@ std::string generate_uuid(); * and converted to and from the text representation. */ class ObjectReference { public: + enum RefType { REF_NULL, REF_ZERO, REF_NORMAL }; + ObjectReference(); + ObjectReference(RefType t); ObjectReference(const std::string& segment, int sequence); ObjectReference(const std::string& segment, const std::string& sequence); - bool is_null() const { return segment.size() == 0; } + bool is_null() const { return type == REF_NULL; } + bool is_normal() const { return type == REF_NORMAL; } std::string to_string() const; static ObjectReference parse(const std::string& s); @@ -92,6 +96,7 @@ public: bool merge(ObjectReference ref); private: + RefType type; std::string segment, object, checksum; size_t range_start, range_length; bool checksum_valid, range_valid; diff --git a/scandir.cc b/scandir.cc index 03d8414..654f9fb 100644 --- a/scandir.cc +++ b/scandir.cc @@ -137,7 +137,8 @@ int64_t dumpfile(int fd, dictionary &file_info, const string &path, i != blocks.end(); ++i) { const ObjectReference &ref = *i; object_list.push_back(ref.to_string()); - segment_list.insert(ref.get_segment()); + if (ref.is_normal()) + segment_list.insert(ref.get_segment()); db->UseObject(ref); } size = stat_buf.st_size; @@ -160,16 +161,34 @@ int64_t dumpfile(int fd, dictionary &file_info, const string &path, hash.process(block_buf, bytes); + // Sparse file processing: if we read a block of all zeroes, encode + // that explicitly. + bool all_zero = true; + for (int i = 0; i < bytes; i++) { + if (block_buf[i] != 0) { + all_zero = false; + break; + } + } + // Either find a copy of this block in an already-existing segment, // or index it so it can be re-used in the future double block_age = 0.0; + ObjectReference ref; + SHA1Checksum block_hash; block_hash.process(block_buf, bytes); string block_csum = block_hash.checksum_str(); - ObjectReference ref = db->FindObject(block_csum, bytes); + + if (all_zero) { + ref = ObjectReference(ObjectReference::REF_ZERO); + ref.set_range(0, bytes); + } else { + ObjectReference ref = db->FindObject(block_csum, bytes); + } // Store a copy of the object if one does not yet exist - if (ref.get_segment().size() == 0) { + if (ref.is_null()) { LbsObject *o = new LbsObject; int object_group; @@ -208,7 +227,8 @@ int64_t dumpfile(int fd, dictionary &file_info, const string &path, } object_list.push_back(ref.to_string()); - segment_list.insert(ref.get_segment()); + if (ref.is_normal()) + segment_list.insert(ref.get_segment()); db->UseObject(ref); size += bytes; -- 2.20.1