From f9deef3757145dab5eead5bc7e068defdb0aacd8 Mon Sep 17 00:00:00 2001
From: Michael Vrable <mvrable@cs.ucsd.edu>
Date: Fri, 11 May 2007 21:55:36 -0700
Subject: [PATCH] Begin work on a reference decoder for backups.

The intent is that the reference decoder will eventually be a tool for
recovery, if need, before a better tool is written.  It should also help to
verify the format specification and backup tool.

The reference decoder can currently parse a single object reference and
extract the data for it.
---
 format.txt |  21 ++-------
 restore.pl | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+), 16 deletions(-)
 create mode 100755 restore.pl

diff --git a/format.txt b/format.txt
index 7bf0474..1e4ce24 100644
--- a/format.txt
+++ b/format.txt
@@ -32,22 +32,11 @@ Object naming:
   - Each segment is assigned a unique 128-bit identifier (uuid).  Each
     segment is stored as a separate file whose name is based on its
     uuid.
-  - Objects within a segment are numbered sequentially, with a 32-bit
-    counter.
-Thus, each object may be referred to with a unique 160 (128 + 32) bit
-identifier.
-
-Segment structure:
-There are two main options:
-  - Streaming format: Each object is prepended with a header, and then
-    all (header, object) pairs are concatenated.  This is inspired by
-    the tar file format.  Can be written out in one pass and also
-    processed when read back in one pass.  Well-adapted to streaming
-    transformations, such as compression.
-  - Indexed format: Each segment contains a table giving the starting
-    position and length of each object.  This is somewhat similar to
-    PDF.  Data can still be written out in a single pass, but reading
-    will require random access.
+  - Objects within a segment are numbered, using a 32-bit counter.
+
+Each segment is structured as a TAR file (optionally filtered through a
+compressor such as gzip/bzip2, or encrypted).  Objects are stored as
+individual files.
 
 File attributes: Metadata for each file is stored in a dictionary.
 Dictionary keys include:
diff --git a/restore.pl b/restore.pl
new file mode 100755
index 0000000..90ff3cc
--- /dev/null
+++ b/restore.pl
@@ -0,0 +1,131 @@
+#!/usr/bin/perl -w
+#
+# Proof-of-concept/reference decoder for LBS-format backup snapshots.
+#
+# This decoder aims to decompress an LBS snapshot.  It is not meant to be
+# particularly efficient, but should be a small and portable tool for doing so
+# (important for recovering from data loss).  It is also meant to serve as a
+# check on the snapshot tool and data format itself, and serve as documentation
+# for the format.
+#
+# This decoder does not understand TAR archives; it assumes that all segments
+# in the snapshot have already been decompressed, and that objects are
+# available simply as files in the filesystem.  This simplifies the design.
+#
+# Copyright (C) 2007  Michael Vrable
+
+use strict;
+use Digest::SHA1;
+
+my $OBJECT_DIR = ".";           # Directory where objects are unpacked
+
+############################ CHECKSUM VERIFICATION ############################
+# A very simple later for verifying checksums.  Checksums may be used on object
+# references directly, and can also be used to verify entire reconstructed
+# files.
+#
+# A checksum to verify is given in the form "algorithm=hexdigest".  Given such
+# a string, we can construct a "verifier" object.  Bytes can be incrementally
+# added to the verifier, and at the end a test can be made to see if the
+# checksum matches.  The caller need not know what algorithm is used.  However,
+# at the moment we only support SHA-1 for computing digest (algorith name
+# "sha1").
+sub verifier_create {
+    my $checksum = shift;
+
+    if ($checksum !~ m/^(\w+)=([0-9a-f]+)$/) {
+        die "Malformed checksum: $checksum";
+    }
+    my ($algorithm, $hash) = ($1, $2);
+    if ($algorithm ne 'sha1') {
+        die "Unsupported checksum algorithm: $algorithm";
+    }
+
+    my %verifier = (
+        ALGORITHM => $algorithm,
+        HASH => $hash,
+        DIGESTER => new Digest::SHA1
+    );
+
+    return \%verifier;
+}
+
+sub verifier_add_bytes {
+    my $verifier = shift;
+    my $digester = $verifier->{DIGESTER};
+    my $data = shift;
+
+    $digester->add($data);
+}
+
+sub verifier_check {
+    my $verifier = shift;
+    my $digester = $verifier->{DIGESTER};
+
+    my $newhash = $digester->hexdigest();
+    return ($verifier->{HASH} eq $newhash);
+}
+
+################################ OBJECT ACCESS ################################
+# The base of the decompressor is the object reference layer.  See ref.h for a
+# description of the format for object references.  These functions will parse
+# an object reference, locate the object data from the filesystem, perform any
+# necessary integrity checks (if a checksum is included), and return the object
+# data.
+sub load_ref {
+    # First, try to parse the object reference string into constituent pieces.
+    # The format is segment/object(checksum)[range].  Both the checksum and
+    # range are optional.
+    my $ref_str = shift;
+
+    if ($ref_str !~ m/^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[\S+\])?$/) {
+        die "Malformed object reference: $ref_str";
+    }
+
+    my ($segment, $object, $checksum, $range) = ($1, $2, $3, $4);
+
+    # Next, use the segment/object components to locate and read the object
+    # contents from disk.
+    open OBJECT, "<", "$OBJECT_DIR/$segment/$object"
+        or die "Unable to open object: $OBJECT_DIR/$segment/$object";
+    my $contents = join '', <OBJECT>;
+    close OBJECT;
+
+    # If a checksum was specified in the object reference, verify the object
+    # integrity by computing a checksum of the read data and comparing.
+    if ($checksum) {
+        $checksum =~ m/^\((\S+)\)$/;
+        my $verifier = verifier_create($1);
+        verifier_add_bytes($verifier, $contents);
+        if (!verifier_check($verifier)) {
+            die "Integrity check for object $ref_str failed";
+        }
+    }
+
+    # If a range was specified, then only a subset of the bytes of the object
+    # are desired.  Extract just the desired bytes.
+    if ($range) {
+        if ($range !~ m/^\[(\d+)\+(\d+)\]$/) {
+            die "Malformed object range: $range";
+        }
+
+        my $object_size = length $contents;
+        my ($start, $length) = ($1 + 0, $2 + 0);
+        if ($start >= $object_size || $start + $length > $object_size) {
+            die "Object range $range falls outside object bounds "
+                . "(actual size $object_size)";
+        }
+
+        $contents = substr $contents, $start, $length;
+    }
+
+    return $contents;
+}
+
+############################### MAIN ENTRY POINT ##############################
+my $object = $ARGV[0];
+
+#print "Object: $object\n\n";
+
+my $contents = load_ref($object);
+print $contents;
-- 
2.20.1