restore.pl

   1 #!/usr/bin/perl -w
   2 #
   3 # Proof-of-concept/reference decoder for LBS-format backup snapshots.
   4 #
   5 # This decoder aims to decompress an LBS snapshot.  It is not meant to be
   6 # particularly efficient, but should be a small and portable tool for doing so
   7 # (important for recovering from data loss).  It is also meant to serve as a
   8 # check on the snapshot tool and data format itself, and serve as documentation
   9 # for the format.
  10 #
  11 # This decoder does not understand TAR archives; it assumes that all segments
  12 # in the snapshot have already been decompressed, and that objects are
  13 # available simply as files in the filesystem.  This simplifies the design.
  14 #
  15 # Limitations: Since this code is probably using 32-bit arithmetic, files
  16 # larger than 2-4 GB may not be properly handled.
  17 #
  18 # Copyright (C) 2007  Michael Vrable
  19
  20 use strict;
  21 use Digest::SHA1;
  22 use File::Basename;
  23
  24 my $OBJECT_DIR;                 # Where are the unpacked objects available?
  25 my $DEST_DIR = ".";             # Where should restored files should be placed?
  26 my $RECURSION_LIMIT = 3;        # Bound on recursive object references
  27
  28 ############################ CHECKSUM VERIFICATION ############################
  29 # A very simple later for verifying checksums.  Checksums may be used on object
  30 # references directly, and can also be used to verify entire reconstructed
  31 # files.
  32 #
  33 # A checksum to verify is given in the form "algorithm=hexdigest".  Given such
  34 # a string, we can construct a "verifier" object.  Bytes can be incrementally
  35 # added to the verifier, and at the end a test can be made to see if the
  36 # checksum matches.  The caller need not know what algorithm is used.  However,
  37 # at the moment we only support SHA-1 for computing digest (algorith name
  38 # "sha1").
  39 sub verifier_create {
  40     my $checksum = shift;
  41
  42     if ($checksum !~ m/^(\w+)=([0-9a-f]+)$/) {
  43         die "Malformed checksum: $checksum";
  44     }
  45     my ($algorithm, $hash) = ($1, $2);
  46     if ($algorithm ne 'sha1') {
  47         die "Unsupported checksum algorithm: $algorithm";
  48     }
  49
  50     my %verifier = (
  51         ALGORITHM => $algorithm,
  52         HASH => $hash,
  53         DIGESTER => new Digest::SHA1
  54     );
  55
  56     return \%verifier;
  57 }
  58
  59 sub verifier_add_bytes {
  60     my $verifier = shift;
  61     my $digester = $verifier->{DIGESTER};
  62     my $data = shift;
  63
  64     $digester->add($data);
  65 }
  66
  67 sub verifier_check {
  68     my $verifier = shift;
  69     my $digester = $verifier->{DIGESTER};
  70
  71     my $newhash = $digester->hexdigest();
  72     if ($verifier->{HASH} ne $newhash) {
  73         print STDERR "Verification failure: ",
  74             $newhash, " != ", $verifier->{HASH}, "\n";
  75     }
  76     return ($verifier->{HASH} eq $newhash);
  77 }
  78
  79 ################################ OBJECT ACCESS ################################
  80 # The base of the decompressor is the object reference layer.  See ref.h for a
  81 # description of the format for object references.  These functions will parse
  82 # an object reference, locate the object data from the filesystem, perform any
  83 # necessary integrity checks (if a checksum is included), and return the object
  84 # data.
  85 sub load_ref {
  86     # First, try to parse the object reference string into constituent pieces.
  87     # The format is segment/object(checksum)[range].  Both the checksum and
  88     # range are optional.
  89     my $ref_str = shift;
  90
  91     if ($ref_str !~ m/^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[\S+\])?$/) {
  92         die "Malformed object reference: $ref_str";
  93     }
  94
  95     my ($segment, $object, $checksum, $range) = ($1, $2, $3, $4);
  96
  97     # Next, use the segment/object components to locate and read the object
  98     # contents from disk.
  99     open OBJECT, "<", "$OBJECT_DIR/$segment/$object"
 100         or die "Unable to open object $OBJECT_DIR/$segment/$object: $!";
 101     my $contents = join '', <OBJECT>;
 102     close OBJECT;
 103
 104     # If a checksum was specified in the object reference, verify the object
 105     # integrity by computing a checksum of the read data and comparing.
 106     if ($checksum) {
 107         $checksum =~ m/^\((\S+)\)$/;
 108         my $verifier = verifier_create($1);
 109         verifier_add_bytes($verifier, $contents);
 110         if (!verifier_check($verifier)) {
 111             die "Integrity check for object $ref_str failed";
 112         }
 113     }
 114
 115     # If a range was specified, then only a subset of the bytes of the object
 116     # are desired.  Extract just the desired bytes.
 117     if ($range) {
 118         if ($range !~ m/^\[(\d+)\+(\d+)\]$/) {
 119             die "Malformed object range: $range";
 120         }
 121
 122         my $object_size = length $contents;
 123         my ($start, $length) = ($1 + 0, $2 + 0);
 124         if ($start >= $object_size || $start + $length > $object_size) {
 125             die "Object range $range falls outside object bounds "
 126                 . "(actual size $object_size)";
 127         }
 128
 129         $contents = substr $contents, $start, $length;
 130     }
 131
 132     return $contents;
 133 }
 134
 135 ############################### FILE PROCESSING ###############################
 136 # Process the metadata for a single file.  process_file is the main entry
 137 # point; it should be given a list of file metadata key/value pairs.
 138 # iterate_objects is a helper function used to iterate over the set of object
 139 # references that contain the file data for a regular file.
 140
 141 sub uri_decode {
 142     my $str = shift;
 143     $str =~ s/%([0-9a-f]{2})/chr(hex($1))/ge;
 144     return $str;
 145 }
 146
 147 sub iterate_objects {
 148     my $callback = shift;       # Function to be called for each reference
 149     my $arg = shift;            # Argument passed to callback
 150     my $text = shift;           # Whitespace-separate list of object references
 151
 152     # Simple limit to guard against cycles in the object references
 153     my $recursion_level = shift || 0;
 154     if ($recursion_level >= $RECURSION_LIMIT) {
 155         die "Recursion limit reached";
 156     }
 157
 158     # Split the provided text at whitespace boundaries to produce the list of
 159     # object references.  If any of these start with "@", then we have an
 160     # indirect reference, and must look up that object and call iterate_objects
 161     # with the contents.
 162     my $obj;
 163     foreach $obj (split /\s+/, $text) {
 164         next if $obj eq "";
 165         if ($obj =~ /^@(\S+)$/) {
 166             my $indirect = load_ref($1);
 167             iterate_objects($callback, $arg, $1, $recursion_level + 1);
 168         } else {
 169             &$callback($arg, $obj);
 170         }
 171     }
 172 }
 173
 174 sub obj_callback {
 175     my $state = shift;
 176     my $obj = shift;
 177     my $data = load_ref($obj);
 178     print FILE $data
 179         or die "Error writing file data: $!";
 180     verifier_add_bytes($state->{VERIFIER}, $data);
 181     $state->{BYTES} += length($data);
 182 }
 183
 184 # Extract the contents of a regular file by concatenating all the objects that
 185 # comprise it.
 186 sub unpack_file {
 187     my $name = shift;
 188     my %info = @_;
 189     my %state = ();
 190
 191     if (!defined $info{data}) {
 192         die "File contents not specified for $name";
 193     }
 194     if (!defined $info{checksum} || !defined $info{size}) {
 195         die "File $name is missing checksum or size";
 196     }
 197
 198     # Open the file to be recreated.  The data will be written out by the call
 199     # to iterate_objects.
 200     open FILE, ">", "$DEST_DIR/$name"
 201         or die "Cannot write file $name: $!";
 202
 203     # Set up state so that we can incrementally compute the checksum and length
 204     # of the reconstructed data.  Then iterate over all objects in the file.
 205     $state{VERIFIER} = verifier_create($info{checksum});
 206     $state{BYTES} = 0;
 207     iterate_objects(\&obj_callback, \%state, $info{data});
 208
 209     close FILE;
 210
 211     # Verify that the reconstructed object matches the size/checksum we were
 212     # given.
 213     if (!verifier_check($state{VERIFIER}) || $state{BYTES} != $info{size}) {
 214         die "File reconstruction failed for $name: size or checksum differs";
 215     }
 216 }
 217
 218 sub process_file {
 219     my %info = @_;
 220
 221     if (!defined($info{name})) {
 222         die "Filename not specified in metadata block";
 223     }
 224
 225     my $type = $info{type};
 226
 227     my $filename = uri_decode($info{name});
 228     print "process_file: $filename\n";
 229
 230     # Restore the specified file.  How to do so depends upon the file type, so
 231     # dispatch based on that.
 232     my $dest = "$DEST_DIR/$filename";
 233     if ($type eq '-') {
 234         # Regular file
 235         unpack_file($filename, %info);
 236     } elsif ($type eq 'd') {
 237         # Directory
 238         if ($filename ne '.') {
 239             mkdir $dest or die "Cannot create directory $filename: $!";
 240         }
 241     } elsif ($type eq 'l') {
 242         # Symlink
 243         if (!defined($info{contents})) {
 244             die "Symlink $filename has no value specified";
 245         }
 246         my $contents = uri_decode($info{contents});
 247         symlink $contents, $dest
 248             or die "Cannot create symlink $filename: $!";
 249
 250         # TODO: We can't properly restore all metadata for symbolic links
 251         # (attempts to do so below will change metadata for the pointed-to
 252         # file).  This should be later fixed, but for now we simply return
 253         # before getting to the restore metadata step below.
 254         return;
 255     } elsif ($type eq 'p' || $type eq 's' || $type eq 'c' || $type eq 'b') {
 256         # Pipe, socket, character device, block device.
 257         # TODO: Handle these cases.
 258         print STDERR "Ignoring special file $filename of type $type\n";
 259         return;
 260     } else {
 261         die "Unknown file type '$type' for file $filename";
 262     }
 263
 264     # Restore mode, ownership, and any other metadata for the file.  This is
 265     # split out from the code above since the code is the same regardless of
 266     # file type.
 267     my $atime = $info{atime} || time();
 268     my $mtime = $info{mtime} || time();
 269     utime $atime, $mtime, $dest
 270         or warn "Unable to update atime/mtime for $dest";
 271
 272     my $uid = $info{user} || -1;
 273     my $gid = $info{group} || -1;
 274     chown $uid, $gid, $dest
 275         or warn "Unable to change ownership for $dest";
 276
 277     if (defined $info{mode}) {
 278         my $mode = $info{mode};
 279         chmod $mode, $dest
 280             or warn "Unable to change permissions for $dest";
 281     }
 282 }
 283
 284 ########################### METADATA LIST PROCESSING ##########################
 285 # Process the file metadata listing provided, and as information for each file
 286 # is extracted, pass it to process_file.  This will recursively follow indirect
 287 # references to other metadata objects.
 288 sub process_metadata {
 289     my ($metadata, $recursion_level) = @_;
 290
 291     # Check recursion; this will prevent us from infinitely recursing on an
 292     # indirect reference which loops back to itself.
 293     $recursion_level ||= 0;
 294     if ($recursion_level >= $RECURSION_LIMIT) {
 295         die "Recursion limit reached";
 296     }
 297
 298     # Split the metadata into lines, then start processing each line.  There
 299     # are two primary cases:
 300     #   - Lines starting with "@" are indirect references to other metadata
 301     #     objects.  Recursively process that object before continuing.
 302     #   - Other lines should come in groups separated by a blank line; these
 303     #     contain metadata for a single file that should be passed to
 304     #     process_file.
 305     # Note that blocks of metadata about a file cannot span a boundary between
 306     # metadata objects.
 307     my %info = ();
 308     my $line;
 309     my $last_key;
 310     foreach $line (split /\n/, $metadata) {
 311         # If we find a blank line or a reference to another block, process any
 312         # data for the previous file first.
 313         if ($line eq '' || $line =~ m/^@/) {
 314             process_file(%info) if %info;
 315             %info = ();
 316             undef $last_key;
 317             next if $line eq '';
 318         }
 319
 320         # Recursively handle indirect metadata blocks.
 321         if ($line =~ m/^@(\S+)$/) {
 322             print "Indirect: $1\n";
 323             my $indirect = load_ref($1);
 324             process_metadata($indirect, $recursion_level + 1);
 325             next;
 326         }
 327
 328         # Try to parse the data as "key: value" pairs of file metadata.  Also
 329         # handle continuation lines, which start with whitespace and continue
 330         # the previous "key: value" pair.
 331         if ($line =~ m/^(\w+):\s+(.*)\s*$/) {
 332             $info{$1} = $2;
 333             $last_key = $1;
 334         } elsif ($line =~/^\s/ && defined $last_key) {
 335             $info{$last_key} .= $line;
 336         } else {
 337             print STDERR "Junk in file metadata section: $line\n";
 338         }
 339     }
 340
 341     # Process any last file metadata which has not already been processed.
 342     process_file(%info) if %info;
 343 }
 344
 345 ############################### MAIN ENTRY POINT ##############################
 346 # Program start.  We expect to be called with a single argument, which is the
 347 # name of the backup descriptor file written by a backup pass.  This will name
 348 # the root object in the snapshot, from which we can reach all other data we
 349 # need.
 350
 351 # Parse command-line arguments.  The first (required) is the name of the
 352 # snapshot descriptor file.  The backup objects are assumed to be stored in the
 353 # same directory as the descriptor.  The second (optional) argument is the
 354 # directory where the restored files should be written; it defaults to ".";
 355 my $descriptor = $ARGV[0];
 356 unless (defined($descriptor) && -r $descriptor) {
 357     print STDERR "Usage: $0 <snapshot file>\n";
 358     exit 1;
 359 }
 360
 361 if (defined($ARGV[1])) {
 362     $DEST_DIR = $ARGV[1];
 363 }
 364
 365 $OBJECT_DIR = dirname($descriptor);
 366 print "Source directory: $OBJECT_DIR\n";
 367
 368 # Read the snapshot descriptor to find the root object.
 369 open DESCRIPTOR, "<", $descriptor
 370     or die "Cannot open backup descriptor file $descriptor: $!";
 371 my $line = <DESCRIPTOR>;
 372 if ($line !~ m/^root: (\S+)$/) {
 373     die "Expected 'root:' specification in backup descriptor file";
 374 }
 375 my $root = $1;
 376 close DESCRIPTOR;
 377
 378 # Set the umask to something restrictive.  As we unpack files, we'll originally
 379 # write the files/directories without setting the permissions, so be
 380 # conservative and ensure that they can't be read.  Afterwards, we'll properly
 381 # fix up permissions.
 382 umask 077;
 383
 384 # Start processing metadata stored in the root to recreate the files.
 385 print "Root object: $root\n";
 386 my $contents = load_ref($root);
 387 process_metadata($contents);