contrib/restore.pl

   1 #!/usr/bin/perl -w
   2 #
   3 # Proof-of-concept/reference decoder for LBS-format backup snapshots.
   4 #
   5 # This decoder aims to decompress an LBS snapshot.  It is not meant to be
   6 # particularly efficient, but should be a small and portable tool for doing so
   7 # (important for recovering from data loss).  It is also meant to serve as a
   8 # check on the snapshot tool and data format itself, and serve as documentation
   9 # for the format.
  10 #
  11 # This decoder does not understand TAR archives; it assumes that all segments
  12 # in the snapshot have already been decompressed, and that objects are
  13 # available simply as files in the filesystem.  This simplifies the design.
  14 #
  15 # Limitations: Since this code is probably using 32-bit arithmetic, files
  16 # larger than 2-4 GB may not be properly handled.
  17 #
  18 # Copyright (C) 2007  Michael Vrable
  19
  20 use strict;
  21 use Digest::SHA1;
  22 use File::Basename;
  23
  24 my $OBJECT_DIR;                 # Where are the unpacked objects available?
  25 my $DEST_DIR = ".";             # Where should restored files should be placed?
  26 my $RECURSION_LIMIT = 3;        # Bound on recursive object references
  27
  28 my $VERBOSE = 0;                # Set to 1 to enable debugging messages
  29
  30 ############################ CHECKSUM VERIFICATION ############################
  31 # A very simple layer for verifying checksums.  Checksums may be used on object
  32 # references directly, and can also be used to verify entire reconstructed
  33 # files.
  34 #
  35 # A checksum to verify is given in the form "algorithm=hexdigest".  Given such
  36 # a string, we can construct a "verifier" object.  Bytes can be incrementally
  37 # added to the verifier, and at the end a test can be made to see if the
  38 # checksum matches.  The caller need not know what algorithm is used.  However,
  39 # at the moment we only support SHA-1 for computing digest (algorith name
  40 # "sha1").
  41 sub verifier_create {
  42     my $checksum = shift;
  43
  44     if ($checksum !~ m/^(\w+)=([0-9a-f]+)$/) {
  45         die "Malformed checksum: $checksum";
  46     }
  47     my ($algorithm, $hash) = ($1, $2);
  48     if ($algorithm ne 'sha1') {
  49         die "Unsupported checksum algorithm: $algorithm";
  50     }
  51
  52     my %verifier = (
  53         ALGORITHM => $algorithm,
  54         HASH => $hash,
  55         DIGESTER => new Digest::SHA1
  56     );
  57
  58     return \%verifier;
  59 }
  60
  61 sub verifier_add_bytes {
  62     my $verifier = shift;
  63     my $digester = $verifier->{DIGESTER};
  64     my $data = shift;
  65
  66     $digester->add($data);
  67 }
  68
  69 sub verifier_check {
  70     my $verifier = shift;
  71     my $digester = $verifier->{DIGESTER};
  72
  73     my $newhash = $digester->hexdigest();
  74     if ($VERBOSE && $verifier->{HASH} ne $newhash) {
  75         print STDERR "Verification failure: ",
  76             $newhash, " != ", $verifier->{HASH}, "\n";
  77     }
  78     return ($verifier->{HASH} eq $newhash);
  79 }
  80
  81 ################################ OBJECT ACCESS ################################
  82 # The base of the decompressor is the object reference layer.  See ref.h for a
  83 # description of the format for object references.  These functions will parse
  84 # an object reference, locate the object data from the filesystem, perform any
  85 # necessary integrity checks (if a checksum is included), and return the object
  86 # data.
  87 sub load_ref {
  88     my $ref_str = shift;
  89
  90     # Check for special objects before attempting general parsing.
  91     if ($ref_str =~ m/^zero\[((\d+)\+)?(\d+)\]$/) {
  92         return "\0" x ($3 + 0);
  93     }
  94
  95     # Try to parse the object reference string into constituent pieces.  The
  96     # format is segment/object(checksum)[range].  Both the checksum and range
  97     # are optional.
  98     if ($ref_str !~ m/^([-0-9a-f]+)\/([0-9a-f]+)(\(\S+\))?(\[\S+\])?$/) {
  99         die "Malformed object reference: $ref_str";
 100     }
 101
 102     my ($segment, $object, $checksum, $range) = ($1, $2, $3, $4);
 103
 104     # Next, use the segment/object components to locate and read the object
 105     # contents from disk.
 106     open OBJECT, "<", "$OBJECT_DIR/$segment/$object"
 107         or die "Unable to open object $OBJECT_DIR/$segment/$object: $!";
 108     my $contents = join '', <OBJECT>;
 109     close OBJECT;
 110
 111     # If a checksum was specified in the object reference, verify the object
 112     # integrity by computing a checksum of the read data and comparing.
 113     if ($checksum) {
 114         $checksum =~ m/^\((\S+)\)$/;
 115         my $verifier = verifier_create($1);
 116         verifier_add_bytes($verifier, $contents);
 117         if (!verifier_check($verifier)) {
 118             die "Integrity check for object $ref_str failed";
 119         }
 120     }
 121
 122     # If a range was specified, then only a subset of the bytes of the object
 123     # are desired.  Extract just the desired bytes.
 124     if ($range) {
 125         my $object_size = length $contents;
 126         my ($start, $length);
 127
 128         if ($range =~ m/^\[=(\d+)\]$/) {
 129             die "Object size incorrect (ref $ref_str, actual size $object_size"
 130                 if $object_size != $1;
 131             ($start, $length) = (0, $1 + 0);
 132         } elsif ($range =~ m/^\[(\d+)\]$/) {
 133             ($start, $length) = (0, $1 + 0);
 134         } elsif ($range =~ m/^\[(\d+)\+(\d+)\]$/) {
 135             ($start, $length) = ($1 + 0, $2 + 0);
 136         } else {
 137             die "Malformed object range: $range";
 138         }
 139
 140         if ($start >= $object_size || $start + $length > $object_size) {
 141             die "Object range $range falls outside object bounds "
 142                 . "(actual size $object_size)";
 143         }
 144
 145         $contents = substr $contents, $start, $length;
 146     }
 147
 148     return $contents;
 149 }
 150
 151 ############################### FILE PROCESSING ###############################
 152 # Process the metadata for a single file.  process_file is the main entry
 153 # point; it should be given a list of file metadata key/value pairs.
 154 # iterate_objects is a helper function used to iterate over the set of object
 155 # references that contain the file data for a regular file.
 156
 157 sub parse_int {
 158     my $str = shift;
 159     if ($str =~ /^0/) {
 160         return oct($str);
 161     } else {
 162         return $str + 0;
 163     }
 164 }
 165
 166 sub uri_decode {
 167     my $str = shift;
 168     $str =~ s/%([0-9a-f]{2})/chr(hex($1))/ge;
 169     return $str;
 170 }
 171
 172 sub iterate_objects {
 173     my $callback = shift;       # Function to be called for each reference
 174     my $arg = shift;            # Argument passed to callback
 175     my $text = shift;           # Whitespace-separate list of object references
 176
 177     # Simple limit to guard against cycles in the object references
 178     my $recursion_level = shift || 0;
 179     if ($recursion_level >= $RECURSION_LIMIT) {
 180         die "Recursion limit reached";
 181     }
 182
 183     # Split the provided text at whitespace boundaries to produce the list of
 184     # object references.  If any of these start with "@", then we have an
 185     # indirect reference, and must look up that object and call iterate_objects
 186     # with the contents.
 187     my $obj;
 188     foreach $obj (split /\s+/, $text) {
 189         next if $obj eq "";
 190         if ($obj =~ /^@(\S+)$/) {
 191             my $indirect = load_ref($1);
 192             iterate_objects($callback, $arg, $indirect, $recursion_level + 1);
 193         } else {
 194             &$callback($arg, $obj);
 195         }
 196     }
 197 }
 198
 199 sub obj_callback {
 200     my $state = shift;
 201     my $obj = shift;
 202     my $data = load_ref($obj);
 203     print FILE $data
 204         or die "Error writing file data: $!";
 205     verifier_add_bytes($state->{VERIFIER}, $data);
 206     $state->{BYTES} += length($data);
 207 }
 208
 209 # Extract the contents of a regular file by concatenating all the objects that
 210 # comprise it.
 211 sub unpack_file {
 212     my $name = shift;
 213     my %info = @_;
 214     my %state = ();
 215
 216     if (!defined $info{data}) {
 217         die "File contents not specified for $name";
 218     }
 219     if (!defined $info{checksum} || !defined $info{size}) {
 220         die "File $name is missing checksum or size";
 221     }
 222
 223     $info{size} = parse_int($info{size});
 224
 225     # Open the file to be recreated.  The data will be written out by the call
 226     # to iterate_objects.
 227     open FILE, ">", "$DEST_DIR/$name"
 228         or die "Cannot write file $name: $!";
 229
 230     # Set up state so that we can incrementally compute the checksum and length
 231     # of the reconstructed data.  Then iterate over all objects in the file.
 232     $state{VERIFIER} = verifier_create($info{checksum});
 233     $state{BYTES} = 0;
 234     iterate_objects(\&obj_callback, \%state, $info{data});
 235
 236     close FILE;
 237
 238     # Verify that the reconstructed object matches the size/checksum we were
 239     # given.
 240     if (!verifier_check($state{VERIFIER}) || $state{BYTES} != $info{size}) {
 241         die "File reconstruction failed for $name: size or checksum differs";
 242     }
 243 }
 244
 245 sub process_file {
 246     my %info = @_;
 247
 248     if (!defined($info{name})) {
 249         die "Filename not specified in metadata block";
 250     }
 251
 252     my $type = $info{type};
 253
 254     my $filename = uri_decode($info{name});
 255     print "$filename\n" if $VERBOSE;
 256
 257     # Restore the specified file.  How to do so depends upon the file type, so
 258     # dispatch based on that.
 259     my $dest = "$DEST_DIR/$filename";
 260     if ($type eq '-' || $type eq 'f') {
 261         # Regular file
 262         unpack_file($filename, %info);
 263     } elsif ($type eq 'd') {
 264         # Directory
 265         if ($filename ne '.') {
 266             mkdir $dest or die "Cannot create directory $filename: $!";
 267         }
 268     } elsif ($type eq 'l') {
 269         # Symlink
 270         my $target = $info{target} || $info{contents};
 271         if (!defined($target)) {
 272             die "Symlink $filename has no value specified";
 273         }
 274         $target = uri_decode($target);
 275         symlink $target, $dest
 276             or die "Cannot create symlink $filename: $!";
 277
 278         # TODO: We can't properly restore all metadata for symbolic links
 279         # (attempts to do so below will change metadata for the pointed-to
 280         # file).  This should be later fixed, but for now we simply return
 281         # before getting to the restore metadata step below.
 282         return;
 283     } elsif ($type eq 'p' || $type eq 's' || $type eq 'c' || $type eq 'b') {
 284         # Pipe, socket, character device, block device.
 285         # TODO: Handle these cases.
 286         print STDERR "Ignoring special file $filename of type $type\n";
 287         return;
 288     } else {
 289         die "Unknown file type '$type' for file $filename";
 290     }
 291
 292     # Restore mode, ownership, and any other metadata for the file.  This is
 293     # split out from the code above since the code is the same regardless of
 294     # file type.
 295     my $mtime = $info{mtime} || time();
 296     utime time(), $mtime, $dest
 297         or warn "Unable to update mtime for $dest";
 298
 299     my $uid = -1;
 300     my $gid = -1;
 301     if (defined $info{user}) {
 302         my @items = split /\s/, $info{user};
 303         $uid = parse_int($items[0]) if exists $items[0];
 304     }
 305     if (defined $info{group}) {
 306         my @items = split /\s/, $info{group};
 307         $gid = parse_int($items[0]) if exists $items[0];
 308     }
 309     chown $uid, $gid, $dest
 310         or warn "Unable to change ownership for $dest";
 311
 312     if (defined $info{mode}) {
 313         my $mode = parse_int($info{mode});
 314         chmod $mode, $dest
 315             or warn "Unable to change permissions for $dest";
 316     }
 317 }
 318
 319 ########################### METADATA LIST PROCESSING ##########################
 320 # Process the file metadata listing provided, and as information for each file
 321 # is extracted, pass it to process_file.  This will recursively follow indirect
 322 # references to other metadata objects.
 323 sub process_metadata {
 324     my ($metadata, $recursion_level) = @_;
 325
 326     # Check recursion; this will prevent us from infinitely recursing on an
 327     # indirect reference which loops back to itself.
 328     $recursion_level ||= 0;
 329     if ($recursion_level >= $RECURSION_LIMIT) {
 330         die "Recursion limit reached";
 331     }
 332
 333     # Split the metadata into lines, then start processing each line.  There
 334     # are two primary cases:
 335     #   - Lines starting with "@" are indirect references to other metadata
 336     #     objects.  Recursively process that object before continuing.
 337     #   - Other lines should come in groups separated by a blank line; these
 338     #     contain metadata for a single file that should be passed to
 339     #     process_file.
 340     # Note that blocks of metadata about a file cannot span a boundary between
 341     # metadata objects.
 342     my %info = ();
 343     my $line;
 344     my $last_key;
 345     foreach $line (split /\n/, $metadata) {
 346         # If we find a blank line or a reference to another block, process any
 347         # data for the previous file first.
 348         if ($line eq '' || $line =~ m/^@/) {
 349             process_file(%info) if %info;
 350             %info = ();
 351             undef $last_key;
 352             next if $line eq '';
 353         }
 354
 355         # Recursively handle indirect metadata blocks.
 356         if ($line =~ m/^@(\S+)$/) {
 357             print "Indirect: $1\n" if $VERBOSE;
 358             my $indirect = load_ref($1);
 359             process_metadata($indirect, $recursion_level + 1);
 360             next;
 361         }
 362
 363         # Try to parse the data as "key: value" pairs of file metadata.  Also
 364         # handle continuation lines, which start with whitespace and continue
 365         # the previous "key: value" pair.
 366         if ($line =~ m/^([-\w]+):\s*(.*)$/) {
 367             $info{$1} = $2;
 368             $last_key = $1;
 369         } elsif ($line =~/^\s/ && defined $last_key) {
 370             $info{$last_key} .= $line;
 371         } else {
 372             print STDERR "Junk in file metadata section: $line\n";
 373         }
 374     }
 375
 376     # Process any last file metadata which has not already been processed.
 377     process_file(%info) if %info;
 378 }
 379
 380 ############################### MAIN ENTRY POINT ##############################
 381 # Program start.  We expect to be called with a single argument, which is the
 382 # name of the backup descriptor file written by a backup pass.  This will name
 383 # the root object in the snapshot, from which we can reach all other data we
 384 # need.
 385
 386 # Parse command-line arguments.  The first (required) is the name of the
 387 # snapshot descriptor file.  The backup objects are assumed to be stored in the
 388 # same directory as the descriptor.  The second (optional) argument is the
 389 # directory where the restored files should be written; it defaults to ".";
 390 my $descriptor = $ARGV[0];
 391 unless (defined($descriptor) && -r $descriptor) {
 392     print STDERR "Usage: $0 <snapshot file>\n";
 393     exit 1;
 394 }
 395
 396 if (defined($ARGV[1])) {
 397     $DEST_DIR = $ARGV[1];
 398 }
 399
 400 $OBJECT_DIR = dirname($descriptor);
 401 print "Source directory: $OBJECT_DIR\n" if $VERBOSE;
 402
 403 # Read the snapshot descriptor to find the root object.  Parse it to get a set
 404 # of key/value pairs.
 405 open DESCRIPTOR, "<", $descriptor
 406     or die "Cannot open backup descriptor file $descriptor: $!";
 407 my %descriptor = ();
 408 my ($line, $last_key);
 409 while (defined($line = <DESCRIPTOR>)) {
 410     # Any lines of the form "key: value" should be inserted into the
 411     # %descriptor dictionary.  Any continuation line (a line starting with
 412     # whitespace) will append text to the previous key's value.  Ignore other
 413     # lines.
 414     chomp $line;
 415
 416     if ($line =~ m/^([-\w]+):\s*(.*)$/) {
 417         $descriptor{$1} = $2;
 418         $last_key = $1;
 419     } elsif ($line =~/^\s/ && defined $last_key) {
 420         $descriptor{$last_key} .= $line;
 421     } else {
 422         undef $last_key;
 423         print STDERR "Ignoring line in backup descriptor: $line\n";
 424     }
 425 }
 426
 427 # A valid backup descriptor should at the very least specify the root metadata
 428 # object.
 429 if (!exists $descriptor{Root}) {
 430     die "Expected 'Root:' specification in backup descriptor file";
 431 }
 432 my $root = $descriptor{Root};
 433 close DESCRIPTOR;
 434
 435 # Set the umask to something restrictive.  As we unpack files, we'll originally
 436 # write the files/directories without setting the permissions, so be
 437 # conservative and ensure that they can't be read.  Afterwards, we'll properly
 438 # fix up permissions.
 439 umask 077;
 440
 441 # Start processing metadata stored in the root to recreate the files.
 442 print "Root object: $root\n" if $VERBOSE;
 443 my $contents = load_ref($root);
 444 process_metadata($contents);