3 # Generate parity blocks for backup segments, so that the corruption or loss of
4 # some segment data can be recovered, and does not render a snapshot
7 # par2 is used to generate the parity blocks. This script merely automates the
8 # creation and maintenance of these parity files.
10 # This program will incrementally update parity files. Input files which are
11 # not protected will have parity files generated. If there are many new files,
12 # they will be grouped into about 16-file blocks and parity files generated for
13 # each block. If files have been deleted, obsolete parity sets will be
16 # The generated parity files are given a name of parity-<hexdigits>.par2.
17 # Associated with each is a parity-<hexdigits>.manifest file which specifies
18 # how many file losses are protected against, and which files specifically are
19 # protected by that set.
21 import md5, os, re, sys
22 from subprocess import Popen
24 SEG_REGEXP = r"^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(\.\S+)?$"
26 def build_parity_set(files, recoverycount):
27 """Low-level function to call out to par2 to generate parity files.
29 Given a specific list of files, use par2 to generate a set of checksums for
30 recovery of that exact file list. Arrange so that at least recoverycount
31 files can be lost from the set and still recover the data.
36 for f in files: name.update(f)
37 name = name.hexdigest()
39 maxsize = max([os.stat(os.path.join(basedir, f)).st_size for f in files])
40 blocksize = (maxsize + 15) // 16
41 blocksize = (blocksize + 3) & ~3
43 print name, blocksize, files
45 p = Popen(["par2", "create", "-s%d" % (blocksize,),
46 "-c%d" % (recoverycount * 16), "-n%d" % (recoverycount,),
47 "-u", "-v", "parity-%s" % (name,)] + files,
51 raise RuntimeError("Nonzero return code from par2: %s" % (retcode,))
53 descfile = open(os.path.join(basedir, "parity-%s.manifest" % (name,)), 'w')
54 descfile.write("%d\n" % (recoverycount,))
56 descfile.write(f + "\n")
59 def generate_parity(files):
60 """Generate parity files for the given list of input files.
62 This will automatically break the list of input files into smaller chunks,
63 and generate separate parity files for each chunk.
70 count = min(16, len(files))
71 redundancy = 1 + count // 4
72 build_parity_set(files[0:count], redundancy)
75 def remove_parity(manifest):
76 """Delete the given set of parity files and associated metadata."""
79 if manifest.endswith(suffix):
80 manifest = manifest[:-len(suffix)]
81 matches = [f for f in file_list if f.startswith(manifest)]
82 print "Delete", matches
84 os.unlink(os.path.join(basedir, f))
87 file_list = os.listdir(basedir)
89 parity_list = [f for f in file_list if re.match(r"^parity-.*\.manifest$", f)]
90 segment_list = set(f for f in file_list if re.match(SEG_REGEXP, f))
95 files = [l.strip() for l in open(os.path.join(basedir, p))]
96 redundancy = int(files[0])
100 if not os.access(os.path.join(basedir, f), os.F_OK):
102 min_redundancy = 1 + len(files) // 8
104 print "Parity set %s is missing %d files, can recover %d more" \
105 % (p, missing, redundancy - missing)
106 if redundancy - missing < min_redundancy or missing:
109 segment_list.difference_update(files)
111 print "Need parity:", list(segment_list)
112 generate_parity(segment_list)
114 print "Deleting:", to_delete