1 /* Cumulus: Smart Filesystem Backup to Dumb Servers
3 * Copyright (C) 2008 The Regents of the University of California
4 * Written by Michael Vrable <mvrable@cs.ucsd.edu>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 /* Allow for sub-file incremental backups: if only a portion of a file changes,
22 * allow the new data to be written out, and the old data to simply be
23 * referenced from the new metadata log. */
28 #include <arpa/inet.h>
31 #include "third_party/chunk.h"
32 #include "third_party/sha1.h"
42 Subfile::Subfile(LocalDb *localdb)
43 : db(localdb), checksums_loaded(false), new_block_summary_valid(false)
49 for (size_t i = 0; i < block_list.size(); i++) {
50 delete[] block_list[i].chunks;
56 void Subfile::free_analysis()
58 if (new_block_summary_valid)
59 delete[] new_block_summary.chunks;
61 new_block_summary_valid = false;
64 void Subfile::load_old_blocks(const list<ObjectReference> &blocks)
66 for (list<ObjectReference>::const_iterator i = blocks.begin();
67 i != blocks.end(); ++i) {
71 ObjectReference base = i->base();
72 if (old_blocks.find(base) == old_blocks.end()) {
73 old_blocks.insert(base);
80 /* Actually load chunk signatures from the database, and index them in memory.
81 * This should only be called once per segment. */
82 void Subfile::index_chunks(ObjectReference ref)
84 string refstr = ref.to_string();
86 if (!db->IsAvailable(ref))
89 /* Look for checksums for this block in the database. They may not exist,
90 * in which case simply return without error. */
94 if (!db->LoadChunkSignatures(ref, (void **)&packed_sigs, &len, &algorithm))
96 if (algorithm != get_algorithm()) {
101 int block_id = block_list.size();
103 block_summary summary;
104 summary.ref = ref.base();
105 summary.num_chunks = len / (2 + HASH_SIZE);
106 summary.chunks = new chunk_info[summary.num_chunks];
109 for (int i = 0; i < summary.num_chunks; i++) {
110 char *packed_info = &packed_sigs[i * (2 + HASH_SIZE)];
111 memcpy(summary.chunks[i].hash, &packed_info[2], HASH_SIZE);
114 memcpy(&chunk_len, &packed_info[0], 2);
115 summary.chunks[i].len = ntohs(chunk_len);
116 summary.chunks[i].offset = block_start;
117 block_start += summary.chunks[i].len;
119 chunk_index[string(summary.chunks[i].hash, HASH_SIZE)]
120 = make_pair(block_id, i);
123 block_list.push_back(summary);
127 /* Signatures can be loaded lazily; this method should be called before any
128 * actual access to the chunk signatures is required, to ensure the data has
129 * actually been loaded. */
130 void Subfile::ensure_signatures_loaded()
132 if (checksums_loaded)
135 for (set<ObjectReference>::iterator i = old_blocks.begin();
136 i != old_blocks.end(); ++i) {
140 checksums_loaded = true;
143 void Subfile::analyze_new_block(const char *buf, size_t len)
147 int max_chunks = chunk_compute_max_num_breaks(len);
151 size_t *breakpoints = new size_t[max_chunks];
152 int num_breakpoints = chunk_compute_breaks(buf, len, breakpoints);
154 if (num_breakpoints == 0) {
155 delete[] breakpoints;
159 new_block_summary.num_chunks = num_breakpoints;
160 new_block_summary.chunks = new chunk_info[num_breakpoints];
163 for (int i = 0; i < num_breakpoints; i++) {
164 new_block_summary.chunks[i].offset = block_start;
165 new_block_summary.chunks[i].len = breakpoints[i] - block_start + 1;
166 block_start = breakpoints[i] + 1;
169 hash.process(&buf[new_block_summary.chunks[i].offset],
170 new_block_summary.chunks[i].len);
171 assert(hash.checksum_size() == (size_t)HASH_SIZE);
172 memcpy(new_block_summary.chunks[i].hash, hash.checksum(), HASH_SIZE);
175 new_block_summary_valid = true;
176 delete[] breakpoints;
179 void Subfile::store_block_signatures(ObjectReference ref, block_summary summary)
181 int n = summary.num_chunks;
182 char *packed = (char *)malloc(n * (2 + HASH_SIZE));
184 for (int i = 0; i < n; i++) {
185 assert(summary.chunks[i].len >= 0 && summary.chunks[i].len <= 0xffff);
186 uint16_t len = htons(summary.chunks[i].len);
187 char *packed_info = &packed[i * (2 + HASH_SIZE)];
188 memcpy(&packed_info[0], &len, 2);
189 memcpy(&packed_info[2], summary.chunks[i].hash, HASH_SIZE);
192 db->StoreChunkSignatures(ref, packed, n * (2 + HASH_SIZE),
198 void Subfile::store_analyzed_signatures(ObjectReference ref)
200 if (analyzed_len >= 16384)
201 store_block_signatures(ref, new_block_summary);
204 /* Compute an incremental representation of the most recent block analyzed. */
205 enum subfile_item_type { SUBFILE_COPY, SUBFILE_NEW };
207 struct subfile_item {
208 subfile_item_type type;
210 // For type SUBFILE_COPY
213 // For type SUBFILE_NEW
214 int src_offset, dst_offset;
216 char hash[Subfile::HASH_SIZE];
219 /* Compute an incremental representation of the data last analyzed. A list of
220 * references will be returned corresponding to the data. If new data must be
221 * written out to the backup, it will be written out via the LbsObject
222 * provided, to the provided TarSegmentStore. */
223 list<ObjectReference> Subfile::create_incremental(TarSegmentStore *tss,
227 assert(new_block_summary_valid);
228 bool matched_old = false;
231 list<subfile_item> items;
232 list<ObjectReference> refs;
234 ensure_signatures_loaded();
236 assert(new_block_summary.num_chunks > 0);
238 for (int i = 0; i < new_block_summary.num_chunks; i++) {
239 map<string, pair<int, int> >::iterator m
240 = chunk_index.find(string(new_block_summary.chunks[i].hash,
243 struct subfile_item item;
244 if (m == chunk_index.end()) {
245 item.type = SUBFILE_NEW;
246 item.src_offset = new_block_summary.chunks[i].offset;
247 item.dst_offset = new_data;
248 item.len = new_block_summary.chunks[i].len;
249 memcpy(item.hash, new_block_summary.chunks[i].hash, HASH_SIZE);
250 new_data += item.len;
252 struct chunk_info &old_chunk
253 = block_list[m->second.first].chunks[m->second.second];
254 item.type = SUBFILE_COPY;
255 item.ref = block_list[m->second.first].ref;
256 item.ref.set_range(old_chunk.offset, old_chunk.len);
260 items.push_back(item);
263 // No data was matched. The entire block can be written out as is into a
264 // new object, and the new_block_summary used to save chunk signatures.
266 SHA1Checksum block_hash;
267 block_hash.process(analyzed_buf, analyzed_len);
268 string block_csum = block_hash.checksum_str();
270 o->set_data(analyzed_buf, analyzed_len);
272 ObjectReference ref = o->get_ref();
273 db->StoreObject(ref, block_csum, analyzed_len, block_age);
274 store_analyzed_signatures(ref);
275 ref.set_range(0, analyzed_len, true);
281 // Otherwise, construct a new block containing all literal data needed (if
282 // any exists), write it out, and construct the appropriate list of
284 list<subfile_item>::iterator i;
286 char *literal_buf = new char[new_data];
287 for (i = items.begin(); i != items.end(); ++i) {
288 if (i->type == SUBFILE_NEW) {
289 memcpy(&literal_buf[i->dst_offset],
290 &analyzed_buf[i->src_offset], i->len);
294 SHA1Checksum block_hash;
295 block_hash.process(literal_buf, new_data);
296 string block_csum = block_hash.checksum_str();
298 o->set_group("data");
299 o->set_data(literal_buf, new_data);
301 ObjectReference ref = o->get_ref();
302 for (i = items.begin(); i != items.end(); ++i) {
303 if (i->type == SUBFILE_NEW) {
305 i->ref.set_range(i->dst_offset, i->len);
309 db->StoreObject(ref, block_csum, new_data, 0.0);
311 block_summary summary;
313 summary.num_chunks = 0;
314 summary.chunks = new chunk_info[items.size()];
315 for (i = items.begin(); i != items.end(); ++i) {
316 if (i->type == SUBFILE_NEW) {
317 chunk_info &info = summary.chunks[summary.num_chunks];
318 memcpy(info.hash, i->hash, HASH_SIZE);
319 info.offset = i->dst_offset;
321 summary.num_chunks++;
325 store_block_signatures(ref, summary);
327 delete[] summary.chunks;
328 delete[] literal_buf;
334 for (i = items.begin(); i != items.end(); ++i) {
335 string refstr = i->ref.to_string();
336 if (!ref.merge(i->ref)) {
341 assert(!ref.is_null());