1 /* Cumulus: Efficient Filesystem Backup to the Cloud
2 * Copyright (C) 2008 The Cumulus Developers
3 * See the AUTHORS file for a list of contributors.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 /* Allow for sub-file incremental backups: if only a portion of a file changes,
21 * allow the new data to be written out, and the old data to simply be
22 * referenced from the new metadata log. */
27 #include <arpa/inet.h>
31 #include "third_party/chunk.h"
41 Subfile::Subfile(LocalDb *localdb)
42 : db(localdb), checksums_loaded(false), new_block_summary_valid(false)
44 Hash *hasher = Hash::New();
46 algorithm_name = chunk_algorithm_name() + "/" + hasher->name();
47 hash_size = hasher->digest_size();
53 for (size_t i = 0; i < block_list.size(); i++) {
54 delete[] block_list[i].chunks;
60 void Subfile::free_analysis()
62 if (new_block_summary_valid)
63 delete[] new_block_summary.chunks;
65 new_block_summary_valid = false;
68 void Subfile::load_old_blocks(const list<ObjectReference> &blocks)
70 for (list<ObjectReference>::const_iterator i = blocks.begin();
71 i != blocks.end(); ++i) {
75 ObjectReference base = i->base();
76 if (old_blocks.find(base) == old_blocks.end()) {
77 old_blocks.insert(base);
84 /* Actually load chunk signatures from the database, and index them in memory.
85 * This should only be called once per segment. */
86 void Subfile::index_chunks(ObjectReference ref)
88 string refstr = ref.to_string();
90 if (!db->IsAvailable(ref))
93 /* Look for checksums for this block in the database. They may not exist,
94 * in which case simply return without error. */
98 if (!db->LoadChunkSignatures(ref, (void **)&packed_sigs, &len, &algorithm))
100 if (algorithm != algorithm_name) {
105 int block_id = block_list.size();
107 block_summary summary;
108 summary.ref = ref.base();
109 summary.num_chunks = len / (2 + hash_size);
110 summary.chunks = new chunk_info[summary.num_chunks];
113 for (int i = 0; i < summary.num_chunks; i++) {
114 char *packed_info = &packed_sigs[i * (2 + hash_size)];
115 summary.chunks[i].hash = string(&packed_info[2], hash_size);
118 memcpy(&chunk_len, &packed_info[0], 2);
119 summary.chunks[i].len = ntohs(chunk_len);
120 summary.chunks[i].offset = block_start;
121 block_start += summary.chunks[i].len;
123 chunk_index[summary.chunks[i].hash] = make_pair(block_id, i);
126 block_list.push_back(summary);
130 /* Signatures can be loaded lazily; this method should be called before any
131 * actual access to the chunk signatures is required, to ensure the data has
132 * actually been loaded. */
133 void Subfile::ensure_signatures_loaded()
135 if (checksums_loaded)
138 for (set<ObjectReference>::iterator i = old_blocks.begin();
139 i != old_blocks.end(); ++i) {
143 checksums_loaded = true;
146 void Subfile::analyze_new_block(const char *buf, size_t len)
150 int max_chunks = chunk_compute_max_num_breaks(len);
154 size_t *breakpoints = new size_t[max_chunks];
155 int num_breakpoints = chunk_compute_breaks(buf, len, breakpoints);
157 if (num_breakpoints == 0) {
158 delete[] breakpoints;
162 new_block_summary.num_chunks = num_breakpoints;
163 new_block_summary.chunks = new chunk_info[num_breakpoints];
166 for (int i = 0; i < num_breakpoints; i++) {
167 new_block_summary.chunks[i].offset = block_start;
168 new_block_summary.chunks[i].len = breakpoints[i] - block_start + 1;
169 block_start = breakpoints[i] + 1;
171 Hash *hasher = Hash::New();
172 hasher->update(&buf[new_block_summary.chunks[i].offset],
173 new_block_summary.chunks[i].len);
174 new_block_summary.chunks[i].hash
175 = string(reinterpret_cast<const char *>(hasher->digest()),
176 hasher->digest_size());
180 new_block_summary_valid = true;
181 delete[] breakpoints;
184 void Subfile::store_block_signatures(ObjectReference ref, block_summary summary)
186 int n = summary.num_chunks;
187 char *packed = (char *)malloc(n * (2 + hash_size));
189 for (int i = 0; i < n; i++) {
190 assert(summary.chunks[i].len >= 0 && summary.chunks[i].len <= 0xffff);
191 uint16_t len = htons(summary.chunks[i].len);
192 char *packed_info = &packed[i * (2 + hash_size)];
193 memcpy(&packed_info[0], &len, 2);
194 memcpy(&packed_info[2], summary.chunks[i].hash.data(), hash_size);
197 db->StoreChunkSignatures(ref, packed, n * (2 + hash_size), algorithm_name);
202 void Subfile::store_analyzed_signatures(ObjectReference ref)
204 if (analyzed_len >= 16384)
205 store_block_signatures(ref, new_block_summary);
208 /* Compute an incremental representation of the most recent block analyzed. */
209 enum subfile_item_type { SUBFILE_COPY, SUBFILE_NEW };
211 struct subfile_item {
212 subfile_item_type type;
214 // For type SUBFILE_COPY
217 // For type SUBFILE_NEW
218 int src_offset, dst_offset;
223 /* Compute an incremental representation of the data last analyzed. A list of
224 * references will be returned corresponding to the data. If new data must be
225 * written out to the backup, it will be written out via the LbsObject
226 * provided, to the provided TarSegmentStore. */
227 list<ObjectReference> Subfile::create_incremental(TarSegmentStore *tss,
231 assert(new_block_summary_valid);
232 bool matched_old = false;
235 list<subfile_item> items;
236 list<ObjectReference> refs;
238 ensure_signatures_loaded();
240 assert(new_block_summary.num_chunks > 0);
242 for (int i = 0; i < new_block_summary.num_chunks; i++) {
243 map<string, pair<int, int> >::iterator m
244 = chunk_index.find(new_block_summary.chunks[i].hash);
246 struct subfile_item item;
247 if (m == chunk_index.end()) {
248 item.type = SUBFILE_NEW;
249 item.src_offset = new_block_summary.chunks[i].offset;
250 item.dst_offset = new_data;
251 item.len = new_block_summary.chunks[i].len;
252 item.hash = new_block_summary.chunks[i].hash;
253 new_data += item.len;
255 struct chunk_info &old_chunk
256 = block_list[m->second.first].chunks[m->second.second];
257 item.type = SUBFILE_COPY;
258 item.ref = block_list[m->second.first].ref;
259 item.ref.set_range(old_chunk.offset, old_chunk.len);
263 items.push_back(item);
266 // No data was matched. The entire block can be written out as is into a
267 // new object, and the new_block_summary used to save chunk signatures.
269 o->set_age(block_age);
270 o->set_data(analyzed_buf, analyzed_len, NULL);
272 ObjectReference ref = o->get_ref();
273 store_analyzed_signatures(ref);
279 // Otherwise, construct a new block containing all literal data needed (if
280 // any exists), write it out, and construct the appropriate list of
282 list<subfile_item>::iterator i;
284 char *literal_buf = new char[new_data];
285 for (i = items.begin(); i != items.end(); ++i) {
286 if (i->type == SUBFILE_NEW) {
287 memcpy(&literal_buf[i->dst_offset],
288 &analyzed_buf[i->src_offset], i->len);
292 Hash *hasher = Hash::New();
293 hasher->update(literal_buf, new_data);
294 string block_csum = hasher->digest_str();
297 o->set_group("data");
298 o->set_data(literal_buf, new_data, NULL);
300 ObjectReference ref = o->get_ref();
301 for (i = items.begin(); i != items.end(); ++i) {
302 if (i->type == SUBFILE_NEW) {
304 i->ref.set_range(i->dst_offset, i->len);
308 //db->StoreObject(ref, 0.0);
310 block_summary summary;
312 summary.num_chunks = 0;
313 summary.chunks = new chunk_info[items.size()];
314 for (i = items.begin(); i != items.end(); ++i) {
315 if (i->type == SUBFILE_NEW) {
316 chunk_info &info = summary.chunks[summary.num_chunks];
318 info.offset = i->dst_offset;
320 summary.num_chunks++;
324 store_block_signatures(ref, summary);
326 delete[] summary.chunks;
327 delete[] literal_buf;
333 for (i = items.begin(); i != items.end(); ++i) {
334 string refstr = i->ref.to_string();
335 if (!ref.merge(i->ref)) {
340 assert(!ref.is_null());