1 /* Blue Sky: File Systems in the Cloud
3 * Copyright (C) 2009 The Regents of the University of California
4 * Written by Michael Vrable <mvrable@cs.ucsd.edu>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the University nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 #include "bluesky-private.h"
40 /* Magic number at the start of the checkpoint record, to check for version
42 #define CHECKPOINT_MAGIC 0x7ad7dafb42a498b4ULL
44 /* Inode maps. There is both an in-memory representation as well as the
45 * serialized form in the cloud.
47 * The inode map is broken up into pieces by partitioning in the inode number
48 * space. The checkpoint region will contain pointers to each of these map
51 /* Roughly how many inodes to try to put in each inode map range. */
52 static int target_imap_size = 4096;
54 /* Comparison function for lookuping up inode map entries. Reads a 64-bit
55 * value from the pointed-to address to do the comparison, so we require that
56 * the InodeMapEntry and InodeMapRange structures start with the 64-bit value
57 * they are sorted by. */
58 static gint compare(gconstpointer a, gconstpointer b, gpointer user_data)
60 uint64_t x = *(uint64_t *)a;
61 uint64_t y = *(uint64_t *)b;
71 /* Look up the inode map entry for the given inode number. If action is +1,
72 * create a new entry if it does not already exist; if it is -1 then delete the
73 * entry instead. If 0, return the entry if it is already present. A non-zero
74 * action value will mark the inode map range as dirty. */
75 InodeMapEntry *bluesky_inode_map_lookup(GSequence *inode_map, uint64_t inum,
80 /* First, scan through to find the inode map section that covers the
81 * appropriate range. Create one if it does not exist but we were asked to
83 InodeMapRange *range = NULL;
85 i = g_sequence_search(inode_map, &inum, compare, NULL);
86 i = g_sequence_iter_prev(i);
87 if (!g_sequence_iter_is_end(i))
88 range = (InodeMapRange *)g_sequence_get(i);
89 if (range == NULL || inum < range->start || inum > range->end) {
93 /* Create a new range. First, determine bounds on the range enpoints
94 * based on neighboring ranges. Then, shrink the size of the range to
95 * a reasonable size that covers the needed inode number. */
96 range = g_new0(InodeMapRange, 1);
98 range->end = G_MAXUINT64;
99 range->map_entries = g_sequence_new(NULL);
100 range->serialized = NULL;
102 g_print("Creating inode map range, 1: start=%"PRIu64" end=%"PRIu64"\n",
103 range->start, range->end);
105 if (!g_sequence_iter_is_begin(i) && !g_sequence_iter_is_end(i))
106 range->start = ((InodeMapRange *)g_sequence_get(i))->end + 1;
107 i = g_sequence_iter_next(i);
108 if (!g_sequence_iter_is_end(i))
109 range->end = ((InodeMapRange *)g_sequence_get(i))->start - 1;
111 g_print("Creating inode map range, 2: start=%"PRIu64" end=%"PRIu64"\n",
112 range->start, range->end);
113 g_assert(inum >= range->start && inum <= range->end);
115 range->start = MAX(range->start,
116 inum & ~(uint64_t)(target_imap_size - 1));
117 range->end = MIN(range->end, range->start + target_imap_size - 1);
119 g_print("Creating inode map range, 3: start=%"PRIu64" end=%"PRIu64"\n",
120 range->start, range->end);
122 g_sequence_insert_sorted(inode_map, range, compare, NULL);
125 /* Next, try to find the entry within this range of the inode map. */
126 InodeMapEntry *entry = NULL;
127 i = g_sequence_search(range->map_entries, &inum, compare, NULL);
128 i = g_sequence_iter_prev(i);
129 if (!g_sequence_iter_is_end(i))
130 entry = (InodeMapEntry *)g_sequence_get(i);
131 if (entry == NULL || inum != entry->inum) {
135 entry = g_new0(InodeMapEntry, 1);
137 g_sequence_insert_sorted(range->map_entries, entry, compare, NULL);
140 g_print("Created inode map entry for %"PRIu64"\n", inum);
144 bluesky_cloudlog_unref_delayed(range->serialized);
145 range->serialized = NULL;
148 /* Were we requested to delete the item? */
150 g_sequence_remove(i);
157 /* Convert a section of the inode map to serialized form, in preparation for
158 * writing it out to the cloud. */
159 static void bluesky_inode_map_serialize_section(BlueSkyFS *fs,
160 InodeMapRange *range)
162 if (range->serialized != NULL)
165 GString *buf = g_string_new("");
166 BlueSkyCloudLog *log = bluesky_cloudlog_new(fs, NULL);
167 log->type = LOGTYPE_INODE_MAP;
170 GSequenceIter *i = g_sequence_get_begin_iter(range->map_entries);
171 while (!g_sequence_iter_is_end(i)) {
172 InodeMapEntry *entry = (InodeMapEntry *)g_sequence_get(i);
173 uint64_t inum = GUINT64_TO_LE(entry->inum);
174 g_string_append_len(buf, (const char *)&inum, sizeof(inum));
175 bluesky_cloudlog_ref(entry->item);
176 g_array_append_val(log->links, entry->item);
177 i = g_sequence_iter_next(i);
180 log->data = bluesky_string_new_from_gstring(buf);
181 bluesky_cloudlog_unref(range->serialized);
182 range->serialized = log;
183 bluesky_cloudlog_stats_update(log, 1);
186 BlueSkyCloudLog *bluesky_inode_map_serialize(BlueSkyFS *fs)
188 gboolean updated = FALSE;
189 GString *buf = g_string_new("");
190 BlueSkyCloudLog *log = bluesky_cloudlog_new(fs, NULL);
191 log->type = LOGTYPE_CHECKPOINT;
194 /* The checkpoint record starts with a magic number, followed by the
195 * version vector which lists the latest sequence number of all other logs
196 * (currently, only the cleaner) which have been seen. */
197 uint64_t magic = GUINT64_TO_LE(CHECKPOINT_MAGIC);
198 g_string_append_len(buf, (const char *)&magic, sizeof(magic));
200 versions = GUINT32_TO_LE(fs->log_state->latest_cleaner_seq_seen >= 0);
201 g_string_append_len(buf, (const char *)&versions, sizeof(versions));
202 if (fs->log_state->latest_cleaner_seq_seen >= 0) {
203 versions = GUINT32_TO_LE(BLUESKY_CLOUD_DIR_CLEANER);
204 g_string_append_len(buf, (const char *)&versions, sizeof(versions));
205 versions = GUINT32_TO_LE(fs->log_state->latest_cleaner_seq_seen);
206 g_string_append_len(buf, (const char *)&versions, sizeof(versions));
209 GSequenceIter *i = g_sequence_get_begin_iter(fs->inode_map);
210 while (!g_sequence_iter_is_end(i)) {
211 InodeMapRange *range = (InodeMapRange *)g_sequence_get(i);
212 uint64_t inum = GUINT64_TO_LE(range->start);
213 g_string_append_len(buf, (const char *)&inum, sizeof(inum));
214 inum = GUINT64_TO_LE(range->end);
215 g_string_append_len(buf, (const char *)&inum, sizeof(inum));
217 if (range->serialized == NULL) {
218 bluesky_inode_map_serialize_section(fs, range);
221 bluesky_cloudlog_ref(range->serialized);
222 g_array_append_val(log->links, range->serialized);
223 i = g_sequence_iter_next(i);
226 log->data = bluesky_string_new_from_gstring(buf);
227 bluesky_cloudlog_stats_update(log, 1);
232 bluesky_cloudlog_unref(log);
237 /* Minimize resources consumed the inode map. This should only be called once
238 * an updated inode map has been serialized to the cloud, and will replace
239 * cloud log objects with skeletal versions that just reference the data
240 * location in the cloud (rather than pinning all object data in memory). */
241 void bluesky_inode_map_minimize(BlueSkyFS *fs)
243 GSequenceIter *i = g_sequence_get_begin_iter(fs->inode_map);
244 while (!g_sequence_iter_is_end(i)) {
245 InodeMapRange *range = (InodeMapRange *)g_sequence_get(i);
247 if (range->serialized != NULL)
248 bluesky_cloudlog_erase(range->serialized);
251 for (j = g_sequence_get_begin_iter(range->map_entries);
252 !g_sequence_iter_is_end(j); j = g_sequence_iter_next(j))
254 InodeMapEntry *entry = (InodeMapEntry *)g_sequence_get(j);
255 BlueSkyCloudLog *item = entry->item;
257 g_mutex_lock(item->lock);
258 if (g_atomic_int_get(&item->refcount) == 1) {
259 bluesky_cloudlog_erase(item);
261 g_mutex_unlock(item->lock);
263 g_warning("Null item for inode map entry %"PRIu64"!",
268 i = g_sequence_iter_next(i);
272 /* Reconstruct the inode map from data stored in the cloud. */
273 static void bluesky_inode_map_deserialize(BlueSkyFS *fs, BlueSkyCloudLog *imap)
275 g_mutex_lock(imap->lock);
276 bluesky_cloudlog_fetch(imap);
277 g_assert(imap->data != NULL);
278 g_assert(imap->data->len >= 12);
280 uint32_t vector_data;
281 memcpy((char *)&magic, imap->data->data, sizeof(magic));
282 g_assert(GUINT64_FROM_LE(magic) == CHECKPOINT_MAGIC);
283 memcpy((char *)&vector_data, imap->data->data + 8, sizeof(vector_data));
284 g_assert(GUINT32_FROM_LE(vector_data) <= 2);
286 int vector_size = GUINT32_FROM_LE(vector_data);
287 g_assert(imap->data->len == 16 * imap->links->len + 12 + 8 * vector_size);
289 for (int i = 0; i < vector_size; i++) {
290 memcpy((char *)&vector_data, imap->data->data + 12 + 8*i,
291 sizeof(vector_data));
292 if (GUINT32_FROM_LE(vector_data) == 1) {
293 memcpy((char *)&vector_data, imap->data->data + 16 + 8*i,
294 sizeof(vector_data));
295 fs->log_state->latest_cleaner_seq_seen
296 = GUINT32_FROM_LE(vector_data);
297 g_print("Deserializing checkpoint: last cleaner sequence is %d\n",
298 GUINT32_FROM_LE(vector_data));
302 //uint64_t *inum_range = (uint64_t *)imap->data->data;
303 for (int i = 0; i < imap->links->len; i++) {
304 //int64_t start = GUINT64_FROM_LE(*inum_range++);
305 //int64_t end = GUINT64_FROM_LE(*inum_range++);
306 BlueSkyCloudLog *section = g_array_index(imap->links,
307 BlueSkyCloudLog *, i);
308 g_mutex_lock(section->lock);
309 bluesky_cloudlog_fetch(section);
310 g_print("Loaded cloudlog item (%zd bytes)\n", section->data->len);
312 uint64_t *inum = (uint64_t *)section->data->data;
313 for (int j = 0; j < section->links->len; j++) {
314 InodeMapEntry *entry;
315 entry = bluesky_inode_map_lookup(fs->inode_map, *inum, 1);
316 entry->inum = GUINT64_FROM_LE(*inum);
317 bluesky_cloudlog_unref_delayed(entry->item);
318 entry->item = g_array_index(section->links,
319 BlueSkyCloudLog *, j);
320 bluesky_cloudlog_ref(entry->item);
321 fs->next_inum = MAX(fs->next_inum, entry->inum + 1);
324 g_mutex_unlock(section->lock);
326 g_mutex_unlock(imap->lock);
329 /* Find the most recent checkpoint record in the cloud and reload inode map
330 * data from it to initialize the filesystem. Returns a boolean indicating
331 * whether a checkpoint was found and loaded or not. */
332 gboolean bluesky_checkpoint_load(BlueSkyFS *fs)
334 g_print("Claiming cloud log directory: %d\n",
335 fs->log_state->location.directory);
336 char *prefix = g_strdup_printf("log-%08d",
337 fs->log_state->location.directory);
338 char *last_segment = bluesky_store_lookup_last(fs->store, prefix);
340 if (last_segment == NULL)
343 g_print("Last cloud log segment: %s\n", last_segment);
344 int seq = atoi(last_segment + 13);
345 fs->log_state->location.sequence = seq + 1;
347 BlueSkyRCStr *last = bluesky_store_get(fs->store, last_segment);
348 g_free(last_segment);
350 g_warning("Unable to fetch last log segment from cloud!");
354 last = bluesky_string_dup(last);
355 bluesky_cloudlog_decrypt(last->data, last->len, fs->keys, NULL, FALSE);
357 /* Scan through the contents of the last log segment to find a checkpoint
358 * record. We need to do a linear scan since at this point we don't have a
359 * direct pointer; once we have the last commit record then all other data
360 * can be loaded by directly following pointers. */
361 const char *buf = last->data;
362 size_t len = last->len;
363 const char *checkpoint = NULL;
364 size_t checkpoint_size = 0;
365 while (len > sizeof(struct cloudlog_header)) {
366 struct cloudlog_header *header = (struct cloudlog_header *)buf;
367 if (memcmp(header->magic, CLOUDLOG_MAGIC, 4) != 0) {
368 g_warning("Could not parse cloudlog entry!");
371 int size = sizeof(struct cloudlog_header);
372 size += GUINT32_FROM_LE(header->size1);
373 size += GUINT32_FROM_LE(header->size2);
374 size += GUINT32_FROM_LE(header->size3);
376 g_warning("Cloudlog entry is malformed (size too large)!");
379 if (header->type - '0' == LOGTYPE_CHECKPOINT) {
381 checkpoint_size = size;
387 if (checkpoint_size == 0) {
388 g_error("Unable to locate checkpoint record!\n");
391 g_print("Found checkpoint record at %zd (size %zd)\n",
392 checkpoint - last->data, checkpoint_size);
394 /* Bootstrap the loading process by manually setting the location of this
396 BlueSkyCloudLog *commit;
397 commit = bluesky_cloudlog_get(fs,
398 ((struct cloudlog_header *)checkpoint)->id);
399 g_mutex_lock(commit->lock);
400 commit->location_flags |= CLOUDLOG_CLOUD;
401 commit->location.directory = 0;
402 commit->location.sequence = seq;
403 commit->location.offset = checkpoint - last->data;
404 commit->location.size = checkpoint_size;
405 g_mutex_unlock(commit->lock);
406 bluesky_cloudlog_stats_update(commit, 1);
408 bluesky_inode_map_deserialize(fs, commit);
409 bluesky_cloudlog_unref(commit);