X-Git-Url: http://git.vrable.net/?a=blobdiff_plain;f=bluesky%2Finode.c;h=f3b804dc4e7f5378b08227b8ad22d8a7a5545f64;hb=ddaec40a37a5e65e53546b14632b1b0b35613264;hp=7ea8dbf2af026b0e74f4235003f040b6bac303c8;hpb=70fdd2326239a9a5e02b3c3699d2588d5fee48fa;p=bluesky.git diff --git a/bluesky/inode.c b/bluesky/inode.c index 7ea8dbf..f3b804d 100644 --- a/bluesky/inode.c +++ b/bluesky/inode.c @@ -6,11 +6,13 @@ * TODO: Licensing */ +#include #include +#include #include #include -#include "bluesky.h" +#include "bluesky-private.h" /* Core filesystem. Different proxies, such as the NFSv3 one, interface to * this, but the core actually tracks the data which is stored. So far we just @@ -27,7 +29,8 @@ int64_t bluesky_get_current_time() /* Update an inode to indicate that a modification was made. This increases * the change counter, updates the ctime to the current time, and optionally - * updates the mtime. */ + * updates the mtime. This also makes the inode contents subject to writeback + * to storage in the future. inode must already be locked. */ void bluesky_inode_update_ctime(BlueSkyInode *inode, gboolean update_mtime) { int64_t now = bluesky_get_current_time(); @@ -35,54 +38,23 @@ void bluesky_inode_update_ctime(BlueSkyInode *inode, gboolean update_mtime) inode->ctime = now; if (update_mtime) inode->mtime = now; -} - -/* Compute the HMAC keyed-hash function using the given hash algorithm, data, - * and key. */ -void compute_hmac(GChecksumType algo, - const guchar *data, gsize data_len, - const guchar *key, gsize key_len, - guint8 *buffer, gsize *digest_len) -{ - int block_size; - - switch (algo) { - case G_CHECKSUM_MD5: - case G_CHECKSUM_SHA1: - case G_CHECKSUM_SHA256: - block_size = 64; - break; - default: - g_error("Unknown hash algorithm for HMAC: %d\n", algo); - } - - gsize digest_size = g_checksum_type_get_length(algo); - - guchar keybuf[block_size]; - memset(keybuf, 0, block_size); - memcpy(keybuf, key, MIN(block_size, key_len)); - for (int i = 0; i < block_size; i++) - keybuf[i] ^= 0x36; - - GChecksum *csum1 = g_checksum_new(algo); - g_checksum_update(csum1, keybuf, block_size); - g_checksum_update(csum1, data, data_len); - guint8 digest[digest_size]; - g_checksum_get_digest(csum1, digest, &digest_size); - - memset(keybuf, 0, block_size); - memcpy(keybuf, key, MIN(block_size, key_len)); - for (int i = 0; i < block_size; i++) - keybuf[i] ^= 0x5c; - - GChecksum *csum2 = g_checksum_new(algo); - g_checksum_update(csum2, keybuf, block_size); - g_checksum_update(csum2, digest, digest_size); - - g_checksum_get_digest(csum2, buffer, digest_len); - g_checksum_free(csum1); - g_checksum_free(csum2); + if (inode->change_time == 0) + inode->change_time = now; + +#if 0 + if (bluesky_options.writethrough_cache) + bluesky_file_flush(inode, NULL); +#endif + + g_mutex_lock(inode->fs->lock); + bluesky_list_unlink(&inode->fs->unlogged_list, inode->unlogged_list); + inode->unlogged_list = bluesky_list_prepend(&inode->fs->unlogged_list, inode); + bluesky_list_unlink(&inode->fs->dirty_list, inode->dirty_list); + inode->dirty_list = bluesky_list_prepend(&inode->fs->dirty_list, inode); + bluesky_list_unlink(&inode->fs->accessed_list, inode->accessed_list); + inode->accessed_list = bluesky_list_prepend(&inode->fs->accessed_list, inode); + g_mutex_unlock(inode->fs->lock); } /* Unfortunately a glib hash table is only guaranteed to be able to store @@ -114,34 +86,139 @@ BlueSkyFS *bluesky_new_fs(gchar *name) fs->inodes = g_hash_table_new(bluesky_fs_key_hash_func, bluesky_fs_key_equal_func); fs->next_inum = BLUESKY_ROOT_INUM + 1; - fs->store = s3store_new(); + fs->store = bluesky_store_new("file"); + fs->flushd_lock = g_mutex_new(); + fs->locations = g_hash_table_new(bluesky_cloudlog_hash, + bluesky_cloudlog_equal); + + fs->log_state = g_new0(BlueSkyCloudLogState, 1); + fs->log_state->data = g_string_new(""); return fs; } +BlueSkyFS *bluesky_init_fs(gchar *name, BlueSkyStore *store) +{ + BlueSkyRCStr *data = bluesky_store_get(store, "superblock"); + if (data != NULL) { + BlueSkyFS *fs = bluesky_deserialize_superblock(data->data); + if (fs != NULL) { + fs->store = store; + fs->log = bluesky_log_new("journal"); + g_print("Loaded filesystem superblock\n"); + g_free(fs->name); + fs->name = g_strdup(name); + return fs; + } + bluesky_string_unref(data); + } + + g_print("Initializing fresh filesystem\n"); + BlueSkyFS *fs = bluesky_new_fs(name); + fs->store = store; + fs->log = bluesky_log_new("journal"); + + BlueSkyInode *root = bluesky_new_inode(BLUESKY_ROOT_INUM, fs, + BLUESKY_DIRECTORY); + root->nlink = 1; + root->mode = 0755; + bluesky_insert_inode(fs, root); + bluesky_inode_update_ctime(root, TRUE); + + bluesky_inode_do_sync(root); + bluesky_superblock_flush(fs); + + return fs; +} + +/* Inode reference counting. */ +void bluesky_inode_ref(BlueSkyInode *inode) +{ + g_atomic_int_inc(&inode->refcount); +} + +void bluesky_inode_unref(BlueSkyInode *inode) +{ + if (g_atomic_int_dec_and_test(&inode->refcount)) { + if (bluesky_verbose) { + g_log("bluesky/inode", G_LOG_LEVEL_DEBUG, + "Reference count for inode %"PRIu64" dropped to zero.", + inode->inum); + } + + /* Sanity check: Is the inode clean? */ + if (inode->change_commit < inode->change_count + || inode->accessed_list != NULL + || inode->unlogged_list != NULL + || inode->dirty_list != NULL) { + g_warning("Dropping inode which is not clean (commit %"PRIi64" < change %"PRIi64"; accessed_list = %p; dirty_list = %p)\n", inode->change_commit, inode->change_count, inode->accessed_list, inode->dirty_list); + } + + /* These shouldn't be needed, but in case the above warning fires and + * we delete the inode anyway, we ought to be sure the inode is not on + * any LRU list. */ + g_mutex_lock(inode->fs->lock); + bluesky_list_unlink(&inode->fs->accessed_list, inode->accessed_list); + bluesky_list_unlink(&inode->fs->dirty_list, inode->dirty_list); + bluesky_list_unlink(&inode->fs->unlogged_list, inode->unlogged_list); + g_mutex_unlock(inode->fs->lock); + + /* Free file type specific data. It should be an error for there to be + * dirty data to commit when the reference count has reaches zero. */ + switch (inode->type) { + case BLUESKY_REGULAR: + for (int i = 0; i < inode->blocks->len; i++) { + BlueSkyBlock *b = &g_array_index(inode->blocks, + BlueSkyBlock, i); + if (b->type == BLUESKY_BLOCK_DIRTY) { + g_error("Deleting an inode with dirty file data!"); + } + bluesky_cloudlog_unref(b->ref); + bluesky_string_unref(b->dirty); + } + g_array_unref(inode->blocks); + break; + + case BLUESKY_DIRECTORY: + g_hash_table_destroy(inode->dirhash); + g_hash_table_destroy(inode->dirhash_folded); + g_sequence_free(inode->dirents); + break; + + case BLUESKY_SYMLINK: + g_free(inode->symlink_contents); + break; + + default: + break; + } + + g_mutex_free(inode->lock); + + g_free(inode); + } +} + /* Allocate a fresh inode number which has not been used before within a - * filesystem. */ + * filesystem. fs must already be locked. */ uint64_t bluesky_fs_alloc_inode(BlueSkyFS *fs) { uint64_t inum; - g_mutex_lock(fs->lock); inum = fs->next_inum; fs->next_inum++; - g_mutex_unlock(fs->lock); + + bluesky_superblock_flush(fs); return inum; } -BlueSkyInode *bluesky_new_inode(uint64_t inum, BlueSkyFS *fs, - BlueSkyFileType type) +/* Perform type-specification initialization of an inode. Normally performed + * in bluesky_new_inode, but can be separated if an inode is created first, + * then deserialized. */ +void bluesky_init_inode(BlueSkyInode *i, BlueSkyFileType type) { - BlueSkyInode *i = g_new0(BlueSkyInode, 1); - - i->lock = g_mutex_new(); i->type = type; - i->fs = fs; - i->inum = inum; switch (type) { case BLUESKY_REGULAR: @@ -150,206 +227,188 @@ BlueSkyInode *bluesky_new_inode(uint64_t inum, BlueSkyFS *fs, case BLUESKY_DIRECTORY: i->dirents = g_sequence_new(bluesky_dirent_destroy); i->dirhash = g_hash_table_new(g_str_hash, g_str_equal); + i->dirhash_folded = g_hash_table_new(g_str_hash, g_str_equal); break; - case BLUESKY_BLOCK: - case BLUESKY_CHARACTER: - case BLUESKY_SYMLINK: - case BLUESKY_SOCKET: - case BLUESKY_FIFO: + default: break; } +} + +BlueSkyInode *bluesky_new_inode(uint64_t inum, BlueSkyFS *fs, + BlueSkyFileType type) +{ + BlueSkyInode *i = g_new0(BlueSkyInode, 1); + + i->lock = g_mutex_new(); + i->refcount = 1; + i->fs = fs; + i->inum = inum; + i->change_count = 1; + bluesky_init_inode(i, type); return i; } /* Retrieve an inode from the filesystem. Eventually this will be a cache and * so we might need to go fetch the inode from elsewhere; for now all - * filesystem state is stored here. */ + * filesystem state is stored here. inode is returned with a reference held + * but not locked. */ BlueSkyInode *bluesky_get_inode(BlueSkyFS *fs, uint64_t inum) { BlueSkyInode *inode = NULL; + if (inum == 0) { + return NULL; + } + g_mutex_lock(fs->lock); inode = (BlueSkyInode *)g_hash_table_lookup(fs->inodes, &inum); + + if (inode == NULL) { + bluesky_inode_fetch(fs, inum); + inode = (BlueSkyInode *)g_hash_table_lookup(fs->inodes, &inum); + } + + if (inode != NULL) { + bluesky_inode_ref(inode); + + /* FIXME: We assume we can atomically update the in-memory access time + * without a lock. */ + inode->access_time = bluesky_get_current_time(); + } + g_mutex_unlock(fs->lock); return inode; } -/* Insert an inode into the filesystem inode cache. */ +/* Insert an inode into the filesystem inode cache. fs should be locked. */ void bluesky_insert_inode(BlueSkyFS *fs, BlueSkyInode *inode) { - g_mutex_lock(fs->lock); g_hash_table_insert(fs->inodes, &inode->inum, inode); - g_mutex_unlock(fs->lock); } -/* Mark a given block dirty and make sure that data is faulted in so that it - * can be written to. */ -void bluesky_block_touch(BlueSkyInode *inode, uint64_t i) +/* Start writeback of an inode and all associated data. */ +void bluesky_inode_start_sync(BlueSkyInode *inode) { - g_return_if_fail(i < inode->blocks->len); - BlueSkyBlock *block = &g_array_index(inode->blocks, BlueSkyBlock, i); + GList *log_items = NULL; - switch (block->type) { - case BLUESKY_BLOCK_ZERO: - block->data = bluesky_string_new(g_malloc0(BLUESKY_BLOCK_SIZE), - BLUESKY_BLOCK_SIZE); - break; - case BLUESKY_BLOCK_REF: - bluesky_block_fetch(inode->fs, block); - g_assert(block->type == BLUESKY_BLOCK_CACHED); - /* Fall through */ - case BLUESKY_BLOCK_CACHED: - case BLUESKY_BLOCK_DIRTY: - block->data = bluesky_string_dup(block->data); - break; - } + if (inode->type == BLUESKY_REGULAR) + bluesky_file_flush(inode, &log_items); - block->type = BLUESKY_BLOCK_DIRTY; -} + BlueSkyCloudLog *cloudlog = bluesky_serialize_inode(inode); -/* Set the size of a file. This will truncate or extend the file as needed. - * Newly-allocated bytes are zeroed. */ -void bluesky_file_truncate(BlueSkyInode *inode, uint64_t size) -{ - g_return_if_fail(size <= BLUESKY_MAX_FILE_SIZE); - - if (size == inode->size) - return; - - uint64_t blocks = (size + BLUESKY_BLOCK_SIZE - 1) / BLUESKY_BLOCK_SIZE; - - if (blocks > inode->blocks->len) { - /* Need to add new blocks to the end of a file. New block structures - * are automatically zeroed, which initializes them to be pointers to - * zero blocks so we don't need to do any more work. */ - g_array_set_size(inode->blocks, blocks); - } else if (blocks < inode->blocks->len) { - /* Delete blocks from a file. Must reclaim memory. */ - for (guint i = inode->blocks->len; i < blocks; i++) { - BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i); - g_free(b->ref); - bluesky_string_unref(b->data); - } - g_array_set_size(inode->blocks, blocks); - } + bluesky_cloudlog_unref(inode->committed_item); + inode->committed_item = cloudlog; - /* If the file size is being decreased, ensure that any trailing data in - * the last block is zeroed. */ - if (size < inode->size) { - BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, - blocks - 1); - if (b->type != BLUESKY_BLOCK_ZERO) { - bluesky_block_touch(inode, blocks - 1); - int end_offset = size % BLUESKY_BLOCK_SIZE; - if (end_offset > 0) { - memset(&b->data->data[end_offset], 0, - BLUESKY_BLOCK_SIZE - end_offset); - } - } - } + bluesky_cloudlog_sync(cloudlog); + bluesky_cloudlog_ref(cloudlog); + log_items = g_list_prepend(log_items, cloudlog); + + /* Wait for all log items to be committed to disk. */ + bluesky_log_finish_all(log_items); - inode->size = size; - bluesky_inode_update_ctime(inode, 1); + /* Mark the inode as clean */ + inode->change_commit = inode->change_count; + inode->change_time = 0; + g_mutex_lock(inode->fs->lock); + bluesky_list_unlink(&inode->fs->unlogged_list, inode->unlogged_list); + inode->unlogged_list = NULL; + g_mutex_unlock(inode->fs->lock); } -void bluesky_file_write(BlueSkyInode *inode, uint64_t offset, - const char *data, gint len) +/* Write back an inode and all associated data and wait for completion. Inode + * should already be locked. */ +void bluesky_inode_do_sync(BlueSkyInode *inode) { - g_return_if_fail(inode->type == BLUESKY_REGULAR); - g_return_if_fail(offset < inode->size); - g_return_if_fail(len <= inode->size - offset); - - if (len == 0) - return; - - while (len > 0) { - uint64_t block_num = offset / BLUESKY_BLOCK_SIZE; - gint block_offset = offset % BLUESKY_BLOCK_SIZE; - gint bytes = MIN(BLUESKY_BLOCK_SIZE - block_offset, len); - - bluesky_block_touch(inode, block_num); - BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, - block_num); - memcpy(&b->data->data[block_offset], data, bytes); - bluesky_block_flush(inode->fs, b); - - offset += bytes; - data += bytes; - len -= bytes; + if (bluesky_verbose) { + g_log("bluesky/inode", G_LOG_LEVEL_DEBUG, + "Synchronous writeback for inode %"PRIu64"...", inode->inum); + } + bluesky_inode_start_sync(inode); + if (bluesky_verbose) { + g_log("bluesky/inode", G_LOG_LEVEL_DEBUG, + "Writeback for inode %"PRIu64" complete", inode->inum); } - - bluesky_inode_update_ctime(inode, 1); } -void bluesky_file_read(BlueSkyInode *inode, uint64_t offset, - char *buf, gint len) +static void complete_inode_fetch(BlueSkyStoreAsync *async, BlueSkyInode *inode) { - g_return_if_fail(inode->type == BLUESKY_REGULAR); - g_return_if_fail(offset < inode->size); - g_return_if_fail(len <= inode->size - offset); - - while (len > 0) { - uint64_t block_num = offset / BLUESKY_BLOCK_SIZE; - gint block_offset = offset % BLUESKY_BLOCK_SIZE; - gint bytes = MIN(BLUESKY_BLOCK_SIZE - block_offset, len); - - BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, - block_num); - switch (b->type) { - case BLUESKY_BLOCK_ZERO: - memset(buf, 0, bytes); - break; - case BLUESKY_BLOCK_REF: - bluesky_block_fetch(inode->fs, b); - /* Fall through */ - case BLUESKY_BLOCK_CACHED: - case BLUESKY_BLOCK_DIRTY: - memcpy(buf, &b->data->data[block_offset], bytes); - break; - } - - offset += bytes; - buf += bytes; - len -= bytes; + if (bluesky_verbose) { + g_log("bluesky/inode", G_LOG_LEVEL_DEBUG, + "Completing fetch of inode %"PRIu64"...", inode->inum); } -} -/* Read the given block from cloud-backed storage if the data is not already - * cached. */ -void bluesky_block_fetch(BlueSkyFS *fs, BlueSkyBlock *block) -{ - if (block->type != BLUESKY_BLOCK_REF) - return; + if (async->result != 0 + || !bluesky_deserialize_inode(inode, async->data->data)) + { + if (bluesky_verbose) { + g_log("bluesky/inode", G_LOG_LEVEL_DEBUG, + " failed to load inode, cleaning up"); + } + g_mutex_lock(inode->fs->lock); + g_hash_table_remove(inode->fs->inodes, &inode->inum); + bluesky_list_unlink(&inode->fs->accessed_list, inode->accessed_list); + inode->accessed_list = NULL; + g_mutex_unlock(inode->fs->lock); + bluesky_inode_unref(inode); + } - g_print("Fetching block from %s\n", block->ref); - BlueSkyRCStr *string = s3store_get(fs->store, block->ref); + inode->access_time = bluesky_get_current_time(); + g_mutex_lock(inode->fs->lock); + bluesky_list_unlink(&inode->fs->accessed_list, inode->accessed_list); + inode->accessed_list = bluesky_list_prepend(&inode->fs->accessed_list, inode); + g_mutex_unlock(inode->fs->lock); - bluesky_string_unref(block->data); - block->data = string; - block->type = BLUESKY_BLOCK_CACHED; + g_mutex_unlock(inode->lock); + bluesky_inode_unref(inode); } -/* Write the given block to cloud-backed storage and mark it clean. */ -void bluesky_block_flush(BlueSkyFS *fs, BlueSkyBlock *block) +/* Fetch an inode from stable storage. The fetch can be performed + * asynchronously: the in-memory inode is allocated, but not filled with data + * immediately. It is kept locked until it has been filled in, so any users + * should try to acquire the lock on the inode before accessing any data. The + * fs lock must be held. */ +void bluesky_inode_fetch(BlueSkyFS *fs, uint64_t inum) { - if (block->type != BLUESKY_BLOCK_DIRTY) - return; + char key[64]; + sprintf(key, "inode-%016"PRIx64, inum); + + BlueSkyInode *inode = bluesky_new_inode(inum, fs, BLUESKY_PENDING); + inode->change_count = 0; + bluesky_inode_ref(inode); // Extra ref held by fetching process + g_mutex_lock(inode->lock); + bluesky_insert_inode(fs, inode); + + BlueSkyStoreAsync *async = bluesky_store_async_new(fs->store); + async->op = STORE_OP_GET; + async->key = g_strdup(key); - GChecksum *csum = g_checksum_new(G_CHECKSUM_SHA256); - g_checksum_update(csum, block->data->data, block->data->len); - gchar *name = g_strdup(g_checksum_get_string(csum)); + bluesky_store_async_add_notifier(async, (GFunc)complete_inode_fetch, inode); + bluesky_store_async_submit(async); - g_print("Flushing block as %s\n", name); - s3store_put(fs->store, name, block->data); - g_free(block->ref); - block->ref = name; + if (bluesky_options.sync_inode_fetches) { + bluesky_store_async_wait(async); + } - /* block->type = BLUESKY_BLOCK_CACHED; */ - bluesky_string_unref(block->data); - block->data = NULL; - block->type = BLUESKY_BLOCK_REF; + bluesky_store_async_unref(async); +} - g_checksum_free(csum); +/* Synchronize filesystem superblock to stable storage. */ +void bluesky_superblock_flush(BlueSkyFS *fs) +{ +#if 0 + GString *buf = g_string_new(""); + bluesky_serialize_superblock(buf, fs); + BlueSkyRCStr *data = bluesky_string_new_from_gstring(buf); + + BlueSkyStoreAsync *async = bluesky_store_async_new(fs->store); + async->op = STORE_OP_PUT; + async->key = g_strdup("superblock"); + async->data = data; + bluesky_store_async_submit(async); + bluesky_store_async_unref(async); + + //bluesky_store_sync(fs->store); +#endif }