Rework caching of data blocks to eliminate double-caching.
authorMichael Vrable <mvrable@cs.ucsd.edu>
Wed, 4 Aug 2010 22:32:00 +0000 (15:32 -0700)
committerMichael Vrable <mvrable@cs.ucsd.edu>
Wed, 4 Aug 2010 22:32:00 +0000 (15:32 -0700)
Previously data could be cached both as a cloud log item and as a string at
the inode level.  Now cache as a string for dirty data and a cloud log item
for clean data.

bluesky/bluesky-private.h
bluesky/bluesky.h
bluesky/cloudlog.c
bluesky/debug.c
bluesky/file.c
bluesky/inode.c
bluesky/log.c
bluesky/serialize.c

index 0f86f45..418c1e5 100644 (file)
@@ -165,6 +165,7 @@ typedef struct {
 } BlueSkyCloudPointer;
 
 typedef enum {
+    LOGTYPE_INVALID = -1,
     LOGTYPE_UNKNOWN = 0,
     LOGTYPE_DATA = 1,
     LOGTYPE_INODE = 2,
@@ -209,8 +210,11 @@ struct _BlueSkyCloudLog {
     // Pointers to other objects
     GArray *pointers;
 
-    // Serialized data, if available in memory (otherwise NULL).
+    // Serialized data, if available in memory (otherwise NULL), and a lock
+    // count which tracks if there are users that require the data to be kept
+    // around.
     BlueSkyRCStr *data;
+    int data_lock_count;
 };
 
 /* Serialize objects into a log segment to be written to the cloud. */
index 8846488..e9e40d8 100644 (file)
@@ -272,15 +272,14 @@ typedef struct {
 #define BLUESKY_MAX_FILE_SIZE (BLUESKY_BLOCK_SIZE << 24)
 typedef enum {
     BLUESKY_BLOCK_ZERO = 0,     /* Data is all zeroes, not explicitly stored */
-    BLUESKY_BLOCK_REF = 1,      /* Reference to key/value store, not cached */
-    BLUESKY_BLOCK_CACHED = 2,   /* Data is cached in memory, clean */
-    BLUESKY_BLOCK_DIRTY = 3,    /* Data needs to be committed to store */
+    BLUESKY_BLOCK_REF = 1,      /* Reference to cloud log item, data clean */
+    BLUESKY_BLOCK_DIRTY = 2,    /* Data needs to be committed to store */
 } BlueSkyBlockType;
 
 typedef struct {
     BlueSkyBlockType type;
-    BlueSkyRCStr *data;         /* Pointer to data in memory if cached */
-    BlueSkyCloudLog *cloudref;  /* Reference to cloud log entry with data */
+    BlueSkyCloudLog *ref;       /* if REF: cloud log entry with data */
+    BlueSkyRCStr *dirty;        /* if DIRTY: raw data in memory */
 } BlueSkyBlock;
 
 BlueSkyFS *bluesky_init_fs(gchar *name, BlueSkyStore *store);
index 0d91e0d..6b80233 100644 (file)
@@ -132,6 +132,7 @@ void bluesky_cloudlog_unref(BlueSkyCloudLog *log)
         g_hash_table_remove(fs->locations, &log->id);
         g_mutex_unlock(fs->lock);
 
+        log->type = LOGTYPE_INVALID;
         g_mutex_free(log->lock);
         g_cond_free(log->cond);
         g_array_unref(log->pointers);
@@ -190,6 +191,7 @@ BlueSkyCloudPointer bluesky_cloudlog_serialize(BlueSkyCloudLog *log,
         BlueSkyCloudLog *log2
             = (BlueSkyCloudLog *)g_hash_table_lookup(log->fs->locations, &id);
         // TODO: refcount
+        bluesky_cloudlog_fetch(log2);
         g_assert(log2 != NULL);
         bluesky_cloudlog_ref(log2);
         g_mutex_unlock(log->fs->lock);
@@ -247,6 +249,7 @@ void bluesky_cloudlog_write_log(BlueSkyFS *fs)
     while (state->inode_list != NULL) {
         BlueSkyCloudLog *log = (BlueSkyCloudLog *)state->inode_list->data;
         bluesky_cloudlog_serialize(log, state);
+        bluesky_cloudlog_unref(log);
         state->inode_list = g_list_delete_link(state->inode_list,
                                                state->inode_list);
     }
index 1df8382..6242ba6 100644 (file)
@@ -45,7 +45,8 @@ static void cloudlog_dump(gpointer key, gpointer value, gpointer user_data)
     }
     g_print(": refs=%d ty=%d inode=%"PRIu64" locs=%x log@(%d,%d) cloud@(%d,%d,%d)\n",
             log->refcount,
-            log->type, log->inum, log->location_flags,
+            log->type, log->inum,
+            log->location_flags | (log->data != NULL ? 0x100 : 0),
             log->log_seq, log->log_offset, log->location.directory,
             log->location.sequence, log->location.offset);
 }
index b329531..977618d 100644 (file)
@@ -31,31 +31,33 @@ void bluesky_block_touch(BlueSkyInode *inode, uint64_t i)
 
     switch (block->type) {
     case BLUESKY_BLOCK_ZERO:
-        block->data = bluesky_string_new(g_malloc0(block_len), block_len);
+        block->dirty = bluesky_string_new(g_malloc0(block_len), block_len);
         break;
     case BLUESKY_BLOCK_REF:
+        // FIXME: locking on the cloudlog?
         bluesky_block_fetch(inode, block, NULL);
-        g_assert(block->type == BLUESKY_BLOCK_CACHED);
-        /* Fall through */
-    case BLUESKY_BLOCK_CACHED:
+        bluesky_string_ref(block->ref->data);
+        block->dirty = bluesky_string_dup(block->ref->data);
+        break;
     case BLUESKY_BLOCK_DIRTY:
-        block->data = bluesky_string_dup(block->data);
+        block->dirty = bluesky_string_dup(block->dirty);
         break;
     }
 
-    if (block->type != BLUESKY_BLOCK_CACHED
+    /*if (block->type != BLUESKY_BLOCK_CACHED
             && block->type != BLUESKY_BLOCK_DIRTY)
-        g_atomic_int_add(&inode->fs->cache_total, 1);
+        g_atomic_int_add(&inode->fs->cache_total, 1);   //FIXME */
     if (block->type != BLUESKY_BLOCK_DIRTY)
         g_atomic_int_add(&inode->fs->cache_dirty, 1);
 
     block->type = BLUESKY_BLOCK_DIRTY;
-    bluesky_cloudlog_unref(block->cloudref);
-    block->cloudref = NULL;
+    bluesky_cloudlog_unref(block->ref);
+    block->ref = NULL;
 }
 
 /* Set the size of a file.  This will truncate or extend the file as needed.
  * Newly-allocated bytes are zeroed. */
+// FIXME
 void bluesky_file_truncate(BlueSkyInode *inode, uint64_t size)
 {
     g_return_if_fail(size <= BLUESKY_MAX_FILE_SIZE);
@@ -82,11 +84,11 @@ void bluesky_file_truncate(BlueSkyInode *inode, uint64_t size)
 
             if (b->type != BLUESKY_BLOCK_ZERO
                     && (b->type == BLUESKY_BLOCK_REF
-                        || b->data->len < BLUESKY_BLOCK_SIZE)) {
+                        || b->dirty->len < BLUESKY_BLOCK_SIZE)) {
                 bluesky_block_touch(inode, inode->blocks->len - 1);
-                gsize old_size = b->data->len;
-                bluesky_string_resize(b->data, BLUESKY_BLOCK_SIZE);
-                memset(&b->data->data[old_size], 0,
+                gsize old_size = b->dirty->len;
+                bluesky_string_resize(b->dirty, BLUESKY_BLOCK_SIZE);
+                memset(&b->dirty->data[old_size], 0,
                        BLUESKY_BLOCK_SIZE - old_size);
             }
         }
@@ -96,13 +98,13 @@ void bluesky_file_truncate(BlueSkyInode *inode, uint64_t size)
         /* Delete blocks from a file.  Must reclaim memory. */
         for (guint i = inode->blocks->len; i < blocks; i++) {
             BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
-            if (b->type == BLUESKY_BLOCK_CACHED
+            /* if (b->type == BLUESKY_BLOCK_CACHED
                     || b->type == BLUESKY_BLOCK_DIRTY)
-                g_atomic_int_add(&inode->fs->cache_total, -1);
+                g_atomic_int_add(&inode->fs->cache_total, -1); FIXME */
             if (b->type == BLUESKY_BLOCK_DIRTY)
                 g_atomic_int_add(&inode->fs->cache_dirty, -1);
-            bluesky_string_unref(b->data);
-            bluesky_cloudlog_unref(b->cloudref);
+            bluesky_string_unref(b->dirty);
+            bluesky_cloudlog_unref(b->ref);
         }
         g_array_set_size(inode->blocks, blocks);
     }
@@ -115,13 +117,13 @@ void bluesky_file_truncate(BlueSkyInode *inode, uint64_t size)
 
         if (b->type != BLUESKY_BLOCK_ZERO) {
             bluesky_block_touch(inode, blocks - 1);
-            gsize old_size = b->data->len;
+            gsize old_size = b->dirty->len;
             gsize new_size = size - (blocks - 1) * BLUESKY_BLOCK_SIZE;
 
-            bluesky_string_resize(b->data, new_size);
+            bluesky_string_resize(b->dirty, new_size);
 
             if (new_size > old_size) {
-                memset(&b->data->data[old_size], 0, new_size - old_size);
+                memset(&b->dirty->data[old_size], 0, new_size - old_size);
             }
         }
     }
@@ -140,6 +142,8 @@ void bluesky_file_write(BlueSkyInode *inode, uint64_t offset,
     if (len == 0)
         return;
 
+    // TODO: Optimization: If we are entirely overwriting a block we don't need
+    // to fetch it frm storage first.
     while (len > 0) {
         uint64_t block_num = offset / BLUESKY_BLOCK_SIZE;
         gint block_offset = offset % BLUESKY_BLOCK_SIZE;
@@ -148,7 +152,7 @@ void bluesky_file_write(BlueSkyInode *inode, uint64_t offset,
         bluesky_block_touch(inode, block_num);
         BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
                                          block_num);
-        memcpy(&b->data->data[block_offset], data, bytes);
+        memcpy(&b->dirty->data[block_offset], data, bytes);
 
         offset += bytes;
         data += bytes;
@@ -168,6 +172,7 @@ void bluesky_file_read(BlueSkyInode *inode, uint64_t offset,
     g_return_if_fail(offset < inode->size);
     g_return_if_fail(len <= inode->size - offset);
 
+#if 0
     /* Start fetches on any data blocks that we will need for this read. */
     BlueSkyStoreAsync *barrier = bluesky_store_async_new(inode->fs->store);
     barrier->op = STORE_OP_BARRIER;
@@ -191,6 +196,7 @@ void bluesky_file_read(BlueSkyInode *inode, uint64_t offset,
     if (bluesky_verbose) {
         g_log("bluesky/file", G_LOG_LEVEL_DEBUG, "Prefetch complete.");
     }
+#endif
 
     while (len > 0) {
         uint64_t block_num = offset / BLUESKY_BLOCK_SIZE;
@@ -199,17 +205,17 @@ void bluesky_file_read(BlueSkyInode *inode, uint64_t offset,
 
         BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
                                          block_num);
-        switch (b->type) {
-        case BLUESKY_BLOCK_ZERO:
+        if (b->type == BLUESKY_BLOCK_ZERO) {
             memset(buf, 0, bytes);
-            break;
-        case BLUESKY_BLOCK_REF:
-            bluesky_block_fetch(inode, b, NULL);
-            /* Fall through */
-        case BLUESKY_BLOCK_CACHED:
-        case BLUESKY_BLOCK_DIRTY:
-            memcpy(buf, &b->data->data[block_offset], bytes);
-            break;
+        } else {
+            BlueSkyRCStr *data = NULL;
+            if (b->type == BLUESKY_BLOCK_REF) {
+                bluesky_block_fetch(inode, b, NULL);
+                data = b->ref->data;
+            } else if (b->type == BLUESKY_BLOCK_DIRTY) {
+                data = b->dirty;
+            }
+            memcpy(buf, &data->data[block_offset], bytes);
         }
 
         offset += bytes;
@@ -224,13 +230,11 @@ void bluesky_block_fetch(BlueSkyInode *inode, BlueSkyBlock *block,
     if (block->type != BLUESKY_BLOCK_REF)
         return;
 
-    g_mutex_lock(block->cloudref->lock);
-    bluesky_cloudlog_fetch(block->cloudref);
-    block->data = block->cloudref->data;
-    bluesky_string_ref(block->data);
-    g_mutex_unlock(block->cloudref->lock);
-    block->type = BLUESKY_BLOCK_CACHED;
-    g_atomic_int_add(&inode->fs->cache_total, 1);
+    g_mutex_lock(block->ref->lock);
+    bluesky_cloudlog_fetch(block->ref);
+    g_mutex_unlock(block->ref->lock);
+    block->type = BLUESKY_BLOCK_REF;
+    g_atomic_int_add(&inode->fs->cache_total, 1);  //FIXME
 }
 
 /* Write the given block to cloud-backed storage and mark it clean. */
@@ -242,22 +246,21 @@ void bluesky_block_flush(BlueSkyInode *inode, BlueSkyBlock *block,
     if (block->type != BLUESKY_BLOCK_DIRTY)
         return;
 
-    bluesky_cloudlog_unref(block->cloudref);
-
-    BlueSkyRCStr *data = block->data;
+    g_assert(block->ref == NULL);
 
     BlueSkyCloudLog *cloudlog = bluesky_cloudlog_new(fs);
     cloudlog->type = LOGTYPE_DATA;
     cloudlog->inum = inode->inum;
-    cloudlog->data = data;
-    bluesky_string_ref(data);
+    cloudlog->data = block->dirty;      // String ownership is transferred
     bluesky_cloudlog_sync(cloudlog);
+    bluesky_cloudlog_ref(cloudlog);     // Reference for log_items list
     *log_items = g_list_prepend(*log_items, cloudlog);
     bluesky_cloudlog_insert(cloudlog);
 
-    block->cloudref = cloudlog;
+    block->ref = cloudlog;              // Uses initial reference from _new()
 
-    block->type = BLUESKY_BLOCK_CACHED;
+    block->type = BLUESKY_BLOCK_REF;
+    block->dirty = NULL;
     g_atomic_int_add(&fs->cache_dirty, -1);
 }
 
@@ -279,25 +282,17 @@ void bluesky_file_drop_cached(BlueSkyInode *inode)
 
     for (int i = 0; i < inode->blocks->len; i++) {
         BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
-        if (b->type == BLUESKY_BLOCK_CACHED) {
-            if (bluesky_verbose) {
-                g_log("bluesky/cache", G_LOG_LEVEL_DEBUG,
-                      "Dropping block %d of inode %"PRIu64" from cache",
-                      i, inode->inum);
-                g_log("bluesky/cache", G_LOG_LEVEL_DEBUG,
-                      "  (reference count was %d)", b->data->refcount);
+        if (b->type == BLUESKY_BLOCK_REF) {
+            g_mutex_lock(b->ref->lock);
+            if (b->ref->data != NULL
+                && g_atomic_int_get(&b->ref->data_lock_count) == 0
+                && (b->ref->location_flags != 0))
+            {
+                bluesky_string_unref(b->ref->data);
+                b->ref->data = NULL;
             }
-
-            bluesky_string_unref(b->data);
-            b->data = NULL;
-            b->type = BLUESKY_BLOCK_REF;
+            g_mutex_unlock(b->ref->lock);
             g_atomic_int_add(&inode->fs->cache_total, -1);
-            g_mutex_lock(b->cloudref->lock);
-            if (b->cloudref->location_flags & CLOUDLOG_JOURNAL) {
-                bluesky_string_unref(b->cloudref->data);
-                b->cloudref->data = NULL;
-            }
-            g_mutex_unlock(b->cloudref->lock);
         }
     }
 }
index 6e9c0c8..a76b4b9 100644 (file)
@@ -173,8 +173,8 @@ void bluesky_inode_unref(BlueSkyInode *inode)
                 if (b->type == BLUESKY_BLOCK_DIRTY) {
                     g_error("Deleting an inode with dirty file data!");
                 }
-                bluesky_cloudlog_unref(b->cloudref);
-                bluesky_string_unref(b->data);
+                bluesky_cloudlog_unref(b->ref);
+                bluesky_string_unref(b->dirty);
             }
             g_array_unref(inode->blocks);
             break;
@@ -299,7 +299,6 @@ void bluesky_inode_start_sync(BlueSkyInode *inode)
 
     GString *buf = g_string_new("");
     bluesky_serialize_inode(buf, inode);
-    BlueSkyRCStr *data = bluesky_string_new_from_gstring(buf);
 
     char key[64];
     sprintf(key, "inode-%016"PRIx64, inode->inum);
@@ -307,26 +306,23 @@ void bluesky_inode_start_sync(BlueSkyInode *inode)
     BlueSkyCloudLog *cloudlog = bluesky_cloudlog_new(fs);
     cloudlog->type = LOGTYPE_INODE;
     cloudlog->inum = inode->inum;
-    cloudlog->data = data;
-    bluesky_string_ref(data);
+    cloudlog->data = bluesky_string_new_from_gstring(buf);
 
     if (inode->type == BLUESKY_REGULAR) {
         for (int i = 0; i < inode->blocks->len; i++) {
             BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
-            if (b->type == BLUESKY_BLOCK_CACHED
-                || b->type == BLUESKY_BLOCK_REF)
-            {
-                BlueSkyCloudID id = b->cloudref->id;
+            if (b->type == BLUESKY_BLOCK_REF) {
+                BlueSkyCloudID id = b->ref->id;
                 g_array_append_val(cloudlog->pointers, id);
             }
         }
     }
 
-    if (inode->committed_item != NULL)
-        bluesky_cloudlog_unref(inode->committed_item);
+    bluesky_cloudlog_unref(inode->committed_item);
     inode->committed_item = cloudlog;
 
     bluesky_cloudlog_sync(cloudlog);
+    bluesky_cloudlog_ref(cloudlog);
     log_items = g_list_prepend(log_items, cloudlog);
     bluesky_cloudlog_insert(cloudlog);
 
index 55daf7a..cec46cb 100644 (file)
@@ -82,6 +82,7 @@ static void log_commit(BlueSkyLog *log)
         g_cond_signal(item->cond);
         g_mutex_unlock(item->lock);
         log->committed = g_slist_delete_link(log->committed, log->committed);
+        bluesky_cloudlog_unref(item);
         batchsize++;
     }
 
@@ -153,6 +154,7 @@ static gpointer log_thread(gpointer d)
         if ((item->location_flags | item->pending_write) & CLOUDLOG_JOURNAL) {
             g_mutex_unlock(item->lock);
             bluesky_cloudlog_unref(item);
+            g_atomic_int_add(&item->data_lock_count, -1);
             continue;
         }
 
@@ -199,6 +201,7 @@ static gpointer log_thread(gpointer d)
         offset += sizeof(header) + sizeof(footer) + item->data->len;
 
         log->committed  = g_slist_prepend(log->committed, item);
+        g_atomic_int_add(&item->data_lock_count, -1);
         g_mutex_unlock(item->lock);
 
         /* Force an if there are no other log items currently waiting to be
@@ -235,6 +238,7 @@ BlueSkyLog *bluesky_log_new(const char *log_directory)
 void bluesky_log_item_submit(BlueSkyCloudLog *item, BlueSkyLog *log)
 {
     bluesky_cloudlog_ref(item);
+    g_atomic_int_add(&item->data_lock_count, 1);
     g_async_queue_push(log->queue, item);
 }
 
index 44233a1..a9e4182 100644 (file)
@@ -95,8 +95,8 @@ void bluesky_serialize_inode(GString *out, BlueSkyInode *inode)
             BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
             BlueSkyCloudID id;
             memset(&id, 0, sizeof(id));
-            if (b->cloudref != NULL)
-                id = b->cloudref->id;
+            if (b->ref != NULL)
+                id = b->ref->id;
             g_string_append_len(out, (const char *)&id, sizeof(id));
         }
         break;