1 /* Blue Sky: File Systems in the Cloud
3 * Copyright (C) 2009 The Regents of the University of California
4 * Written by Michael Vrable <mvrable@cs.ucsd.edu>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the University nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 #include "bluesky-private.h"
38 /* Core filesystem: handling of regular files and caching of file data. */
40 /* Mark a given block dirty and make sure that data is faulted in so that it
43 * If preserve is set to false, this is a hint that the block is about to be
44 * entirely overwritten. In this case, a dirty block is made available but any
45 * prior contents might be lost. A value of preserve = TRUE is always safe. */
46 void bluesky_block_touch(BlueSkyInode *inode, uint64_t i, gboolean preserve)
48 g_return_if_fail(i < inode->blocks->len);
49 BlueSkyBlock *block = &g_array_index(inode->blocks, BlueSkyBlock, i);
52 if (i < inode->blocks->len - 1) {
53 block_len = BLUESKY_BLOCK_SIZE;
55 block_len = inode->size - i * BLUESKY_BLOCK_SIZE;
58 switch (block->type) {
59 case BLUESKY_BLOCK_ZERO:
60 block->dirty = bluesky_string_new(g_malloc0(block_len), block_len);
62 case BLUESKY_BLOCK_REF:
64 // FIXME: locking on the cloudlog?
65 bluesky_block_fetch(inode, block, NULL);
66 bluesky_string_ref(block->ref->data);
67 block->dirty = bluesky_string_dup(block->ref->data);
69 block->dirty = bluesky_string_new(g_malloc0(block_len), block_len);
72 case BLUESKY_BLOCK_DIRTY:
73 block->dirty = bluesky_string_dup(block->dirty);
77 if (block->type != BLUESKY_BLOCK_DIRTY)
78 g_atomic_int_add(&inode->fs->cache_dirty, 1);
80 block->type = BLUESKY_BLOCK_DIRTY;
81 bluesky_cloudlog_unref(block->ref);
85 /* Set the size of a file. This will truncate or extend the file as needed.
86 * Newly-allocated bytes are zeroed. */
88 void bluesky_file_truncate(BlueSkyInode *inode, uint64_t size)
90 g_return_if_fail(size <= BLUESKY_MAX_FILE_SIZE);
92 if (size == inode->size)
95 if (bluesky_verbose) {
96 g_log("bluesky/file", G_LOG_LEVEL_DEBUG,
97 "Truncating file to %"PRIi64" bytes", size);
100 uint64_t blocks = (size + BLUESKY_BLOCK_SIZE - 1) / BLUESKY_BLOCK_SIZE;
102 /* Calculate number of bytes in the last block of the file */
103 int lastblock_old, lastblock_new;
104 lastblock_old = inode->size % BLUESKY_BLOCK_SIZE;
105 if (lastblock_old == 0 && inode->size > 0)
106 lastblock_old = BLUESKY_BLOCK_SIZE;
107 lastblock_new = size % BLUESKY_BLOCK_SIZE;
108 if (lastblock_new == 0 && size > 0)
109 lastblock_new = BLUESKY_BLOCK_SIZE;
111 if (blocks > inode->blocks->len) {
112 /* Need to add new blocks to the end of a file. New block structures
113 * are automatically zeroed, which initializes them to be pointers to
114 * zero blocks so we don't need to do any more work. If the
115 * previously-last block in the file is smaller than
116 * BLUESKY_BLOCK_SIZE, extend it to full size. */
117 if (inode->blocks->len > 0) {
118 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
119 inode->blocks->len - 1);
121 if (b->type != BLUESKY_BLOCK_ZERO
122 && lastblock_old < BLUESKY_BLOCK_SIZE) {
123 bluesky_block_touch(inode, inode->blocks->len - 1, TRUE);
124 gsize old_size = b->dirty->len;
125 if (lastblock_old != old_size) {
127 "Warning: last block size = %zd, expected %d\n",
128 old_size, lastblock_old);
130 bluesky_string_resize(b->dirty, BLUESKY_BLOCK_SIZE);
131 memset(&b->dirty->data[old_size], 0,
132 BLUESKY_BLOCK_SIZE - old_size);
136 g_array_set_size(inode->blocks, blocks);
137 } else if (blocks < inode->blocks->len) {
138 /* Delete blocks from a file. Must reclaim memory. */
139 for (guint i = blocks; i < inode->blocks->len; i++) {
140 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
141 if (b->type == BLUESKY_BLOCK_DIRTY)
142 g_atomic_int_add(&inode->fs->cache_dirty, -1);
143 bluesky_string_unref(b->dirty);
144 bluesky_cloudlog_unref(b->ref);
146 g_array_set_size(inode->blocks, blocks);
149 /* Ensure the new last block of the file is properly sized. If the block
150 * is extended, newly-added bytes must be zeroed. */
152 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
155 gboolean need_resize = TRUE;
156 if (b->type == BLUESKY_BLOCK_ZERO)
158 else if (size < inode->size && lastblock_new == BLUESKY_BLOCK_SIZE)
162 bluesky_block_touch(inode, blocks - 1, TRUE);
163 gsize old_size = b->dirty->len;
164 gsize new_size = size - (blocks - 1) * BLUESKY_BLOCK_SIZE;
166 bluesky_string_resize(b->dirty, new_size);
168 if (new_size > old_size) {
169 memset(&b->dirty->data[old_size], 0, new_size - old_size);
175 bluesky_inode_update_ctime(inode, 1);
178 void bluesky_file_write(BlueSkyInode *inode, uint64_t offset,
179 const char *data, gint len)
181 g_return_if_fail(inode->type == BLUESKY_REGULAR);
182 g_return_if_fail(offset < inode->size);
183 g_return_if_fail(len <= inode->size - offset);
189 uint64_t block_num = offset / BLUESKY_BLOCK_SIZE;
190 gint block_offset = offset % BLUESKY_BLOCK_SIZE;
191 gint bytes = MIN(BLUESKY_BLOCK_SIZE - block_offset, len);
193 gboolean preserve = TRUE;
194 gsize block_size = BLUESKY_BLOCK_SIZE;
195 if (block_num == inode->blocks->len - 1) {
196 block_size = inode->size - block_num * BLUESKY_BLOCK_SIZE;
198 if (block_offset == 0 && bytes == block_size) {
201 bluesky_block_touch(inode, block_num, preserve);
202 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
204 memcpy(&b->dirty->data[block_offset], data, bytes);
211 bluesky_inode_update_ctime(inode, 1);
214 void bluesky_file_read(BlueSkyInode *inode, uint64_t offset,
217 if (len == 0 && offset <= inode->size)
220 g_return_if_fail(inode->type == BLUESKY_REGULAR);
221 g_return_if_fail(offset < inode->size);
222 g_return_if_fail(len <= inode->size - offset);
224 BlueSkyProfile *profile = bluesky_profile_get();
226 bluesky_profile_add_event(profile,
227 g_strdup_printf("Start file read prefetch"));
228 uint64_t start_block, end_block;
229 start_block = offset / BLUESKY_BLOCK_SIZE;
230 end_block = (offset + len - 1) / BLUESKY_BLOCK_SIZE;
231 for (uint64_t i = start_block; i <= end_block; i++) {
232 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
234 if (b->type == BLUESKY_BLOCK_REF)
235 bluesky_cloudlog_prefetch(b->ref);
238 bluesky_profile_add_event(profile,
239 g_strdup_printf("End file read prefetch"));
242 uint64_t block_num = offset / BLUESKY_BLOCK_SIZE;
243 gint block_offset = offset % BLUESKY_BLOCK_SIZE;
244 gint bytes = MIN(BLUESKY_BLOCK_SIZE - block_offset, len);
246 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
248 if (b->type == BLUESKY_BLOCK_ZERO) {
249 memset(buf, 0, bytes);
251 BlueSkyRCStr *data = NULL;
252 if (b->type == BLUESKY_BLOCK_REF) {
253 bluesky_block_fetch(inode, b, NULL);
255 } else if (b->type == BLUESKY_BLOCK_DIRTY) {
258 memcpy(buf, &data->data[block_offset], bytes);
266 bluesky_profile_add_event(profile,
267 g_strdup_printf("BlueSky read complete"));
270 void bluesky_block_fetch(BlueSkyInode *inode, BlueSkyBlock *block,
271 BlueSkyStoreAsync *barrier)
273 if (block->type != BLUESKY_BLOCK_REF)
276 g_mutex_lock(block->ref->lock);
277 bluesky_cloudlog_fetch(block->ref);
278 g_mutex_unlock(block->ref->lock);
279 block->type = BLUESKY_BLOCK_REF;
282 /* Write the given block to cloud-backed storage and mark it clean. */
283 void bluesky_block_flush(BlueSkyInode *inode, BlueSkyBlock *block,
286 BlueSkyFS *fs = inode->fs;
288 if (block->type != BLUESKY_BLOCK_DIRTY)
291 g_assert(block->ref == NULL);
293 BlueSkyCloudLog *cloudlog = bluesky_cloudlog_new(fs, NULL);
294 cloudlog->type = LOGTYPE_DATA;
295 cloudlog->inum = inode->inum;
296 cloudlog->data = block->dirty; // String ownership is transferred
297 bluesky_cloudlog_stats_update(cloudlog, 1);
298 bluesky_cloudlog_sync(cloudlog);
299 bluesky_cloudlog_ref(cloudlog); // Reference for log_items list
300 *log_items = g_list_prepend(*log_items, cloudlog);
301 bluesky_cloudlog_insert(cloudlog);
303 block->ref = cloudlog; // Uses initial reference from _new()
305 block->type = BLUESKY_BLOCK_REF;
307 g_atomic_int_add(&fs->cache_dirty, -1);
310 /* Flush all blocks in a file to stable storage. */
311 void bluesky_file_flush(BlueSkyInode *inode, GList **log_items)
313 g_return_if_fail(inode->type == BLUESKY_REGULAR);
315 for (int i = 0; i < inode->blocks->len; i++) {
316 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
317 bluesky_block_flush(inode, b, log_items);
321 /* Drop clean data blocks for a file from cache. */
322 void bluesky_file_drop_cached(BlueSkyInode *inode)
324 g_return_if_fail(inode->type == BLUESKY_REGULAR);
326 for (int i = 0; i < inode->blocks->len; i++) {
327 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
328 if (b->type == BLUESKY_BLOCK_REF) {
329 g_mutex_lock(b->ref->lock);
330 if (b->ref->data != NULL
331 && g_atomic_int_get(&b->ref->data_lock_count) == 0
332 && (b->ref->location_flags != 0))
334 bluesky_cloudlog_stats_update(b->ref, -1);
335 bluesky_string_unref(b->ref->data);
337 bluesky_cloudlog_stats_update(b->ref, 1);
339 g_mutex_unlock(b->ref->lock);