1 /* Blue Sky: File Systems in the Cloud
3 * Copyright (C) 2009 The Regents of the University of California
4 * Written by Michael Vrable <mvrable@cs.ucsd.edu>
14 #include "bluesky-private.h"
16 /* Core filesystem: handling of regular files and caching of file data. */
18 /* Mark a given block dirty and make sure that data is faulted in so that it
21 * If preserve is set to false, this is a hint that the block is about to be
22 * entirely overwritten. In this case, a dirty block is made available but any
23 * prior contents might be lost. A value of preserve = TRUE is always safe. */
24 void bluesky_block_touch(BlueSkyInode *inode, uint64_t i, gboolean preserve)
26 g_return_if_fail(i < inode->blocks->len);
27 BlueSkyBlock *block = &g_array_index(inode->blocks, BlueSkyBlock, i);
30 if (i < inode->blocks->len - 1) {
31 block_len = BLUESKY_BLOCK_SIZE;
33 block_len = inode->size - i * BLUESKY_BLOCK_SIZE;
36 switch (block->type) {
37 case BLUESKY_BLOCK_ZERO:
38 block->dirty = bluesky_string_new(g_malloc0(block_len), block_len);
40 case BLUESKY_BLOCK_REF:
42 // FIXME: locking on the cloudlog?
43 bluesky_block_fetch(inode, block, NULL);
44 bluesky_string_ref(block->ref->data);
45 block->dirty = bluesky_string_dup(block->ref->data);
47 block->dirty = bluesky_string_new(g_malloc0(block_len), block_len);
50 case BLUESKY_BLOCK_DIRTY:
51 block->dirty = bluesky_string_dup(block->dirty);
55 if (block->type != BLUESKY_BLOCK_DIRTY)
56 g_atomic_int_add(&inode->fs->cache_dirty, 1);
58 block->type = BLUESKY_BLOCK_DIRTY;
59 bluesky_cloudlog_unref(block->ref);
63 /* Set the size of a file. This will truncate or extend the file as needed.
64 * Newly-allocated bytes are zeroed. */
66 void bluesky_file_truncate(BlueSkyInode *inode, uint64_t size)
68 g_return_if_fail(size <= BLUESKY_MAX_FILE_SIZE);
70 if (size == inode->size)
73 if (bluesky_verbose) {
74 g_log("bluesky/file", G_LOG_LEVEL_DEBUG,
75 "Truncating file to %"PRIi64" bytes", size);
78 uint64_t blocks = (size + BLUESKY_BLOCK_SIZE - 1) / BLUESKY_BLOCK_SIZE;
80 /* Calculate number of bytes in the last block of the file */
81 int lastblock_old, lastblock_new;
82 lastblock_old = inode->size % BLUESKY_BLOCK_SIZE;
83 if (lastblock_old == 0 && inode->size > 0)
84 lastblock_old = BLUESKY_BLOCK_SIZE;
85 lastblock_new = size % BLUESKY_BLOCK_SIZE;
86 if (lastblock_new == 0 && size > 0)
87 lastblock_new = BLUESKY_BLOCK_SIZE;
89 if (blocks > inode->blocks->len) {
90 /* Need to add new blocks to the end of a file. New block structures
91 * are automatically zeroed, which initializes them to be pointers to
92 * zero blocks so we don't need to do any more work. If the
93 * previously-last block in the file is smaller than
94 * BLUESKY_BLOCK_SIZE, extend it to full size. */
95 if (inode->blocks->len > 0) {
96 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
97 inode->blocks->len - 1);
99 if (b->type != BLUESKY_BLOCK_ZERO
100 && lastblock_old < BLUESKY_BLOCK_SIZE) {
101 bluesky_block_touch(inode, inode->blocks->len - 1, TRUE);
102 gsize old_size = b->dirty->len;
103 if (lastblock_old != old_size) {
105 "Warning: last block size = %zd, expected %d\n",
106 old_size, lastblock_old);
108 bluesky_string_resize(b->dirty, BLUESKY_BLOCK_SIZE);
109 memset(&b->dirty->data[old_size], 0,
110 BLUESKY_BLOCK_SIZE - old_size);
114 g_array_set_size(inode->blocks, blocks);
115 } else if (blocks < inode->blocks->len) {
116 /* Delete blocks from a file. Must reclaim memory. */
117 for (guint i = blocks; i < inode->blocks->len; i++) {
118 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
119 if (b->type == BLUESKY_BLOCK_DIRTY)
120 g_atomic_int_add(&inode->fs->cache_dirty, -1);
121 bluesky_string_unref(b->dirty);
122 bluesky_cloudlog_unref(b->ref);
124 g_array_set_size(inode->blocks, blocks);
127 /* Ensure the new last block of the file is properly sized. If the block
128 * is extended, newly-added bytes must be zeroed. */
130 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
133 gboolean need_resize = TRUE;
134 if (b->type == BLUESKY_BLOCK_ZERO)
136 else if (size < inode->size && lastblock_new == BLUESKY_BLOCK_SIZE)
140 bluesky_block_touch(inode, blocks - 1, TRUE);
141 gsize old_size = b->dirty->len;
142 gsize new_size = size - (blocks - 1) * BLUESKY_BLOCK_SIZE;
144 bluesky_string_resize(b->dirty, new_size);
146 if (new_size > old_size) {
147 memset(&b->dirty->data[old_size], 0, new_size - old_size);
153 bluesky_inode_update_ctime(inode, 1);
156 void bluesky_file_write(BlueSkyInode *inode, uint64_t offset,
157 const char *data, gint len)
159 g_return_if_fail(inode->type == BLUESKY_REGULAR);
160 g_return_if_fail(offset < inode->size);
161 g_return_if_fail(len <= inode->size - offset);
167 uint64_t block_num = offset / BLUESKY_BLOCK_SIZE;
168 gint block_offset = offset % BLUESKY_BLOCK_SIZE;
169 gint bytes = MIN(BLUESKY_BLOCK_SIZE - block_offset, len);
171 gboolean preserve = TRUE;
172 gsize block_size = BLUESKY_BLOCK_SIZE;
173 if (block_num == inode->blocks->len - 1) {
174 block_size = inode->size - block_num * BLUESKY_BLOCK_SIZE;
176 if (block_offset == 0 && bytes == block_size) {
179 bluesky_block_touch(inode, block_num, preserve);
180 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
182 memcpy(&b->dirty->data[block_offset], data, bytes);
189 bluesky_inode_update_ctime(inode, 1);
192 void bluesky_file_read(BlueSkyInode *inode, uint64_t offset,
195 if (len == 0 && offset <= inode->size)
198 g_return_if_fail(inode->type == BLUESKY_REGULAR);
199 g_return_if_fail(offset < inode->size);
200 g_return_if_fail(len <= inode->size - offset);
202 BlueSkyProfile *profile = bluesky_profile_get();
204 bluesky_profile_add_event(profile,
205 g_strdup_printf("Start file read prefetch"));
206 uint64_t start_block, end_block;
207 start_block = offset / BLUESKY_BLOCK_SIZE;
208 end_block = (offset + len - 1) / BLUESKY_BLOCK_SIZE;
209 for (uint64_t i = start_block; i <= end_block; i++) {
210 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
212 if (b->type == BLUESKY_BLOCK_REF)
213 bluesky_cloudlog_prefetch(b->ref);
216 bluesky_profile_add_event(profile,
217 g_strdup_printf("End file read prefetch"));
220 uint64_t block_num = offset / BLUESKY_BLOCK_SIZE;
221 gint block_offset = offset % BLUESKY_BLOCK_SIZE;
222 gint bytes = MIN(BLUESKY_BLOCK_SIZE - block_offset, len);
224 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock,
226 if (b->type == BLUESKY_BLOCK_ZERO) {
227 memset(buf, 0, bytes);
229 BlueSkyRCStr *data = NULL;
230 if (b->type == BLUESKY_BLOCK_REF) {
231 bluesky_block_fetch(inode, b, NULL);
233 } else if (b->type == BLUESKY_BLOCK_DIRTY) {
236 memcpy(buf, &data->data[block_offset], bytes);
244 bluesky_profile_add_event(profile,
245 g_strdup_printf("BlueSky read complete"));
248 void bluesky_block_fetch(BlueSkyInode *inode, BlueSkyBlock *block,
249 BlueSkyStoreAsync *barrier)
251 if (block->type != BLUESKY_BLOCK_REF)
254 g_mutex_lock(block->ref->lock);
255 bluesky_cloudlog_fetch(block->ref);
256 g_mutex_unlock(block->ref->lock);
257 block->type = BLUESKY_BLOCK_REF;
260 /* Write the given block to cloud-backed storage and mark it clean. */
261 void bluesky_block_flush(BlueSkyInode *inode, BlueSkyBlock *block,
264 BlueSkyFS *fs = inode->fs;
266 if (block->type != BLUESKY_BLOCK_DIRTY)
269 g_assert(block->ref == NULL);
271 BlueSkyCloudLog *cloudlog = bluesky_cloudlog_new(fs, NULL);
272 cloudlog->type = LOGTYPE_DATA;
273 cloudlog->inum = inode->inum;
274 cloudlog->data = block->dirty; // String ownership is transferred
275 bluesky_cloudlog_stats_update(cloudlog, 1);
276 bluesky_cloudlog_sync(cloudlog);
277 bluesky_cloudlog_ref(cloudlog); // Reference for log_items list
278 *log_items = g_list_prepend(*log_items, cloudlog);
279 bluesky_cloudlog_insert(cloudlog);
281 block->ref = cloudlog; // Uses initial reference from _new()
283 block->type = BLUESKY_BLOCK_REF;
285 g_atomic_int_add(&fs->cache_dirty, -1);
288 /* Flush all blocks in a file to stable storage. */
289 void bluesky_file_flush(BlueSkyInode *inode, GList **log_items)
291 g_return_if_fail(inode->type == BLUESKY_REGULAR);
293 for (int i = 0; i < inode->blocks->len; i++) {
294 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
295 bluesky_block_flush(inode, b, log_items);
299 /* Drop clean data blocks for a file from cache. */
300 void bluesky_file_drop_cached(BlueSkyInode *inode)
302 g_return_if_fail(inode->type == BLUESKY_REGULAR);
304 for (int i = 0; i < inode->blocks->len; i++) {
305 BlueSkyBlock *b = &g_array_index(inode->blocks, BlueSkyBlock, i);
306 if (b->type == BLUESKY_BLOCK_REF) {
307 g_mutex_lock(b->ref->lock);
308 if (b->ref->data != NULL
309 && g_atomic_int_get(&b->ref->data_lock_count) == 0
310 && (b->ref->location_flags != 0))
312 bluesky_cloudlog_stats_update(b->ref, -1);
313 bluesky_string_unref(b->ref->data);
315 bluesky_cloudlog_stats_update(b->ref, 1);
317 g_mutex_unlock(b->ref->lock);