1 /* Blue Sky: File Systems in the Cloud
3 * Copyright (C) 2009 The Regents of the University of California
4 * Written by Michael Vrable <mvrable@cs.ucsd.edu>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the University nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 #include <sys/syscall.h>
39 #include <sys/types.h>
41 #include "bluesky-private.h"
43 /* Miscellaneous useful functions that don't really fit anywhere else. */
45 bluesky_time_hires bluesky_now_hires()
49 if (clock_gettime(CLOCK_REALTIME, &time) != 0) {
50 perror("clock_gettime");
54 return (int64_t)(time.tv_sec) * 1000000000 + time.tv_nsec;
57 /* Convert a UTF-8 string to lowercase. This can be used to implement
58 * case-insensitive lookups and comparisons, by normalizing all values to
59 * lowercase first. Returns a newly-allocated string as a result. */
60 gchar *bluesky_lowercase(const gchar *s)
62 /* TODO: Unicode handling; for now just do ASCII. */
63 return g_ascii_strdown(s, -1);
66 gboolean bluesky_inode_is_ready(BlueSkyInode *inode)
71 g_mutex_lock(inode->lock);
72 gboolean valid = (inode->type != BLUESKY_PENDING
73 && inode->type != BLUESKY_INVALID);
75 g_mutex_unlock(inode->lock);
80 /**** Reference-counted strings. ****/
82 /* Create and return a new reference-counted string. The reference count is
83 * initially one. The newly-returned string takes ownership of the memory
84 * pointed at by data, and will call g_free on it when the reference count
86 BlueSkyRCStr *bluesky_string_new(gpointer data, gsize len)
88 BlueSkyRCStr *string = g_new(BlueSkyRCStr, 1);
92 g_atomic_int_set(&string->refcount, 1);
96 /* Create a new BlueSkyRCStr from a GString. The GString is destroyed. */
97 BlueSkyRCStr *bluesky_string_new_from_gstring(GString *s)
100 return bluesky_string_new(g_string_free(s, FALSE), len);
103 /* Create a new BlueSkyRCStr from a memory-mapped buffer. */
104 BlueSkyRCStr *bluesky_string_new_from_mmap(BlueSkyCacheFile *mmap,
105 int offset, gsize len)
107 g_assert(offset + len <= mmap->len);
108 g_assert(mmap->addr != NULL);
110 BlueSkyRCStr *string = g_new(BlueSkyRCStr, 1);
112 g_atomic_int_inc(&mmap->mapcount);
113 string->data = (char *)mmap->addr + offset;
115 g_atomic_int_set(&string->refcount, 1);
119 void bluesky_string_ref(BlueSkyRCStr *string)
124 g_atomic_int_inc(&string->refcount);
127 void bluesky_string_unref(BlueSkyRCStr *string)
132 if (g_atomic_int_dec_and_test(&string->refcount)) {
133 if (string->mmap == NULL) {
134 g_free(string->data);
136 bluesky_mmap_unref(string->mmap);
142 /* Duplicate and return a new reference-counted string, containing a copy of
143 * the original data, with a reference count of 1. As an optimization, if the
144 * passed-in string already has a reference count of 1, the original is
145 * returned. Can be used to make a mutable copy of a shared string. For this
146 * to truly be safe, it is probably needed that there be some type of lock
147 * protecting access to the string. */
148 BlueSkyRCStr *bluesky_string_dup(BlueSkyRCStr *string)
153 if (string->mmap != NULL) {
155 s = bluesky_string_new(g_memdup(string->data, string->len),
157 bluesky_string_unref(string);
161 if (g_atomic_int_dec_and_test(&string->refcount)) {
162 /* There are no other shared copies, so return this one. */
163 g_atomic_int_inc(&string->refcount);
166 return bluesky_string_new(g_memdup(string->data, string->len),
171 /* Resize the data block used by a BlueSkyRCStr. The data pointer might change
172 * after making this call, so it should not be cached across calls to this
173 * function. To avoid confusing any other users, the caller probably ought to
174 * hold the only reference to the string (by calling bluesky_string_dup first
176 void bluesky_string_resize(BlueSkyRCStr *string, gsize len)
178 g_assert(string->mmap == NULL);
180 if (string->len == len)
183 g_warn_if_fail(string->refcount == 1);
185 string->data = g_realloc(string->data, len);
189 /* Cache LRU list management functions. These manage the doubly-linked list of
190 * inodes sorted by accessed/modified time. The FS lock should be held while
193 * _remove will unlink an inode from the linked list.
195 * _prepend and _append insert an inode at the head or tail of the linked list,
196 * and return a pointer to the linked list element (which should be stored in
197 * the inode); the inode should not already be in the list.
199 * _head and _tail simply return the first or last item inode in the list. */
200 void bluesky_list_unlink(GList *head, GList *item)
205 if (head->prev == item)
206 head->prev = item->prev;
207 head->next = g_list_delete_link(head->next, item);
210 GList *bluesky_list_prepend(GList *head, BlueSkyInode *inode)
212 head->next = g_list_prepend(head->next, inode);
213 if (head->prev == NULL)
214 head->prev = g_list_last(head->next);
218 GList *bluesky_list_append(GList *head, BlueSkyInode *inode)
220 if (head->next == NULL)
221 return bluesky_list_prepend(head, inode);
223 g_assert(head->prev != NULL && head->prev->next == NULL);
225 GList *link = g_list_alloc();
228 link->prev = head->prev;
229 head->prev->next = link;
234 BlueSkyInode *bluesky_list_head(GList *head)
236 if (head->next == NULL)
239 return (BlueSkyInode *)head->next->data;
242 BlueSkyInode *bluesky_list_tail(GList *head)
244 if (head->prev == NULL)
247 return (BlueSkyInode *)head->prev->data;
250 /**** Range sets. ****/
252 /* These are a data structure which can track a set of discontiguous integer
253 * ranges--such as the partitioning of the inode number space or the bytes in a
254 * log file into objects. This current prototype implementation just tracks
255 * the starting offset with a hash table and doesn't track the length, but
256 * should be extended later to track properly. */
258 struct BlueSkyRangeset {
262 static gint compare64(gconstpointer a, gconstpointer b, gpointer user_data)
264 uint64_t x = *(uint64_t *)a;
265 uint64_t y = *(uint64_t *)b;
275 BlueSkyRangeset *bluesky_rangeset_new()
277 BlueSkyRangeset *rangeset = g_new(BlueSkyRangeset, 1);
278 rangeset->seq = g_sequence_new(g_free);
282 void bluesky_rangeset_free(BlueSkyRangeset *rangeset)
284 g_sequence_free(rangeset->seq);
288 gboolean bluesky_rangeset_insert(BlueSkyRangeset *rangeset,
289 uint64_t start, uint64_t length,
293 i = g_sequence_search(rangeset->seq, &start, compare64, NULL);
294 i = g_sequence_iter_prev(i);
296 /* TODO: Checks that no other item overlaps. */
298 BlueSkyRangesetItem *item = g_new(BlueSkyRangesetItem, 1);
300 item->length = length;
302 g_sequence_insert_sorted(rangeset->seq, item, compare64, NULL);
307 const BlueSkyRangesetItem *bluesky_rangeset_lookup(BlueSkyRangeset *rangeset,
311 i = g_sequence_search(rangeset->seq, &offset, compare64, NULL);
312 i = g_sequence_iter_prev(i);
313 if (g_sequence_iter_is_end(i))
316 BlueSkyRangesetItem *item = (BlueSkyRangesetItem *)g_sequence_get(i);
317 if (offset >= item->start && offset < item->start + item->length)
323 /* Look up the first rangeset item starting at or following the given address.
324 * Can be used to iterate through a rangeset. */
325 const BlueSkyRangesetItem *bluesky_rangeset_lookup_next(BlueSkyRangeset *rangeset, uint64_t offset)
328 i = g_sequence_search(rangeset->seq, &offset, compare64, NULL);
329 i = g_sequence_iter_prev(i);
330 if (g_sequence_iter_is_end(i))
332 BlueSkyRangesetItem *item = (BlueSkyRangesetItem *)g_sequence_get(i);
333 if (item->start < offset) {
334 i = g_sequence_iter_next(i);
335 if (g_sequence_iter_is_end(i))
338 item = (BlueSkyRangesetItem *)g_sequence_get(i);
343 /* Determine the full extent of a rangeset--the smallest offset covered and the
344 * length needed to extend to the end of the last item. */
345 void bluesky_rangeset_get_extents(BlueSkyRangeset *rangeset,
346 uint64_t *start, uint64_t *length)
349 BlueSkyRangesetItem *item;
351 i = g_sequence_get_begin_iter(rangeset->seq);
352 if (g_sequence_iter_is_end(i)) {
358 item = (BlueSkyRangesetItem *)g_sequence_get(i);
359 *start = item->start;
361 i = g_sequence_get_end_iter(rangeset->seq);
362 i = g_sequence_iter_prev(i);
363 item = (BlueSkyRangesetItem *)g_sequence_get(i);
364 *length = (item->start + item->length) - *start;
367 /**** Request response-time tracking. ****/
371 bluesky_time_hires timestamp;
375 /* To catch attempts to access to invalid profile structures. */
376 #define PROFILE_MAGIC 0x439929d8
378 static FILE *profiling_file = NULL;
380 BlueSkyProfile *bluesky_profile_new()
382 BlueSkyProfile *profile = g_new0(BlueSkyProfile, 1);
383 profile->lock = g_mutex_new();
384 profile->magic = PROFILE_MAGIC;
388 void bluesky_profile_free(BlueSkyProfile *profile)
390 if (profile->magic != PROFILE_MAGIC) {
391 g_warning("Access to invalid BlueSkyProfile object!");
394 while (profile->events != NULL) {
395 RTEvent *event = (RTEvent *)profile->events->data;
396 g_free(event->message);
398 profile->events = g_list_delete_link(profile->events, profile->events);
401 g_mutex_free(profile->lock);
402 g_free(profile->description);
406 void bluesky_profile_add_event(BlueSkyProfile *profile, char *message)
408 if (profiling_file == NULL)
411 g_return_if_fail(profile != NULL);
413 if (profile->magic != PROFILE_MAGIC) {
414 g_warning("Access to invalid BlueSkyProfile object!");
417 g_mutex_lock(profile->lock);
418 RTEvent *event = g_new(RTEvent, 1);
419 event->timestamp = bluesky_now_hires();
420 /* FIXME: Non-portable */
421 event->tid = syscall(SYS_gettid);
422 event->message = message;
423 profile->events = g_list_prepend(profile->events, event);
424 g_mutex_unlock(profile->lock);
427 static GStaticMutex profiling_print_lock = G_STATIC_MUTEX_INIT;
429 void bluesky_profile_set_output(FILE *stream)
431 profiling_file = stream;
434 void bluesky_profile_print(BlueSkyProfile *profile)
436 FILE *stream = profiling_file;
440 g_return_if_fail(profile != NULL);
442 if (profile->magic != PROFILE_MAGIC) {
443 g_warning("Access to invalid BlueSkyProfile object!");
447 g_mutex_lock(profile->lock);
448 g_static_mutex_lock(&profiling_print_lock);
449 fprintf(stream, "Event Timeline: %s\n", profile->description);
450 GList *link = g_list_last(profile->events);
451 bluesky_time_hires last_time = 0;
452 while (link != NULL) {
453 RTEvent *event = (RTEvent *)link->data;
454 fprintf(stream, " [%d] [%"PRIi64" ns]: %s\n",
455 event->tid, event->timestamp - last_time, event->message);
456 last_time = event->timestamp;
459 g_static_mutex_unlock(&profiling_print_lock);
460 g_mutex_unlock(profile->lock);
463 static GStaticPrivate per_thread_profile = G_STATIC_PRIVATE_INIT;
465 BlueSkyProfile *bluesky_profile_get()
467 return (BlueSkyProfile *)g_static_private_get(&per_thread_profile);
470 void bluesky_profile_set(BlueSkyProfile *profile)
472 g_static_private_set(&per_thread_profile, profile, NULL);