1 /* Blue Sky: File Systems in the Cloud
3 * Copyright (C) 2010 The Regents of the University of California
4 * Written by Michael Vrable <mvrable@cs.ucsd.edu>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the University nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 /* A simple tool for benchmarking various logging strategies.
33 * We want to log a series of key/value pairs. Approaches that we try include:
34 * - Data written directly into the filesystem.
35 * - Data is written to a Berkeley DB.
36 * - Data is appended to a log file.
37 * In all cases we want to ensure that data is persistent on disk so it could
38 * be used for crash recovery. We measure how many log records we can write
39 * per second to gauge performance. */
42 #define _ATFILE_SOURCE
49 #include <sys/types.h>
63 int queue_capacity = 1024;
66 int opt_batchsize = 1;
67 int opt_writes = (1 << 12);
68 int opt_bdb_async = FALSE;
73 GCond *cond_empty, *cond_full;
78 clock_gettime(CLOCK_MONOTONIC, &ts);
80 return ts.tv_sec * 1000000000LL + ts.tv_nsec;
83 struct item *get_item()
85 return (struct item *)g_async_queue_pop(queue);
88 void finish_item(struct item *item)
97 g_cond_signal(cond_empty);
98 if (outstanding < queue_capacity)
99 g_cond_signal(cond_full);
100 g_mutex_unlock(lock);
103 void writebuf(int fd, const char *buf, size_t len)
107 written = write(fd, buf, len);
108 if (written < 0 && errno == EINTR)
110 g_assert(written >= 0);
116 /************************ Direct-to-filesystem logging ***********************/
117 static int dir_fd = -1;
119 gpointer fslog_thread(gpointer d)
122 struct item *item = get_item();
124 int fd = openat(dir_fd, item->key, O_CREAT|O_WRONLY|O_TRUNC, 0666);
127 writebuf(fd, item->data, item->len);
141 dir_fd = open(".", O_DIRECTORY);
142 g_assert(dir_fd >= 0);
144 for (int i = 0; i < 1; i++)
145 g_thread_create(fslog_thread, NULL, FALSE, NULL);
148 /****************************** Single-File Log ******************************/
149 gpointer flatlog_thread(gpointer d)
151 int fd = open("logfile", O_CREAT|O_WRONLY|O_TRUNC, 0666);
157 struct item *item = get_item();
159 writebuf(fd, item->key, strlen(item->key) + 1);
160 writebuf(fd, (char *)&item->len, sizeof(item->len));
161 writebuf(fd, item->data, item->len);
164 if (count % opt_batchsize == 0)
173 void launch_flatlog()
175 g_thread_create(flatlog_thread, NULL, FALSE, NULL);
178 /************************* Transactional Berkeley DB *************************/
179 gpointer bdb_thread(gpointer d)
187 res = db_env_create(&env, 0);
190 res = env->open(env, ".",
191 DB_CREATE | DB_RECOVER | DB_INIT_LOCK | DB_INIT_LOG
192 | DB_INIT_MPOOL | DB_INIT_TXN | DB_THREAD, 0644);
196 res = env->set_flags(env, DB_TXN_WRITE_NOSYNC, 1);
200 res = db_create(&db, env, 0);
203 res = db->open(db, NULL, "log.db", "log", DB_BTREE,
204 DB_CREATE | DB_THREAD | DB_AUTO_COMMIT, 0644);
208 if (txn == NULL && !opt_bdb_async) {
209 res = env->txn_begin(env, NULL, &txn, 0);
213 struct item *item = get_item();
216 memset(&key, 0, sizeof(key));
217 memset(&value, 0, sizeof(value));
219 key.data = item->key;
220 key.size = strlen(item->key);
222 value.data = item->data;
223 value.size = item->len;
225 res = db->put(db, opt_bdb_async ? NULL : txn, &key, &value, 0);
229 if (count % opt_batchsize == 0) {
231 env->txn_checkpoint(env, 0, 0, 0);
246 g_thread_create(bdb_thread, NULL, FALSE, NULL);
249 int main(int argc, char *argv[])
251 int64_t time_start, time_end;
254 queue = g_async_queue_new();
255 lock = g_mutex_new();
256 cond_empty = g_cond_new();
257 cond_full = g_cond_new();
261 while ((opt = getopt(argc, argv, "at:s:b:n:BFD")) != -1) {
264 // Make BDB log writes more asynchronous
265 opt_bdb_async = TRUE;
268 // Set number of log worker threads
269 opt_threads = atoi(optarg);
272 // Set item size (in bytes)
273 item_size = atoi(optarg);
277 opt_batchsize = atoi(optarg);
281 opt_writes = atoi(optarg);
284 // Select BDB backend
288 // Select flat file backend
292 // Select file system directory backend
296 fprintf(stderr, "Usage: %s [-t threads] {-B|-F|-D}\n",
313 fprintf(stderr, "Backend not selected!\n");
317 time_start = get_ns();
318 for (int i = 0; i < opt_writes; i++) {
319 struct item *item = g_new(struct item, 1);
320 item->key = g_strdup_printf("item-%06d", i);
321 item->data = g_malloc(item_size);
322 item->len = item_size;
325 g_async_queue_push(queue, item);
327 if (outstanding == opt_batchsize)
328 g_cond_wait(cond_empty, lock);
329 g_mutex_unlock(lock);
333 while (outstanding > 0)
334 g_cond_wait(cond_empty, lock);
335 g_mutex_unlock(lock);
338 double elapsed = (time_end - time_start) / 1e9;
339 printf("Elapsed: %f s\nThroughput: %f txn/s, %f MiB/s\n",
340 elapsed, opt_writes / elapsed,
341 opt_writes / elapsed * item_size / (1 << 20));
343 if (backend == 'b' && opt_bdb_async)
346 FILE *f = fopen("../logbench.data", "a");
348 fprintf(f, "%c\t%d\t%d\t%d\t%f\t%f\t%f\n",
349 backend, item_size, opt_writes, opt_batchsize,
350 elapsed, opt_writes / elapsed,
351 opt_writes / elapsed * item_size / (1 << 20));