#include <fcntl.h>
#include <time.h>
+#include <algorithm>
#include <list>
+#include <map>
#include <set>
#include <string>
#include <iostream>
#include "store.h"
#include "ref.h"
+using std::max;
using std::list;
+using std::map;
using std::set;
using std::string;
-static char *const filter_program[] = {"bzip2", "-c", NULL};
+/* Default filter program is bzip2 */
+const char *filter_program = "bzip2 -c";
+const char *filter_extension = ".bz2";
static void cloexec(int fd)
{
: size(0),
segment_name(segment)
{
- real_fd = open(path.c_str(), O_WRONLY | O_CREAT, 0600);
+ real_fd = open(path.c_str(), O_WRONLY | O_CREAT, 0666);
if (real_fd < 0)
throw IOException("Error opening output file");
filter_fd = spawn_filter(real_fd);
if (tar_fdopen(&t, filter_fd, (char *)path.c_str(), NULL,
- O_WRONLY | O_CREAT, 0600, TAR_VERBOSE | TAR_GNU) == -1)
+ O_WRONLY | O_CREAT, 0666, TAR_VERBOSE | TAR_GNU) == -1)
throw IOException("Error opening Tarfile");
}
close(fd_out);
/* Exec the filter program. */
- execvp(filter_program[0], filter_program);
+ execlp("/bin/sh", "/bin/sh", "-c", filter_program, NULL);
/* Should not reach here except for error cases. */
fprintf(stderr, "Could not exec filter: %m\n");
}
/* Estimate the size based on the size of the actual output file on disk.
- * However, the filter may not have written all data yet, and in the event that
- * it is buffering data to a large extent, also use */
+ * However, it might be the case that the filter program is buffering all its
+ * data, and might potentially not write a single byte until we have closed
+ * our end of the pipe. If we don't do so until we see data written, we have
+ * a problem. So, arbitrarily pick an upper bound on the compression ratio
+ * that the filter will achieve (128:1), and return a size estimate which is
+ * the larger of a) bytes actually seen written to disk, and b) input
+ * bytes/128. */
size_t Tarfile::size_estimate()
{
struct stat statbuf;
- if (fstat(real_fd, &statbuf) == 0) {
- size_t disk_size = statbuf.st_size;
-
- if (disk_size >= size / 128)
- return disk_size;
- }
+ if (fstat(real_fd, &statbuf) == 0)
+ return max((int64_t)statbuf.st_size, (int64_t)(size / 128));
+ /* Couldn't stat the file on disk, so just return the actual number of
+ * bytes, before compression. */
return size;
}
static const size_t SEGMENT_SIZE = 4 * 1024 * 1024;
+static map<string, int64_t> group_sizes;
+
ObjectReference TarSegmentStore::write_object(const char *data, size_t len,
const std::string &group)
{
segment->name = generate_uuid();
- string filename = path + "/" + segment->name + ".tar.bz2";
+ string filename = path + "/" + segment->name + ".tar";
+ filename += filter_extension;
segment->file = new Tarfile(filename, segment->name);
segment->count = 0;
segment->file->write_object(id, data, len);
segment->count++;
+ group_sizes[group] += len;
+
ObjectReference ref(segment->name, id_buf);
// If this segment meets or exceeds the size target, close it so that
close_segment(segments.begin()->first);
}
+void TarSegmentStore::dump_stats()
+{
+ printf("Data written:\n");
+ for (map<string, int64_t>::iterator i = group_sizes.begin();
+ i != group_sizes.end(); ++i) {
+ printf(" %s: %lld\n", i->first.c_str(), i->second);
+ }
+}
+
void TarSegmentStore::close_segment(const string &group)
{
struct segment_info *segment = segments[group];