From 7680a5bda8a87e1f5d02ea07ef767878e3bb143c Mon Sep 17 00:00:00 2001 From: Michael Vrable Date: Thu, 11 Jan 2007 14:47:41 -0800 Subject: [PATCH] Return the names for allocated objects, and link file metaata to data. Extend the interface for the new_object methods so that they can return the segment UUID and object index for created objects. Then, use this to write out links from the file metadata to the data blocks making up the file, via an indirect block. --- scandir.cc | 74 ++++++++++++++++++++++++++++++++++++++++++------------ store.cc | 28 ++++++++++++++++++--- store.h | 16 +++++++----- 3 files changed, 92 insertions(+), 26 deletions(-) diff --git a/scandir.cc b/scandir.cc index 043d35e..c2a093f 100644 --- a/scandir.cc +++ b/scandir.cc @@ -25,6 +25,10 @@ static OutputStream *info_dump = NULL; static SegmentPartitioner *index_segment, *data_segment; +/* Buffer for holding a single block of data read from a file. */ +static const int LBS_BLOCK_SIZE = 1024 * 1024; +static char *block_buf; + void scandir(const string& path); /* Converts time to microseconds since the epoch. */ @@ -33,39 +37,75 @@ int64_t encode_time(time_t time) return (int64_t)time * 1000000; } +/* Read data from a file descriptor and return the amount of data read. A + * short read (less than the requested size) will only occur if end-of-file is + * hit. */ +size_t file_read(int fd, char *buf, size_t maxlen) +{ + size_t bytes_read = 0; + + while (true) { + ssize_t res = read(fd, buf, maxlen); + if (res < 0) { + if (errno == EINTR) + continue; + throw IOException("file_read: error reading"); + } else if (res == 0) { + break; + } else { + bytes_read += res; + buf += res; + maxlen -= res; + } + } + + return bytes_read; +} + +/* Read the contents of a file (specified by an open file descriptor) and copy + * the data to the store. */ void dumpfile(int fd, dictionary &file_info) { struct stat stat_buf; fstat(fd, &stat_buf); int64_t size = 0; - char buf[4096]; - if ((stat_buf.st_mode & S_IFMT) != S_IFREG) { printf("file is no longer a regular file!\n"); return; } + /* The index data consists of a sequence of pointers to the data blocks + * that actually comprise the file data. This level of indirection is used + * so that the same data block can be used in multiple files, or multiple + * versions of the same file. */ + struct uuid segment_uuid; + int object_id; + OutputStream *index_data = index_segment->new_object(&segment_uuid, + &object_id); + SHA1Checksum hash; while (true) { - ssize_t res = read(fd, buf, sizeof(buf)); - if (res < 0) { - if (errno == EINTR) - continue; - printf("Error while reading: %m\n"); - return; - } else if (res == 0) { + struct uuid block_segment_uuid; + int block_object_id; + + size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE); + if (bytes == 0) break; - } else { - hash.process(buf, res); - OutputStream *block = data_segment->new_object(); - block->write(buf, res); - size += res; - } + + hash.process(block_buf, bytes); + OutputStream *block = data_segment->new_object(&block_segment_uuid, + &block_object_id); + block->write(block_buf, bytes); + index_data->write_uuid(block_segment_uuid); + index_data->write_u32(block_object_id); + + size += bytes; } file_info["sha1"] = string((const char *)hash.checksum(), hash.checksum_size()); + file_info["data"] = encode_objref(segment_uuid, object_id); } void scanfile(const string& path) @@ -206,9 +246,11 @@ void scandir(const string& path) int main(int argc, char *argv[]) { + block_buf = new char[LBS_BLOCK_SIZE]; + segment_store = new SegmentStore("."); SegmentWriter *sw = segment_store->new_segment(); - info_dump = sw->new_object(); + info_dump = sw->new_object(NULL); index_segment = new SegmentPartitioner(segment_store); data_segment = new SegmentPartitioner(segment_store); diff --git a/store.cc b/store.cc index 06e9453..e7c373c 100644 --- a/store.cc +++ b/store.cc @@ -77,6 +77,11 @@ void OutputStream::write_varint(uint64_t val) } while (val); } +void OutputStream::write_uuid(const struct uuid &u) +{ + write(u.bytes, 16); +} + /* Write an arbitrary string by first writing out the length, followed by the * data itself. */ void OutputStream::write_string(const string &s) @@ -183,6 +188,14 @@ string encode_u64(uint64_t val) return s.contents(); } +string encode_objref(const struct uuid &segment, uint32_t object) +{ + StringOutputStream s; + s.write_uuid(segment); + s.write_u32(object); + return s.contents(); +} + SegmentWriter::SegmentWriter(OutputStream *output, struct uuid u) : raw_out(output), id(u), @@ -195,7 +208,7 @@ SegmentWriter::SegmentWriter(OutputStream *output, struct uuid u) /* Write out the segment header first. */ static const char signature[] = "LBSSEG0\n"; out->write(signature, strlen(signature)); - out->write(id.bytes, sizeof(struct uuid)); + out->write_uuid(id); } SegmentWriter::~SegmentWriter() @@ -230,7 +243,7 @@ SegmentWriter::~SegmentWriter() delete raw_out; } -OutputStream *SegmentWriter::new_object() +OutputStream *SegmentWriter::new_object(int *id) { if (object_stream) finish_object(); @@ -238,6 +251,10 @@ OutputStream *SegmentWriter::new_object() object_start_offset = out->get_pos(); object_stream = new WrapperOutputStream(*out); + if (id != NULL) { + *id = objects.size(); + } + return object_stream; } @@ -304,7 +321,7 @@ SegmentPartitioner::~SegmentPartitioner() delete segment; } -OutputStream *SegmentPartitioner::new_object() +OutputStream *SegmentPartitioner::new_object(struct uuid *uuid, int *id) { if (segment != NULL && segment->get_size() > target_size) { delete segment; @@ -314,5 +331,8 @@ OutputStream *SegmentPartitioner::new_object() if (segment == NULL) segment = store->new_segment(); - return segment->new_object(); + if (uuid != NULL) + *uuid = segment->get_uuid(); + + return segment->new_object(id); } diff --git a/store.h b/store.h index 7629673..e1244f0 100644 --- a/store.h +++ b/store.h @@ -22,6 +22,12 @@ * metadata. Currently implemented as map. */ typedef std::map dictionary; +/* In-memory representation of a UUID (Universally-Unique Identifier), which is + * used to name a segment. */ +struct uuid { + uint8_t bytes[16]; +}; + /* IOException will be thrown if an error occurs while reading or writing in * one of the I/O wrappers. Depending upon the context; this may be fatal or * not--typically, errors reading/writing the store will be serious, but errors @@ -61,6 +67,7 @@ public: void write_varint(uint64_t val); + void write_uuid(const struct uuid &u); void write_string(const std::string &s); void write_dictionary(const dictionary &d); @@ -139,10 +146,7 @@ private: std::string encode_u16(uint16_t val); std::string encode_u32(uint32_t val); std::string encode_u64(uint64_t val); - -struct uuid { - uint8_t bytes[16]; -}; +std::string encode_objref(const struct uuid &segment, uint32_t object); /* A class which is used to pack multiple objects into a single segment, with a * lookup table to quickly locate each object. Call new_object() to get an @@ -158,7 +162,7 @@ public: struct uuid get_uuid() const { return id; } // Start writing out a new object to this segment. - OutputStream *new_object(); + OutputStream *new_object(int *id); void finish_object(); // Determine size of segment data written out so far. @@ -205,7 +209,7 @@ public: explicit SegmentPartitioner(SegmentStore *s); ~SegmentPartitioner(); - OutputStream *new_object(); + OutputStream *new_object(struct uuid *uuid, int *id); private: size_t target_size; -- 2.20.1