Return the names for allocated objects, and link file metaata to data.

author Michael Vrable <mvrable@cs.ucsd.edu>

Thu, 11 Jan 2007 22:47:41 +0000 (14:47 -0800)

committer Michael Vrable <mvrable@beleg.ucsd.edu>

Thu, 11 Jan 2007 22:47:41 +0000 (14:47 -0800)
author Michael Vrable <mvrable@cs.ucsd.edu>
Thu, 11 Jan 2007 22:47:41 +0000 (14:47 -0800)
committer Michael Vrable <mvrable@beleg.ucsd.edu>
Thu, 11 Jan 2007 22:47:41 +0000 (14:47 -0800)
diff --git a/scandir.cc b/scandir.cc

index 043d35e..c2a093f 100644 (file)
--- a/scandir.cc
+++ b/scandir.cc
@@ -25,6 +25,10 @@ static OutputStream *info_dump = NULL;
  
  static SegmentPartitioner *index_segment, *data_segment;
  
+/* Buffer for holding a single block of data read from a file. */
+static const int LBS_BLOCK_SIZE = 1024 * 1024;
+static char *block_buf;
+
  void scandir(const string& path);
  
  /* Converts time to microseconds since the epoch. */
@@ -33,39 +37,75 @@ int64_t encode_time(time_t time)
      return (int64_t)time * 1000000;
  }
  
+/* Read data from a file descriptor and return the amount of data read.  A
+ * short read (less than the requested size) will only occur if end-of-file is
+ * hit. */
+size_t file_read(int fd, char *buf, size_t maxlen)
+{
+    size_t bytes_read = 0;
+
+    while (true) {
+        ssize_t res = read(fd, buf, maxlen);
+        if (res < 0) {
+            if (errno == EINTR)
+                continue;
+            throw IOException("file_read: error reading");
+        } else if (res == 0) {
+            break;
+        } else {
+            bytes_read += res;
+            buf += res;
+            maxlen -= res;
+        }
+    }
+
+    return bytes_read;
+}
+
+/* Read the contents of a file (specified by an open file descriptor) and copy
+ * the data to the store. */
  void dumpfile(int fd, dictionary &file_info)
  {
      struct stat stat_buf;
      fstat(fd, &stat_buf);
      int64_t size = 0;
  
-    char buf[4096];
-
      if ((stat_buf.st_mode & S_IFMT) != S_IFREG) {
          printf("file is no longer a regular file!\n");
          return;
      }
  
+    /* The index data consists of a sequence of pointers to the data blocks
+     * that actually comprise the file data.  This level of indirection is used
+     * so that the same data block can be used in multiple files, or multiple
+     * versions of the same file. */
+    struct uuid segment_uuid;
+    int object_id;
+    OutputStream *index_data = index_segment->new_object(&segment_uuid,
+                                                         &object_id);
+
      SHA1Checksum hash;
      while (true) {
-        ssize_t res = read(fd, buf, sizeof(buf));
-        if (res < 0) {
-            if (errno == EINTR)
-                continue;
-            printf("Error while reading: %m\n");
-            return;
-        } else if (res == 0) {
+        struct uuid block_segment_uuid;
+        int block_object_id;
+
+        size_t bytes = file_read(fd, block_buf, LBS_BLOCK_SIZE);
+        if (bytes == 0)
              break;
-        } else {
-            hash.process(buf, res);
-            OutputStream *block = data_segment->new_object();
-            block->write(buf, res);
-            size += res;
-        }
+
+        hash.process(block_buf, bytes);
+        OutputStream *block = data_segment->new_object(&block_segment_uuid,
+                                                       &block_object_id);
+        block->write(block_buf, bytes);
+        index_data->write_uuid(block_segment_uuid);
+        index_data->write_u32(block_object_id);
+
+        size += bytes;
      }
  
      file_info["sha1"] = string((const char *)hash.checksum(),
                                 hash.checksum_size());
+    file_info["data"] = encode_objref(segment_uuid, object_id);
  }
  
  void scanfile(const string& path)
@@ -206,9 +246,11 @@ void scandir(const string& path)
  
  int main(int argc, char *argv[])
  {
+    block_buf = new char[LBS_BLOCK_SIZE];
+
      segment_store = new SegmentStore(".");
      SegmentWriter *sw = segment_store->new_segment();
-    info_dump = sw->new_object();
+    info_dump = sw->new_object(NULL);
  
      index_segment = new SegmentPartitioner(segment_store);
      data_segment = new SegmentPartitioner(segment_store);
diff --git a/store.cc b/store.cc

index 06e9453..e7c373c 100644 (file)
--- a/store.cc
+++ b/store.cc
@@ -77,6 +77,11 @@ void OutputStream::write_varint(uint64_t val)
      } while (val);
  }
  
+void OutputStream::write_uuid(const struct uuid &u)
+{
+    write(u.bytes, 16);
+}
+
  /* Write an arbitrary string by first writing out the length, followed by the
   * data itself. */
  void OutputStream::write_string(const string &s)
@@ -183,6 +188,14 @@ string encode_u64(uint64_t val)
      return s.contents();
  }
  
+string encode_objref(const struct uuid &segment, uint32_t object)
+{
+    StringOutputStream s;
+    s.write_uuid(segment);
+    s.write_u32(object);
+    return s.contents();
+}
+
  SegmentWriter::SegmentWriter(OutputStream *output, struct uuid u)
      : raw_out(output),
        id(u),
@@ -195,7 +208,7 @@ SegmentWriter::SegmentWriter(OutputStream *output, struct uuid u)
      /* Write out the segment header first. */
      static const char signature[] = "LBSSEG0\n";
      out->write(signature, strlen(signature));
-    out->write(id.bytes, sizeof(struct uuid));
+    out->write_uuid(id);
  }
  
  SegmentWriter::~SegmentWriter()
@@ -230,7 +243,7 @@ SegmentWriter::~SegmentWriter()
      delete raw_out;
  }
  
-OutputStream *SegmentWriter::new_object()
+OutputStream *SegmentWriter::new_object(int *id)
  {
      if (object_stream)
          finish_object();
@@ -238,6 +251,10 @@ OutputStream *SegmentWriter::new_object()
      object_start_offset = out->get_pos();
      object_stream = new WrapperOutputStream(*out);
  
+    if (id != NULL) {
+        *id = objects.size();
+    }
+
      return object_stream;
  }
  
@@ -304,7 +321,7 @@ SegmentPartitioner::~SegmentPartitioner()
          delete segment;
  }
  
-OutputStream *SegmentPartitioner::new_object()
+OutputStream *SegmentPartitioner::new_object(struct uuid *uuid, int *id)
  {
      if (segment != NULL && segment->get_size() > target_size) {
          delete segment;
@@ -314,5 +331,8 @@ OutputStream *SegmentPartitioner::new_object()
      if (segment == NULL)
          segment = store->new_segment();
  
-    return segment->new_object();
+    if (uuid != NULL)
+        *uuid = segment->get_uuid();
+
+    return segment->new_object(id);
  }
diff --git a/store.h b/store.h

index 7629673..e1244f0 100644 (file)
--- a/store.h
+++ b/store.h
@@ -22,6 +22,12 @@
   * metadata.  Currently implemented as map<string, string>. */
  typedef std::map<std::string, std::string> dictionary;
  
+/* In-memory representation of a UUID (Universally-Unique Identifier), which is
+ * used to name a segment. */
+struct uuid {
+    uint8_t bytes[16];
+};
+
  /* IOException will be thrown if an error occurs while reading or writing in
   * one of the I/O wrappers.  Depending upon the context; this may be fatal or
   * not--typically, errors reading/writing the store will be serious, but errors
@@ -61,6 +67,7 @@ public:
  
      void write_varint(uint64_t val);
  
+    void write_uuid(const struct uuid &u);
      void write_string(const std::string &s);
      void write_dictionary(const dictionary &d);
  
@@ -139,10 +146,7 @@ private:
  std::string encode_u16(uint16_t val);
  std::string encode_u32(uint32_t val);
  std::string encode_u64(uint64_t val);
-
-struct uuid {
-    uint8_t bytes[16];
-};
+std::string encode_objref(const struct uuid &segment, uint32_t object);
  
  /* A class which is used to pack multiple objects into a single segment, with a
   * lookup table to quickly locate each object.  Call new_object() to get an
@@ -158,7 +162,7 @@ public:
      struct uuid get_uuid() const { return id; }
  
      // Start writing out a new object to this segment.
-    OutputStream *new_object();
+    OutputStream *new_object(int *id);
      void finish_object();
  
      // Determine size of segment data written out so far.
@@ -205,7 +209,7 @@ public:
      explicit SegmentPartitioner(SegmentStore *s);
      ~SegmentPartitioner();
  
-    OutputStream *new_object();
+    OutputStream *new_object(struct uuid *uuid, int *id);
  
  private:
      size_t target_size;
author	Michael Vrable <mvrable@cs.ucsd.edu>
	Thu, 11 Jan 2007 22:47:41 +0000 (14:47 -0800)
committer	Michael Vrable <mvrable@beleg.ucsd.edu>
	Thu, 11 Jan 2007 22:47:41 +0000 (14:47 -0800)
scandir.cc		patch \| blob \| history
store.cc		patch \| blob \| history
store.h		patch \| blob \| history