Cleanup to used data tracking in localdb.

[cumulus.git] / schema.sql
diff --git a/schema.sql b/schema.sql

index a37b501..465dde2 100644 (file)
--- a/schema.sql
+++ b/schema.sql
@@ -3,23 +3,41 @@
  --
  -- The index is stored in an SQLite3 database.  This is its schema.
  
--- List of snapshots which have been created.
+-- Versioning information, describing the revision for which the table schema
+-- was set up.
+create table schema_version(
+    version text,               -- Program version, dotted decimal string
+    major integer,              -- Major version number
+    minor integer               -- Minor version number
+);
+insert into schema_version values ('0.11', 0, 11);
+
+-- List of snapshots which have been created and which we are still tracking.
+-- There may be more snapshots than this actually stored at the remote server,
+-- but the reverse should not ever be true: Cumulus may depend on data stored
+-- in these snapshots when writing a new snapshot.
  create table snapshots (
      snapshotid integer primary key,
      name text not null,
-    scheme text,
-    timestamp real
+    scheme text not null,
+    timestamp real,
+    intent real                 -- TODO: deprecated, should be removed
  );
  
  -- List of segments which have been created.
  create table segments (
      segmentid integer primary key,
      segment text unique not null,
+    mtime real,                 -- timestamp when segment was created
      path text,
-    checksum text
+    checksum text,
+    data_size integer,          -- sum of bytes in all objects in the segment
+    disk_size integer,          -- size of segment on disk, after compression
+    type text
  );
  
--- Index of all blocks which have been stored in a snapshot, by checksum.
+-- Index of all data blocks in stored segments.  This is indexed by content
+-- hash to allow for coarse block-level data deduplication.
  create table block_index (
      blockid integer primary key,
      segmentid integer not null,
@@ -32,28 +50,35 @@ create table block_index (
  create index block_content_index on block_index(checksum);
  create unique index block_name_index on block_index(segmentid, object);
  
--- Index tracking which blocks are used by which snapshots.
-create table snapshot_contents (
-    blockid integer,
-    snapshotid integer
+-- Checksums for the decomposition of blocks into even smaller chunks
+-- (variable-sized, but generally ~4 kB, and maximum 64 kB).  Chunk boundaries
+-- are determined based on the contents using Rabin fingerprints.  These
+-- checksums can be used for computing sub-file incrementals.
+--
+-- Each block stored in block_index may have an entry in the
+-- subblock_signatures table.  The signatures field is a binary blob consisting
+-- of a packed sequence of (chunk length [16-bit unsigned, big-endian],
+-- checksum [20 bytes if SHA-1]) tuples that should cover the entire block.
+--
+-- algorithm specifies the method used for computing break points as well as
+-- the hash function used, so that signatures can be discarded if the algorithm
+-- changes.  The current algorithm used is 'lbfs-4096/sha1', which specifies a
+-- target 4 kB block size with parameters set to match LBFS, and SHA-1 as the
+-- hash algorithm.
+create table subblock_signatures (
+    blockid integer primary key,
+    algorithm text not null,
+    signatures blob not null
  );
-create unique index snapshot_contents_unique
-    on snapshot_contents(blockid, snapshotid);
  
--- Summary statistics for each segment.
-create view segment_info as select * from
-    (select segmentid, max(timestamp) as mtime,
-            sum(size) as size, count(*) as objects
-       from block_index natural join segments group by segmentid)
-natural join
-    (select segmentid, sum(size) as used, count(*) as objects_used
-       from block_index where blockid in
-            (select blockid from snapshot_contents) group by segmentid);
+-- Summary of segment utilization for each snapshot.
+create table segment_utilization (
+    snapshotid integer not null,
+    segmentid integer not null,
  
--- Ranking of segments to be cleaned, using a benefit function of
--- (fraction free space)*(age of youngest object).
-create view cleaning_order as select *, (1-u)*age/(u+0.1) as benefit from
-    (select segmentid,
-            cast(used as real) / size as u, julianday('now') - mtime as age
-        from segment_info)
-where benefit > 0;
+    -- Estimate for the number of live bytes in data objects: this is capped at
+    -- segments.data_size if all data in the segment is referenced.
+    bytes_referenced integer
+);
+create unique index segment_utilization_index
+    on segment_utilization(snapshotid, segmentid);