From: Michael Vrable Date: Sat, 23 Dec 2006 20:46:19 +0000 (-0800) Subject: Compute SHA-1 checksums of regular files to be stored with index data. X-Git-Url: https://git.vrable.net/?a=commitdiff_plain;h=948051f02c9bc3285ce1ea187dbae27120dbf7aa;p=cumulus.git Compute SHA-1 checksums of regular files to be stored with index data. --- diff --git a/Makefile b/Makefile index 45fb135..1822adc 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CXXFLAGS=-O -Wall -D_FILE_OFFSET_BITS=64 -g LDFLAGS=-g -OBJS=scandir.o store.o +OBJS=scandir.o sha1.o store.o scandir : $(OBJS) $(CXX) $(LDFLAGS) -o $@ $^ diff --git a/scandir.cc b/scandir.cc index 3e004fd..81ddd4b 100644 --- a/scandir.cc +++ b/scandir.cc @@ -15,6 +15,7 @@ #include #include "store.h" +#include "sha1.h" using std::string; using std::vector; @@ -29,7 +30,7 @@ int64_t encode_time(time_t time) return (int64_t)time * 1000000; } -void dumpfile(int fd) +void dumpfile(int fd, dictionary &file_info) { struct stat stat_buf; fstat(fd, &stat_buf); @@ -42,6 +43,7 @@ void dumpfile(int fd) return; } + SHA1Checksum hash; while (true) { ssize_t res = read(fd, buf, sizeof(buf)); if (res < 0) { @@ -52,11 +54,13 @@ void dumpfile(int fd) } else if (res == 0) { break; } else { + hash.process(buf, res); size += res; } } - printf(" bytes=%Ld\n", size); + file_info["sha1"] = string((const char *)hash.checksum(), + hash.checksum_size()); } void scanfile(const string& path) @@ -137,8 +141,8 @@ void scanfile(const string& path) flags = fcntl(fd, F_GETFL); fcntl(fd, F_SETFL, flags & ~O_NONBLOCK); - //dumpfile(fd); file_info["size"] = encode_u64(stat_buf.st_size); + dumpfile(fd, file_info); close(fd); break; diff --git a/sha1.cc b/sha1.cc new file mode 100644 index 0000000..1fc4ad7 --- /dev/null +++ b/sha1.cc @@ -0,0 +1,351 @@ +/* sha1.cc - Functions to compute SHA1 message digest of data streams + according to the NIST specification FIPS-180-1. + + Copyright (C) 2000, 2001, 2003, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2006 Michael Vrable + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* Written by Scott G. Miller + Credits: + Robert Klep -- Expansion function fix + + Modified by Michael Vrable to simplify the interface + and add an object-oriented wrapper. Original code (in C) taken from GNU + coreutils (Debian package 5.97-5). +*/ + +#include "sha1.h" + +#include +#include +#include + +/* SWAP does an endian swap on architectures that are little-endian, + as SHA1 needs some data in a big-endian form. */ +#define SWAP(n) htonl(n) + +#define BLOCKSIZE 4096 +#if BLOCKSIZE % 64 != 0 +# error "invalid BLOCKSIZE" +#endif + +/* This array contains the bytes used to pad the buffer to the next + 64-byte boundary. (RFC 1321, 3.1: Step 1) */ +static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ }; + + +/* + Takes a pointer to a 160 bit block of data (five 32 bit ints) and + intializes it to the start constants of the SHA1 algorithm. This + must be called before using hash in the call to sha1_hash. +*/ +void +sha1_init_ctx (struct sha1_ctx *ctx) +{ + ctx->A = 0x67452301; + ctx->B = 0xefcdab89; + ctx->C = 0x98badcfe; + ctx->D = 0x10325476; + ctx->E = 0xc3d2e1f0; + + ctx->total[0] = ctx->total[1] = 0; + ctx->buflen = 0; +} + +/* Put result from CTX in first 20 bytes following RESBUF. The result + must be in little endian byte order. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +void * +sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf) +{ + ((md5_uint32 *) resbuf)[0] = SWAP (ctx->A); + ((md5_uint32 *) resbuf)[1] = SWAP (ctx->B); + ((md5_uint32 *) resbuf)[2] = SWAP (ctx->C); + ((md5_uint32 *) resbuf)[3] = SWAP (ctx->D); + ((md5_uint32 *) resbuf)[4] = SWAP (ctx->E); + + return resbuf; +} + +/* Process the remaining bytes in the internal buffer and the usual + prolog according to the standard and write the result to RESBUF. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +void * +sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf) +{ + /* Take yet unprocessed bytes into account. */ + md5_uint32 bytes = ctx->buflen; + size_t pad; + + /* Now count remaining bytes. */ + ctx->total[0] += bytes; + if (ctx->total[0] < bytes) + ++ctx->total[1]; + + pad = bytes >= 56 ? 64 + 56 - bytes : 56 - bytes; + memcpy (&ctx->buffer[bytes], fillbuf, pad); + + /* Put the 64-bit file length in *bits* at the end of the buffer. */ + *(md5_uint32 *) &ctx->buffer[bytes + pad + 4] = SWAP (ctx->total[0] << 3); + *(md5_uint32 *) &ctx->buffer[bytes + pad] = SWAP ((ctx->total[1] << 3) | + (ctx->total[0] >> 29)); + + /* Process last bytes. */ + sha1_process_block (ctx->buffer, bytes + pad + 8, ctx); + + return sha1_read_ctx (ctx, resbuf); +} + +void +sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + /* When we already have some bits in our internal buffer concatenate + both inputs first. */ + if (ctx->buflen != 0) + { + size_t left_over = ctx->buflen; + size_t add = 128 - left_over > len ? len : 128 - left_over; + + memcpy (&ctx->buffer[left_over], buffer, add); + ctx->buflen += add; + + if (ctx->buflen > 64) + { + sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx); + + ctx->buflen &= 63; + /* The regions in the following copy operation cannot overlap. */ + memcpy (ctx->buffer, &ctx->buffer[(left_over + add) & ~63], + ctx->buflen); + } + + buffer = (const char *) buffer + add; + len -= add; + } + + /* Process available complete blocks. */ + if (len >= 64) + { +#if !_STRING_ARCH_unaligned +# define alignof(type) offsetof (struct { char c; type x; }, x) +# define UNALIGNED_P(p) (((size_t) p) % alignof (md5_uint32) != 0) + if (UNALIGNED_P (buffer)) + while (len > 64) + { + sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); + buffer = (const char *) buffer + 64; + len -= 64; + } + else +#endif + { + sha1_process_block (buffer, len & ~63, ctx); + buffer = (const char *) buffer + (len & ~63); + len &= 63; + } + } + + /* Move remaining bytes in internal buffer. */ + if (len > 0) + { + size_t left_over = ctx->buflen; + + memcpy (&ctx->buffer[left_over], buffer, len); + left_over += len; + if (left_over >= 64) + { + sha1_process_block (ctx->buffer, 64, ctx); + left_over -= 64; + memcpy (ctx->buffer, &ctx->buffer[64], left_over); + } + ctx->buflen = left_over; + } +} + +/* --- Code below is the primary difference between md5.c and sha1.c --- */ + +/* SHA1 round constants */ +#define K1 0x5a827999L +#define K2 0x6ed9eba1L +#define K3 0x8f1bbcdcL +#define K4 0xca62c1d6L + +/* Round functions. Note that F2 is the same as F4. */ +#define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) ) +#define F2(B,C,D) (B ^ C ^ D) +#define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) ) +#define F4(B,C,D) (B ^ C ^ D) + +/* Process LEN bytes of BUFFER, accumulating context into CTX. + It is assumed that LEN % 64 == 0. + Most of this code comes from GnuPG's cipher/sha1.c. */ + +void +sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + const md5_uint32 *words = (const md5_uint32 *)buffer; + size_t nwords = len / sizeof (md5_uint32); + const md5_uint32 *endp = words + nwords; + md5_uint32 x[16]; + md5_uint32 a = ctx->A; + md5_uint32 b = ctx->B; + md5_uint32 c = ctx->C; + md5_uint32 d = ctx->D; + md5_uint32 e = ctx->E; + + /* First increment the byte count. RFC 1321 specifies the possible + length of the file up to 2^64 bits. Here we only compute the + number of bytes. Do a double word increment. */ + ctx->total[0] += len; + if (ctx->total[0] < len) + ++ctx->total[1]; + +#define rol(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + +#define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \ + ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \ + , (x[I&0x0f] = rol(tm, 1)) ) + +#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ + + F( B, C, D ) \ + + K \ + + M; \ + B = rol( B, 30 ); \ + } while(0) + + while (words < endp) + { + md5_uint32 tm; + int t; + for (t = 0; t < 16; t++) + { + x[t] = SWAP (*words); + words++; + } + + R( a, b, c, d, e, F1, K1, x[ 0] ); + R( e, a, b, c, d, F1, K1, x[ 1] ); + R( d, e, a, b, c, F1, K1, x[ 2] ); + R( c, d, e, a, b, F1, K1, x[ 3] ); + R( b, c, d, e, a, F1, K1, x[ 4] ); + R( a, b, c, d, e, F1, K1, x[ 5] ); + R( e, a, b, c, d, F1, K1, x[ 6] ); + R( d, e, a, b, c, F1, K1, x[ 7] ); + R( c, d, e, a, b, F1, K1, x[ 8] ); + R( b, c, d, e, a, F1, K1, x[ 9] ); + R( a, b, c, d, e, F1, K1, x[10] ); + R( e, a, b, c, d, F1, K1, x[11] ); + R( d, e, a, b, c, F1, K1, x[12] ); + R( c, d, e, a, b, F1, K1, x[13] ); + R( b, c, d, e, a, F1, K1, x[14] ); + R( a, b, c, d, e, F1, K1, x[15] ); + R( e, a, b, c, d, F1, K1, M(16) ); + R( d, e, a, b, c, F1, K1, M(17) ); + R( c, d, e, a, b, F1, K1, M(18) ); + R( b, c, d, e, a, F1, K1, M(19) ); + R( a, b, c, d, e, F2, K2, M(20) ); + R( e, a, b, c, d, F2, K2, M(21) ); + R( d, e, a, b, c, F2, K2, M(22) ); + R( c, d, e, a, b, F2, K2, M(23) ); + R( b, c, d, e, a, F2, K2, M(24) ); + R( a, b, c, d, e, F2, K2, M(25) ); + R( e, a, b, c, d, F2, K2, M(26) ); + R( d, e, a, b, c, F2, K2, M(27) ); + R( c, d, e, a, b, F2, K2, M(28) ); + R( b, c, d, e, a, F2, K2, M(29) ); + R( a, b, c, d, e, F2, K2, M(30) ); + R( e, a, b, c, d, F2, K2, M(31) ); + R( d, e, a, b, c, F2, K2, M(32) ); + R( c, d, e, a, b, F2, K2, M(33) ); + R( b, c, d, e, a, F2, K2, M(34) ); + R( a, b, c, d, e, F2, K2, M(35) ); + R( e, a, b, c, d, F2, K2, M(36) ); + R( d, e, a, b, c, F2, K2, M(37) ); + R( c, d, e, a, b, F2, K2, M(38) ); + R( b, c, d, e, a, F2, K2, M(39) ); + R( a, b, c, d, e, F3, K3, M(40) ); + R( e, a, b, c, d, F3, K3, M(41) ); + R( d, e, a, b, c, F3, K3, M(42) ); + R( c, d, e, a, b, F3, K3, M(43) ); + R( b, c, d, e, a, F3, K3, M(44) ); + R( a, b, c, d, e, F3, K3, M(45) ); + R( e, a, b, c, d, F3, K3, M(46) ); + R( d, e, a, b, c, F3, K3, M(47) ); + R( c, d, e, a, b, F3, K3, M(48) ); + R( b, c, d, e, a, F3, K3, M(49) ); + R( a, b, c, d, e, F3, K3, M(50) ); + R( e, a, b, c, d, F3, K3, M(51) ); + R( d, e, a, b, c, F3, K3, M(52) ); + R( c, d, e, a, b, F3, K3, M(53) ); + R( b, c, d, e, a, F3, K3, M(54) ); + R( a, b, c, d, e, F3, K3, M(55) ); + R( e, a, b, c, d, F3, K3, M(56) ); + R( d, e, a, b, c, F3, K3, M(57) ); + R( c, d, e, a, b, F3, K3, M(58) ); + R( b, c, d, e, a, F3, K3, M(59) ); + R( a, b, c, d, e, F4, K4, M(60) ); + R( e, a, b, c, d, F4, K4, M(61) ); + R( d, e, a, b, c, F4, K4, M(62) ); + R( c, d, e, a, b, F4, K4, M(63) ); + R( b, c, d, e, a, F4, K4, M(64) ); + R( a, b, c, d, e, F4, K4, M(65) ); + R( e, a, b, c, d, F4, K4, M(66) ); + R( d, e, a, b, c, F4, K4, M(67) ); + R( c, d, e, a, b, F4, K4, M(68) ); + R( b, c, d, e, a, F4, K4, M(69) ); + R( a, b, c, d, e, F4, K4, M(70) ); + R( e, a, b, c, d, F4, K4, M(71) ); + R( d, e, a, b, c, F4, K4, M(72) ); + R( c, d, e, a, b, F4, K4, M(73) ); + R( b, c, d, e, a, F4, K4, M(74) ); + R( a, b, c, d, e, F4, K4, M(75) ); + R( e, a, b, c, d, F4, K4, M(76) ); + R( d, e, a, b, c, F4, K4, M(77) ); + R( c, d, e, a, b, F4, K4, M(78) ); + R( b, c, d, e, a, F4, K4, M(79) ); + + a = ctx->A += a; + b = ctx->B += b; + c = ctx->C += c; + d = ctx->D += d; + e = ctx->E += e; + } +} + +/* ---- Object-Oriented Wrapper */ +SHA1Checksum::SHA1Checksum() +{ + sha1_init_ctx(&ctx); +} + +SHA1Checksum::~SHA1Checksum() +{ +} + +void SHA1Checksum::process(void *data, size_t len) +{ + sha1_process_bytes(data, len, &ctx); +} + +const uint8_t *SHA1Checksum::checksum() +{ + sha1_finish_ctx(&ctx, resbuf); + return (const uint8_t *)resbuf; +} diff --git a/sha1.h b/sha1.h new file mode 100644 index 0000000..597f39a --- /dev/null +++ b/sha1.h @@ -0,0 +1,93 @@ +/* Declarations of functions and data types used for SHA1 sum + library functions. + Copyright (C) 2000, 2001, 2003, 2005 Free Software Foundation, Inc. + Copyright (C) 2006 Michael Vrable + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef SHA1_H +# define SHA1_H 1 + +# include +# include + +typedef uint32_t md5_uint32; + +/* Structure to save state of computation between the single steps. */ +struct sha1_ctx +{ + md5_uint32 A; + md5_uint32 B; + md5_uint32 C; + md5_uint32 D; + md5_uint32 E; + + md5_uint32 total[2]; + md5_uint32 buflen; + char buffer[128] __attribute__ ((__aligned__ (__alignof__ (md5_uint32)))); +}; + + +/* Initialize structure containing state of computation. */ +extern void sha1_init_ctx (struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is necessary that LEN is a multiple of 64!!! */ +extern void sha1_process_block (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is NOT required that LEN is a multiple of 64. */ +extern void sha1_process_bytes (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Process the remaining bytes in the buffer and put result from CTX + in first 20 bytes following RESBUF. The result is always in little + endian byte order, so that a byte-wise output yields to the wanted + ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF be correctly + aligned for a 32 bits value. */ +extern void *sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf); + + +/* Put result from CTX in first 20 bytes following RESBUF. The result is + always in little endian byte order, so that a byte-wise output yields + to the wanted ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +extern void *sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf); + +/* An object-oriented wrapper around checksumming functionality. */ +class SHA1Checksum { +private: + struct sha1_ctx ctx; + char resbuf[20] __attribute__ ((__aligned__ (__alignof__ (md5_uint32)))); + +public: + SHA1Checksum(); + ~SHA1Checksum(); + + void process(void *data, size_t len); + const uint8_t *checksum(); + size_t checksum_size() const { return 20; } +}; + +#endif diff --git a/store.h b/store.h index d484780..7950297 100644 --- a/store.h +++ b/store.h @@ -5,6 +5,9 @@ * into segments for storage purposes. This file provides interfaces for * reading and writing objects and segments. */ +#ifndef _LBS_STORE_H +#define _LBS_STORE_H + #include #include @@ -67,3 +70,5 @@ public: std::string encode_u16(uint16_t val); std::string encode_u32(uint32_t val); std::string encode_u64(uint64_t val); + +#endif // _LBS_STORE_H