From 9d32f47c84c4b8aae2ec6fd63d1a4f008228e82f Mon Sep 17 00:00:00 2001 From: Michael Vrable Date: Tue, 4 May 2010 12:19:42 -0700 Subject: [PATCH] Work on more tools for automating cloud storage performance measurement. --- parsetrace/split-trace.py | 68 ++++++++++++++++++++++++++++++++++++ s3bench/cloudtest.py | 73 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100755 parsetrace/split-trace.py create mode 100755 s3bench/cloudtest.py diff --git a/parsetrace/split-trace.py b/parsetrace/split-trace.py new file mode 100755 index 0000000..e1ff68a --- /dev/null +++ b/parsetrace/split-trace.py @@ -0,0 +1,68 @@ +#!/usr/bin/python +# +# Split a tcpdump trace apart into multiple files, each containing a single TCP +# flow. + +import impacket, itertools, pcapy, re, socket, subprocess, sys +import impacket.ImpactDecoder, impacket.ImpactPacket + +# Domain names for cloud service providers, whose traces we want to pull out. +DOMAINS = ['.amazon.com', '.core.windows.net'] + +# The collection of flows we've seen. The value associated with each flow is a +# sequence number indicating in what order we saw the flows in the trace. +flows = {} + +# Step 1: Parse the input file and extract a listing of all the flows that we +# care about. +def handler(header, data): + pkt = decoder.decode(data) + ip = pkt.child() + tcp = ip.child() + src = (ip.get_ip_src(), tcp.get_th_sport()) + dst = (ip.get_ip_dst(), tcp.get_th_dport()) + flow = tuple(sorted([src, dst], + cmp=lambda x, y: cmp(x[1], y[1]) or cmp(x[0], y[0]))) + if flow not in flows: + flows[flow] = max(itertools.chain(flows.values(), [0])) + 1 + +def scan(filename): + global decoder + p = pcapy.open_offline(filename) + p.setfilter(r"ip proto \tcp") + assert p.datalink() == pcapy.DLT_EN10MB + decoder = impacket.ImpactDecoder.EthDecoder() + p.loop(0, handler) + +for file in sys.argv[1:]: + print "Scanning %s..." % (file,) + scan(file) + + filters = {} + for (((dst, dport), (src, sport)), seq) in flows.items(): + # Filter out to find just the relevant flows. Right now we want only + # flows to port 80 (since both S3/Azure use that as the service port + # when unencrypted which is what we use). We probably ought to apply + # another filter on IP address in case there happened to be any other + # HTTP flows during the trace capture. + if dport != 80: continue + try: + name = socket.gethostbyaddr(dst)[0] + except: + name = dst + matches = False + for d in DOMAINS: + if name.endswith(d): matches = True + if not matches: continue + + filter = "tcp and (host %s and host %s) and (port %d and port %d)" \ + % (src, dst, sport, dport) + filters[seq] = (filter, name) + + n = 0 + for (_, (filter, name)) in sorted(filters.items()): + print "%d: %s" % (n, filter) + subprocess.check_call(['tcpdump', '-s0', '-r', file, '-w', + 'trace-%03d-%s' % (n, name), + filter]) + n += 1 diff --git a/s3bench/cloudtest.py b/s3bench/cloudtest.py new file mode 100755 index 0000000..0755688 --- /dev/null +++ b/s3bench/cloudtest.py @@ -0,0 +1,73 @@ +#!/usr/bin/python +# +# Run a series of simple test requests against S3 for gathering some basic +# performance numbers. + +import boto, time +from boto.s3.connection import SubdomainCallingFormat +from boto.s3.key import Key +import azure + +BUCKET_NAME = 'mvrable-benchmark' +SIZES = [64, 4096, 32 << 10, 256 << 10, 1 << 20, 4 << 20, 32 << 20] + +class S3TestConnection: + def __init__(self): + self.conn = boto.connect_s3(is_secure=False, + calling_format=SubdomainCallingFormat()) + self.bucket = self.conn.get_bucket(BUCKET_NAME) + + def put_object(self, name, size): + buf = 'A' * size + k = Key(self.bucket, name) + start_time = time.time() + k.set_contents_from_string(buf) + print "%s: %f" % (name, time.time() - start_time) + + def get_object(self, name, size): + k = Key(self.bucket, name) + start_time = time.time() + buf = k.get_contents_as_string() + print "%s: %f" % (name, time.time() - start_time) + +class AzureTestConnection: + def __init__(self): + self.conn = azure.Connection() + + def put_object(self, name, size): + buf = 'A' * size + start_time = time.time() + self.conn.make_request('/benchmark/' + name, 'PUT', buf, + {'x-ms-blob-type': 'BlockBlob'}) + print "%s: %f" % (name, time.time() - start_time) + + def get_object(self, name, size): + start_time = time.time() + self.conn.make_request('/benchmark/' + name, 'GET') + print "%s: %f" % (name, time.time() - start_time) + +def run_test(): + print "==== S3 ====" + c = S3TestConnection() + for repeat in range(4): + for size in SIZES: + c.put_object('file-%d-%d' % (size, repeat), size) + + c = S3TestConnection() + for repeat in range(4): + for size in SIZES: + c.get_object('file-%d-%d' % (size, repeat), size) + + print "==== AZURE ====" + c = AzureTestConnection() + for repeat in range(4): + for size in SIZES: + c.put_object('file-%d-%d' % (size, repeat), size) + + c = AzureTestConnection() + for repeat in range(4): + for size in SIZES: + c.get_object('file-%d-%d' % (size, repeat), size) + +if __name__ == '__main__': + run_test() -- 2.20.1