split-trace.py

   1 #!/usr/bin/python
   2 #
   3 # Split a tcpdump trace apart into multiple files, each containing a single TCP
   4 # flow.
   5
   6 import impacket, itertools, pcapy, re, socket, subprocess, sys
   7 import impacket.ImpactDecoder, impacket.ImpactPacket
   8
   9 # Domain names for cloud service providers, whose traces we want to pull out.
  10 DOMAINS = ['.amazon.com', '.core.windows.net']
  11
  12 # The collection of flows we've seen.  The value associated with each flow is a
  13 # sequence number indicating in what order we saw the flows in the trace.
  14 flows = {}
  15
  16 # Step 1: Parse the input file and extract a listing of all the flows that we
  17 # care about.
  18 def handler(header, data):
  19     pkt = decoder.decode(data)
  20     ip = pkt.child()
  21     tcp = ip.child()
  22     src = (ip.get_ip_src(), tcp.get_th_sport())
  23     dst = (ip.get_ip_dst(), tcp.get_th_dport())
  24     flow = tuple(sorted([src, dst],
  25                         cmp=lambda x, y: cmp(x[1], y[1]) or cmp(x[0], y[0])))
  26     if flow not in flows:
  27         flows[flow] = max(itertools.chain(flows.values(), [0])) + 1
  28
  29 def scan(filename):
  30     global decoder
  31     p = pcapy.open_offline(filename)
  32     p.setfilter(r"ip proto \tcp")
  33     assert p.datalink() == pcapy.DLT_EN10MB
  34     decoder = impacket.ImpactDecoder.EthDecoder()
  35     p.loop(0, handler)
  36
  37 for file in sys.argv[1:]:
  38     print "Scanning %s..." % (file,)
  39     scan(file)
  40
  41     filters = {}
  42     for (((dst, dport), (src, sport)), seq) in flows.items():
  43         # Filter out to find just the relevant flows.  Right now we want only
  44         # flows to port 80 (since both S3/Azure use that as the service port
  45         # when unencrypted which is what we use).  We probably ought to apply
  46         # another filter on IP address in case there happened to be any other
  47         # HTTP flows during the trace capture.
  48         if dport != 80: continue
  49         try:
  50             name = socket.gethostbyaddr(dst)[0]
  51         except:
  52             name = dst
  53         matches = False
  54         for d in DOMAINS:
  55             if name.endswith(d): matches = True
  56         if not matches: continue
  57
  58         filter = "tcp and (host %s and host %s) and (port %d and port %d)" \
  59             % (src, dst, sport, dport)
  60         filters[seq] = (filter, name)
  61
  62     n = 0
  63     for (_, (filter, name)) in sorted(filters.items()):
  64         print "%d: %s" % (n, filter)
  65         subprocess.check_call(['tcpdump', '-s0', '-r', file, '-w',
  66                                'trace-%03d-%s' % (n, name),
  67                                filter])
  68         n += 1