pcap_to_parquet.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #!/usr/bin/env python
  2. """
  3. Convert PCAP output to undirected graph and save in Parquet format.
  4. """
  5. from __future__ import print_function
  6. import re
  7. import socket
  8. import struct
  9. import sys
  10. import fastparquet as fp
  11. import numpy as np
  12. import pandas as pd
  13. def ip_to_integer(s):
  14. return struct.unpack("!I", socket.inet_aton(s))[0]
  15. def get_ip_protocol(s):
  16. if "tcp" in s:
  17. return "tcp"
  18. if "UDP" in s:
  19. return "udp"
  20. if "EIGRP" in s:
  21. return "eigrp"
  22. if "ICMP" in s:
  23. return "icmp"
  24. return None
  25. def to_parquet(filename, prefix="maccdc2012"):
  26. with open(filename) as f:
  27. traffic = {}
  28. nodes = set()
  29. for line in f.readlines():
  30. if "unreachable" in line:
  31. continue
  32. fields = line.split()
  33. if not fields:
  34. continue
  35. if fields[1] != "IP":
  36. continue
  37. protocol = get_ip_protocol(line)
  38. if protocol not in ("tcp", "udp", "eigrp", "icmp"):
  39. continue
  40. try:
  41. addresses = []
  42. # Extract source IP address and convert to integer
  43. m = re.match(r'(?P<address>\d+\.\d+\.\d+\.\d+)', fields[2])
  44. if not m:
  45. continue
  46. addresses.append(ip_to_integer(m.group('address')))
  47. # Extract target IP address and convert to integer
  48. m = re.match(r'(?P<address>\d+\.\d+\.\d+\.\d+)', fields[4])
  49. if not m:
  50. continue
  51. addresses.append(ip_to_integer(m.group('address')))
  52. nodes = nodes.union(addresses)
  53. src, dst = sorted(addresses)
  54. key = (protocol, src, dst)
  55. # Extract packet size
  56. nbytes = int(fields[-1])
  57. if key in traffic:
  58. traffic[key] += nbytes
  59. else:
  60. traffic[key] = nbytes
  61. except:
  62. pass
  63. nodes = dict([(node, i) for i, node in enumerate(sorted(nodes))])
  64. edges = []
  65. for key in traffic:
  66. edge = [nodes[key[1]], nodes[key[2]], key[0], traffic[key]]
  67. edges.append(edge)
  68. nodes_df = pd.DataFrame(np.arange(len(nodes)), columns=['id'])
  69. nodes_df = nodes_df.set_index('id')
  70. edges_df = pd.DataFrame(np.array(edges), columns=['source', 'target', 'protocol', 'weight'])
  71. edges_df['source'] = pd.to_numeric(edges_df['source'])
  72. edges_df['target'] = pd.to_numeric(edges_df['target'])
  73. edges_df['weight'] = pd.to_numeric(edges_df['weight'])
  74. edges_df['protocol'] = edges_df['protocol'].astype('category')
  75. fp.write('{}_nodes.parq'.format(prefix), nodes_df)
  76. fp.write('{}_edges.parq'.format(prefix), edges_df)
  77. if __name__ == '__main__':
  78. if len(sys.argv) > 2:
  79. to_parquet(sys.argv[1], prefix=sys.argv[2])
  80. else:
  81. to_parquet(sys.argv[1])