domainhunter.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. #!/usr/bin/env python
  2. ## Title: domainhunter.py
  3. ## Author: Joe Vest and Andrew Chiles
  4. ## Description: Checks expired domains, bluecoat categorization, and Archive.org history to determine
  5. ## good candidates for phishing and C2 domain names
  6. # To-do:
  7. # Add reputation categorizations to identify desireable vs undesireable domains
  8. # Code cleanup/optimization
  9. # Read in list of desired domain names
  10. # Add Authenticated "Members-Only" option to download CSV/txt (https://member.expireddomains.net/domains/expiredcom/)
  11. import time
  12. import random
  13. import argparse
  14. import json
  15. ## Functions
  16. def checkBluecoat(domain):
  17. try:
  18. url = 'https://sitereview.bluecoat.com/rest/categorization'
  19. postData = {"url":domain} # HTTP POST Parameters
  20. headers = {'User-Agent':useragent,
  21. 'X-Requested-With':'XMLHttpRequest',
  22. 'Referer':'https://sitereview.bluecoat.com/sitereview.jsp'}
  23. print('[*] BlueCoat Check: {}'.format(domain))
  24. response = s.post(url,headers=headers,data=postData,verify=False)
  25. responseJson = json.loads(response.text)
  26. if 'errorType' in responseJson:
  27. a = responseJson['errorType']
  28. else:
  29. soupA = BeautifulSoup(responseJson['categorization'], 'lxml')
  30. a = soupA.find("a").text
  31. return a
  32. except:
  33. print('[-] Error retrieving Bluecoat reputation!')
  34. return "-"
  35. def checkIBMxForce(domain):
  36. try:
  37. url = 'https://exchange.xforce.ibmcloud.com/url/{}'.format(domain)
  38. headers = {'User-Agent':useragent,
  39. 'Accept':'application/json, text/plain, */*',
  40. 'x-ui':'XFE',
  41. 'Origin':url,
  42. 'Referer':url}
  43. print('[*] IBM xForce Check: {}'.format(domain))
  44. url = 'https://api.xforce.ibmcloud.com/url/{}'.format(domain)
  45. response = s.get(url,headers=headers,verify=False)
  46. responseJson = json.loads(response.text)
  47. if 'error' in responseJson:
  48. a = responseJson['error']
  49. else:
  50. a = responseJson["result"]['cats']
  51. return a
  52. except:
  53. print('[-] Error retrieving IBM x-Force reputation!')
  54. return "-"
  55. def downloadMalwareDomains():
  56. url = malwaredomains
  57. response = s.get(url,headers=headers,verify=False)
  58. responseText = response.text
  59. if response.status_code == 200:
  60. return responseText
  61. else:
  62. print("Error reaching:{} Status: {}").format(url, response.status_code)
  63. ## MAIN
  64. if __name__ == "__main__":
  65. try:
  66. import requests
  67. from bs4 import BeautifulSoup
  68. from texttable import Texttable
  69. except Exception as e:
  70. print "Expired Domains Reputation Check"
  71. print "[-] Missing dependencies: {}".format(str(e))
  72. print "[*] Install required dependencies by running `pip install -r requirements.txt`"
  73. quit(0)
  74. parser = argparse.ArgumentParser(description='Checks expired domains, bluecoat categorization, and Archive.org history to determine good candidates for C2 and phishing domains')
  75. parser.add_argument('-q','--query', help='Optional keyword used to refine search results', required=False, type=str)
  76. parser.add_argument('-c','--check', help='Perform slow reputation checks', required=False, default=False, action='store_true')
  77. parser.add_argument('-r','--maxresults', help='Number of results to return when querying latest expired/deleted domains (min. 100)', required=False, type=int, default=100)
  78. parser.add_argument('-w','--maxwidth', help='Width of text table', required=False, type=int, default=400)
  79. parser.add_argument('-f','--file', help='Input file containing potential domain names to check (1 per line)', required=False, type=str)
  80. args = parser.parse_args()
  81. ## Variables
  82. query = False
  83. if args.query:
  84. query = args.query
  85. check = args.check
  86. maxresults = args.maxresults
  87. if maxresults < 100:
  88. maxresults = 100
  89. maxwidth=args.maxwidth
  90. inputfile = False
  91. if args.file:
  92. inputfile = args.file
  93. t = Texttable(max_width=maxwidth)
  94. malwaredomains = 'http://mirror1.malwaredomains.com/files/justdomains'
  95. expireddomainsqueryurl = 'https://www.expireddomains.net/domain-name-search'
  96. timestamp = time.strftime("%Y%m%d_%H%M%S")
  97. useragent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
  98. headers = {'User-Agent':useragent}
  99. requests.packages.urllib3.disable_warnings()
  100. # HTTP Session container, used to manage cookies, session tokens and other session information
  101. s = requests.Session()
  102. data = []
  103. title = '''
  104. ____ ___ __ __ _ ___ _ _ _ _ _ _ _ _ _____ _____ ____
  105. | _ \ / _ \| \/ | / \ |_ _| \ | | | | | | | | | \ | |_ _| ____| _ \
  106. | | | | | | | |\/| | / _ \ | || \| | | |_| | | | | \| | | | | _| | |_) |
  107. | |_| | |_| | | | |/ ___ \ | || |\ | | _ | |_| | |\ | | | | |___| _ <
  108. |____/ \___/|_| |_/_/ \_\___|_| \_| |_| |_|\___/|_| \_| |_| |_____|_| \_\ '''
  109. print(title)
  110. print("")
  111. print("Expired Domains Reptutation Checker")
  112. print("")
  113. print("DISCLAIMER:")
  114. print("This is for educational purposes only!")
  115. disclaimer = '''It is designed to promote education and the improvement of computer/cyber security.
  116. The authors or employers are not liable for any illegal act or misuse performed by any user of this tool.
  117. If you plan to use this content for illegal purpose, don't. Have a nice day :)'''
  118. print(disclaimer)
  119. print("")
  120. print("********************************************")
  121. print("Start Time: {}").format(timestamp)
  122. print("TextTable Column Width: {}").format(str(maxwidth))
  123. print("Checking Reputation: {}").format(str(check))
  124. print("Number Domains Checked: {}").format(maxresults)
  125. print("********************************************")
  126. runtime = 0
  127. if check:
  128. runtime = (maxresults * 20) / 60
  129. else:
  130. runtime = maxresults * .15 / 60
  131. print("Estimated Max Run Time: {} minutes").format(int(runtime))
  132. print("")
  133. print('[*] Downloading malware domain list from {}'.format(malwaredomains))
  134. maldomains = downloadMalwareDomains()
  135. maldomains_list = maldomains.split("\n")
  136. # Use the keyword string to narrow domain search if provided
  137. # Need to modify this to pull more than the first 25 results
  138. if query:
  139. print('[*] Fetching expired or deleted domains containing "{}"...').format(query)
  140. for i in range (0,maxresults,25):
  141. if i == 0:
  142. url = "{}/?q={}".format(expireddomainsqueryurl,query)
  143. headers['Referer'] ='https://www.expireddomains.net/domain-name-search/?q={}&searchinit=1'.format(query)
  144. else:
  145. url = "{}/?start={}&q={}".format(expireddomainsqueryurl,i,query)
  146. headers['Referer'] ='https://www.expireddomains.net/domain-name-search/?start={}&q={}'.format((i-25),query)
  147. print("[*] {}".format(url))
  148. # Annoyingly when querying specific keywords the expireddomains.net site requires additional cookies which
  149. # are set in JavaScript and not recognized by Requests so we add them here manually
  150. jar = requests.cookies.RequestsCookieJar()
  151. jar.set('_pk_id.10.dd0a', '*', domain='expireddomains.net', path='/')
  152. jar.set('_pk_ses.10.dd0a', '*', domain='expireddomains.net', path='/')
  153. domains = s.get(url,headers=headers,verify=False,cookies=jar).text
  154. # Turn the HTML into a Beautiful Soup object
  155. soup = BeautifulSoup(domains, 'lxml')
  156. table = soup.find("table")
  157. try:
  158. for row in table.findAll('tr')[1:]:
  159. # Alternative way to extract domain name
  160. # domain = row.find('td').find('a').text
  161. cells = row.findAll("td")
  162. if len(cells) >= 1:
  163. output = ""
  164. c0 = row.find('td').find('a').text # domain
  165. c1 = cells[1].find(text=True) # bl
  166. c2 = cells[2].find(text=True) # domainpop
  167. c3 = cells[3].find(text=True) # birth
  168. c4 = cells[4].find(text=True) # entries
  169. c5 = cells[5].find(text=True) # similarweb
  170. c6 = cells[6].find(text=True) # similarweb country code
  171. c7 = cells[7].find(text=True) # moz
  172. c8 = cells[8].find(text=True) # status com
  173. c9 = cells[9].find(text=True) # status net
  174. c10 = cells[10].find(text=True) # status org
  175. c11 = cells[11].find(text=True) # status de
  176. c12 = cells[12].find(text=True) # tld registered
  177. c13 = cells[13].find(text=True) # monthly searches
  178. c14 = cells[14].find(text=True) # adwords competition
  179. c15 = cells[15].find(text=True) # list
  180. c16 = cells[16].find(text=True) # status
  181. c17 = cells[17].find(text=True) # related links
  182. available = ''
  183. if c8 == "available":
  184. available += ".com "
  185. if c9 == "available":
  186. available += ".net "
  187. if c10 == "available":
  188. available += ".org "
  189. if c11 == "available":
  190. available += ".de "
  191. # Skip additional reputation checks if this domain is already categorized as malicious
  192. if c0 in maldomains_list:
  193. print("[-] Skipping {} - Identified as known malware domain").format(c0)
  194. else:
  195. bluecoat = ''
  196. ibmxforce = ''
  197. if c3 == '-':
  198. bluecoat = 'ignored'
  199. ibmxforce = 'ignored'
  200. elif check == True:
  201. bluecoat = checkBluecoat(c0)
  202. print "[+] {} is categorized as: {}".format(c0, bluecoat)
  203. ibmxforce = checkIBMxForce(c0)
  204. print "[+] {} is categorized as: {}".format(c0, ibmxforce)
  205. # Sleep to avoid captchas
  206. time.sleep(random.randrange(10,20))
  207. else:
  208. bluecoat = "skipped"
  209. ibmxforce = "skipped"
  210. # Append parsed domain data to list
  211. data.append([c0,c3,c4,available,bluecoat,ibmxforce])
  212. except:
  213. print "[-] Error: No results found on this page!"
  214. # Retrieve the most recent expired/deleted domain results
  215. elif inputfile:
  216. print('[*] Fetching domain reputation from file: {}').format(inputfile)
  217. # read in file contents to list
  218. try:
  219. domains = [line.rstrip('\r\n') for line in open(inputfile, "r")]
  220. except IOError:
  221. print '[-] Error: {} does not appear to exist.'.format(inputfile)
  222. exit()
  223. print('[*] Domains loaded: {}').format(len(domains))
  224. for domain in domains:
  225. if domain in maldomains_list:
  226. print("[-] Skipping {} - Identified as known malware domain").format(domain)
  227. else:
  228. bluecoat = ''
  229. ibmxforce = ''
  230. bluecoat = checkBluecoat(domain)
  231. print "[+] {} is categorized as: {}".format(domain, bluecoat)
  232. ibmxforce = checkIBMxForce(domain)
  233. print "[+] {} is categorized as: {}".format(domain, ibmxforce)
  234. # Sleep to avoid captchas
  235. time.sleep(random.randrange(10,20))
  236. data.append([domain,'-','-','-',bluecoat,ibmxforce])
  237. else:
  238. print('[*] Fetching {} expired or deleted domains...').format(query)
  239. # Generate list of URLs to query for expired/deleted domains, queries return 25 results per page
  240. urls = []
  241. for i in range (0,(maxresults/4),25):
  242. urls.append('https://www.expireddomains.net/backorder-expired-domains?start={}'.format(i))
  243. urls.append('https://www.expireddomains.net/deleted-com-domains/?start={}o=changes&r=d'.format(i))
  244. urls.append('https://www.expireddomains.net/deleted-net-domains/?start={}o=changes&r=d'.format(i))
  245. urls.append('https://www.expireddomains.net/deleted-org-domains/?start={}o=changes&r=d'.format(i))
  246. for url in urls:
  247. print("[*] {}".format(url))
  248. expireddomains = s.get(url,headers=headers,verify=False).text
  249. # Turn the HTML into a Beautiful Soup object
  250. soup = BeautifulSoup(expireddomains, 'lxml')
  251. table = soup.find("table")
  252. try:
  253. for row in table.findAll('tr')[1:]:
  254. cells = row.findAll("td")
  255. if len(cells) >= 1:
  256. output = ""
  257. c0 = cells[0].find(text=True) # domain
  258. c1 = cells[1].find(text=True) # bl
  259. c2 = cells[2].find(text=True) # domainpop
  260. c3 = cells[3].find(text=True) # birth
  261. c4 = cells[4].find(text=True) # entries
  262. c5 = cells[5].find(text=True) # similarweb
  263. c6 = cells[6].find(text=True) # similarweb country code
  264. c7 = cells[7].find(text=True) # moz
  265. c8 = cells[8].find(text=True) # status com
  266. c9 = cells[9].find(text=True) # status net
  267. c10 = cells[10].find(text=True) # status org
  268. c11 = cells[11].find(text=True) # status de
  269. c12 = cells[12].find(text=True) # tld registered
  270. c13 = cells[13].find(text=True) # changes
  271. c14 = cells[14].find(text=True) # whois
  272. # Expired Domains results have an additional 'Availability' column that breaks parsing "deleted" domains
  273. #c15 = cells[15].find(text=True) # related links
  274. available = ''
  275. if c8 == "available":
  276. available += ".com "
  277. if c9 == "available":
  278. available += ".net "
  279. if c10 == "available":
  280. available += ".org "
  281. if c11 == "available":
  282. available += ".de "
  283. # Skip additional reputation checks if this domain is already categorized as malicious
  284. if c0 in maldomains_list:
  285. print("[-] Skipping {} - Identified as known malware domain").format(c0)
  286. else:
  287. bluecoat = ''
  288. ibmxforce = ''
  289. if c3 == '-':
  290. bluecoat = 'ignored'
  291. ibmxforce = 'ignored'
  292. elif check == True:
  293. bluecoat = checkBluecoat(c0)
  294. print "[+] {} is categorized as: {}".format(c0, bluecoat)
  295. ibmxforce = checkIBMxForce(c0)
  296. print "[+] {} is categorized as: {}".format(c0, ibmxforce)
  297. # Sleep to avoid captchas
  298. time.sleep(random.randrange(10,20))
  299. else:
  300. bluecoat = "skipped"
  301. ibmxforce = "skipped"
  302. # Append parsed domain data to list
  303. data.append([c0,c3,c4,available,bluecoat,ibmxforce])
  304. except:
  305. print "[-] Error: No results found on this page!"
  306. # Sort domain list by column 2 (Birth Year)
  307. sortedData = sorted(data, key=lambda x: x[1], reverse=True)
  308. t.add_rows(sortedData)
  309. header = ['Domain', 'Birth', '#', 'TLDs', 'BC', 'IBM']
  310. t.header(header)
  311. # Build HTML Table
  312. html = ''
  313. htmlHeader = '<html><head><title>Expired Domain List</title></head>'
  314. htmlBody = '<body><p>The following available domains report was generated at {}</p>'.format(timestamp)
  315. htmlTableHeader = '''
  316. <table border="1" align="center">
  317. <th>Domain</th>
  318. <th>Birth</th>
  319. <th>Entries</th>
  320. <th>TLDs Available</th>
  321. <th>Bluecoat</th>
  322. <th>Categorization</th>
  323. <th>IBM-xForce</th>
  324. <th>Categorization</th>
  325. <th>WatchGuard</th>
  326. <th>Namecheap</th>
  327. <th>Archive.org</th>
  328. '''
  329. htmlTableBody = ''
  330. htmlTableFooter = '</table>'
  331. htmlFooter = '</body></html>'
  332. # Build HTML table contents
  333. for i in sortedData:
  334. htmlTableBody += '<tr>'
  335. htmlTableBody += '<td>{}</td>'.format(i[0]) # Domain
  336. htmlTableBody += '<td>{}</td>'.format(i[1]) # Birth
  337. htmlTableBody += '<td>{}</td>'.format(i[2]) # Entries
  338. htmlTableBody += '<td>{}</td>'.format(i[3]) # TLDs
  339. htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/sitereview.jsp#/?search={}" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
  340. htmlTableBody += '<td>{}</td>'.format(i[4]) # Bluecoat Categorization
  341. htmlTableBody += '<td><a href="https://exchange.xforce.ibmcloud.com/url/{}" target="_blank">IBM-xForce</a></td>'.format(i[0]) # IBM xForce
  342. htmlTableBody += '<td>{}</td>'.format(i[5]) # IBM x-Force Categorization
  343. htmlTableBody += '<td><a href="http://www.borderware.com/domain_lookup.php?ip={}" target="_blank">WatchGuard</a></td>'.format(i[0]) # Borderware WatchGuard
  344. htmlTableBody += '<td><a href="https://www.namecheap.com/domains/registration/results.aspx?domain={}" target="_blank">Namecheap</a></td>'.format(i[0]) # Namecheap
  345. htmlTableBody += '<td><a href="http://web.archive.org/web/*/{}" target="_blank">Archive.org</a></td>'.format(i[0]) # Archive.org
  346. htmlTableBody += '</tr>'
  347. html = htmlHeader + htmlBody + htmlTableHeader + htmlTableBody + htmlTableFooter + htmlFooter
  348. logfilename = "{}_domainreport.html".format(timestamp)
  349. log = open(logfilename,'w')
  350. log.write(html)
  351. log.close
  352. print("\n[*] Search complete")
  353. print("[*] Log written to {}\n").format(logfilename)
  354. print(t.draw())