domainhunter.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. #!/usr/bin/env python
  2. ## Title: domainhunter.py
  3. ## Author: Joe Vest and Andrew Chiles
  4. ## Description: Checks expired domains, bluecoat categorization, and Archive.org history to determine
  5. ## good candidates for phishing and C2 domain names
  6. # To-do:
  7. # Python 3 support
  8. # Add reputation categorizations to identify desireable vs undesireable domains
  9. # Code cleanup/optimization
  10. # Add Authenticated "Members-Only" option to download CSV/txt (https://member.expireddomains.net/domains/expiredcom/)
  11. import time
  12. import random
  13. import argparse
  14. import json
  15. ## Functions
  16. def checkBluecoat(domain):
  17. try:
  18. url = 'https://sitereview.bluecoat.com/rest/categorization'
  19. postData = {"url":domain} # HTTP POST Parameters
  20. headers = {'User-Agent':useragent,
  21. 'X-Requested-With':'XMLHttpRequest',
  22. 'Referer':'https://sitereview.bluecoat.com/sitereview.jsp'}
  23. print('[*] BlueCoat Check: {}'.format(domain))
  24. response = s.post(url,headers=headers,data=postData,verify=False)
  25. responseJson = json.loads(response.text)
  26. if 'errorType' in responseJson:
  27. a = responseJson['errorType']
  28. else:
  29. soupA = BeautifulSoup(responseJson['categorization'], 'lxml')
  30. a = soupA.find("a").text
  31. # Print notice if CAPTCHAs are blocking accurate results
  32. if a == 'captcha':
  33. print('[-] Error: Blue Coat CAPTCHA received. Change your IP or manually solve a CAPTCHA at "https://sitereview.bluecoat.com/sitereview.jsp"')
  34. #raw_input('[*] Press Enter to continue...')
  35. return a
  36. except:
  37. print('[-] Error retrieving Bluecoat reputation!')
  38. return "-"
  39. def checkIBMxForce(domain):
  40. try:
  41. url = 'https://exchange.xforce.ibmcloud.com/url/{}'.format(domain)
  42. headers = {'User-Agent':useragent,
  43. 'Accept':'application/json, text/plain, */*',
  44. 'x-ui':'XFE',
  45. 'Origin':url,
  46. 'Referer':url}
  47. print('[*] IBM xForce Check: {}'.format(domain))
  48. url = 'https://api.xforce.ibmcloud.com/url/{}'.format(domain)
  49. response = s.get(url,headers=headers,verify=False)
  50. responseJson = json.loads(response.text)
  51. if 'error' in responseJson:
  52. a = responseJson['error']
  53. else:
  54. a = responseJson["result"]['cats']
  55. return a
  56. except:
  57. print('[-] Error retrieving IBM x-Force reputation!')
  58. return "-"
  59. def downloadMalwareDomains():
  60. url = malwaredomains
  61. response = s.get(url,headers=headers,verify=False)
  62. responseText = response.text
  63. if response.status_code == 200:
  64. return responseText
  65. else:
  66. print("Error reaching:{} Status: {}").format(url, response.status_code)
  67. ## MAIN
  68. if __name__ == "__main__":
  69. try:
  70. import requests
  71. from bs4 import BeautifulSoup
  72. from texttable import Texttable
  73. except Exception as e:
  74. print "Expired Domains Reputation Check"
  75. print "[-] Missing dependencies: {}".format(str(e))
  76. print "[*] Install required dependencies by running `pip install -r requirements.txt`"
  77. quit(0)
  78. parser = argparse.ArgumentParser(description='Checks expired domains, bluecoat categorization, and Archive.org history to determine good candidates for C2 and phishing domains')
  79. parser.add_argument('-q','--query', help='Optional keyword used to refine search results', required=False, type=str)
  80. parser.add_argument('-c','--check', help='Perform slow reputation checks', required=False, default=False, action='store_true')
  81. parser.add_argument('-r','--maxresults', help='Number of results to return when querying latest expired/deleted domains (min. 100)', required=False, type=int, default=100)
  82. parser.add_argument('-w','--maxwidth', help='Width of text table', required=False, type=int, default=400)
  83. parser.add_argument('-f','--file', help='Input file containing potential domain names to check (1 per line)', required=False, type=str)
  84. args = parser.parse_args()
  85. ## Variables
  86. query = False
  87. if args.query:
  88. query = args.query
  89. check = args.check
  90. maxresults = args.maxresults
  91. if maxresults < 100:
  92. maxresults = 100
  93. maxwidth=args.maxwidth
  94. inputfile = False
  95. if args.file:
  96. inputfile = args.file
  97. t = Texttable(max_width=maxwidth)
  98. malwaredomains = 'http://mirror1.malwaredomains.com/files/justdomains'
  99. expireddomainsqueryurl = 'https://www.expireddomains.net/domain-name-search'
  100. timestamp = time.strftime("%Y%m%d_%H%M%S")
  101. useragent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
  102. headers = {'User-Agent':useragent}
  103. requests.packages.urllib3.disable_warnings()
  104. # HTTP Session container, used to manage cookies, session tokens and other session information
  105. s = requests.Session()
  106. data = []
  107. title = '''
  108. ____ ___ __ __ _ ___ _ _ _ _ _ _ _ _ _____ _____ ____
  109. | _ \ / _ \| \/ | / \ |_ _| \ | | | | | | | | | \ | |_ _| ____| _ \
  110. | | | | | | | |\/| | / _ \ | || \| | | |_| | | | | \| | | | | _| | |_) |
  111. | |_| | |_| | | | |/ ___ \ | || |\ | | _ | |_| | |\ | | | | |___| _ <
  112. |____/ \___/|_| |_/_/ \_\___|_| \_| |_| |_|\___/|_| \_| |_| |_____|_| \_\ '''
  113. print(title)
  114. print("")
  115. print("Expired Domains Reptutation Checker")
  116. print("")
  117. print("DISCLAIMER:")
  118. print("This is for educational purposes only!")
  119. disclaimer = '''It is designed to promote education and the improvement of computer/cyber security.
  120. The authors or employers are not liable for any illegal act or misuse performed by any user of this tool.
  121. If you plan to use this content for illegal purpose, don't. Have a nice day :)'''
  122. print(disclaimer)
  123. print("")
  124. print("********************************************")
  125. print("Start Time: {}").format(timestamp)
  126. print("TextTable Column Width: {}").format(str(maxwidth))
  127. print("Checking Reputation: {}").format(str(check))
  128. print("Number Domains Checked: {}").format(maxresults)
  129. print("********************************************")
  130. runtime = 0
  131. if check:
  132. runtime = (maxresults * 20) / 60
  133. else:
  134. runtime = maxresults * .15 / 60
  135. print("Estimated Max Run Time: {} minutes").format(int(runtime))
  136. print("")
  137. print('[*] Downloading malware domain list from {}'.format(malwaredomains))
  138. maldomains = downloadMalwareDomains()
  139. maldomains_list = maldomains.split("\n")
  140. # Use the keyword string to narrow domain search if provided
  141. # Need to modify this to pull more than the first 25 results
  142. if query:
  143. print('[*] Fetching expired or deleted domains containing "{}"...').format(query)
  144. for i in range (0,maxresults,25):
  145. if i == 0:
  146. url = "{}/?q={}".format(expireddomainsqueryurl,query)
  147. headers['Referer'] ='https://www.expireddomains.net/domain-name-search/?q={}&searchinit=1'.format(query)
  148. else:
  149. url = "{}/?start={}&q={}".format(expireddomainsqueryurl,i,query)
  150. headers['Referer'] ='https://www.expireddomains.net/domain-name-search/?start={}&q={}'.format((i-25),query)
  151. print("[*] {}".format(url))
  152. # Annoyingly when querying specific keywords the expireddomains.net site requires additional cookies which
  153. # are set in JavaScript and not recognized by Requests so we add them here manually
  154. jar = requests.cookies.RequestsCookieJar()
  155. jar.set('_pk_id.10.dd0a', '*', domain='expireddomains.net', path='/')
  156. jar.set('_pk_ses.10.dd0a', '*', domain='expireddomains.net', path='/')
  157. domains = s.get(url,headers=headers,verify=False,cookies=jar).text
  158. # Turn the HTML into a Beautiful Soup object
  159. soup = BeautifulSoup(domains, 'lxml')
  160. table = soup.find("table")
  161. try:
  162. for row in table.findAll('tr')[1:]:
  163. # Alternative way to extract domain name
  164. # domain = row.find('td').find('a').text
  165. cells = row.findAll("td")
  166. if len(cells) >= 1:
  167. output = ""
  168. c0 = row.find('td').find('a').text # domain
  169. c1 = cells[1].find(text=True) # bl
  170. c2 = cells[2].find(text=True) # domainpop
  171. c3 = cells[3].find(text=True) # birth
  172. c4 = cells[4].find(text=True) # entries
  173. c5 = cells[5].find(text=True) # similarweb
  174. c6 = cells[6].find(text=True) # similarweb country code
  175. c7 = cells[7].find(text=True) # moz
  176. c8 = cells[8].find(text=True) # status com
  177. c9 = cells[9].find(text=True) # status net
  178. c10 = cells[10].find(text=True) # status org
  179. c11 = cells[11].find(text=True) # status de
  180. c12 = cells[12].find(text=True) # tld registered
  181. c13 = cells[13].find(text=True) # monthly searches
  182. c14 = cells[14].find(text=True) # adwords competition
  183. c15 = cells[15].find(text=True) # list
  184. c16 = cells[16].find(text=True) # status
  185. c17 = cells[17].find(text=True) # related links
  186. available = ''
  187. if c8 == "available":
  188. available += ".com "
  189. if c9 == "available":
  190. available += ".net "
  191. if c10 == "available":
  192. available += ".org "
  193. if c11 == "available":
  194. available += ".de "
  195. # Skip additional reputation checks if this domain is already categorized as malicious
  196. if c0 in maldomains_list:
  197. print("[-] Skipping {} - Identified as known malware domain").format(c0)
  198. else:
  199. bluecoat = ''
  200. ibmxforce = ''
  201. if c3 == '-':
  202. bluecoat = 'ignored'
  203. ibmxforce = 'ignored'
  204. elif check == True:
  205. bluecoat = checkBluecoat(c0)
  206. print "[+] {} is categorized as: {}".format(c0, bluecoat)
  207. ibmxforce = checkIBMxForce(c0)
  208. print "[+] {} is categorized as: {}".format(c0, ibmxforce)
  209. # Sleep to avoid captchas
  210. time.sleep(random.randrange(10,20))
  211. else:
  212. bluecoat = "skipped"
  213. ibmxforce = "skipped"
  214. # Append parsed domain data to list
  215. data.append([c0,c3,c4,available,bluecoat,ibmxforce])
  216. except:
  217. print "[-] Error: No results found on this page!"
  218. # Retrieve the most recent expired/deleted domain results
  219. elif inputfile:
  220. print('[*] Fetching domain reputation from file: {}').format(inputfile)
  221. # read in file contents to list
  222. try:
  223. domains = [line.rstrip('\r\n') for line in open(inputfile, "r")]
  224. except IOError:
  225. print '[-] Error: "{}" does not appear to exist.'.format(inputfile)
  226. exit()
  227. print('[*] Domains loaded: {}').format(len(domains))
  228. for domain in domains:
  229. if domain in maldomains_list:
  230. print("[-] Skipping {} - Identified as known malware domain").format(domain)
  231. else:
  232. bluecoat = ''
  233. ibmxforce = ''
  234. bluecoat = checkBluecoat(domain)
  235. print "[+] {} is categorized as: {}".format(domain, bluecoat)
  236. ibmxforce = checkIBMxForce(domain)
  237. print "[+] {} is categorized as: {}".format(domain, ibmxforce)
  238. # Sleep to avoid captchas
  239. time.sleep(random.randrange(10,20))
  240. data.append([domain,'-','-','-',bluecoat,ibmxforce])
  241. else:
  242. print('[*] Fetching {} expired or deleted domains...').format(query)
  243. # Generate list of URLs to query for expired/deleted domains, queries return 25 results per page
  244. urls = []
  245. for i in range (0,(maxresults/4),25):
  246. urls.append('https://www.expireddomains.net/backorder-expired-domains?start={}&o=changed&r=a'.format(i))
  247. urls.append('https://www.expireddomains.net/deleted-com-domains/?start={}&o=changed&r=a'.format(i))
  248. urls.append('https://www.expireddomains.net/deleted-net-domains/?start={}&o=changed&r=a'.format(i))
  249. urls.append('https://www.expireddomains.net/deleted-org-domains/?start={}&o=changed&r=a'.format(i))
  250. for url in urls:
  251. print("[*] {}".format(url))
  252. expireddomains = s.get(url,headers=headers,verify=False).text
  253. # Turn the HTML into a Beautiful Soup object
  254. soup = BeautifulSoup(expireddomains, 'lxml')
  255. table = soup.find("table")
  256. try:
  257. for row in table.findAll('tr')[1:]:
  258. cells = row.findAll("td")
  259. if len(cells) >= 1:
  260. output = ""
  261. c0 = cells[0].find(text=True) # domain
  262. c1 = cells[1].find(text=True) # bl
  263. c2 = cells[2].find(text=True) # domainpop
  264. c3 = cells[3].find(text=True) # birth
  265. c4 = cells[4].find(text=True) # entries
  266. c5 = cells[5].find(text=True) # similarweb
  267. c6 = cells[6].find(text=True) # similarweb country code
  268. c7 = cells[7].find(text=True) # moz
  269. c8 = cells[8].find(text=True) # status com
  270. c9 = cells[9].find(text=True) # status net
  271. c10 = cells[10].find(text=True) # status org
  272. c11 = cells[11].find(text=True) # status de
  273. c12 = cells[12].find(text=True) # tld registered
  274. c13 = cells[13].find(text=True) # changes
  275. c14 = cells[14].find(text=True) # whois
  276. # Expired Domains results have an additional 'Availability' column that breaks parsing "deleted" domains
  277. #c15 = cells[15].find(text=True) # related links
  278. available = ''
  279. if c8 == "available":
  280. available += ".com "
  281. if c9 == "available":
  282. available += ".net "
  283. if c10 == "available":
  284. available += ".org "
  285. if c11 == "available":
  286. available += ".de "
  287. # Skip additional reputation checks if this domain is already categorized as malicious
  288. if c0 in maldomains_list:
  289. print("[-] Skipping {} - Identified as known malware domain").format(c0)
  290. else:
  291. bluecoat = ''
  292. ibmxforce = ''
  293. if c3 == '-':
  294. bluecoat = 'ignored'
  295. ibmxforce = 'ignored'
  296. elif check == True:
  297. bluecoat = checkBluecoat(c0)
  298. print "[+] {} is categorized as: {}".format(c0, bluecoat)
  299. ibmxforce = checkIBMxForce(c0)
  300. print "[+] {} is categorized as: {}".format(c0, ibmxforce)
  301. # Sleep to avoid captchas
  302. time.sleep(random.randrange(10,20))
  303. else:
  304. bluecoat = "skipped"
  305. ibmxforce = "skipped"
  306. # Append parsed domain data to list
  307. data.append([c0,c3,c4,available,bluecoat,ibmxforce])
  308. except:
  309. print "[-] Error: No results found on this page!"
  310. # Sort domain list by column 2 (Birth Year)
  311. sortedData = sorted(data, key=lambda x: x[1], reverse=True)
  312. t.add_rows(sortedData)
  313. header = ['Domain', 'Birth', '#', 'TLDs', 'BC', 'IBM']
  314. t.header(header)
  315. # Build HTML Table
  316. html = ''
  317. htmlHeader = '<html><head><title>Expired Domain List</title></head>'
  318. htmlBody = '<body><p>The following available domains report was generated at {}</p>'.format(timestamp)
  319. htmlTableHeader = '''
  320. <table border="1" align="center">
  321. <th>Domain</th>
  322. <th>Birth</th>
  323. <th>Entries</th>
  324. <th>TLDs Available</th>
  325. <th>Bluecoat</th>
  326. <th>Categorization</th>
  327. <th>IBM-xForce</th>
  328. <th>Categorization</th>
  329. <th>WatchGuard</th>
  330. <th>Namecheap</th>
  331. <th>Archive.org</th>
  332. '''
  333. htmlTableBody = ''
  334. htmlTableFooter = '</table>'
  335. htmlFooter = '</body></html>'
  336. # Build HTML table contents
  337. for i in sortedData:
  338. htmlTableBody += '<tr>'
  339. htmlTableBody += '<td>{}</td>'.format(i[0]) # Domain
  340. htmlTableBody += '<td>{}</td>'.format(i[1]) # Birth
  341. htmlTableBody += '<td>{}</td>'.format(i[2]) # Entries
  342. htmlTableBody += '<td>{}</td>'.format(i[3]) # TLDs
  343. htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/sitereview.jsp#/?search={}" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
  344. htmlTableBody += '<td>{}</td>'.format(i[4]) # Bluecoat Categorization
  345. htmlTableBody += '<td><a href="https://exchange.xforce.ibmcloud.com/url/{}" target="_blank">IBM-xForce</a></td>'.format(i[0]) # IBM xForce
  346. htmlTableBody += '<td>{}</td>'.format(i[5]) # IBM x-Force Categorization
  347. htmlTableBody += '<td><a href="http://www.borderware.com/domain_lookup.php?ip={}" target="_blank">WatchGuard</a></td>'.format(i[0]) # Borderware WatchGuard
  348. htmlTableBody += '<td><a href="https://www.namecheap.com/domains/registration/results.aspx?domain={}" target="_blank">Namecheap</a></td>'.format(i[0]) # Namecheap
  349. htmlTableBody += '<td><a href="http://web.archive.org/web/*/{}" target="_blank">Archive.org</a></td>'.format(i[0]) # Archive.org
  350. htmlTableBody += '</tr>'
  351. html = htmlHeader + htmlBody + htmlTableHeader + htmlTableBody + htmlTableFooter + htmlFooter
  352. logfilename = "{}_domainreport.html".format(timestamp)
  353. log = open(logfilename,'w')
  354. log.write(html)
  355. log.close
  356. print("\n[*] Search complete")
  357. print("[*] Log written to {}\n").format(logfilename)
  358. print(t.draw())