Selaa lähdekoodia

Fix parsing for keyword search feature. Update HTML and text table output. Add reputation check filtering based on PR from @christruncer

Andrew Chiles 7 vuotta sitten
vanhempi
commit
e3985c2c37
2 muutettua tiedostoa jossa 115 lisäystä ja 86 poistoa
  1. 7 2
      README.md
  2. 108 84
      domainhunter.py

+ 7 - 2
README.md

@@ -8,6 +8,11 @@ This Python based tool was written to quickly query the Expireddomains.net searc
 
 ## Changes
 
+- 6 May 2018
+    + Fixed expired domains parsing when performing a keyword search
+    + Minor HTML and text table output updates
+    + Filtered reputation checks to only execute for .COM, .ORG, and .NET domains and removed check for Archive.org records when performing a default or keyword search. Credit to @christruncer for the original PR and idea.
+
 - 11 April 2018
     + Added OCR support for CAPTCHA solving with tesseract. Thanks to t94j0 for the idea in [AIRMASTER](https://github.com/t94j0/AIRMASTER)  
     + Added support for input file list of potential domains (-f/--filename)
@@ -85,11 +90,11 @@ List DomainHunter options
 
 Use defaults to check for most recent 100 domains and check reputation
     
-    python ./domainhunter.py
+    python3 ./domainhunter.py
 
 Search for 1000 most recently expired/deleted domains, but don't check reputation
 
-    python ./domainhunter.py -r 1000
+    python3 ./domainhunter.py -r 1000
 
 Perform all reputation checks for a single domain
 

+ 108 - 84
domainhunter.py

@@ -4,17 +4,15 @@
 ## Author:      @joevest and @andrewchiles
 ## Description: Checks expired domains, reputation/categorization, and Archive.org history to determine 
 ##              good candidates for phishing and C2 domain names
-# Add OCR support for BlueCoat/SiteReview CAPTCHA using tesseract
-# Add support for input file list of potential domains
-# Add additional error checking for ExpiredDomains.net parsing
-# Changed -q/--query switch to -k/--keyword to better match its purpose
+
 import time 
 import random
 import argparse
 import json
 import base64
+import os
 
-__version__ = "20180411"
+__version__ = "20180506"
 
 ## Functions
 
@@ -34,7 +32,7 @@ def doSleep(timing):
 def checkBluecoat(domain):
     try:
         url = 'https://sitereview.bluecoat.com/resource/lookup'
-        postData = {'url':domain,'captcha':''}   # HTTP POST Parameters
+        postData = {'url':domain,'captcha':''}
         headers = {'User-Agent':useragent,
                     'Content-Type':'application/json; charset=UTF-8',
                     'Referer':'https://sitereview.bluecoat.com/lookup'}
@@ -51,16 +49,14 @@ def checkBluecoat(domain):
         # Print notice if CAPTCHAs are blocking accurate results and attempt to solve if --ocr
         if a == 'captcha':
             if ocr:
-                # This request is performed in a browser, but is not needed for our purposes
+                # This request is also performed by a browser, but is not needed for our purposes
                 #captcharequestURL = 'https://sitereview.bluecoat.com/resource/captcha-request'
-                #print('[*] Requesting CAPTCHA')
-                #response = s.get(url=captcharequestURL,headers=headers,cookies=cookies,verify=False)
 
                 print('[*] Received CAPTCHA challenge!')
                 captcha = solveCaptcha('https://sitereview.bluecoat.com/resource/captcha.jpg',s)
                 
                 if captcha:
-                    b64captcha = base64.b64encode(captcha.encode('utf-8')).decode('utf-8')
+                    b64captcha = base64.urlsafe_b64encode(captcha.encode('utf-8')).decode('utf-8')
                    
                     # Send CAPTCHA solution via GET since inclusion with the domain categorization request doens't work anymore
                     captchasolutionURL = 'https://sitereview.bluecoat.com/resource/captcha-request/{0}'.format(b64captcha)
@@ -124,7 +120,7 @@ def checkIBMXForce(domain):
         return "-"
 
 def checkTalos(domain):
-    url = "https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc".format(domain)
+    url = 'https://www.talosintelligence.com/sb_api/query_lookup?query=%2Fapi%2Fv2%2Fdetails%2Fdomain%2F&query_entry={0}&offset=0&order=ip+asc'.format(domain)
     headers = {'User-Agent':useragent,
                'Referer':url}
 
@@ -222,10 +218,7 @@ def checkDomain(domain):
 
     if domain in maldomainsList:
         print("[!] {}: Identified as known malware domain (malwaredomains.com)".format(domain))
-    
-    mxtoolbox = checkMXToolbox(domain)
-    print("[+] {}: {}".format(domain, mxtoolbox))
-    
+      
     bluecoat = checkBluecoat(domain)
     print("[+] {}: {}".format(domain, bluecoat))
     
@@ -234,13 +227,21 @@ def checkDomain(domain):
 
     ciscotalos = checkTalos(domain)
     print("[+] {}: {}".format(domain, ciscotalos))
+
+    mxtoolbox = checkMXToolbox(domain)
+    print("[+] {}: {}".format(domain, mxtoolbox))
+
     print("")
-    return
+    
+    results = [domain,bluecoat,ibmxforce,ciscotalos,mxtoolbox]
+    return results
 
 def solveCaptcha(url,session):  
     # Downloads CAPTCHA image and saves to current directory for OCR with tesseract
     # Returns CAPTCHA string or False if error occured
+    
     jpeg = 'captcha.jpg'
+    
     try:
         response = session.get(url=url,headers=headers,verify=False, stream=True)
         if response.status_code == 200:
@@ -251,13 +252,32 @@ def solveCaptcha(url,session):
             print('[-] Error downloading CAPTCHA file!')
             return False
 
+        # Perform basic OCR without additional image enhancement
         text = pytesseract.image_to_string(Image.open(jpeg))
         text = text.replace(" ", "")
+        
+        # Remove CAPTCHA file
+        try:
+            os.remove(jpeg)
+        except OSError:
+            pass
+
         return text
+
     except Exception as e:
         print("[-] Error solving CAPTCHA - {0}".format(e))
+        
         return False
 
+def drawTable(header,data):
+    
+    data.insert(0,header)
+    t = Texttable(max_width=maxwidth)
+    t.add_rows(data)
+    t.header(header)
+    
+    return(t.draw())
+
 ## MAIN
 if __name__ == "__main__":
 
@@ -265,7 +285,7 @@ if __name__ == "__main__":
     parser.add_argument('-k','--keyword', help='Keyword used to refine search results', required=False, default=False, type=str, dest='keyword')
     parser.add_argument('-c','--check', help='Perform domain reputation checks', required=False, default=False, action='store_true', dest='check')
     parser.add_argument('-f','--filename', help='Specify input file of line delimited domain names to check', required=False, default=False, type=str, dest='filename')
-    parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when present', required=False, default=False, action='store_true')
+    parser.add_argument('--ocr', help='Perform OCR on CAPTCHAs when challenged', required=False, default=False, action='store_true')
     parser.add_argument('-r','--maxresults', help='Number of results to return when querying latest expired/deleted domains', required=False, default=100, type=int, dest='maxresults')
     parser.add_argument('-s','--single', help='Performs detailed reputation checks against a single domain name/IP.', required=False, default=False, dest='single')
     parser.add_argument('-t','--timing', help='Modifies request timing to avoid CAPTCHAs. Slowest(0) = 90-120 seconds, Default(3) = 10-20 seconds, Fastest(5) = no delay', required=False, default=3, type=int, choices=range(0,6), dest='timing')
@@ -294,9 +314,9 @@ if __name__ == "__main__":
         except Exception as e:
             print("Expired Domains Reputation Check")
             print("[-] Missing OCR dependencies: {}".format(str(e)))
-            print("[*] Install required Python dependencies by running `pip3 install -r requirements.txt`")
-            print("[*] Ubuntu\Debian - Install tesseract by running `apt-get install tesseract-ocr python3-imaging`")
-            print("[*] MAC OSX - Install tesseract with homebrew by running `brew install tesseract`")
+            print("[*] Install required Python dependencies by running: pip3 install -r requirements.txt")
+            print("[*] Ubuntu\Debian - Install tesseract by running: apt-get install tesseract-ocr python3-imaging")
+            print("[*] macOS - Install tesseract with homebrew by running: brew install tesseract")
             quit(0)
 
 ## Variables
@@ -314,14 +334,16 @@ if __name__ == "__main__":
 
     maxwidth = args.maxwidth
     
-    malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains'
-    expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search'
-    
     ocr = args.ocr
+    
+    malwaredomainsURL = 'http://mirror1.malwaredomains.com/files/justdomains'
+
+    expireddomainsqueryURL = 'https://www.expireddomains.net/domain-name-search'  
 
     timestamp = time.strftime("%Y%m%d_%H%M%S")
             
     useragent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
+   
     headers = {'User-Agent':useragent}
 
     requests.packages.urllib3.disable_warnings()
@@ -329,8 +351,6 @@ if __name__ == "__main__":
     # HTTP Session container, used to manage cookies, session tokens and other session information
     s = requests.Session()
 
-    data = []
-
     title = '''
  ____   ___  __  __    _    ___ _   _   _   _ _   _ _   _ _____ _____ ____  
 |  _ \ / _ \|  \/  |  / \  |_ _| \ | | | | | | | | | \ | |_   _| ____|  _ \ 
@@ -357,22 +377,29 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
     # Retrieve reputation for a single choosen domain (Quick Mode)
     if single:
         checkDomain(single)
-        quit(0)
+        exit(0)
 
-    # Perform detailed domain reputation checks against input file
+    # Perform detailed domain reputation checks against input file, print table, and quit
     if filename:
+        # Initialize our list with an empty row for the header
+        data = []
         try:
             with open(filename, 'r') as domainsList:
                 for line in domainsList.read().splitlines():
-                    checkDomain(line)
+                    data.append(checkDomain(line))
                     doSleep(timing)
+
+                # Print results table
+                header = ['Domain', 'BlueCoat', 'IBM X-Force', 'Cisco Talos', 'MXToolbox']
+                print(drawTable(header,data))
+
         except KeyboardInterrupt:
             print('Caught keyboard interrupt. Exiting!')
-            quit(0)
+            exit(0)
         except Exception as e:
-            print('[-] {}'.format(e))
-            quit(1)
-        quit(0)
+            print('[-] Error: {}'.format(e))
+            exit(1)
+        exit(0)
      
     # Generic Proxy support 
     # TODO: add as a parameter 
@@ -389,8 +416,9 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
 
     # Generate list of URLs to query for expired/deleted domains
     urls = []
+    domain_list = []
 
-    # Use the keyword string to narrow domain search if provided
+    # Use the keyword string to narrow domain search if provided. This generates a list of URLs to query
     if keyword:
         print('[*] Fetching expired or deleted domains containing "{}"'.format(keyword))
         for i in range (0,maxresults,25):
@@ -404,14 +432,12 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
     # If no keyword provided, retrieve list of recently expired domains in batches of 25 results.
     else:
         print('[*] Fetching expired or deleted domains...')
-        # Caculate number of URLs to request since we're performing a request for four different resources instead of one
-        numresults = int(maxresults / 4)
+        # Caculate number of URLs to request since we're performing a request for two different resources instead of one
+        numresults = int(maxresults / 2)
         for i in range (0,(numresults),25):
             urls.append('https://www.expireddomains.net/backorder-expired-domains?start={}&o=changed&r=a'.format(i))
             urls.append('https://www.expireddomains.net/deleted-com-domains/?start={}&o=changed&r=a'.format(i))
-            urls.append('https://www.expireddomains.net/deleted-net-domains/?start={}&o=changed&r=a'.format(i))
-            urls.append('https://www.expireddomains.net/deleted-org-domains/?start={}&o=changed&r=a'.format(i))
-    
+ 
     for url in urls:
 
         print("[*]  {}".format(url))
@@ -423,7 +449,6 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
 
         r1 = random.randint(100000,999999)
 
-
         # Known good example _pk_id.10.dd0a cookie: 5abbbc772cbacfb1.1496760705.2.1496760705.1496760705
         pk_str = '5abbbc772cbacfb1' + '.1496' + str(r1) + '.2.1496' + str(r1) + '.1496' + str(r1)
 
@@ -435,10 +460,10 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
         #domainrequest = s.get(url,headers=headers,verify=False,cookies=jar,proxies=proxies)
 
         domains = domainrequest.text
-
+   
         # Turn the HTML into a Beautiful Soup object
-        soup = BeautifulSoup(domains, 'lxml')
-        
+        soup = BeautifulSoup(domains, 'lxml')    
+        #print(soup)
         try:
             table = soup.find("table")
             for row in table.findAll('tr')[1:]:
@@ -449,8 +474,6 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                 cells = row.findAll("td")
 
                 if len(cells) >= 1:
-                    output = ""
-
                     if keyword:
 
                         c0 = row.find('td').find('a').text   # domain
@@ -466,10 +489,9 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                         c10 = cells[10].find(text=True) # status org
                         c11 = cells[11].find(text=True) # status de
                         c12 = cells[12].find(text=True) # tld registered
-                        c13 = cells[13].find(text=True) # Related Domains
-                        c14 = cells[14].find(text=True) # Domain list
-                        c15 = cells[15].find(text=True) # status
-                        c16 = cells[16].find(text=True) # related links
+                        c13 = cells[13].find(text=True) # Source List
+                        c14 = cells[14].find(text=True) # Domain Status
+                        c15 = ""                        # Related Domains
 
                     else:
                         c0 = cells[0].find(text=True)   # domain
@@ -487,12 +509,6 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                         c12 = cells[12].find(text=True) # tld registered
                         c13 = cells[13].find(text=True) # changes
                         c14 = cells[14].find(text=True) # whois
-                        c15 = ""                        # not used
-                        c16 = ""                        # not used
-                        c17 = ""                        # not used
-
-                        # Expired Domains results have an additional 'Availability' column that breaks parsing "deleted" domains
-                        #c15 = cells[15].find(text=True) # related links
 
                     available = ''
                     if c8 == "available":
@@ -507,42 +523,49 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                     if c11 == "available":
                         available += ".de "
 
+                    # Only grab status for keyword searches since it doesn't exist otherwise
                     status = ""
-                    if c15:
-                        status = c15
+                    if keyword:
+                        status = c14
+                    
+                    bluecoat = ''
+                    ibmxforce = ''
+                    ciscotalos = ''
+
+                    if check == True:
+                        # Only perform reputation checks if domain is a .com .net. .org and not in maldomains list
+                        if (c0.lower().endswith(".com") or c0.lower().endswith(".net") or c0.lower().endswith(".org")) and (c0 not in maldomainsList):
 
-                    # Skip additional reputation checks if this domain is already categorized as malicious 
-                    if c0 in maldomainsList:
-                        print("[-] Skipping {} - Identified as known malware domain").format(c0)
-                    else:
-                        bluecoat = ''
-                        ibmxforce = ''
-                        if c3 == '-':
-                            bluecoat = 'ignored'
-                            ibmxforce = 'ignored'
-                        elif check == True:
                             bluecoat = checkBluecoat(c0)
                             print("[+] {}: {}".format(c0, bluecoat))
                             ibmxforce = checkIBMXForce(c0)
                             print("[+] {}: {}".format(c0, ibmxforce))
+                            ciscotalos = checkTalos(c0)
+                            print("[+] {}: {}".format(c0, ciscotalos))
                             # Sleep to avoid captchas
                             doSleep(timing)
                         else:
-                            bluecoat = "skipped"
-                            ibmxforce = "skipped"
-                        # Append parsed domain data to list
-                        data.append([c0,c3,c4,available,status,bluecoat,ibmxforce])
+                            bluecoat = 'skipped'
+                            ibmxforce = 'skipped'
+                            ciscotalos = 'skipped'
+
+                    # Append parsed domain data to list
+                    domain_list.append([c0,c3,c4,available,status,bluecoat,ibmxforce,ciscotalos])               
+
         except Exception as e: 
-            #print(e)
+        #    print(e)
             pass
 
+        # Add additional sleep on requests to ExpiredDomains.net to avoid errors
+        time.sleep(5)
+
     # Check for valid results before continuing
-    if not(data):
-        print("[-] No results found for keyword: {0}".format(keyword))
-        quit(0)
+    if len(domain_list) == 0:
+        print("[-] No domain results found")
+        exit(0)
 
     # Sort domain list by column 2 (Birth Year)
-    sortedData = sorted(data, key=lambda x: x[1], reverse=True) 
+    sortedDomains = sorted(domain_list, key=lambda x: x[1], reverse=True) 
 
     # Build HTML Table
     html = ''
@@ -556,9 +579,11 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
                     <th>Entries</th>
                     <th>TLDs Available</th>
                     <th>Status</th>
-                    <th>Symantec</th>
+                    <th>BlueCoat</th>
                     <th>Categorization</th>
-                    <th>IBM-xForce</th>
+                    <th>IBM X-Force</th>
+                    <th>Categorization</th>
+                    <th>Cisco Talos</th>
                     <th>Categorization</th>
                     <th>WatchGuard</th>
                     <th>Namecheap</th>
@@ -570,7 +595,7 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
     htmlFooter = '</body></html>'
 
     # Build HTML table contents
-    for i in sortedData:
+    for i in sortedDomains:
         htmlTableBody += '<tr>'
         htmlTableBody += '<td>{}</td>'.format(i[0]) # Domain
         htmlTableBody += '<td>{}</td>'.format(i[1]) # Birth
@@ -578,10 +603,12 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
         htmlTableBody += '<td>{}</td>'.format(i[3]) # TLDs
         htmlTableBody += '<td>{}</td>'.format(i[4]) # Status
 
-        htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/sitereview#/?search={}" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
+        htmlTableBody += '<td><a href="https://sitereview.bluecoat.com/" target="_blank">Bluecoat</a></td>'.format(i[0]) # Bluecoat
         htmlTableBody += '<td>{}</td>'.format(i[5]) # Bluecoat Categorization
         htmlTableBody += '<td><a href="https://exchange.xforce.ibmcloud.com/url/{}" target="_blank">IBM-xForce</a></td>'.format(i[0]) # IBM xForce
         htmlTableBody += '<td>{}</td>'.format(i[6]) # IBM x-Force Categorization
+        htmlTableBody += '<td><a href="https://www.talosintelligence.com/reputation_center/lookup?search={0}" target="_blank">Cisco Talos</a></td>'.format(i[0]) # Cisco Talos
+        htmlTableBody += '<td>{}</td>'.format(i[7]) # Cisco Talos
         htmlTableBody += '<td><a href="http://www.borderware.com/domain_lookup.php?ip={}" target="_blank">WatchGuard</a></td>'.format(i[0]) # Borderware WatchGuard
         htmlTableBody += '<td><a href="https://www.namecheap.com/domains/registration/results.aspx?domain={}" target="_blank">Namecheap</a></td>'.format(i[0]) # Namecheap
         htmlTableBody += '<td><a href="http://web.archive.org/web/*/{}" target="_blank">Archive.org</a></td>'.format(i[0]) # Archive.org
@@ -598,8 +625,5 @@ If you plan to use this content for illegal purpose, don't.  Have a nice day :)'
     print("[*] Log written to {}\n".format(logfilename))
     
     # Print Text Table
-    t = Texttable(max_width=maxwidth)
-    t.add_rows(sortedData)
-    header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'Symantec', 'IBM']
-    t.header(header)
-    print(t.draw())
+    header = ['Domain', 'Birth', '#', 'TLDs', 'Status', 'BlueCoat', 'IBM', 'Cisco Talos']
+    print(drawTable(header,sortedDomains))