Merge pull request #106 from sethsec/issue-102-extract-urls

Merge Issue 102 extract urls with dev
sethsec · Jun 13, 2019 · 1e88bc3 · 1e88bc3
2 parents 1df28e6 + a73ff1c
commit 1e88bc3
Show file tree

Hide file tree

Showing 11 changed files with 350 additions and 234 deletions.
diff --git a/celerystalk b/celerystalk
@@ -18,6 +18,7 @@ Usage:
     celerystalk resume ([all]|[<task_ids>]) [-h]
     celerystalk db ([workspaces]|[workspace]|[services]|[ports]|[hosts]|[vhosts]|[paths]|[paths_only]|[tasks]) [-h]
     celerystalk db export [-h]
+    celerystalk db paths_only limit [-h]
     celerystalk admin ([start]|[stop]|[restart]|[reset]|[backup]|[restore]) [-f <restore_file>] [-h]
     celerystalk interactive [-h]
     celerystalk (help | -h | --help)
@@ -89,7 +90,7 @@ import csv
 
 from lib.nmap import nmapcommand
 
-build=str(191)
+build=str(200)
 
 def print_banner():
 
@@ -691,10 +692,15 @@ def main(arguments):
             elif arguments["paths_only"]:
                 #print("[+] Showing paths for the [{0}] workspace\n".format(workspace))
                 #columns = ["IP", "Port", "Path"]
-                paths_rows = lib.db.get_all_paths(workspace)
-                for row in paths_rows:
-                    sys.stdout.write(row[3]+"\n")
-                print("\n\n")
+                if arguments["limit"]:
+                    paths = lib.screenshot.get_max_screenshots(workspace,config_file)
+                    for path in paths:
+                        print path
+                else:
+                    paths_rows = lib.db.get_all_paths(workspace)
+                    for row in paths_rows:
+                        sys.stdout.write(row[3]+"\n")
+                print("\n")
             elif arguments["tasks"]:
                 print("[+] Showing tasks for the [{0}] workspace\n".format(workspace))
                 columns = ["ID","PID","Command","Status"]

diff --git a/lib/config_parser.py b/lib/config_parser.py
@@ -34,6 +34,12 @@ def get_simpleserver_port(config_file):
         if key == "simple_server_port":
             return val
 
+def get_screenshot_max(config_file):
+    config,supported_services = read_config_ini(config_file)
+    for (key, val) in config.items("celerystalk-config"):
+        if key == "max_screenshots_per_vhost":
+            return val
+
 def get_user_config(config_file):
     config,supported_services = read_config_ini(config_file)
     return config.items("user-config")

diff --git a/lib/csimport.py b/lib/csimport.py
@@ -237,7 +237,7 @@ def import_url(url,workspace,output_base_dir):
                     url_path = ''
 
                 url_screenshot_filename = scan_output_base_file_dir + url_path.replace("/", "_") + ".png"
-                db_path = (vhost, port, url, 0, url_screenshot_filename, workspace)
+                db_path = (vhost, port, url.rstrip("/"), 0, url_screenshot_filename, workspace)
                 db.insert_new_path(db_path)
                 # print("Found Url: " + str(url))
                 #urls_to_screenshot.append((url, url_screenshot_filename))
@@ -246,7 +246,7 @@ def import_url(url,workspace,output_base_dir):
                 # print(result)
 
 
-            db_path = (vhost, port, url, 0, url_screenshot_filename, workspace)
+            db_path = (vhost, port, url.rstrip("/"), 0, url_screenshot_filename, workspace)
             lib.db.insert_new_path(db_path)
     else:
         print("[!] {0} is explicitly marked as out of scope. Skipping...".format(vhost))
@@ -523,6 +523,7 @@ def process_nmap_data(nmap_report,workspace, target=None):
 
                 if (scanned_service_name == 'https') or (scanned_service_name == 'http'):
                     path = scanned_service_name + "://" + ip + ":" + str(scanned_service_port)
+                    path = path.rstrip("/")
                     db_path = db.get_path(path, workspace)
                     if not db_path:
                         url_screenshot_filename = scan_output_base_file_dir + ".png"
@@ -553,6 +554,7 @@ def process_nmap_data(nmap_report,workspace, target=None):
 
                     if (scanned_service_name == 'https') or (scanned_service_name == 'http'):
                         path = scanned_service_name + "://" + vhost + ":" + str(scanned_service_port)
+                        path = path.rstrip("/")
                         db_path = db.get_path(path, workspace)
                         if not db_path:
                             url_screenshot_filename = scan_output_base_file_dir + ".png"

diff --git a/lib/db.py b/lib/db.py
@@ -662,6 +662,12 @@ def get_all_paths_for_host_path_only(ip,workspace):
     CONNECTION.commit()
     return all_paths_for_host
 
+def get_x_paths_for_host_path_only(ip,workspace,config_max):
+    CUR.execute("SELECT path FROM paths WHERE ip = ? AND workspace = ? LIMIT ?", (ip,workspace,config_max))
+    all_paths_for_host = CUR.fetchall()
+    CONNECTION.commit()
+    return all_paths_for_host
+
 def get_path(path,workspace):
     CUR.execute("SELECT * FROM paths WHERE workspace = ? AND path = ?", (workspace,path))
     path = CUR.fetchall()

diff --git a/lib/report.py b/lib/report.py
@@ -15,9 +15,29 @@
 
 
 
-def summary_paths(workspace):
+def summary_paths():
     pass
 
+def summary_hosts():
+    pass
+
+def summary_services():
+    pass
+
+def summary_tasks():
+    workspace = lib.db.get_current_workspace()[0][0]
+    completed_rows = lib.db.get_completed_tasks(workspace)
+    if completed_rows.__len__() > 0:
+        for completed_row in completed_rows:
+            command = completed_row[1]
+            run_time = completed_row[2]
+            run_time = time.strftime("%H:%M:%S", time.gmtime(float(run_time)))
+            ip = completed_row[3]
+
+
+
+
+
 
 def paths_report(host,all_paths):
     #all_paths = lib.db.get_all_paths_for_host(host)
@@ -29,12 +49,12 @@ def paths_report(host,all_paths):
             url_screenshot_filename = urllib.quote(url_screenshot_filename)
             url_screenshot_filename_relative = os.path.join("screens/",url_screenshot_filename.split("/screens/")[1])
             html_code = html_code + """\n<div id="linkwrap">\n"""
-            html_code = html_code + """<a class="link" href="#">[Screenshot]<span><img src="{1}" alt="image"/></span></a>  <a href="{0}">{0}</a><br>\n""".format(path,url_screenshot_filename_relative)
+            html_code = html_code + """<a href="{0}">{0}</a><br>\n""".format(path)
             html_code = html_code + "\n</div>\n"
         except:
             #print("Could not find screenshot for " + path)
             html_code = html_code + """\n<div id="linkwrap">\n"""
-            html_code = html_code + "[Screenshot]  " + """<a href="{0}">{0}</a><br>\n""".format(path)
+            html_code = html_code + """<a href="{0}">{0}</a><br>\n""".format(path)
             html_code = html_code + "\n</div>\n"
     return html_code
 

diff --git a/lib/screenshot.py b/lib/screenshot.py
@@ -29,12 +29,36 @@ def screenshot_command(arguments):
             print("./celerystalk workspace create -o output_dir -w workspace_name -m vapt")
             print("./celerystalk workspace create -o output_dir -w workspace_name -m bb\n")
             exit()
+    if arguments["-c"]:
+        if os.path.exists(arguments["-c"]):
+            config_file = arguments["-c"]
+        else:
+            print("[!] The specified config file does not exist. Try again?")
+            exit()
+    else:
+        config_file = 'config.ini'
 
     # lib.screenshot.screenshot_all_paths(workspace)
+    #TODO: change this to reflect number of screenshots taken based on config.ini max
     paths_len = len(lib.db.get_all_paths(workspace))
-    print("[+]\n[+] Tasking aquatone to take [{0}] screenshots").format(str(paths_len))
+    max_paths_len = len(get_max_screenshots(workspace,config_file))
+    max = lib.config_parser.get_screenshot_max(config_file)
+    print("[+]\n[+] There are [{0}] paths in the DB").format(str(paths_len))
+    #print("[+] max_screenshots_per_vhost set to: [{0}]").format(str(max))
+    print("[+] Tasking aquatone to take [{0}] screenshots per host for a total of [{1}] screenshots\n[+]\n[+]").format(str(max),str(max_paths_len))
     lib.screenshot.aquatone_all_paths(workspace)
 
+def get_max_screenshots(workspace,config_file):
+    screenshot_list = []
+    max = lib.config_parser.get_screenshot_max(config_file)
+    vhosts = lib.db.get_unique_hosts_with_paths(workspace)
+    for vhost in vhosts:
+        vhost = vhost[0]
+        paths = lib.db.get_x_paths_for_host_path_only(vhost, workspace,max)
+        for path in paths:
+            screenshot_list.append(path[0])
+    return screenshot_list
+
 
 def aquatone_all_paths(workspace,simulation=None,config_file=None):
     #print("in aquatone all_paths")
@@ -62,7 +86,7 @@ def aquatone_all_paths(workspace,simulation=None,config_file=None):
             #print(cmd_name, cmd)
             try:
                 if cmd_name == "aquatone":
-                    populated_command = celery_path + "/celerystalk db paths_only | " + cmd.replace("[OUTPUT]", outdir)
+                    populated_command = celery_path + "/celerystalk db paths_only limit | " + cmd.replace("[OUTPUT]", outdir)
                     #print(populated_command)
             except Exception, e:
                 print(e)
@@ -71,14 +95,15 @@ def aquatone_all_paths(workspace,simulation=None,config_file=None):
 
 
         task_id = uuid()
-        utils.create_task(cmd_name, populated_command, workspace, outdir + "/aquatone_report.html", workspace, task_id)
+        utils.create_task(cmd_name, populated_command, workspace, outdir + "aquatone_report.html", workspace, task_id)
         result = chain(
             tasks.run_cmd.si(cmd_name, populated_command, celery_path, task_id).set(task_id=task_id),
         )()
         print("[+]\t\tTo keep an eye on things, run one of these commands: \n[+]")
         print("[+]\t\t./celerystalk query [watch]")
         print("[+]\t\t./celerystalk query brief [watch]")
         print("[+]\t\t./celerystalk query summary [watch]")
+        print("[+]\t\tor\n[+]\t\ttail -f " + outdir + "aquatone_stdout.txt")
         print("[+]")
         print("[+] To peak behind the curtain, view log/celeryWorker.log")
         print("[+] For a csv compatible record of every command execued, view log/cmdExecutionAudit.log\n")
diff --git a/parsers/__init__.py b/parsers/__init__.py
diff --git a/parsers/generic_urlextract.py b/parsers/generic_urlextract.py
@@ -0,0 +1,80 @@
+#from urlextract import URLExtract
+import re
+import lib.db
+import lib.utils
+import urlparse
+
+
+#TODO: Add this when i move project to python3
+# def extract_urls_urlextractor(tool_output):
+#     #print(type(tool_output))
+#     extractor = URLExtract()
+#     urls = extractor.find_urls(tool_output)
+#     #print(urls)
+#     #for url in extractor.find_urls(tool_output):
+#         #print("* " + urls)
+#         #print(type(url))
+#         #print(urls)
+#     return urls
+
+
+
+def extract_urls_regex(tool_output):
+    intereseting_urls = []
+    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tool_output)
+    not_interesting_extensions = [".png", ".ico", ".js", ".css", ".woff2", ".ttf", ".jpg", ".jpeg", ".svg", ".eot", ".woff",".gif"]
+    for url in urls:
+        if not url.endswith(tuple(not_interesting_extensions)):
+            intereseting_urls.append(url)
+    return intereseting_urls
+
+
+def extract_urls(tool_output):
+    #TODO: Add uncomment these three lines and comment out the forth when i move project to python3
+    #a = extract_urls_urlextractor(tool_output)
+    #b = extract_urls_regex(tool_output)
+    #urls = list(set().union(a, b))
+    urls = extract_urls_regex(tool_output)
+    return urls
+
+
+def is_url_in_scope(url):
+    workspace = lib.db.get_current_workspace()[0][0]
+    try:
+        parsed_url = urlparse.urlparse(url)
+        scheme = parsed_url[0]
+        if ":" in parsed_url[1]:
+            vhost, port = parsed_url[1].split(':')
+        else:
+            vhost = parsed_url[1]
+            if scheme == "http":
+                port = 80
+            elif scheme == "https":
+                port = 443
+        path = parsed_url[2].replace("//", "/")
+    except:
+        print("error parsing url")
+        if not scheme:
+            pass
+    in_scope = lib.db.is_vhost_in_db(vhost,workspace)
+    if in_scope:
+        return str(True),vhost,port,url.rstrip("/"),workspace
+    else:
+        return str(False)
+
+def insert_url_into_db(vhost,port,url,workspace):
+    db_path = (vhost, port, url, 0, "", workspace)
+    lib.db.insert_new_path(db_path)
+    print("Found Url: " + str(url))
+
+def extract_in_scope_urls_from_task_output(tool_output):
+    urls = extract_urls(tool_output)
+    for url in urls:
+        exists,vhost,port,url,workspace = is_url_in_scope(url)
+        if exists == "True":
+            insert_url_into_db(vhost,port,url,workspace)
+
+
+
+
+