forked from Haax9/WaybackPDF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
waybackPDF.py
executable file
·141 lines (108 loc) · 4.3 KB
/
waybackPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# coding: utf-8
import json
import os
import re
import requests
import argparse
class PC:
"""PC (Print Color)
Used to generate some colorful, relevant, nicely formatted status messages.
"""
green = '\033[92m'
blue = '\033[94m'
orange = '\033[93m'
endc = '\033[0m'
ok_box = blue + '[*] ' + endc
note_box = green + '[+] ' + endc
warn_box = orange + '[!] ' + endc
def parse_arguments():
desc = ('OSINT tool to download archived PDF files from archive.org for'
' a given website.')
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-d', '--domain', type=str, action='store',
required=True,
help='The target domain you are looking for files')
parser.add_argument('-o', '--output', type=str, action='store',
required=False,
help='Optional output directory (Default is the domain name)')
parser.add_argument('--http', type=str, action='store',
required=False,
help='Use HTTP instead of HTTPS for the target domain.'
' The default behavior uses HTTPS')
parser.add_argument('-r', '--resume', type=int, action='store', required=False,
help='Start downloading at a given index and skip X previous'
' files')
args = parser.parse_args()
return args
def getPDFlist(domain, protocol):
print("\n" + PC.ok_box + "Requesting PDF list...")
# Default target is HTTPS
targetDomain = "https://{}".format(domain)
# Building URL
if protocol:
targetDomain = "http://{}".format(domain)
baseURL = "https://web.archive.org/web/timemap/"
payload = {'url':targetDomain, 'matchType':'prefix','collapse':'urlkey',
'output':'json', 'fl':'original,mimetype,timestamp,endtimestamp'
',groupcount,uniqcount', 'filter':'!statuscode:[45]..'
,'limit':'100000', '_':'1587473968806'}
# HTTP request to get PDF list
raw = requests.get(baseURL, params=payload).json()
# Building the PDF list
files = []
headers = raw[0]
for item in raw[1:]:
file = {}
for i, header in enumerate(headers):
file[headers[i]] = item[i]
files.append(file)
pdfs = []
for file in files:
if file['mimetype'] == 'application/pdf':
pdfs.append(file)
for pdf in pdfs:
# Create direct URL for each PDF
pdf['wayback'] = 'https://web.archive.org/web/' + pdf['timestamp'] + 'if_/' + pdf['original']
name = pdf['original'].rsplit('/',1)[1]
if ".pdf" in name:
name = name.rsplit(".", 1)
pdf['name'] = name[0] + '-' + pdf['timestamp']
print(PC.note_box + "{} PDFs found".format(len(pdfs)))
return pdfs
def downloadFiles(domain, pdfs, output, resume):
# If needed, create directory
if not os.path.exists(output):
os.makedirs(output)
print("\n" + PC.ok_box + "Downloading Files...")
# Checking if resume switch is on
if resume:
print("\n" + PC.ok_box + "Resume switch on, skipping the first {} file(s)".format(resume))
pdfs = pdfs[resume:]
# Downloading and saving files
for p,pdf in enumerate(pdfs):
outputDir = output + '/'
fileName = pdf['name'][:250] + '.pdf'
filePath = os.path.join(outputDir, fileName)
with open(filePath,'wb') as file:
data = requests.get(pdf['wayback'])
file.write(data.content)
print(PC.note_box + "({}/{}) Saved {}.pdf".format(p+1, len(pdfs),pdf['name']))
def main():
"""Main Function"""
args = parse_arguments()
if args.output:
outputDir = args.output
else:
outputDir = args.domain
print("\n" + PC.note_box + "Web Archive PDF Downloader ")
print(PC.note_box + "Target domain : " + args.domain)
print(PC.note_box + "Output directory : {}/".format(outputDir))
# Getting the PDF list
pdfList = getPDFlist(args.domain, args.http)
# Downloading PDF
downloadFiles(args.domain, pdfList, outputDir, args.resume)
print("\n" + PC.ok_box + "Everything's done !")
print(PC.ok_box + "Happy analysis !\n")
if __name__ == "__main__":
main()