-
Notifications
You must be signed in to change notification settings - Fork 1
/
standard.txt
101 lines (81 loc) · 3.33 KB
/
standard.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from zipfile import ZipFile
from io import BytesIO
import gradio as gr
def download_file(url, session):
"""Download a file and return its content."""
try:
response = session.get(url)
response.raise_for_status()
return response.content
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}: {e}")
return None
def save_webpage_as_zip(url):
"""Save a webpage and its assets as a ZIP file."""
session = requests.Session()
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
temp_dir = 'temp_webpage'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
main_html_path = os.path.join(temp_dir, 'index.html')
with open(main_html_path, 'wb') as f:
f.write(response.content)
assets = []
for tag in soup.find_all(['img', 'link', 'script']):
if tag.name == 'img' and tag.get('src'):
assets.append(tag['src'])
elif tag.name == 'link' and tag.get('href'):
assets.append(tag['href'])
elif tag.name == 'script' and tag.get('src'):
assets.append(tag['src'])
# Download and save all assets
for asset in assets:
asset_url = urljoin(url, asset)
asset_path = urlparse(asset_url).path.lstrip('/')
asset_full_path = os.path.join(temp_dir, asset_path)
if asset_path.endswith('/'):
print(f"Skipping directory {asset_full_path}")
continue
os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
content = download_file(asset_url, session)
if content:
if os.path.isdir(asset_full_path):
print(f"Skipping directory {asset_full_path}")
continue
with open(asset_full_path, 'wb') as f:
f.write(content)
zip_buffer = BytesIO()
with ZipFile(zip_buffer, 'w') as zipf:
for root, _, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, temp_dir))
for root, _, files in os.walk(temp_dir, topdown=False):
for file in files:
os.remove(os.path.join(root, file))
os.rmdir(root)
zip_buffer.seek(0)
return zip_buffer
def generate_zip_file(url):
"""Generate ZIP file from a webpage URL."""
zip_buffer = save_webpage_as_zip(url)
temp_zip_path = "webpage.zip"
with open(temp_zip_path, 'wb') as f:
f.write(zip_buffer.read())
return temp_zip_path
with gr.Blocks(theme="bethecloud/storj_theme") as demo:
gr.Markdown("## Webpage to ZIP Downloader 🔗")
gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.")
url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)")
download_button = gr.Button("Download as ZIP")
output_file = gr.File(label="Download")
def set_example_url(url):
url_input.value = url
download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)
demo.launch()