forked from Data-drone/ANZ_LLM_Bootcamp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0_lab_setup.py
152 lines (112 loc) · 4.95 KB
/
0_lab_setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Databricks notebook source
# MAGIC %md
# MAGIC This notebook will setup the datasets and preload the big models to use for exploring LLM RAGs
# MAGIC This notebook is just if you are setting up locally after the workshop
# MAGIC In a workshop the instructor should run and set this up
# COMMAND ----------
import os
import requests
# COMMAND ----------
# DBTITLE 1,Setup dbfs folder paths
# MAGIC %run ./utils
# COMMAND ----------
# DBTITLE 1,Config Params
# We will setup a folder to store the files
user_agent = "me-me-me"
reset_home = False
# If running this on your own in multiuser environment then use this
library_folder = dbfs_source_docs
# When teaching a class
class_lib = '/bootcamp_data/pdf_data'
if reset_home = True:
dbutils.fs.rm(class_lib, True)
dbutils.fs.mkdirs(class_lib)
library_folder = f'/dbfs{class_lib}'
# COMMAND ----------
def load_file(file_uri, file_name, library_folder):
# Create the local file path for saving the PDF
local_file_path = os.path.join(library_folder, file_name)
# Download the PDF using requests
try:
# Set the custom User-Agent header
headers = {"User-Agent": user_agent}
response = requests.get(file_uri, headers=headers)
# Check if the request was successful
if response.status_code == 200:
# Save the PDF to the local file
with open(local_file_path, "wb") as pdf_file:
pdf_file.write(response.content)
print("PDF downloaded successfully.")
else:
print(f"Failed to download PDF. Status code: {response.status_code}")
except requests.RequestException as e:
print("Error occurred during the request:", e)
# COMMAND ----------
pdfs = {'2203.02155.pdf':'https://arxiv.org/pdf/2203.02155.pdf',
'2302.09419.pdf': 'https://arxiv.org/pdf/2302.09419.pdf',
'Brooks_InstructPix2Pix_Learning_To_Follow_Image_Editing_Instructions_CVPR_2023_paper.pdf': 'https://openaccess.thecvf.com/content/CVPR2023/papers/Brooks_InstructPix2Pix_Learning_To_Follow_Image_Editing_Instructions_CVPR_2023_paper.pdf',
'2303.10130.pdf':'https://arxiv.org/pdf/2303.10130.pdf',
'2302.06476.pdf':'https://arxiv.org/pdf/2302.06476.pdf',
'2302.06476.pdf':'https://arxiv.org/pdf/2302.06476.pdf',
'2303.04671.pdf':'https://arxiv.org/pdf/2303.04671.pdf',
'2209.07753.pdf':'https://arxiv.org/pdf/2209.07753.pdf',
'2302.07842.pdf':'https://arxiv.org/pdf/2302.07842.pdf',
'2302.07842.pdf':'https://arxiv.org/pdf/2302.07842.pdf',
'2204.01691.pdf':'https://arxiv.org/pdf/2204.01691.pdf'}
for pdf in pdfs.keys():
load_file(pdfs[pdf], pdf, library_folder)
# COMMAND ----------
dbutils.fs.ls(class_lib)
# COMMAND ----------
# MAGIC %md
# MAGIC Setting up huggingface \
# MAGIC Lets load the models that we need \
# MAGIC Then we can save students having to wait for downloads or worry about tokens for HF
# COMMAND ----------
import os
hf_home = '/bootcamp_data/hf_cache'
transformers_cache = f'{hf_home}/transformers'
download_dir = f'{hf_home}/downloads'
if reset_home = True
dbutils.fs.rm(hf_home, True)
dbutils.fs.mkdirs(hf_home)
dbutils.fs.mkdirs(transformers_cache)
dbutils.fs.mkdirs(download_dir)
dbfs_hf_home = f'/dbfs{hf_home}'
dbfs_transformers_home = f'/dbfs{transformers_cache}'
dbfs_downloads_home = f'/dbfs{download_dir}'
os.environ['TRANSFORMERS_CACHE'] = dbfs_transformers_home
os.environ['HF_HOME'] = dbfs_hf_home
# COMMAND ----------
%sh export TRANSFORMERS_CACHE=$dbfs_transformers_home
# COMMAND ----------
# this is needed for llama 2 downloading
# You need to create a huggingface account
# The follow the instructions here: https://huggingface.co/docs/hub/security-tokens#:~:text=To%20create%20an%20access%20token,you're%20ready%20to%20go!
# we could also use notebook login
# we can use a secret to setup the huggingface connection
import huggingface_hub
# use this if you are logging in when doing lab setup
# huggingface_hub.notebook_login()
# use this if you have a hf key saved in secrets
huggingface_key = dbutils.secrets.get(scope='brian-hf', key='hf-key')
huggingface_hub.login(token=huggingface_key)
# COMMAND ----------
# Lets use snapshot downloads
from huggingface_hub import hf_hub_download, list_repo_files
repo_list = {'llama_2_gpu': 'meta-llama/Llama-2-7b-chat-hf',
'llama_2_cpu': 'TheBloke/Llama-2-7B-chat-GGUF'}
for lib_name in repo_list.keys():
for name in list_repo_files(repo_list[lib_name]):
# skip all the safetensors data as we aren't using it and it's time consuming to download
if "safetensors" in name:
continue
target_path = os.path.join(dbfs_downloads_home, lib_name, name)
if not os.path.exists(target_path):
print(f"Downloading {name}")
hf_hub_download(
repo_list[lib_name],
filename=name,
local_dir=os.path.join(dbfs_downloads_home, lib_name),
local_dir_use_symlinks=False,
)