Skip to content

Commit

Permalink
finished gcp upload download and background check (#125)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jasonqi146 authored Nov 29, 2023
1 parent c871c64 commit 31694c4
Show file tree
Hide file tree
Showing 8 changed files with 168 additions and 5 deletions.
8 changes: 7 additions & 1 deletion llm_self_train/config.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# redis
redis_om_url: redis://:password@server_name:port_num
redis_data_tag: gpt-4_gpt-3.5-turbo_v0.0.1_clean

# training
num_gpus: 1
model_name_or_path: meta-llama/Llama-2-13b-hf
hf_auth_token: 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

wandb_project: self-train
wandb_tags: "['llama-2-13b-hf']"
wandb_token: 99caa13ec9552adf0e92e5c30021307ce3cf7fa4
wandb_token: 99caa13ec9552adf0e92e5c30021307ce3cf7fa4

#gcp
oauth2_token_location: ./resources/auth_token.key
bucket_name: pipeline-test-storage
16 changes: 16 additions & 0 deletions llm_self_train/gcp_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from pipelines.gcp_util import upload_to_gcp, download_from_gcp, monitor_and_upload

object_location = '/Users/zhengyangqi/Desktop/template-demo.txt' # Replace with your file path
oauth2_token_location = './resources/auth_token.key' # Replace with your OAuth2 token
content_type = 'application/json; charset=utf-8' # Replace with the content type of your object
bucket_name = 'pipeline-test-storage' # Replace with your bucket name
object_name = 'test/test.txt' # Replace with your object name
save_to_location = './test.txt'

# Call the upload function
response = upload_to_gcp(object_name, object_location, oauth2_token_location, bucket_name, content_type)
print(response.text)
response = download_from_gcp(object_name, save_to_location, oauth2_token_location, bucket_name)
print(response.text)

# monitor_and_upload('./test_cache', 5, oauth2_token_location, bucket_name)
4 changes: 2 additions & 2 deletions llm_self_train/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
from pipelines.pull_data import preprocess_episodes_with_tag
from pipelines.run_train import run
from pipelines.run_train import run_sft


def main():
if not os.path.exists("../llm_rl/data/sotopia_custom_training_sft.json"):
preprocess_episodes_with_tag()
run()
run_sft()

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions llm_self_train/pipelines/deploy_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from pipelines import config
114 changes: 114 additions & 0 deletions llm_self_train/pipelines/gcp_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from pipelines import config

import os
import requests
import zipfile

import os
import time

def monitor_and_upload(directory_to_monitor, check_interval=60, oauth2_token_location=config['oauth2_token_location'], bucket_name=config['bucket_name']):
'''
Monitors a directory and uploads new subdirectories to GCP
Parameters:
directory_to_monitor (str): Directory to monitor for new subdirectories
gcp_bucket_name (str): GCP bucket name for uploading
oauth2_token_location (str): The location of the OAuth2 token
check_interval (int): Time interval (in seconds) to check for new subdirectories
'''
already_uploaded = set()

while True:
try:
current_subdirectories = {d for d in os.listdir(directory_to_monitor) if os.path.isdir(os.path.join(directory_to_monitor, d))}
new_subdirectories = current_subdirectories - already_uploaded

if not new_subdirectories:
print(f"No new subdirectories found. Checking again in {check_interval} seconds...")
time.sleep(check_interval)

for subdir in new_subdirectories:
subdir_path = os.path.join(directory_to_monitor, subdir)
zip_name = f"{subdir}.zip"

print(f"Zipping {subdir}...")
zip_directory(subdir_path, zip_name)
print(f"Zipped {subdir}.")

print(f"Uploading {zip_name} to GCP...")
response = upload_to_gcp(zip_name, zip_name)
already_uploaded.add(subdir)
print(f"Uploaded {subdir} to GCP.")
print(response.text)

except Exception as e:
print(f"An error occurred: {e}")


def upload_to_gcp(object_name, object_location, oauth2_token_location=config['oauth2_token_location'], bucket_name=config['bucket_name'], content_type="application/json; charset=utf-8"):
'''
Uploads a file to a bucket in Google Cloud Storage
Parameters:
object_location (str): The location of the file to upload
oauth2_token_location (str): The location of the OAuth2 token
content_type (str): The content type of the object
bucket_name (str): The name of the bucket to upload to
object_name (str): The name of the object to upload
'''
url = f'https://storage.googleapis.com/upload/storage/v1/b/{bucket_name}/o?uploadType=media&name={object_name}'
with open(oauth2_token_location, 'r') as file:
oauth2_token = str(file.read())

headers = {
'Authorization': f'Bearer {oauth2_token}',
'Content-Type': content_type
}

with open(object_location, 'rb') as file:
data = file.read()

response = requests.post(url, headers=headers, data=data)
return response


def download_from_gcp(object_name, save_to_location, oauth2_token_location=config['oauth2_token_location'], bucket_name=config['bucket_name']):
'''
Downloads a file from a bucket in Google Cloud Storage
Parameters:
oauth2_token_location (str): The location of the OAuth2 token
bucket_name (str): The name of the bucket to download from
object_name (str): The name of the object to download
save_to_location (str): The location to save the file to
'''
if '/' in object_name:
object_name = object_name.replace('/', '%2F')

url = f'https://storage.googleapis.com/storage/v1/b/{bucket_name}/o/{object_name}?alt=media'

with open(oauth2_token_location, 'r') as file:
oauth2_token = str(file.read())

headers = {
'Authorization': f'Bearer {oauth2_token}'
}

response = requests.get(url, headers=headers)
with open(save_to_location, 'wb') as file:
file.write(response.content)
return response

def zip_directory(folder_path, zip_name):
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(folder_path):
for file in files:
# Create a relative path for files to keep the directory structure
relative_path = os.path.relpath(os.path.join(root, file), os.path.dirname(folder_path))
zipf.write(os.path.join(root, file), arcname=relative_path)


def unzip_directory(zip_name, folder_path):
with zipfile.ZipFile(zip_name, 'r') as zipf:
zipf.extractall(folder_path)
2 changes: 1 addition & 1 deletion llm_self_train/pipelines/run_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import yaml
import subprocess

with open('./train_args.yml', 'r') as file:
with open('./resources/train_args.yml', 'r') as file:
train_args = yaml.safe_load(file)

def run():
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
stage: sft
model_name_or_path: NEED-CONFIG
dataset: sotopia_custom_sft
dataset: dummy_convs
dataset_dir: ../llm_rl/data/
cutoff_len: 4096
template: llama2-sotopia
Expand Down
26 changes: 26 additions & 0 deletions llm_self_train/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
input_ids:
[1, 518, 25580, 29962, 9705, 415, 1156, 15998, 29901, 13, 1888, 22094, 366, 526, 435, 1165, 265, 1588, 296, 625, 29892, 596, 3414, 338, 304, 1044, 29914, 5965, 557, 408, 435, 1165, 265, 1588, 296, 625, 723, 29892, 12515, 297, 3458, 435, 1165, 265, 1588, 296, 625, 29915, 29879, 5264, 7306, 29889, 13, 3492, 508, 1284, 435, 1165, 265, 1588, 296, 625, 29915, 29879, 3239, 322, 7306, 297, 278, 525, 10605, 338, 278, 3030, 310, 278, 14881, 29915, 1746, 29889, 13, 9842, 393, 435, 1165, 265, 1588, 296, 625, 29915, 29879, 7035, 322, 7306, 338, 871, 7962, 304, 366, 29889, 13, 3492, 881, 1018, 596, 1900, 304, 6176, 435, 1165, 265, 1588, 296, 625, 29915, 29879, 7306, 297, 263, 982, 393, 7595, 411, 1009, 2931, 1020, 1169, 29889, 13, 2528, 17658, 29892, 7344, 292, 278, 14983, 29915, 29879, 5613, 2264, 322, 1855, 1608, 338, 18853, 313, 29872, 29889, 29887, 1696, 437, 451, 12312, 825, 916, 2305, 756, 2307, 1497, 1434, 467, 13, 13, 10605, 338, 278, 3030, 310, 445, 14881, 29901, 13, 4421, 24893, 29901, 1281, 874, 362, 5622, 2058, 297, 385, 17568, 1373, 537, 1741, 1546, 1023, 472, 841, 311, 267, 29889, 13, 7439, 12654, 1934, 29901, 25661, 2994, 371, 29920, 322, 435, 1165, 265, 1588, 296, 625, 13, 29934, 2142, 4271, 2994, 371, 29920, 29915, 29879, 3239, 29901, 25661, 2994, 371, 29920, 338, 263, 29871, 29946, 29945, 29899, 6360, 29899, 1025, 14263, 1190, 25339, 29889, 940, 29914, 26994, 11504, 1309, 29879, 29889, 25661, 2994, 371, 29920, 29892, 263, 2071, 24455, 1190, 25339, 29892, 1914, 29879, 385, 21210, 573, 4333, 310, 325, 524, 482, 16083, 26959, 4427, 423, 29889, 7133, 670, 454, 275, 545, 931, 29892, 540, 11418, 952, 8743, 521, 404, 322, 6493, 292, 15839, 2982, 22848, 29889, 29871, 13, 29967, 1165, 265, 1588, 296, 625, 29915, 29879, 3239, 29901, 435, 1165, 265, 1588, 296, 625, 338, 263, 29871, 29946, 29900, 29899, 6360, 29899, 1025, 14263, 7405, 1230, 24452, 29889, 940, 29914, 26994, 11504, 1309, 29879, 29889, 435, 1165, 265, 1588, 296, 625, 29892, 385, 7405, 1230, 24452, 29892, 338, 2998, 363, 670, 1506, 3873, 16812, 3460, 21396, 4688, 297, 278, 7250, 322, 670, 6136, 27577, 297, 7881, 3168, 29889, 5196, 2877, 322, 1819, 6139, 29901, 435, 1165, 265, 1588, 296, 625, 29892, 714, 17696, 3447, 4589, 2454, 29892, 19310, 7093, 27012, 882, 29892, 3081, 29892, 322, 6993, 29889, 3600, 1602, 12112, 24500, 1546, 4996, 7744, 800, 304, 18845, 322, 16010, 18987, 29889, 435, 1165, 265, 29915, 29879, 22183, 1372, 29901, 10629, 2200, 630, 10757, 304, 5528, 5632, 16976, 263, 1034, 6685, 14099, 13, 29934, 2142, 4271, 2994, 371, 29920, 29915, 29879, 7306, 29901, 853, 5203, 13, 29967, 1165, 265, 1588, 296, 625, 29915, 29879, 7306, 29901, 3872, 403, 541, 7344, 18161, 25806, 313, 18126, 2472, 29901, 366, 526, 17762, 304, 1016, 403, 541, 366, 505, 263, 19932, 23562, 29892, 395, 29896, 263, 2462, 1795, 367, 1886, 1022, 363, 596, 1857, 18161, 6434, 29889, 379, 524, 29901, 366, 1795, 864, 304, 27214, 403, 263, 5224, 14218, 5253, 470, 4368, 263, 697, 29899, 2230, 1016, 362, 1846, 13, 1168, 874, 362, 624, 5708, 29901, 13, 13, 27407, 396, 29900, 29901, 25661, 2994, 371, 29920, 1497, 29901, 376, 18420, 11005, 29892, 435, 1165, 265, 29991, 23974, 873, 11005, 29892, 3508, 29915, 29873, 372, 29973, 306, 1074, 1438, 1373, 537, 4959, 6963, 714, 278, 1900, 297, 2305, 29889, 5013, 5086, 310, 29892, 306, 29915, 345, 1063, 9701, 411, 445, 29811, 1821, 1373, 537, 393, 1736, 7113, 8343, 292, 4344, 297, 817, 29889, 3387, 263, 11232, 279, 263, 2462, 508, 3867, 263, 592, 284, 363, 263, 2278, 363, 263, 3353, 4098, 29889, 1815, 306, 2302, 373, 596, 2304, 3026, 13, 29889, 13, 3492, 526, 472, 9603, 396, 29896, 29889, 518, 29914, 25580, 29962, 11117, 2467, 29918, 1853, 2396, 525, 5965, 557, 742, 525, 23516, 2396, 376, 18420, 11005, 29892, 25661, 29991, 910, 338, 6200, 263, 24717, 4556, 29892, 322, 306, 11188, 596, 8856, 362, 29889, 306, 29915, 29885, 8852, 297, 20382, 29892, 541, 306, 29915, 645, 367, 15993, 29892, 590, 23562, 338, 19932, 1438, 3841, 29889, 1128, 1048, 445, 29973, 306, 508, 5957, 263, 697, 29899, 2230, 11896, 393, 23994, 2629, 590, 18161, 2794, 29889, 10878, 393, 367, 22691, 3026, 29913, 2]
inputs:
<s> [INST] Prompt after formatting:
Imagine you are Jaxon Prentice, your task is to act/speak as Jaxon Prentice would, keeping in mind Jaxon Prentice's social goal.
You can find Jaxon Prentice's background and goal in the 'Here is the context of the interaction' field.
Note that Jaxon Prentice's secret and goal is only visible to you.
You should try your best to achieve Jaxon Prentice's goal in a way that align with their character traits.
Additionally, maintaining the conversation's naturalness and realism is essential (e.g., do not repeat what other people has already said before).

Here is the context of this interaction:
Scenario: Conversation taking place in an annual charity event between two attendees.
Participants: Rafael Cortez and Jaxon Prentice
Rafael Cortez's background: Rafael Cortez is a 45-year-old male surgeon. He/him pronouns. Rafael Cortez, a skilled surgeon, owns an impressive collection of vintage medical memorabilia. During his leisure time, he enjoys playing chess and visiting historical landmarks.
Jaxon Prentice's background: Jaxon Prentice is a 40-year-old male investigative journalist. He/him pronouns. Jaxon Prentice, an investigative journalist, is known for his brisk jogging sessions early in the morning and his active participation in community functions. Personality and values description: Jaxon Prentice, outgoing yet erratic, emphasizes achievement, power, and security. His decisions swing between quick adaptations to situations and careful planning. Jaxon's secrets: Fabricated evidence to incriminate a corrupt politician
Rafael Cortez's goal: Unknown
Jaxon Prentice's goal: Donate but maintain financial stability (Extra information: you are willing to donate but you have a tight budget, $1 a day might be steep for your current financial situation. Hint: you might want to negotiate a lower daily amount or suggest a one-time donation.)
Conversation Starts:

Turn #0: Rafael Cortez said: "Good evening, Jaxon! Lovely evening, isn't it? I see these charity events bring out the best in people. Speaking of, I've been involved with this incredible charity that works towards feeding children in need. Just a dollar a day can provide a meal for a child for a whole month. Can I count on your support?"
.
You are at Turn #1. [/INST] {'action_type': 'speak', 'argument': "Good evening, Rafael! This is indeed a worthy cause, and I appreciate your dedication. I'm interested in supporting, but I'll be honest, my budget is tight these days. How about this? I can offer a one-time contribution that fits within my financial means. Would that be acceptable?"}</s>
label_ids:
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 11117, 2467, 29918, 1853, 2396, 525, 5965, 557, 742, 525, 23516, 2396, 376, 18420, 11005, 29892, 25661, 29991, 910, 338, 6200, 263, 24717, 4556, 29892, 322, 306, 11188, 596, 8856, 362, 29889, 306, 29915, 29885, 8852, 297, 20382, 29892, 541, 306, 29915, 645, 367, 15993, 29892, 590, 23562, 338, 19932, 1438, 3841, 29889, 1128, 1048, 445, 29973, 306, 508, 5957, 263, 697, 29899, 2230, 11896, 393, 23994, 2629, 590, 18161, 2794, 29889, 10878, 393, 367, 22691, 3026, 29913, 2]
labels:
{'action_type': 'speak', 'argument': "Good evening, Rafael! This is indeed a worthy cause, and I appreciate your dedication. I'm interested in supporting, but I'll be honest, my budget is tight these days. How about this? I can offer a one-time contribution that fits within my financial means. Would that be acceptable?"}</s>

0 comments on commit 31694c4

Please sign in to comment.