Skip to content

Commit

Permalink
Merge branch 'main' into feature/add-vllm-deploy
Browse files Browse the repository at this point in the history
  • Loading branch information
lwaekfjlk authored Oct 24, 2023
2 parents f5d22c4 + 1e44e74 commit 16328e7
Show file tree
Hide file tree
Showing 29 changed files with 2,151 additions and 28 deletions.
24 changes: 24 additions & 0 deletions .github/ISSUE_TEMPLATE/exp_record.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: 🧪Experiment Record
description: Describe experiment setting and results here
title: "[EXP]: "
labels: ["experiment"]
assignees: [""]
body:
- type: markdown
attributes:
value: |
Please make sure this experiment request hasn't been already submitted by someone by looking through other open/closed issues
- type: textarea
id: description
attributes:
label: Description
description: Give us a brief description of the experimental setting and results you would like
validations:
required: true

- type: textarea
id: additional-information
attributes:
label: Additional Information
description: Give us some additional information on the experimental setting and results like learning rate, data selection , etc.
24 changes: 24 additions & 0 deletions .github/ISSUE_TEMPLATE/writing_task.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: 🖊️Writing Task
description: Describe writing task here
title: "[WRT]: "
labels: ["writing"]
assignees: [""]
body:
- type: markdown
attributes:
value: |
Please make sure this writing task request hasn't been already submitted by someone by looking through other open/closed issues
- type: textarea
id: description
attributes:
label: Description
description: Give us a brief description of the writing task you would like
validations:
required: true

- type: textarea
id: additional-information
attributes:
label: Additional Information
description: Give us some additional information on the writing task like exptected length, main content, etc.
3 changes: 0 additions & 3 deletions .github/workflows/mypy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,10 @@ jobs:
run: |
pip install --upgrade pip
pip install -r requirements.txt
pip install -e .[dev]
- name: Type-checking package with mypy
run: |
# Manually install mypy in the standard way.
pip --quiet install -U mypy
# Log this mypy version for debuggability.
mypy --version
# Run this mypy instance against our main package.
mypy --install-types --non-interactive sotopia
mypy --strict .
1 change: 0 additions & 1 deletion .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,3 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: 3.11.2
- uses: pre-commit/action@v3.0.0
8 changes: 0 additions & 8 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,3 @@ jobs:
run: |
pip install --upgrade pip
pip install -r requirements.txt
pip install -e .[dev]
- name: Test with pytest
env: # Or as an environment variable
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
REDIS_OM_URL: ${{ secrets.REDIS_OM_URL }}
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
run: |
pytest
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ dist
*.log.*
*.json
llm_ft/checkpoints/*
llm_ft/*_checkpoints/*
!**/dummy_conversation.json
!playground/deepspeed_config_s2.json
!playground/deepspeed_config_s3.json
!llm_ft/deepspeed_config_s2.json

# Editor
.idea
Expand Down
27 changes: 27 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.1 # Use the sha / tag you want to point at
hooks:
- id: prettier
types_or: [html]
- repo: https://github.com/psf/black
rev: 22.12.0
hooks:
- id: black
args: [--line-length=79]
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
args: ["--profile", "black", --line-length=72]
- repo: https://github.com/kynan/nbstripout
rev: 0.6.0
hooks:
- id: nbstripout
20 changes: 20 additions & 0 deletions data_process/data/fastchat_data/fastchat_data_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import json
import os

sotopia_data_dir = "/Users/pamela/Documents/capstone/sotopia-ft-data/ft-data-gpt4-gpt4-easy-2-side-partial"

ft_data_list = []
count = 0
for file in os.listdir(sotopia_data_dir):
with open(os.path.join(sotopia_data_dir, file), 'r') as f:
file_dict = json.load(f)
fastchat_dict = {"id": f"identity_{count}", "conversations": []}
fastchat_dict["conversations"].append(
{"from": "human", "value": file_dict["prompt"]})
fastchat_dict["conversations"].append(
{"from": "gpt", "value": file_dict["result"]})
ft_data_list.append(fastchat_dict)
count += 1

with open("fastchat-ft-gp4-gpt4-easy-2-side-partial.json", "w") as f:
f.write(json.dumps(ft_data_list, indent=4))
4 changes: 2 additions & 2 deletions data_process/data/multiturn_data/multiturn_data_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def preprocess_data(sotopia_data_dir, file_list, data_type):


def split_by_difficulty(sotopia_data_dir):
hard_env_set = set(['01H7VFHNV13MHN97GAH73E3KM8', '01H7VFHN5WVC5HKKVBHZBA553R', '01H7VFHNN7XTR99319DS8KZCQM', '01H7VFHN9W0WAFZCBT09PKJJNK', '01H7VFHPDZVVCDZR3AARA547CY', '01H7VFHPQQQY6H4DNC6NBQ8XTG', '01H7VFHPQQQY6H4DNC6NBQ8XTG', '01H7VFHN7WJK7VWVRZZTQ6DX9T', '01H7VFHN7A1ZX5KSMT2YN9RXC4', '01H7VFHPS5WJW2694R1MNC8JFY',
'01H7VFHPS5WJW2694R1MNC8JFY', '01H7VFHNN7XTR99319DS8KZCQM', '01H7VFHQ11NAMZS4A2RDGDB01V', '01H7VFHQ11NAMZS4A2RDGDB01V', '01H7VFHPSWGDGEYRP63H2DJKV0', '01H7VFHPSWGDGEYRP63H2DJKV0', '01H7VFHNF4G18PC9JHGRC8A1R6', '01H7VFHNNYH3W0VRWVY178K2TK', '01H7VFHP8AN5643B0NR0NP00VE', '01H7VFHN7A1ZX5KSMT2YN9RXC4'])
hard_env_set = set(["01H7VFHNV13MHN97GAH73E3KM8", "01H7VFHN5WVC5HKKVBHZBA553R", "01H7VFHN9W0WAFZCBT09PKJJNK", "01H7VFHPDZVVCDZR3AARA547CY", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "01H7VFHPS5WJW2694R1MNC8JFY",
"01H7VFHNN7XTR99319DS8KZCQM", "01H7VFHQ11NAMZS4A2RDGDB01V", "01H7VFHPSWGDGEYRP63H2DJKV0", "01H7VFHNF4G18PC9JHGRC8A1R6", "01H7VFHNNYH3W0VRWVY178K2TK", "01H7VFHP8AN5643B0NR0NP00VE", "01H7VFHN7A1ZX5KSMT2YN9RXC4"])

hard_file_list, easy_file_list = [], []
for conv_file in os.listdir(sotopia_data_dir):
Expand Down
17 changes: 17 additions & 0 deletions data_process/data/together_data/together_data_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import json
import os

sotopia_data_dir = "ft-data-gpt4-gpt4-easy-2-side-partial/"

ft_data_list = []
for file in os.listdir(sotopia_data_dir):
with open(os.path.join(sotopia_data_dir, file), 'r') as f: # 2510
file_dict = json.load(f)
output = file_dict["prompt"] + " " + file_dict["result"]
ft_data_list.append(output)


with open("human-bot-train-gpt4-gpt4-easy-2-side-partial.jsonl", 'w') as f:
for data in ft_data_list:
f.write(json.dumps({"text": data}))
f.write('\n')
175 changes: 175 additions & 0 deletions data_process/redis_data_filtering/prompt_reverse_engineering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import argparse
import os
from collections import defaultdict
from typing import Any, Dict, List, Tuple, Union, cast

import pandas as pd
import rich
from rich.console import Console
from rich.terminal_theme import MONOKAI

from sotopia.database.logs import EpisodeLog
from sotopia.messages.message_classes import ActionType
import numpy as np
import json
import enum

#PROMPT_PREFIX = "Prompt after formatting:\n"

PROMPT_TEMPLATE="""Prompt after formatting:\nImagine you are {agent}, your task is to act/speak as {agent} would, keeping in mind {agent}'s social goal.
You can find {agent}'s background and goal in the 'Here is the context of the interaction' field.
Note that {agent}'s secret and goal is only visible to you.
You should try your best to achieve {agent}'s goal in a way that align with their character traits.
Additionally, maintaining the conversation's naturalness and realism is essential (e.g., do not repeat what other people has already said before).
{history}.
You are at Turn #{turn_number}. Your available action types are
{action_list}.
Note: You can "leave" this conversation if 1. you have achieved your social goals, 2. this conversation makes you uncomfortable, 3. you find it uninteresting/you lose your patience, 4. or for other reasons you want to leave.
Please only generate a JSON string including the action type and the argument.
Your action should follow the given format:
{format_instructions}
"""

#PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str)
FORMAT_TEMPLATE = """\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}
the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.
\nHere is the output schema:\n```\n{\"description\": \"An interface for messages.\\nThere is only one required method: to_natural_language\", \"properties\": {\"action_type\": {\"title\": \"Action Type\", \"description\": \"whether to speak at this turn or choose to not do anything\", \"enum\": [\"none\", \"speak\", \"non-verbal communication\", \"action\", \"leave\"], \"type\": \"string\"}, \"argument\": {\"title\": \"Argument\", \"description\": \"the utterance if choose to speak, the expression or gesture if choose non-verbal communication, or the physical action if choose action\", \"type\": \"string\"}}, \"required\": [\"action_type\", \"argument\"]}\n```\u001b[0m"""


# static
ACTION_LIST = "none action speak non-verbal communication leave" #" ".join(ActionType)

ACTION_REVERSE_MAP = {"left ": "leave", 'did n': 'none', 'said:': 'speak'}


def to_natural_language(self) -> str:
match self.action_type:
case "none":
return "did nothing"
case "speak":
return f'said: "{self.argument}"'
case "non-verbal communication":
return f"[{self.action_type}] {self.argument}"
case "action":
return f"[{self.action_type}] {self.argument}"
case "leave":
return "left the conversation"


SELECTED_TAG = ["gpt-4_gpt-4_v0.0.1_clean"]
def get_clean_episodes(selected_tags=SELECTED_TAG):
selected_episodes = {}
for tag in selected_tags:
tag_epis = EpisodeLog.find(EpisodeLog.tag == tag).all()
if len(tag_epis) > 0:
selected_episodes[tag]=tag_epis

return selected_episodes

def detect_action(msg):
# first detect what action type is, default at none
if msg.startswith("said:"):
action = "speak"
elif msg.startswith("left"):
action = "leave"
elif msg.startswith("[non-verbal communication]"):
action = "non-verbal communication"
elif msg.startswith("[action]"):
action = "action"
else:
action = "none"

return action

def generate_result(msg):
action = detect_action(msg)
result = {}
result["action_type"] = action
result["argument"] = ""
# know formating argument based on action type
match action:
case "speak":
# NOTE: this assume that the speech is in quotes, not ending without punctuation
result["argument"] = msg.replace("said: ", "")[1:-1]
case "action":
result["argument"] = msg
case "non-verbal communication":
result["argument"] = msg

str_result = str(result)

return str_result

def reverse_episode_log(epilog, later_speak=False):
episode_msg = epilog.messages
# per episode
agent_model = epilog.models[1]

if len(episode_msg) > 0:
init_loop = episode_msg[0]
# figure out who speak later, as we must use the 2nd player's data, else turn 0 have nothing to predict the beginning
if later_speak:
speaker = init_loop[-1][0] # this would be the agent as well
turn_div = 1
# figure out who speak the first
else:
speaker = init_loop[-2][0]
turn_div = 0

prompt_result_instances = []
dial_history = ""

for i in range(0, len(episode_msg)):
msg = episode_msg[i]
if (len(msg) != 4) and i < (len(episode_msg) - 1):
continue
turn_dic = {"model":agent_model}
for tpl in msg:
if (tpl[0] == 'Environment' and (tpl[1] == speaker)):
if i > 0:
dial_history += "\n"+tpl[2]
else:
# for the first context, we don't need \n
dial_history += tpl[2]

if tpl[0] == speaker: # if speaker is the agent, use what he said as result
str_result = generate_result(tpl[2])
# check if this is the end
if i%2 == turn_div:
# take alternative turns as we always want to predict one agent, not both
next_turn = i
prompt = PROMPT_TEMPLATE.format(
agent=speaker, history=dial_history, turn_number=next_turn,
action_list=ACTION_LIST, format_instructions=FORMAT_TEMPLATE)
turn_dic["prompt"] = prompt
turn_dic['result'] = str_result
prompt_result_instances.append(turn_dic)

return prompt_result_instances

def parse_prompt_to_json(episode, dir, init_speak):
prompt_result_instances = reverse_episode_log(episode, init_speak)

if not os.path.exists(dir):
os.makedirs(dir)

for i in range(len(prompt_result_instances)):
instance = prompt_result_instances[i]
todump = json.dumps(instance, indent=4)
with open(dir+"/{}-{}.json".format(episode.pk, i), "w") as f:
f.write(todump)

def run_all_tag_reverse(filter_env_dic, dir):
#tag_episodes = get_clean_episodes(selected_tags=[tag])[tag]
for k, v in filter_env_dic.items():
cutoff = len(v)//2
for i in range(len(v)):
episode = v[i]
if i < cutoff:
parse_prompt_to_json(episode, dir, False)
else:
parse_prompt_to_json(episode, dir, True)



Loading

0 comments on commit 16328e7

Please sign in to comment.