exploit.py

import os
import wandb
from omegaconf import OmegaConf
from tqdm import tqdm
import logging
import subprocess
import openai
import pandas as pd
import hydra
import re
from zero_hero.core import (
    gpt_call,
    wrap_system_message,
    wrap_user_message,
    ZEROHERO_ROOT_DIR,
    RewardNode,
    SkillDatabase,
    TaskDatabase,
)
from evolution.utils.extract_task_code import file_to_string


class Divider:
    def __init__(self, model="gpt-3.5-turbo-0125", root_dir=None) -> None:
        self.model = model
        self.root_dir = root_dir if root_dir is not None else ZEROHERO_ROOT_DIR
        self.prompt_dir = f"{self.root_dir}/evolution/utils/prompts"
        self.initial_sys = file_to_string(f"{self.prompt_dir}/skill/initial_sys.txt")
        self.initial_user = file_to_string(f"{self.prompt_dir}/skill/initial_user.txt")

    def run(self, mission: str, skill_database: SkillDatabase):
        messages = [
            wrap_system_message(
                self.initial_sys.format(skills=skill_database.render())
            ),
            wrap_user_message(self.initial_user.format(task=mission)),
        ]
        resp = gpt_call(messages=messages, model=self.model, n_samples=1, temperature=0)
        subtasks_proposal = resp[0]["message"]["content"]
        logging.info(subtasks_proposal)
        subtasks_with_method = re.findall(r"\(\d+\)\.\s(.*)", subtasks_proposal)
        subtasks, methods, variants = [], [], []
        for st in subtasks_with_method:
            match = re.search(r"<<<(\w+)>>>*(.*)", st)
            method, subtask = (
                match.group(1).lower(),
                match.group(2).lstrip("Task").lstrip(":").strip(),
            )
            if "reuse" in method:
                index = re.search(r"skill.*?(\d+).*", subtask).group(1)
                variant = skill_database.get_variant(index=int(index) - 1)
            else:
                variant = ""
            subtasks.append(subtask)
            methods.append(method)  # introduce, reuse
            variants.append(variant)
        df = pd.DataFrame(
            {
                "subtask": subtasks,
                "method": methods,
                "variants": variants,
                "status": "todo",
            }
        )
        return df


class Conquerer:
    def __init__(self, subtasks, **learn_kwargs) -> None:
        self.df = subtasks
        self.env_name = "franka_table"
        seed = 99
        self.tdb = TaskDatabase(
            env_name=self.env_name,
        env_idx = f"E{seed:02d}"
        )
        self.learn_kwargs = learn_kwargs
        self.reset()

    def reset(self):
        self.chain = {}
        self.succ = False
        self.subtask = None
        self.homework = None

    def run(self):
        extended_command = [f"{k}={v}" for k, v in self.learn_kwargs.items()]
        precedents = self.chain
        df = self.df
        subtasks = df["subtask"].values
        homework = [""] * len(subtasks)
        for i, row in tqdm(df.iterrows()):
            subtask = row.subtask
            method = row["method"]
            logging.info(f"[{method}] On subtask: {subtask} ...")
            if method == "introduce":
                # learn and update
                # precedents =','.join(row.precedents.split(',').strip())
                command = [
                    # "python3",
                    "/data/xufeng/miniconda3/envs/zerohero/bin/python",
                    f"{ZEROHERO_ROOT_DIR}/learn.py",
                    f'task="{subtask}"',
                    "seed=99",
                ]
                if len(precedents) > 0:
                    command.append(f'precedents="{precedents}"')
                if len(extended_command) > 0:
                    command.extend(extended_command)
                logging.info(f"Command to run:\n {command}")
                sp = subprocess.Popen(
                    command,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                )
                out_str, _ = sp.communicate()
                self.tdb.load()
                subtask_homework = self.tdb.df.get_attr(attr="status", command=subtask)[0]
                homework[i] = subtask_homework
                if subtask_homework == "failed":
                    logging.info(f"Failed to learn! Break at {subtask}.")
                    self.succ = False
                    self.subtasks = subtasks
                    self.homework = homework
                    return
                elif subtask_homework == "compromised":
                    logging.warning(f"Compromised at subtask: {subtask}")
                else:
                    pass
                variants = {self.tdb.df.get_attr(attr="variants", task=command): subtask}
            elif method == "reuse":
                variants = row["variants"]
            else:
                raise NotImplementedError
            precedents = {**precedents, **variants}
        self.last_reward_node_idx = precedents.pop(-1)
        self.chain = precedents
        self.succ = True
        self.subtasks = subtasks
        self.homework = homework
        return

    def play(self):
        precedents = self.chain
        subtasks, homework, succ = self.subtasks, self.homework, self.succ
        idx = self.last_reward_node_idx
        assert homework is not None and subtasks is not None
        rnode = RewardNode(
            idx=idx,
            precedents=precedents,
            **self.learn_kwargs,
        ).init()
        playbacks = rnode.play(suffix="_all")
        logging.info(
            f"Playback for running reward node {idx} with precedents {precedents} is:\n{playbacks}"
        )
        v_idx, v_path = playbacks["reward_idx"], playbacks["video_path"]
        wandb_video = {f"{v_idx}_all": wandb.Video(v_path, fps=30, format="mp4")}
        wandb_info_table = {
            "subtasks": wandb.Table(
                columns=[i for i in range(len(subtasks))], data=[subtasks, homework]
            ),
            "precedents": wandb.Table(
                columns=[i for i in range(len(precedents))], data=precedents
            ),
        }
        log = {
            "success": succ,
            **wandb_info_table,
            **wandb_video,
        }
        return log


@hydra.main(config_path="cfg", config_name="config", version_base="1.1")
def main(cfg):
    openai.api_key = os.getenv("OPENAI_API_KEY")
    my_cfg = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
    logging.info(cfg)
    env_name = cfg.env.env_name.lower()
    env_idx = f"E{cfg.seed:02d}"

    sdb = SkillDatabase(env_name=env_name, env_idx=env_idx)
    # sdb.absorb(tdb)
    divider = Divider(model=cfg.exploit.model)
    mission = cfg.exploit.mission
    subtasks_df = divider.run(mission, skill_database=sdb)
    assert mission is not None and len(mission) > 0
    conquerer = Conquerer(subtasks=subtasks_df, **cfg.exploit.learn)
    conquerer.run()
    play_log = conquerer.play()
    if cfg.exploit.use_wandb:
        wandbrun = wandb.init(
            project=cfg.exploit.wandb_project,
            config=my_cfg,
        )
        wandbrun.log(play_log)


if __name__ == "__main__":
    main()