Merge remote-tracking branch 'origin/main'

HowieHwong · Apr 19, 2024 · af9f322 · af9f322
2 parents a4764f1 + f517193
commit af9f322
Show file tree

Hide file tree

Showing 17 changed files with 28,950 additions and 58 deletions.
diff --git a/.gitignore b/.gitignore
@@ -178,3 +178,4 @@ pyrightconfig.json
 
 # End of https://www.toptal.com/developers/gitignore/api/python
 test.ipynb
+test.py
diff --git a/generation_results/llama3-70b/robustness/ood_detection.json b/generation_results/llama3-70b/robustness/ood_detection.json
diff --git a/saved_embeddings/embeddings.json b/saved_embeddings/embeddings.json
diff --git a/test.py b/test.py
@@ -6,9 +6,21 @@
 
 
 
-config.openai_key= "1f462c580d06407eb49954553ab22ff7"
+from trustllm.generation.generation import LLMGeneration
+from trustllm.utils import file_process
+from trustllm import config
+
+import concurrent.futures
+
+config.replicate_api= "r8_EBJDSqZ6KDR2EmN0NWRodPva3018ceS0nd6BF"
 
-evaluator = truthfulness.TruthfulnessEval()
-internal= file_process.load_json('/Users/admin/Downloads/mixtral-8x7B/truthfulness/internal.json')
+def run_task(test_type):
+    run_robu=LLMGeneration(test_type=test_type,
+                        online_model=True,
+                        use_replicate=True,
+                model_path='meta/meta-llama-3-8b-instruct',
+                data_path='/Users/admin/Documents/dataset')
+    run_robu.generation_results()
 
-evaluator.eval_internal_squad(internal)
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    executor.map(run_task,['safety','fairness','truthfulness','privacy','robustness','ethics'])
diff --git a/trustllm_pkg/generation_results/llama3-70b/fairness/stereotype_agreement.json b/trustllm_pkg/generation_results/llama3-70b/fairness/stereotype_agreement.json
diff --git a/trustllm_pkg/generation_results/llama3-70b/fairness/stereotype_query_test.json b/trustllm_pkg/generation_results/llama3-70b/fairness/stereotype_query_test.json
diff --git a/trustllm_pkg/generation_results/llama3-70b/robustness/AdvGLUE.json b/trustllm_pkg/generation_results/llama3-70b/robustness/AdvGLUE.json
diff --git a/trustllm_pkg/generation_results/llama3-70b/robustness/AdvInstruction.json b/trustllm_pkg/generation_results/llama3-70b/robustness/AdvInstruction.json
diff --git a/trustllm_pkg/generation_results/llama3-70b/robustness/ood_detection.json b/trustllm_pkg/generation_results/llama3-70b/robustness/ood_detection.json
diff --git a/trustllm_pkg/generation_results/llama3-70b/robustness/ood_generalization.json b/trustllm_pkg/generation_results/llama3-70b/robustness/ood_generalization.json
diff --git a/trustllm_pkg/trustllm/config.py b/trustllm_pkg/trustllm/config.py
@@ -14,9 +14,10 @@
 
 max_worker = 1
 
-##only for auto evaluation
+##only support azure api for auto evaluation
 azure_openai = True
 azure_engine = "TrustLLM-GPT-4"
+azure_embedding_engine='TrustLLM-embedding'
 azure_api_version = "2023-08-01-preview"
 azure_api_base = "https://trustllm-gpt-4.openai.azure.com/"
 azure_api_key=''

diff --git a/trustllm_pkg/trustllm/generation/generation.py b/trustllm_pkg/trustllm/generation/generation.py
@@ -38,11 +38,12 @@ def __init__(self,
         self.num_gpus = num_gpus
         self.max_new_tokens = max_new_tokens
         self.debug = debug
-        self.online_model_dict = get_models()[1]
+        self.online_model_list = get_models()[1]
         self.model_mapping = get_models()[0]
         self.device = device
         self.use_replicate = use_replicate
         self.use_deepinfra = use_deepinfra
+        self.model_name = model_mapping.get(self.model_path, "")
 
     def _generation_hf(self, prompt, tokenizer, model, temperature):
         """
@@ -55,7 +56,7 @@ def _generation_hf(self, prompt, tokenizer, model, temperature):
             :return: The generated text as a string.
             """
 
-        prompt = self._prompt2conversation(prompt)
+        prompt = prompt2conversation(prompt)
         inputs = tokenizer([prompt])
         inputs = {k: torch.tensor(v).to(self.device) for k, v in inputs.items()}
         output_ids = model.generate(
@@ -74,14 +75,7 @@ def _generation_hf(self, prompt, tokenizer, model, temperature):
         )
         return outputs
 
-    def _prompt2conversation(self, prompt):
-        msg = prompt
-        conv = get_conversation_template(self.model_path)
-        conv.set_system_message('')
-        conv.append_message(conv.roles[0], msg)
-        conv.append_message(conv.roles[1], None)
-        conversation = conv.get_prompt()
-        return conversation
+
 
     def generation(self, model_name, prompt, tokenizer, model, temperature=None):
         """
@@ -96,7 +90,7 @@ def generation(self, model_name, prompt, tokenizer, model, temperature=None):
             """
 
         try:
-            if (model_name in self.online_model_dict) and ((self.online_model and self.use_replicate) or (self.online_model and self.use_deepinfra)):
+            if (model_name in self.online_model_list) and ((self.online_model and self.use_replicate) or (self.online_model and self.use_deepinfra)):
                 ans = gen_online(model_name, prompt, temperature, replicate=self.use_replicate, deepinfra=self.use_deepinfra)
             else:
                 ans = self._generation_hf(prompt, tokenizer, model, temperature)
@@ -172,7 +166,7 @@ def process_file(self, data_path, save_path, model_name, tokenizer, model, file_
                 t.join()
         file_process.save_json(saved_data, f"{save_path}")
 
-    def run_task(self, model_name, model, tokenizer, base_dir, file_config, key_name='prompt'):
+    def _run_task(self, model_name, model, tokenizer, base_dir, file_config, key_name='prompt'):
         """
             Runs a specific evaluation task based on provided parameters.
 
@@ -205,7 +199,7 @@ def run_ethics(self, model_name, model, tokenizer):
             "implicit_ETHICS.json": 0.0,
             "implicit_SocialChemistry101.json": 0.0
         }
-        self.run_task(model_name, model, tokenizer, base_dir, file_config)
+        self._run_task(model_name, model, tokenizer, base_dir, file_config)
 
     def run_privacy(self, model_name, model, tokenizer):
         base_dir = os.path.join(self.data_path, 'privacy')
@@ -214,7 +208,7 @@ def run_privacy(self, model_name, model, tokenizer):
             'privacy_awareness_query.json': 1.0,
             'privacy_leakage.json': 1.0,
         }
-        self.run_task(model_name, model, tokenizer, base_dir, file_config)
+        self._run_task(model_name, model, tokenizer, base_dir, file_config)
 
     def run_fairness(self, model_name, model, tokenizer):
         base_dir = os.path.join(self.data_path, 'fairness')
@@ -225,7 +219,7 @@ def run_fairness(self, model_name, model, tokenizer):
             'stereotype_query_test.json': 1.0,
             'stereotype_recognition.json': 0.0,
         }
-        self.run_task(model_name, model, tokenizer, base_dir, file_config)
+        self._run_task(model_name, model, tokenizer, base_dir, file_config)
 
     def run_truthfulness(self, model_name, model, tokenizer):
         base_dir = os.path.join(self.data_path, 'truthfulness')
@@ -236,7 +230,7 @@ def run_truthfulness(self, model_name, model, tokenizer):
             "internal.json": 1.0,
             "sycophancy.json": 1.0
         }
-        self.run_task(model_name, model, tokenizer, base_dir, file_config)
+        self._run_task(model_name, model, tokenizer, base_dir, file_config)
 
     def run_robustness(self, model_name, model, tokenizer):
         base_dir = os.path.join(self.data_path, 'robustness')
@@ -246,7 +240,7 @@ def run_robustness(self, model_name, model, tokenizer):
             'AdvGLUE.json': 0.0,
             'AdvInstruction.json': 1.0,
         }
-        self.run_task(model_name, model, tokenizer, base_dir, file_config)
+        self._run_task(model_name, model, tokenizer, base_dir, file_config)
 
     def run_safety(self, model_name, model, tokenizer):
         base_dir = os.path.join(self.data_path, 'safety')
@@ -256,9 +250,9 @@ def run_safety(self, model_name, model, tokenizer):
             'misuse.json': 1.0,
 
         }
-        self.run_task(model_name, model, tokenizer, base_dir, file_config)
+        self._run_task(model_name, model, tokenizer, base_dir, file_config)
 
-    def run_single_test(self):
+    def _run_single_test(self):
         """
             Executes a single test based on specified parameters.
 
@@ -268,7 +262,7 @@ def run_single_test(self):
         model_name = self.model_name
         print(f"Beginning generation with {self.test_type} evaluation at temperature {self.temperature}.")
         print(f"Evaluation target model: {model_name}")
-        if (model_name in self.online_model_dict) and ((self.online_model and self.use_replicate) or (self.online_model and self.use_deepinfra)):
+        if (model_name in self.online_model_list) and ((self.online_model and self.use_replicate) or (self.online_model and self.use_deepinfra)):
             model, tokenizer = (None, None) 
         else:
             model, tokenizer = load_model(
@@ -308,13 +302,10 @@ def generation_results(self, max_retries=10, retry_interval=3):
             print(f"Dataset path {self.data_path} does not exist.")
             return None
 
-        if self.use_replicate:
-            self.model_name = self.model_path
-        else:
-            self.model_name = model_mapping.get(self.model_path, "")
+
         for attempt in range(max_retries):
             try:
-                state = self.run_single_test()
+                state = self._run_single_test()
                 if state:
                     print(f"Test function successful on attempt {attempt + 1}")
                     return state

diff --git a/trustllm_pkg/trustllm/task/pipeline.py b/trustllm_pkg/trustllm/task/pipeline.py
@@ -1,3 +1,6 @@
+import os
+import difflib
+import inspect
 from trustllm.task import ethics, fairness, privacy, robustness, safety, truthfulness
 from trustllm.utils import file_process
 
@@ -233,13 +236,44 @@ def run_truthfulness(
     }
 
 
+
+def find_best_match(keyword, file_list):
+    """Find the best match for a keyword in a list of filenames."""
+    matches = difflib.get_close_matches(keyword, file_list, n=1, cutoff=0.1)
+    return matches[0] if matches else None
+
+def auto_assign_paths(all_folder_path, param_names):
+    """Automatically assign paths based on parameter names and files in the given folder."""
+    files = os.listdir(all_folder_path)
+    paths = {}
+    for name in param_names:
+        # Convert parameter name to expected file name pattern
+        key = name.replace('_path', '')
+        expected_filename = f"{key}.json"
+        matched_file = find_best_match(expected_filename, files)
+        if matched_file:
+            paths[name] = os.path.join(all_folder_path, matched_file)
+    return paths
+
 def run_fairness(
+    all_folder_path=None,
     stereotype_recognition_path=None,
     stereotype_agreement_path=None,
     stereotype_query_test_path=None,
     disparagement_path=None,
     preference_path=None,
 ):
+#     param_info = inspect.signature(run_fairness).parameters
+#     paths = {param: None for param in param_info if param != 'all_folder_path'}
+
+#     # Auto-assign paths if all_folder_path is provided
+#     if all_folder_path is not None:
+#         auto_paths = auto_assign_paths(all_folder_path, paths.keys())
+#         paths.update((k, v) for k, v in auto_paths.items() if v is not None)
+
+#     # Update paths with explicitly provided paths
+#     local_vars = locals()
+#     paths.update((k, local_vars[k]) for k in paths if local_vars[k] is not None)
     evaluator = fairness.FairnessEval()
 
     (

diff --git a/trustllm_pkg/trustllm/utils/embedder.py b/trustllm_pkg/trustllm/utils/embedder.py
@@ -1,4 +1,4 @@
-import openai
+from openai import OpenAI,AzureOpenAI
 import os
 import logging
 from tqdm import tqdm
@@ -24,26 +24,35 @@ def __init__(self, save_dir='saved_embeddings'):
         # Create the directory if it does not exist
         if not os.path.exists(self.save_dir):
             os.makedirs(self.save_dir)
-        openai.api_key = trustllm.config.openai_key
+
 
-    @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(6))
-    def get_embeddings(self, string):
-        """
-        Retrieve embeddings for a given string.
+    @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(6))
+    def get_embeddings(self,string, embedding_model='text-embedding-ada-002',):
+
+        if trustllm.config.azure_openai:
+            azure_endpoint = trustllm.config.azure_api_base
+            api_key = trustllm.config.azure_api_key
+            api_version = trustllm.config.azure_api_version
+            model = trustllm.config.azure_embedding_engine
+            client = AzureOpenAI(
+                azure_endpoint=azure_endpoint,
+                api_key=api_key,
+                api_version=api_version,
+            )
+            response = client.embeddings.create(
+                model=model,
+                input=string
+            )
+        else:
+            api_key = trustllm.config.openai_key
+            client = OpenAI(api_key=api_key,)
+            response = client.embeddings.create(
+                model=embedding_model,
+                input=string
+            )
+        return response.data[0].embedding
 
-        Args:
-            string (str): The text to embed.
 
-        Returns:
-            list: The embedding vector.
-        """
-        if string is None:
-            string = ""
-        response = openai.Embedding.create(
-            model='text-embedding-ada-002',  # Example model
-            input=string
-        )
-        return response["data"][0]["embedding"]
 
     def save_embeddings(self, embeddings, filename):
         """

diff --git a/trustllm_pkg/trustllm/utils/file_process.py b/trustllm_pkg/trustllm/utils/file_process.py
@@ -1,4 +1,5 @@
 import json
+import os
 
 
 def load_json(file_path):

diff --git a/trustllm_pkg/trustllm/utils/generation_utils.py b/trustllm_pkg/trustllm/utils/generation_utils.py
@@ -15,7 +15,7 @@
 # deepinfra_model = trustllm.config.model_info['deepinfra_model']
 
 model_info = trustllm.config.model_info
-online_model = model_info['online_model']
+online_model_list = model_info['online_model']
 model_mapping = model_info['model_mapping']
 rev_model_mapping = {value: key for key, value in model_mapping.items()}
 
@@ -51,7 +51,7 @@
 
 
 def get_models():
-    return model_mapping, online_model
+    return model_mapping, online_model_list
 
 def get_access_token():
     url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret=".format(
@@ -108,19 +108,19 @@ def get_res_openai(string, model, temperature):
     return response
 
 
+
 def deepinfra_api(string, model, temperature):
     api_token = trustllm.config.deepinfra_api
-
-
     top_p = 1 if temperature <= 1e-5 else 0.9
-
-    client = OpenAI(api_key=api_token,api_base="https://api.deepinfra.com/v1/openai")
+    OpenAI(api_key=api_token,api_base="https://api.deepinfra.com/v1/openai")
     stream = client.chat.completions.create(
         model=rev_model_mapping[model],
         messages=[{"role": "user", "content": string}],
         max_tokens=5192,
+        max_tokens=5192,
         temperature=temperature,
         top_p=top_p,)
+        top_p=top_p,)
     response = stream.choices[0].message.content
     return response
 
@@ -133,8 +133,7 @@ def replicate_api(string, model, temperature):
     else:
         input["prompt"]=prompt2conversation(rev_model_mapping[model],string)
     os.environ["REPLICATE_API_TOKEN"] = trustllm.config.replicate_api
-    res = replicate.run(
-        model,
+    res = replicate.run(rev_model_mapping[model],
         input=input
     )
     res = "".join(res)
@@ -204,7 +203,7 @@ def zhipu_api(string, model, temperature):
     return response.choices[0].message.content
 
 
-@retry(wait=wait_random_exponential(min=1, max=3), stop=stop_after_attempt(3))
+@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
 def gen_online(model_name, prompt, temperature, replicate=False, deepinfra=False):
     if model_name in model_info['wenxin_model']:
         res = get_ernie_res(prompt, temperature=temperature)

diff --git a/trustllm_pkg/trustllm/utils/gpt_auto_eval.py b/trustllm_pkg/trustllm/utils/gpt_auto_eval.py
@@ -80,7 +80,7 @@ def __init__(self, save_dir='saved_evaluations'):
         self.max_worker = trustllm.config.max_worker
         if not os.path.exists(self.save_dir):
             os.makedirs(self.save_dir)
-        openai.api_key = trustllm.config.openai_key
+        #openai.api_key = trustllm.config.openai_key
 
     def save_progress(self, data, filename='auto_eval.json'):
         """
Original file line number	Diff line number	Diff line change
Expand Up		@@ -178,3 +178,4 @@ pyrightconfig.json

		# End of https://www.toptal.com/developers/gitignore/api/python
		test.ipynb
		test.py