-
Notifications
You must be signed in to change notification settings - Fork 1
/
gen_semantic.py
152 lines (124 loc) · 4.81 KB
/
gen_semantic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from argparse import ArgumentParser
import os
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
from fastembed import TextEmbedding
from sklearn.metrics import pairwise_distances_argmin_min
import pandas as pd
import numpy as np
import openai
from utils import validate_template, load_project, init_qdrant, SEED
np.random.seed(SEED)
LLM_CALLS = 0
PROMPT_TOKENS = 0
COMPLETION_TOKENS = 0
def select_diverse_vectors(vectors, k=5):
diverse_indices = [np.random.choice(range(len(vectors)))]
for _ in range(k - 1):
remaining_indices = list(set(range(len(vectors))) - set(diverse_indices))
distances = pairwise_distances_argmin_min(
vectors[remaining_indices], vectors[diverse_indices], metric="cosine"
)[1]
next_index = remaining_indices[np.argmax(distances)]
diverse_indices.append(next_index)
return diverse_indices
instruction = """As an expert in system log analysis, your task is to generate a semantic variable name for the given log messages and their corresponding templates.
### Instructions:
1. Replace all instances of `<*>` in the templates with `{variable_name}` that accurately reflects the semantics of the variable.
2. If a template consists of only static text (i.e., no variables `<*>`), leave it unchanged.
3. Output only the updated semantic template.
Be specific and ensure that the variable names are descriptive and contextually appropriate."""
def ask(logs, template):
log_prompt = "\n".join([f"- {log}" for log in logs])
print(log_prompt)
completion = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": instruction,
},
{
"role": "user",
"content": f"[logs]\n{log_prompt}\n\n[template]\n{template}",
},
],
seed=SEED,
temperature=0.2,
)
return completion.choices[0].message.content
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"project",
type=str,
help="Mac,Android,Thunderbird,HealthApp,OpenStack,OpenSSH,Proxifier,HPC,Zookeeper,Hadoop,Linux,HDFS,BGL,Windows,Apache,Spark",
)
parser.add_argument(
"--shot",
type=int,
help="Number of templates to see",
default=4,
)
args = parser.parse_args()
project = args.project
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
client = QdrantClient("http://localhost:6333")
embedding_model = TextEmbedding()
client.set_model(embedding_model.model_name)
ndim = list(
filter(
lambda x: x["model"] == embedding_model.model_name,
embedding_model.list_supported_models(),
)
)[0]["dim"]
assert init_qdrant(client, ndim, project, True) == True
snapshots = client.list_snapshots(project)
for snapshot in snapshots:
client.delete_snapshot(project, snapshot.name)
df = load_project(project)
embedding_generator = embedding_model.embed(df.log.values)
client.upload_collection(
project,
vectors=embedding_generator,
ids=df.index,
payload=df.to_dict(orient="records"),
)
templates = df.template.unique()
print(f"Total templates: {len(templates)}")
templates = np.random.choice(templates, args.shot, replace=False)
print(f"Randomly selected templates: {len(templates)}")
for template in templates:
records, pointer = client.scroll(
project,
models.Filter(
must=[models.FieldCondition(key="template", match=models.MatchValue(value=template))],
),
with_vectors=True,
)
log_vectors = [log_vector.vector for log_vector in records]
log_vectors = np.array(log_vectors)
if len(log_vectors) < args.shot:
logs = [record.payload["log"] for record in records]
else:
indicies = select_diverse_vectors(log_vectors, args.shot)
logs = [records[i].payload["log"] for i in indicies]
semantic_template = ask(logs, template)
print(template)
print(semantic_template)
# Save the semantic template and 'seen' flag
client.set_payload(
project,
payload={"seen": True, "semantic_template": semantic_template},
points=models.Filter(
must=[models.FieldCondition(key="template", match=models.MatchValue(value=template))]
),
)
snapshot = client.create_snapshot(project)
print(f"Snapshot created: {snapshot.name}")
# [using training data]
# 1. diverse 3 logs for each template
# 2. ask the expert to generate the semantic template
# 3. save the semantic template
# 4. use the semantic template to prompt the model