-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_agglomerative_clustering.py
70 lines (52 loc) · 2.41 KB
/
create_agglomerative_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict
import time
import argparse
import dill as pickle
from memory_profiler import profile
print("USAGE: create_agglomerative_clustering.py -p <POINT_FILE> -v <VOCAB_FILE> -k <CLUSTERS> -o <OUTPUT_FOLDER>")
@profile
def agglomerative_cluster(points, vocab, K, output_path, ref=''):
"""
Uses the point.npy, vocab.npy files of a layer (generated using https://github.com/hsajjad/ConceptX/ library) to produce a clustering of <K> clusters at <output_path> named clusters-agg-{K}.txt
"""
print('Starting agglomerative clustering...')
clustering = AgglomerativeClustering(n_clusters=K,compute_distances=True).fit(points)
print('Finished clustering')
fn = f"{output_path}/model-{K}-agglomerative-clustering{ref}.pkl"
with open(fn, "wb") as fp:
pickle.dump(clustering,fp)
clusters = defaultdict(list)
for i,label in enumerate(clustering.labels_):
clusters[clustering.labels_[i]].append(vocab[i])
# Write Clusters in the format (Word|||WordID|||SentID|||TokenID|||ClusterID)
out = ""
for key in clusters.keys():
for word in clusters[key]:
out += word+"|||"+str(key)+"\n"
with open(f"{output_path}/clusters-agg-{K}{ref}.txt",'w') as f:
f.write(output)
return out
parser = argparse.ArgumentParser()
parser.add_argument("--vocab-file","-v", help="output vocab file with complete path")
parser.add_argument("--point-file","-p", help="output point file with complete path")
parser.add_argument("--output-path","-o", help="output path clustering model and result files")
parser.add_argument("--cluster","-k", help="cluster number")
parser.add_argument("--count","-c", help="point count ratio", default=-1)
args2 = parser.parse_args()
vocab_file = args2.vocab_file
point_file = args2.point_file
output_path = args2.output_path
point_count_ratio = float(args2.count)
K = int(args2.cluster)
vocab = np.load(vocab_file)
original_count = len(vocab)
useable_count = int(point_count_ratio*original_count) if point_count_ratio != -1 else -1
vocab = np.load(vocab_file)[:useable_count]
points = np.load(point_file)[:useable_count, :]
start_time = time.time()
ref = '-' + str(point_count_ratio) if point_count_ratio > 0 else ''
output = agglomerative_cluster(points, vocab, K, output_path, ref)
end_time = time.time()
print(f"Runtime: {end_time - start_time}")