-
Notifications
You must be signed in to change notification settings - Fork 2
/
a_creat_metrix.py
174 lines (133 loc) · 6.63 KB
/
a_creat_metrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import pandas as pd
import json
import os
import time
import sys
import pickle
import datetime
from scipy.sparse import csr_matrix
from scipy import sparse
import scipy
start_time = time.time()
#***************************************************************************************************
#*
#* This program will search all malware features json files under the targeted folder
#* Then, conbine these json files into a huge json file as ML training data.
#* {apk name 01 : { activity01: 1, activity02: 1, activity03: 1 ...}, apk name 02: { activity01: 1, activity02: 1, activity03: 1 ...} ... }
#*
#*********************************************************************
#* get a list of all json file path under dataset location
def get_all_json_list(path):
path = path.replace(" ","")
if not path.endswith('/'):
path= path +'/'
json_list=[]
for dirpath,dirname,filename in os.walk(path):
for f in filename:
if f.endswith(".json"):
file_path = dirpath+'/'+f
json_list.append(file_path)
return json_list
#* get fanily label from file path, normally it is in late second folder
def get_family_label(all_path):
family_class=[]
for file_path in all_path:
path_split = file_path.split("/")
family_label = path_split[-2] #* get family label
family_class.append(family_label)
return family_class #* return a list of all family label
#* get the file name from path, normally the name is last item
def get_file_name(file):
path_split = file.split("/")
file_name = path_split[-1][:32]
return file_name
#* retrun a dict which let file name mapped to family label
def create_family_mapping(json_list, family_label):
mapping={}
file_number=0
for file in json_list:
file_name = get_file_name(file)
mapping[file_name] = family_label[file_number]
file_number=file_number+1
return mapping
#* return a dict let family label become number
def conver_result_2_number(family_mapping):
all_family_list=family_mapping.values()
no_repeated_family_list = list(dict.fromkeys(all_family_list))
family_mapping_number = family_mapping.copy()
for ke in family_mapping.keys(): #* each file name
for number in range(len(no_repeated_family_list)): #* covert family name to number via sequence
if family_mapping[ke] == no_repeated_family_list[number]:
family_mapping_number[ke]=number
return family_mapping_number #* return a dict
def number_mapped_name(num, name):
mapping ={}
for ke in num.keys():
mapping[num[ke]] = name[ke]
return mapping
#* read a json file and retrun a dict
def read_json_file_as_dict(file):
file_name = get_file_name(file)
data_dict = {}
with open (file) as json_file :
data = json.load(json_file) #* this is a dict
data_dict[file_name]=data
return data_dict
#* apeend {'Malware family' : number} into targeted dict
def adding_family_labels(malware_dict, family_dict):
if malware_dict.keys()[0] in family_dict.keys():
(malware_dict.values()[0])['Malware family'] = family_dict[malware_dict.keys()[0]]
return malware_dict
#* appending dict_2 in dict_1
def combine_two_data (data_fdict_1, data_fdict_2):
data_fdict_1.update(data_fdict_2)
return data_fdict_1
#* save dict in "saving location"
def save_dict_result_in(saving_location, data_dict, name):
time_stamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')
saving_location = saving_location.replace(" ","")
if not saving_location.endswith('/'):
final_location = saving_location+'/'+name+'_'+time_stamp+'.json'
else:
final_location = saving_location+name+'_'+time_stamp+'.json'
jsonfile = open(final_location, 'w+')
print("Writting dict into json.............")
jsonfile.write(json.dumps(data_dict))
jsonfile.close()
print("Writting dict into json.............Done")
return final_location
def remove_key(name, dict_data):
for key in dict_data.keys():
if name in dict_data[key]:
del dict_data[key][name ]
return dict_data
def run(data_set_location, saving_location):
json_list = get_all_json_list(data_set_location) #* get a list
family_label = get_family_label(json_list) #* get a list of family label [family a, family b, family c, family d, family a, family a, family b, family b..... ] #* include repeated items.
family_mapping = create_family_mapping(json_list, family_label) #* get a dict that application name map to coresponding family name ex. {family name 01: Airpush, family name02 : BankBot}
conver_list = conver_result_2_number(family_mapping) #* get the above mapping dict which convert family name into number ex. {family name 01: 1, family name02 : 2}
mapping_dict = number_mapped_name(conver_list, family_mapping) #* get a dict that family number map to family name ex. { 1 : Airpush, 2 : BankBot}
number_2_family_name = save_dict_result_in(saving_location, mapping_dict,'number_2_family_name')
#print(mapping_dict)
parent_file = json_list[0] #* using first json file as a parent dataframe
parent_data_dict = read_json_file_as_dict(parent_file)
parent_data_dict = adding_family_labels(parent_data_dict, conver_list)
parent_data_dict = remove_key('sha256', parent_data_dict)
rounds = 0
for file in json_list[1:]: #* load all json file from list and conbine them into a huge json file
rounds = rounds +1
current_data_dict = read_json_file_as_dict(file)
current_data_dict = adding_family_labels(current_data_dict, conver_list)
current_data_dict = remove_key('sha256', current_data_dict)
print("This is %d round "%rounds)
parent_data_dict = combine_two_data( parent_data_dict, current_data_dict) #* comcate dataframe to parent datframe
#print('\n-------parent_data_dict--------------\n\n')
#print(parent_data_dict)
malware_features = save_dict_result_in(saving_location, parent_data_dict,'malware_feature_metrix')
print("This program takes %f second"%(time.time()-start_time))
if __name__ == "__main__":
data_set_location = sys.argv[1]
#! './home/AMD_data'/
saving_location = sys.argv[2]
#! ' ./ '
run(data_set_location, saving_location)