c_RandomForest.py

import json
import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import datetime
import sys
import time
import os
from sklearn.tree import export_graphviz
from subprocess import call
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import graphviz 
import pickle
import copy

#******************************************************************************
#*  This program will fetch 3 json file.
#*  First one is malware_feature.json which is the training data of ML model
#*  Second one is  behavior file that include a dict that malware family mapped to coresponding malicious behaviors.
#*  Third is the family_number  file that is a dict that family number mapped to family name
#*
#*   This program will conbin abovementioned three json files into a training data of Random Forest ML.
#*
#*
#* *******************************************************************************

#*  open family json file that is a dict {family number : malware family name....}
#*  print what malware families in this json file
#*  return a family dict for later using 
def get_family_dict_from(file):
    f_file = open(file)
    f_dict = json.load(f_file)
    f_file.close()
    f_keys = f_dict.keys()
    for th in range(len(f_keys)):
        f_keys[th] = int(f_keys[th])
    f_keys.sort()
    print('Below are family data--------------------\n\n')
    for num in f_keys:
        print([num, f_dict[str(num)]])
    print('\n')
    return f_dict

#* open malware behavior labels json file
#* print all malware behaviors labels
#*  and creat a dict {label-number: number} for later using
def get_label_dict(file):
    new_dict = { }
    la_file = open(file)
    la_dict=json.load(la_file)
    label_dict = la_dict['all'].keys()
    label_list = list(label_dict)
    label_list.sort()

    print('Below are behavior labels --------------------\n\n')
    for num in range(len(label_list)):
        new_dict[str(num)] = label_list[num]
        print( [num,label_list[num] ] )
    print('\n')
    return new_dict

#* create a list that specify what need to be removed
def set_up_removed_label_list(num, num2):
    num2_len = num2.split(',')   #* get a string like '0,1,2,3' and convert to [0,1,2,3]

    if len(num2_len) == 1:
        num2 = int(num2)

        removed_list = [ ] 
        for i in range(num):
            each_la = ''
            for j in range(num):
                if i == j:
                    pass
                else:
                    if len(each_la) ==0:
                        each_la = each_la +str(j)
                    else:
                        each_la = each_la +','+str(j)
            removed_list.append(each_la)

        if num2 == 77:
            return removed_list
        elif num2 == 99:
            return ['99']
        else:
            removed_num = num2
            new_removed_list = [ removed_list[removed_num] ]
            return new_removed_list
    else:
        for th in range(len(num2_len)):
            num2_len[th] = int(num2_len[th]) 

        removed_list=''
        for i in range(num):
            if i not in num2_len:
                if len(removed_list)==0:
                    removed_list = removed_list + str(i)
                else:
                    removed_list = removed_list +','+str(i)

        return [ removed_list ]

#* create part of file name from removed behavior labels ex. if removed label list is [0,1,2,3], the part of file name will be label-0-1-2-3
def create_file_name_from_removed_bahavior(string, dic):
    if string =='99':
        return 'all-label'
    label_len = len(dic)
    default_string = ''
    for i in range(label_len):
        if len(default_string)==0:
            default_string = default_string+str(i)
        else:
            default_string = default_string +','+str(i)
    default_string_content = default_string.split(',')
    
    string_content = (string).split(',')
    #print(string_content)
    for letter in string_content:
        if letter in default_string_content:
            default_string_content.remove(letter)
    
    labels_name ='label'
    for  str_num in default_string_content:
        if len(labels_name)==0:
            labels_name = labels_name+str_num
        else:
            labels_name = labels_name +'-'+str_num

    return labels_name 

def file_to_each_list(family_file):
    file = open(family_file)
    data = json.load(file)
    th_smaple = data.keys()
    for i in range(len(th_smaple)):
        th_smaple[i] = int(th_smaple[i])
    th_smaple = sorted(th_smaple)
    num = int(raw_input('How many family you want to fetch from size %d family files for each running\n'%(len(th_smaple))))
    sample = []

    for i in range(len(th_smaple)/num):
        inside_data = ''
        for j in range(num):
            if len(inside_data)==0:
                inside_data = inside_data + '%s'%(th_smaple.pop(0))
            else:
                inside_data = inside_data + ',%s'%(th_smaple.pop(0))
        sample.append(inside_data)
        
    return  sample

#* create part of file name from removed malware families ex. if removed family list is [0,1,2,3], the part of file name will be family-0-1-2-3
def create_file_name_from_removed_families(string_content): #*'1,2,3'
    if string_content=='99':
        return 'No-removed-family'

    num = string_content.split(',')
    all_name = 'family'
    for letter in num:
        if len(all_name)==0:
            all_name = all_name +letter
        else:
            all_name = all_name +'-%s'%(letter) 
    return all_name

#* creat all  log file and record some basic information like file name, saving location an where are json files from.
def basic_question (time_stamp):
    
    log, folder_loc = creat_log(save_log_location_g, file_name_g, time_stamp)

    print('*****************File name = %s \n'%(file_name_g))
    print('*****************Log is saved in %s \n'%(save_log_location_g))
    print('*****************Get feature file from %s \n'%(feature_file_g))
    print('*****************Get behavior file from %s \n'%(behavors_file_g))
    print('*****************Get family file from %s \n'%(family_file_g))

    write_log(log, "\nLog is saved in %s \n"%(save_log_location_g))
    write_log(log, "\nGet feature file from %s \n"%(feature_file_g))
    write_log(log, "\nGet behavior file from %s\n"%(behavors_file_g))
    write_log(log, "\nGet family file from %s\n"%(family_file_g))
    write_log(log, "\nThis family file include below families  >>\n")

    for num in range(len(f_dict)):
        write_log(log,"%d, %s\n" %(num, f_dict[str(num)]))
    
    return log, folder_loc

def creat_log(location, file_name, time_stamp):
    
    if not location.endswith('/'):
        location=location+'/'

    folder_loc = location+file_name+'_'+time_stamp+'_result'
    if not os.path.exists(folder_loc):
        os.mkdir(folder_loc)

    log = open (folder_loc+'/'+file_name+'_'+time_stamp+'.log', 'w')
    log.write("\n")
    log.write("result of %s\n"%(file_name))
    
    folder_loc = folder_loc + '/'

    return log, folder_loc

#* logged malware family names that would be removed  from training data
#* if the number is 99, then, no malware family have been removed
#* And return a list [ ] that inlcude what family would be chosen  in order to later using
def question_removed_family(log, csv_data):
    removed_family_local = removed_family_g.split(',')          #* get a string '9,1,10' and changed to list with string number ['9', '1', '10']
    
    flag = 0
    for number in removed_family_local:
        removed_family_local[flag] = str(number)
        flag = flag +1
    removed_family_local.sort()

    if removed_family_local[-1] != '99':
        removed = []
        for num in removed_family_local:
            removed.append(f_dict[num])
        print('Below families has been removed_family >>\n %s \n'%(removed))
        write_log(log,'\nBelow families has been removed_family >>\n %s \n'%(removed_family_local))
        write_log(log,'\nBelow families has been removed_family >>\n %s \n'%(removed))
        
        #!----------------------------------------------------
        for j in f_dict:
            up_data_3 = { f_dict[j] : 'v'  } #!  'v' can changed to j (int)
            csv_data['Family_data']['Training' ].update(up_data_3)
        for k in removed:
            up_data_3_sub = { k : csv_data['Family_data']['Training'].pop(k) }
            csv_data['Family_data']['Removed' ].update(up_data_3_sub)
        #!----------------------------------------------------
        return removed_family_local, csv_data
    
    else:
        print('This ML model will trained by all family\n')
        write_log(log,'\n This ML model will trained by all family \n')
        #!----------------------------------------------------
        for j in f_dict:
            up_data_3 = { f_dict[j] : 'v'  } #!  'v' can changed to j (int)
            csv_data['Family_data']['Training' ].update(up_data_3)
        up_data_3_sub = {  }
        csv_data['Family_data']['Removed' ].update(up_data_3_sub)
        #!----------------------------------------------------
        return[ ] , csv_data

#* logged malware family names that would be put into testing data
#* if the number is 99, then, all malware families would be in testing data
#* And return a list [ ] that inlcude what family would be chosen  in order to later using
def question_tested_family(log, csv_data):
    tested_family_local = tested_family_g.split(',')          #* get a string '9,1,10' and chagnge to list with string number ['9', '1', '10']
    
    flag = 0
    for number in tested_family_local:
        tested_family_local[flag] = str(number)
        flag = flag +1
    tested_family_local.sort()

    if tested_family_local[-1] != '99':
        tested = []
        for num in tested_family_local:
            tested.append(f_dict[num])
        print('Below families would be put in test data >>\n %s \n'%(tested))
        write_log(log,'\nBelow families would be put in test data >>\n %s \n'%(tested_family_local))
        write_log(log,'\nBelow families  would be put in test data >>\n %s \n'%(tested))
        
        #!----------------------------------------------------
        for k in tested:
            up_data_3_sub = { k : 'v' }
            csv_data['Family_data']['Testing' ].update(up_data_3_sub)
        #!----------------------------------------------------
        return tested_family_local, csv_data
    
    else:
        print('No family was selected to test\nThus, system will do self_testing')
        write_log(log,'No family was selected to test\nThus, system will do self_testing')
        #!----------------------------------------------------
        for j in f_dict:
            up_data_3_sub = { f_dict[j] : 'v'  } #!  'v' can changed to j (int)
            csv_data['Family_data']['Testing' ].update(up_data_3_sub)
        #!----------------------------------------------------
        return[ ], csv_data

#* decide whether there is malware family label in y_training and y_target
#* Normally, flage =1 is the default setting that there is no family label in y because we want to test zero day family.
def question_include_family(log):
    if include_family_g != 1:
        write_log(log, "\nThis target without family labels\n")
        print('This target without family labels\n')
    else :
        write_log(log, "\nThis target with family labels\n")
        print('This target with family labels\n')

def show_dict_col(file):
    dic = open(file)
    data = json.load(dic)
    label_dict = data['all'].keys()
    label_list = list(label_dict)
    label_list.sort()

    return label_list


#* logged malware behavior labels that would be removed  from training data (y_training or y_testing)
#* if the number is 99, then, no malware labels have been removed
#* And return a list [ ] that inlcude what label would be chosen in order to later using
def question_remove_label(log, csv_data):
    all_be = show_dict_col(behavors_file_g)  #* get a dict that includes all malicious label
    remove = [ ]
    removed_label = removed_label_g.split(',') #* get a string '9,1,10' and chagnge to list with string number ['9', '1', '10']
    flag = 0

    for number in removed_label:
        removed_label[flag] = int(number)
        flag = flag +1
    removed_label.sort()
        
    if removed_label[-1] != 99:
        for num in removed_label:
            remove.append(all_be[num])
        print('removed_belable =>>\n%s\n'%(removed_label))
        print('remove =>>\n%s\n'%(remove))
        write_log(log, "\nThis model without below labels >> \n %s \n"%(removed_label))
        write_log(log, "\nThis model without below labels >> \n %s \n"%(remove))
        #!----------------------------------------------------
        up_data_1= { 'Labels_data':{ } }
        csv_data.update(up_data_1)
        up_data_2 = {'Used' : {},'Removed' : {} }

        for j in range(len(all_be)):
            up_data_3 = { all_be[j] : 'v' }  #!  'v' can changed to j (int)
            up_data_2['Used' ].update(up_data_3)
        for k in remove:
            up_data_3_sub = { k : up_data_2['Used'].pop(k) }
            up_data_2['Removed' ].update(up_data_3_sub)
        csv_data['Labels_data'].update( up_data_2 )

        #!----------------------------------------------------
        return remove, csv_data
    else:
        write_log(log, "\nThis model with all behavior labels\n")
        print('This model with all behavior labels\n')

        #!----------------------------------------------------
        up_data_1= { 'Labels_data':{ } }
        csv_data.update(up_data_1)
        up_data_2 = {'Used' : {},'Removed' : {} }
    
        for j in range(len(all_be)):
            up_data_3 = { all_be[j] : 'v' } #!  'v' can changed to j (int)
            up_data_2['Used' ].update(up_data_3)
            up_data_3_sub = {  }
            up_data_2['Removed' ].update(up_data_3_sub)
        csv_data['Labels_data'].update( up_data_2 )
        #!----------------------------------------------------
        return[ ], csv_data

#* This is the setting for times, testsize and n_samplese
def parameter_setting(log):
    times = 10 #!int(raw_input("How many times want to repeat\n (input a number ex:10)\n") or  10)
    write_log(log, "\nThis modle will run %d times\n"%(times))

    testsize = 0.3 #!float(raw_input("Please input a rate to split ML data\n(ex : 0.3)\n") or  0.3)
    write_log(log,'\nSpliing X data into training and testin by szie %.3f\n'%(testsize))

    n_samples_g = 50 #!int(raw_input("Please input a number which is how many sample you want\n(ex : 10)\n") or 10)
    write_log(log,'\nGet %d samples from X_test\n'%(n_samples_g))

    return times, testsize, n_samples_g

#* Read behavior labels json file which  is used to be y_target 
def read_behavior_label_n_list_from_json():
    print("reading behavior labels.....%f" %(time.time()-start_time))

    behavors_json = open (behavors_file_g)
    behavors_data = json.load(behavors_json)
    family_json = open (family_file_g)
    family_data = json.load(family_json)    
    
    new_behav = {}

    for item in behavors_data:
        if item != 'all':
            for ke in family_data.keys():
                if item == family_data[ke]:
                    new_behav[ke] = behavors_data[item]
    
    print("reading behavior labels.....Done %f" %(time.time()-start_time))
    return new_behav

#*  fetch application names and its coresponding family labels from malware feature to form a y_target  data
#* Then, concatenate the behavior labels with y_target data and remove selected behavior labels .
def assign_Y_data(log, data, behavior, removed_belable):
    print("Assigning Y data....  %f" %(time.time()-start_time))
    y  = data['Malware family']
    y = y.to_dict()
    y = merge_family_behavior_dict_fomat(y, behavior)
    y = pd.DataFrame.from_dict(y, orient='index')
    y = y.fillna(0)
    y = y.astype('int')
    y = y [sorted(y.columns)]
    y = give_first_position_to('Malware family', y)

    if len(removed_belable) !=0:
        for label in removed_belable:
            if label in y.columns:
                y=y.drop(label, axis = 1)
        write_log(log,'\nThis below Y has been remove some labels\n')
        write_log(log,'%s\n'%(y))
        print("Assigning Y data.... Done %f" %(time.time()-start_time))
        return y
    else:
        print('Don\'t need to  remove any behavior labels\n')
        write_log(log,'This below Y has all labels\n')
        write_log(log,'%s\n'%(y))
        print("Assigning Y data.... Done %f" %(time.time()-start_time))
        return y

def merge_family_behavior_dict_fomat(y, behavior):
    key_list = y.keys()
    for key in key_list:
        number = y[key]
        y[key]={'Malware family':number}
        y[key].update(behavior[ str(number) ])
    return y

#*  fetch application names and its coresponding malware features from malware feature to form a X training data 
def assign_X_data(log, data):
    print("Assigning X data.... %f" %(time.time()-start_time))
    X = data
    X = X.fillna(0)
    X = X.astype('int')
    X = X [sorted(X.columns)]
    X = give_first_position_to('Malware family', X)
    #print(X)
    write_log(log,"\nThere are %d features in whole X\n"%len(X.columns))
    write_log(log,"\nThere are %d samples in whole X\n"%len(X.index))
    write_log(log,"\n%s\n\n"%X)
    write_log(log,"\nTop 4 col values >> \n\n%s\n\n"%X.iloc[:,0:4])
    
    print("Assigning X data.... Done %f" %(time.time()-start_time))
    return X

def give_first_position_to(name, df):
    col_list = df.columns.tolist()
    new =[ ]
    for col in col_list:
        if name == col:
            new.append( col )
            col_list.remove(col)
            new = new+col_list
    return df[new]

#* remove selected malware families from X, y data and split them in order to form X_training and y_training
#* use selected malware malware families from X, y data to form X_testing and y_testing
def Spliting_data(log, X, y, removed_family_list, tested_family_list, testsize):
    print("Spliting  data .... %f" %(time.time()-start_time))
    if len(removed_family_list) != 0: #! ==0   >>> no values
        X_train = remove_columns_from(X, removed_family_list, 'Malware family') #! for unknow family
        y_train = remove_columns_from(y, removed_family_list, 'Malware family') #! for unknow family
        
    else:
        X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size = testsize, random_state = 1)
    
    if len(tested_family_list) != 0: #! ==0   >>> no values
        X_test = selected_columns_from(X, tested_family_list, 'Malware family')
        y_test = selected_columns_from(y, tested_family_list, 'Malware family')

    print('after sorted ')
    X_test = X_test.sort_values(by=['Malware family'])
    y_test = y_test.sort_values(by=['Malware family'])

    family_type = {}
    for item in y_test['Malware family'].values:
        if item not in family_type:
            family_type[item] = 1
        else:
            family_type[item] = family_type[item] +1    

    X_train = X_train.drop(['Malware family'], axis=1)
    X_test = X_test.drop(['Malware family'], axis=1)
    
    if include_family_g == 0:
        y_train = y_train.drop(['Malware family'], axis=1)
        y_test = y_test.drop(['Malware family'], axis=1)
    
    write_log(log,"\n--------------------------------below are X train------------------------------------\n\n%s\n\n%s\n\n"%(X_train, X_train.iloc[:,0:4] ) )
    #print('--------------------------------below are X train------------------------------------')
    #print(X_train)

    write_log(log,"\n--------------------------------below are y train------------------------------------\n\n%s\n\n"%(y_train) )
    #print('--------------------------------below are y train------------------------------------')
    #print(y_train)
    
    write_log(log,"\n--------------------------------below are X test------------------------------------\n\n%s\n\n"%X_test )
    #print('--------------------------------below are X test------------------------------------')
    #print(X_test)
    
    write_log(log,"\n--------------------------------below are y test------------------------------------\n\n%s\n\n"%y_test )
    #print('--------------------------------below are y test------------------------------------')
    #print(y_test)
    
    print("Spliting  data .... Done %f" %(time.time()-start_time))
    return X_train, y_train, X_test, y_test, family_type

def remove_columns_from(df, li, name):
    for family_number in li:
        selected_data =  list( df[  ( df[name] == int(family_number)  ).values ].index )
        df = df.drop(selected_data, axis=0)

    return df

def selected_columns_from(df, li, name):
    flag = (df[name] == int(li[0]) ).values
    for family_number in li[1:]:
        flag = flag + (df[name] == int(family_number)).values

    return df[flag]

def show_y_target_lables(y_test):
    label = []
    for ind in y_test.columns:
        label.append(ind)
    return label

#*  this part is used to predict answer via trained random froest
def data_outcoming(log,malware_tree, X_test, y_test, n_samples) :
    print("Data outcoming... %f" %(time.time()-start_time))

    test = X_test.sample(n_samples)
    
    write_log(log,"\nSample of X_test >>\n%s\n\n%s\n\n"%(test, test.iloc[:,0:4]))  
    write_log(log,"\nSample of y_test >>\n%s\n" %(y_test.loc[test.index,:]))

    pre = malware_tree.predict(test)
    pre = pre.astype('int')

    if  pre.shape != (y_test.loc[test.index,:].values).shape:
        pre = pre[:,np.newaxis]
    
    print("\n------below are predicted answer----------------\n")
    write_log(log,"\n------below are predicted answer----------------\n" )
    write_log(log,"%s" %(pre))
    
    write_log(log,"\n------below are answer----------------\n" )
    write_log(log,"%s" %(y_test.loc[test.index,:].values))
    
    print("Data outcoming... Done %f" %(time.time()-start_time))
    
    return test, pre

#*  by compare with y_test and predcited answer to calculate the precision, recall, f1 and accuracy for each run and overall model     
def calculate_accuracy (log, y_test, pre, test, family_type, csv_data ):
    print("Calculating_accuracy... %f" %(time.time()-start_time))
    labels = show_y_target_lables(y_test)
    presice_all = 0.0
    presice_family = 0.0
    presice_sample = 0.0
    TP = 0.0
    FP = 0.0
    FN = 0.0
    
    hamming_loss =[]
    precision_rate= []
    recall_rate=[]
    beha_accuracy =[]
    
    num_of_beha = len(y_test.loc[test.index,:].values[0])
    num_of_apps = len(y_test.loc[test.index,:].values)
    
    for i in range(num_of_beha):
        beha_accuracy.append(0.0)
        hamming_loss.append(0.0)
        precision_rate.append([0.0, 0.0, 0.0])
        recall_rate.append([0.0, 0.0, 0.0])

    all_ans = [ [[0.0,0.0,0.0,0.0 ] for beha_num in range(num_of_beha) ] for family_num in range(len(family_type))]
    all_ans_summay =  [ [[0.0, 0.0,0.0 ] for beha_num in range(num_of_beha) ] for family_num in range(len(family_type))]
    
    num_step_chage = []
    for f in sorted(family_type.keys()):
        num_step_chage.append(  family_type[f] )

    for th_num in range(len(num_step_chage)):
        if th_num != 0:
            num_step_chage[th_num] = num_step_chage[th_num] + num_step_chage[th_num-1]

    th_step = 0

    write_log(log, "\n----below are wrong answer-------------\n")
    for row in range(num_of_apps):
        flag = 0
        for col in range(num_of_beha):
            if pre[row][col] == y_test.loc[test.index,:].values[row][col]:
                presice_all = presice_all +1.0
                beha_accuracy[col] = beha_accuracy[col] + 1.0 
                    
                if  pre[row][col] =='1' or pre[row][col] ==1:  #! Ture Positive
                    TP = TP +1.0

                    precision_rate[col][0] = precision_rate[col][0] + 1.0
                    recall_rate[col][0] = recall_rate[col][0] + 1.0
                
                    if  row < num_step_chage[th_step]:
                        all_ans[th_step][col][0] = all_ans[th_step][col][0] +1.0
                    elif row == num_step_chage[th_step]:
                        th_step = th_step +1 
                        if th_step < len(num_step_chage):
                            all_ans[th_step][col][0] = all_ans[th_step][col][0] +1.0
                else:                                                                                #! True negative
                        if  row < num_step_chage[th_step]:  
                            all_ans[th_step][col][1] = all_ans[th_step][col][1] +1.0
                        elif row == num_step_chage[th_step]:
                            th_step = th_step +1 
                            if th_step < len(num_step_chage):
                                all_ans[th_step][col][1] = all_ans[th_step][col][1] +1.0
            else:
                if pre[row][col]=='1' or pre[row][col]==1:   #! False Positive
                    precision_rate [col][1] = precision_rate [col][1] + 1.0
                    FP = FP + 1.0
                    if  row < num_step_chage[th_step]:
                        all_ans[th_step][col][2] = all_ans[th_step][col][2] +1.0
                    elif row == num_step_chage[th_step]:
                        th_step = th_step +1 
                        if th_step < len(num_step_chage):
                            all_ans[th_step][col][2] = all_ans[th_step][col][2] +1.0
                else:
                    recall_rate[col][1] = recall_rate[col][1] + 1.0   #! False Negative
                    FN = FN + 1.0

                    if  row < num_step_chage[th_step]:
                        all_ans[th_step][col][3] = all_ans[th_step][col][3] +1.0
                    elif row == num_step_chage[th_step]:
                        th_step = th_step +1 
                        if th_step < len(num_step_chage):
                            all_ans[th_step][col][3] = all_ans[th_step][col][3] +1.0

                hamming_loss[col]=hamming_loss[col]+1.0

                if flag == 0:                                                        #! know how many sample is wrong
                    presice_sample = presice_sample+1.0
                    flag = flag+1.0 

                write_log(log, " ( %d , %d  ) \n"%(row, col))
                if y_test.loc[test.index,:].columns[col] == 'Malware family':
                    write_log(log, " ( %s , %s  ) "%( y_test.loc[test.index,:].index[row],  y_test.loc[test.index,:].columns[col]) )
                    right_family = f_dict[ str(y_test.loc[test.index,:].values[row][col]) ]
                    wrong_family = f_dict[ str(pre[row][col]) ]
                    write_log(log, " ( Right one is %s , Wrong one is %s  ) "%(right_family, wrong_family ) )
                    write_log(log, " ( %s ,  %s  ) \n"%( y_test.loc[test.index,:].values[row][col],  pre[row][col]) )
                else:
                    write_log(log, " ( %s , %s  ) "%( y_test.loc[test.index,:].index[row],  y_test.loc[test.index,:].columns[col]) )
                    write_log(log, " ( Right one is %s , Wrong one is %s  ) \n"%( y_test.loc[test.index,:].values[row][col],  pre[row][col]) )
        if pre[row][0] == y_test.loc[test.index,:].values[row][0]:
            presice_family = presice_family+1.0
    
    presice_all_base = num_of_apps * num_of_beha
    presice_family_base = num_of_apps
    presice_sample_base = num_of_apps

    presice_all_answer = (presice_all / presice_all_base)
    presice_family_answer = (presice_family / presice_family_base)
    presice_sample_answer = ((presice_sample_base-presice_sample) / presice_sample_base)
    
    if (TP+FP)>0:
        overall_precision = TP / (TP+FP)
    else:
        overall_precision=0
    
    if (TP+FN) > 0:
        overall_recall = TP / (TP+FN)
    else:
        overall_recall = 0
    
    if (overall_precision+overall_recall) > 0:
        overall_f1 = 2*overall_precision*overall_recall /(overall_precision + overall_recall)
    else:
        overall_f1= 0
    
    overall_hamming_loss = (FN+FP)/(num_of_beha*num_of_apps)
    #!----------------------------------------------------
    up_data_3_n_TP = { 'overall_TP' : TP }
    csv_data['All_parameter']['For_all'].update(up_data_3_n_TP)
    
    up_data_3_n_FP = { 'overall_FP' : FP }
    csv_data['All_parameter']['For_all'].update(up_data_3_n_FP)
    
    up_data_3_n_FN = { 'overall_FN' : FN }
    csv_data['All_parameter']['For_all'].update(up_data_3_n_FN)
    
    #!---------------------------------------------------------------

    print("all label accuracy : %.3f "%presice_all_answer)
    write_log(log, "\n Totoal labels are %.3f  and  corrected labels are %.3f\n"%(presice_all_base, presice_all))
    write_log(log, " All labels accuracy : %.3f \n" %(presice_all_answer))
    
    print("family accuracy : %.3f "%presice_family_answer)
    write_log(log, " \nTotoal family labels are %.3f  and  corrected labels are %.3f\n"%(presice_family_base, presice_family))
    write_log(log, " All labels accuracy : %.3f \n" %(presice_family_answer))

    
    print("Sample accuracy : %.3f "%presice_sample_answer)
    write_log(log, " \nTotoal sample are %.3f  and  wrong are %.3f\n"%(presice_sample_base, presice_sample))
    write_log(log, " All Sample accuracy : %.3f \n" %(presice_sample_answer))

    print("overll_precision : %.3f "%overall_precision)
    write_log(log, " \nTotoal TP are %d  and  FP are %d\n"%(TP, FP))
    write_log(log, " overall_precision : %.3f \n" %(overall_precision))

    print("overll_recall : %.3f "%overall_recall)
    write_log(log, " \nTotoal TP are %d  and  FN are %d\n"%(TP, FN))
    write_log(log, " overall_recall : %.3f \n" %(overall_recall))

    print("over_f1 : %.3f "%overall_f1)
    write_log(log, " overall_f1 : %.3f \n" %(overall_f1))


    print("over_hamming : %.3f "%overall_hamming_loss)
    write_log(log, " overall_hamming : %.3f \n" %(overall_hamming_loss))


    each_accuracy = list_divided(beha_accuracy, num_of_apps)
    hamming_loss = list_divided(hamming_loss, num_of_apps)
    print("Each accuracy : %s "%each_accuracy)
    write_log(log, " \nEach accuracy : %s \n"%each_accuracy)

    print("Each hamming_loss : %s "%hamming_loss)
    write_log(log, " \nEach hamming_loss : %s \n"%hamming_loss)

    for th in range(len(precision_rate)):
        if (precision_rate[th][0]+precision_rate[th][1]) !=0:
            precision_rate[th][2] = precision_rate[th][0] / (precision_rate[th][0]+precision_rate[th][1])
        else:
            precision_rate[th][2] = 0
        
        if  (recall_rate[th][0] +recall_rate[th][1] ) != 0:
            recall_rate[th][2] = recall_rate[th][0] / (recall_rate[th][0] +recall_rate[th][1] )
        else:
            recall_rate[th][2] = 0

    f1_list =[]
    for th in range(len(precision_rate)):
        precision_rate[th] = precision_rate[th][2]
        recall_rate[th] = recall_rate[th][2]
        if ( precision_rate[th] +  recall_rate[th] ) ==0:
            f1_list.append(0)
        else:        
            f1_list .append( recall_rate[th] * precision_rate[th] *2 /(recall_rate[th]+ precision_rate[th]) )

    
    print("Each presicion : %s "%(precision_rate))
    write_log(log, " \nEach presicion : %s \n"%precision_rate)

    print("Each recall : %s "%(recall_rate))
    write_log(log, " \nEach recall : %s \n"%recall_rate)
    
    print("Each F1 : %s"%(f1_list))
    write_log(log, " \nEach F1 : %s\n"%f1_list)
    

    new_all_ans = process_all_ans(all_ans, all_ans_summay, family_type, labels)
    print("Calculating_accuracy... Done %f" %(time.time()-start_time))

    return [presice_all_answer, presice_family_answer, presice_sample_answer, each_accuracy, hamming_loss, precision_rate, recall_rate, f1_list, new_all_ans, overall_precision, overall_recall, overall_f1, overall_hamming_loss]

def get_data_from_list (th_new_all_ans_list, num):
    new_return_list = []
    for be_list in th_new_all_ans_list:
        new_return_list.append(be_list[num])

    return new_return_list

def process_all_ans(all_ans, all_ans_summay ,family_type, labels):
    for f in range(len(all_ans)):
    
        for be in range(len(all_ans[f])):
    
            if all_ans[f][be][0] !=0:
                
                P = all_ans[f][be][0]  / ( all_ans[f][be][0]+ all_ans[f][be][2] )
                R = all_ans[f][be][0] /  (all_ans[f][be][0]+ all_ans[f][be][3])
                F1 = 2*P*R/(P+R)

                all_ans_summay[f][be][0] =  P
                all_ans_summay[f][be][1] =  R
                all_ans_summay[f][be][2] = F1 
            else:
                
                P = 0
                R = 0 
                F1 = 0

                all_ans_summay[f][be][0] =  P
                all_ans_summay[f][be][1] =  R
                all_ans_summay[f][be][2] = F1 
    
    return all_ans_summay


def list_divided(li, num):
    for th in range(len(li)):
        li[th] = li[th]/num
    return li

def sum_two_list(l1, l2):
    if len(l1)==0:
        for th in range(len(l2)):
            l1.append(l2[th])
    else:
        if len(l2) ==len(l1):
            for th in range(len(l2)):
                l1[th] = l1[th] + l2[th]
    return l1

def write_log(log, message):
    log.write(message)

def close_log(log):
    log.close()

#* get top n feature importance from random forest model and save into a sorted list
#* the list will like [[importance01, coresponding feature01], [importance02, coresponding feature02], [importance03, coresponding feature03]...]
# *                                 [ [0.1, 12th feature],  [0.08, 99th features], [0.05 32th features].....]
def top_30_feature_importances(imp_f, n):
    ma_list = list(imp_f)
    rank = [ [0,0 ] for i in range(len(ma_list))]

    for th in range(len(ma_list)):
        rank[th][0] = ma_list[th]
        rank[th][1] = th
    
    rank.sort(reverse=True)
    
    return rank[0:n]

#* get top n feature importance list and convert "th features" to features string 
# * Then, save into the dict
def top_30_feature_importances_withfeature(rank, X_train, csv_data):
    new_rank  = copy.deepcopy(rank)
    for th in new_rank:
        th[1] = X_train.columns[th[1]]
        up_data_3_fea_imp = { th[1] :th[0] }
        csv_data['Feature_importance'][ 'Feature_importance'].update(up_data_3_fea_imp)
    
    return new_rank, csv_data

def analyisi_setting_from_name(file_path):
    file_name = file_path.split('/')[-1]
    file_setting = file_name.split('_')
    re_f = file_setting[0]
    la_in_y = file_setting[1]
    min_sam = file_setting[2]
    max_f = file_setting[3]
    est_num = file_setting[4]
    max_de = file_setting[5]
    min_sam_split  = file_setting[6]

    return (re_f, la_in_y, min_sam, max_f, est_num, max_de, min_sam_split)

#* if there is a ML model with same setting, this program will reuse it
#* the settings include removed_family, removed_behavior_label, estimators, min_sample_leaf, max_depth, min_sample_split and max_features 
def if_there_is_same_tree_can_be_used(log, path):
    path = path.replace(" ","")
    if not path.endswith('/'):
        path= path +'/'

    sav_list=[]

    for dirpath,dirname,filename in os.walk(path):
        for f in filename:
            if f.endswith(".sav"):
                file_path = dirpath+'/'+f
                sav_list.append(file_path)
    
    tree_vs_setting ={}
    for tree in sav_list:
        tree_vs_setting[analyisi_setting_from_name(tree)] = tree
    
    current_setting = analyisi_setting_from_name(file_name_g)

    if current_setting in tree_vs_setting:
        print('-------------------This model will using old tree--------------------------')
        write_log(log, '-------------------This model will using old tree-------------\n\nfrom   >>> %s\n\n'%(tree_vs_setting[current_setting]))
        
        return tree_vs_setting[current_setting]
    else:
        print('-------------------This model will using neEEEEEEEEEEEEEEEEEEEEEEEEEw tree--------------------------')
        write_log(log, '-------------------This model will using neEEEEEEEEEEEEEEEEEEEEEEEEEw  tree-------------\n\n' ) 

        return False

    
def run(data):
    time_stamp = datetime.datetime.now().strftime('%m_%d_%H_%M_%S')
    csv_data = { 'All_parameter':{ 'For_all' : {}  },  'Each_accuracy':{ }, 'Each_hamming':{ }, 'Each_precision':{ }, 'Each_recall':{ }, 'Each_f1':{ } , 'Sum_accuracy':{ } ,  'Feature_importance':{ 'Feature_importance':{}} , 'Family_data':{ 'Training' : {},'Testing' : {}, 'Removed':{} } }
    #* this csv_file record all relevent information

    log, folder_loc = basic_question (time_stamp)
    removed_family_list, csv_data =  question_removed_family(log, csv_data) #! get a list like ['1', '5', '10' ]
    tested_family_list, csv_data = question_tested_family(log, csv_data)
    question_include_family(log)
    removed_belable, csv_data = question_remove_label(log, csv_data)
    times, testsize, n_samples_g = parameter_setting(log)

    behavior = read_behavior_label_n_list_from_json() #* return a behavior dict

    
    y = assign_Y_data(log, data, behavior, removed_belable)
    X = assign_X_data(log,data)

    X_train, y_train, X_test, y_test, family_type = Spliting_data(log, X, y, removed_family_list, tested_family_list, testsize)
    labels = show_y_target_lables(y_test)

    print("Learning....  %f" %(time.time()-start_time))

    if tree_mode_g == '2':
        
        if if_there_is_same_tree_can_be_used(log, save_log_location_g) : #* if there is a ML model with same setting, this program will reuse it
            
            mode_file_path = if_there_is_same_tree_can_be_used(log, save_log_location_g)
            malware_tree = pickle.load(open(mode_file_path, 'rb'))
            
            up_data_3_tree = { 'Tree' : 'Old' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree)


            up_data_3_tree_type = { 'Tree type' : 'Random' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree_type)
            
        else:
            if len(y_test.columns)<2:
                malware_tree = RandomForestClassifier(n_estimators= esti_g, max_depth= max_de_g, n_jobs=-1, min_samples_split=min_sam_split_g, max_features= max_fea_g  ,min_samples_leaf= min_sam_leaf_g, random_state=1, oob_score=True, warm_start=False).fit(X_train, y_train.values.ravel())
            else:
                malware_tree = RandomForestClassifier(n_estimators= esti_g, max_depth= max_de_g, n_jobs=-1, min_samples_split=min_sam_split_g, max_features= max_fea_g  ,min_samples_leaf= min_sam_leaf_g, random_state=1, oob_score=True, warm_start=False).fit(X_train, y_train)
                
            mode = malware_tree.estimators_[10]
            dot_data = tree.export_graphviz(mode, out_file=None, feature_names=X_train.columns, rounded= True ) 
            graph = graphviz.Source(dot_data)    #* export the graph of one tree from random forest. this just for references

            if not folder_loc.endswith('/'):
                pic_name = folder_loc + '/'+file_name_g
            else:
                pic_name = folder_loc +file_name_g
            graph.render(pic_name)

            tree_file_name =folder_loc+file_name_g+'_tree.sav'
            pickle.dump(malware_tree, open(tree_file_name, 'wb'))


            up_data_3_tree = { 'Tree' : 'New' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree)


            up_data_3_tree_type = { 'Tree type' : 'Random' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree_type)
            
    else: #* here is the model for extra tree. If wanna to use this,  just revise " tree_mode_g"  not equal to 2.
        
        if if_there_is_same_tree_can_be_used(log, save_log_location_g) :
            
            mode_file_path = if_there_is_same_tree_can_be_used(log, save_log_location_g)
            malware_tree = pickle.load(open(mode_file_path, 'rb'))
            
            up_data_3_tree = { 'Tree' : 'Old' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree)


            up_data_3_tree_type = { 'Tree type' : 'Extra' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree_type)
            
        else:
            if len(y_test.columns)<2:
                malware_tree = ExtraTreesClassifier(n_estimators= esti_g, max_depth= max_de_g, n_jobs=-1, min_samples_split=min_sam_split_g, max_features= max_fea_g  ,min_samples_leaf= min_sam_leaf_g, random_state=1,  warm_start=False).fit(X_train, y_train.values.ravel())
            else:
                malware_tree = ExtraTreesClassifier(n_estimators= esti_g, max_depth= max_de_g, n_jobs=-1, min_samples_split=min_sam_split_g, max_features= max_fea_g  ,min_samples_leaf= min_sam_leaf_g, random_state=1,  warm_start=False).fit(X_train, y_train)
                
            mode = malware_tree.estimators_[10]
            dot_data = tree.export_graphviz(mode, out_file=None, feature_names=X_train.columns, rounded= True ) 
            graph = graphviz.Source(dot_data) 

            if not folder_loc.endswith('/'):
                pic_name = folder_loc + '/'+file_name_g
            else:
                pic_name = folder_loc +file_name_g
            graph.render(pic_name)

            tree_file_name =folder_loc+file_name_g+'_tree.sav'
            pickle.dump(malware_tree, open(tree_file_name, 'wb'))


            up_data_3_tree = { 'Tree' : 'New' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree)

            up_data_3_tree_type = { 'Tree type' : 'Extra' }
            csv_data['All_parameter']['For_all'].update(up_data_3_tree_type)

    score = malware_tree.score(X_test, y_test)
    print("Learning.... Done %f" %(time.time()-start_time))
    
    #!------------------------------------------------------------------------    
    up_data_3_name = { 'file_name' : file_name_g }
    csv_data['All_parameter']['For_all'].update(up_data_3_name)

    up_data_3_fea_num = { 'Feature_number' : len(X.columns) }
    csv_data['All_parameter']['For_all'].update(up_data_3_fea_num)
    
    up_data_3_sam_num = { 'Sample_number' : len(X.index) }
    csv_data['All_parameter']['For_all'].update(up_data_3_sam_num)
    
    up_data_3_Xtrain_num = { 'X_train_num' : len(X_train.index) }
    csv_data['All_parameter']['For_all'].update(up_data_3_Xtrain_num)

        
    up_data_3_esti_num = { 'Estimators' : esti_g }
    csv_data['All_parameter']['For_all'].update(up_data_3_esti_num)

    up_data_3_min_sam_leaf = { 'min_sam_leaf' : min_sam_leaf_g }
    csv_data['All_parameter']['For_all'].update(up_data_3_min_sam_leaf)

    up_data_3_max_fea = { 'max_fea' : max_fea_g }
    csv_data['All_parameter']['For_all'].update(up_data_3_max_fea)


    up_data_3_max_dept = { 'max_depth' : max_de_g }
    csv_data['All_parameter']['For_all'].update(up_data_3_max_dept)


    up_data_3_min_split = { 'min_sample_split' : min_sam_split_g }
    csv_data['All_parameter']['For_all'].update(up_data_3_min_split)


    up_data_3_score = { 'Score' : score }
    csv_data['All_parameter']['For_all'].update(up_data_3_score)

    #!------------------------------------------------------------------------

    al= 0.0
    family = 0.0
    samp = 0.0
    each_beha  = []
    each_hamming = []
    each_precision = []
    each_recall = []
    each_f1 = []
    precision = 0.0
    recall = 0.0
    f1 = 0.0
    hamming = 0.0


    #!if len(X_test.index) < n_samples_g*times: 
    #*  if user have selected certain families put into testing data, this program will use all samples from selected families  and run only one times
    #*  if uses doesn't select any families, this program will run 10 times with randomly selected   50 samples.
    if len(tested_family_list) != 0:
        print('---------------------------there is testing data inputed ------------------------------------')
        write_log(log, '--there is testing data inputed ----\n\n Thus, assign n_Samples = len(X_test) %d\n '%(len(X_test.index)))
        n_samples = len(X_test.index)
        times = 1
    else:
        n_samples =  n_samples_g
        
    for i in range(times):
        print('\n--------------This is  round %d--------------------------\n' %(i+1))
        write_log(log,'\n--------------This is  round %d--------------------------\n' %(i+1))

        test, pre = data_outcoming(log,malware_tree ,X_test, y_test, n_samples )
        answer_ML = calculate_accuracy (log, y_test, pre, test, family_type, csv_data )
        #*   return [presice_all_answer, presice_family_answer, presice_sample_answer, each_accuracy, hamming_loss, precision_rate, recall_rate, f1_list, new_all_ans, overall_precision, overall_recall, overall_f1, overall_hamming_loss]

        #!------------------just update critical information into dict ----------------------------------
        up_data_2 = {'Round_%d'%(i+1) : {} }
        
        for j in range(len(answer_ML[3])):
            up_data_3 = { labels[j] : answer_ML[3][j] }
            up_data_2['Round_%d'%(i+1)].update(up_data_3)
        
        csv_data['Each_accuracy'].update( up_data_2 )

        up_data_2 = {'Round_%d'%(i+1) : {} }
        
        for j in range(len(answer_ML[4])):
            up_data_3 = { labels[j] : answer_ML[4][j] }
            up_data_2['Round_%d'%(i+1)].update(up_data_3)
        
        csv_data['Each_hamming'].update( up_data_2 )

        up_data_2 = {'Round_%d'%(i+1) : {} }
        
        for j in range(len(answer_ML[5])):
            up_data_3 = { labels[j] : answer_ML[5][j] }
            up_data_2['Round_%d'%(i+1)].update(up_data_3)
        
        csv_data['Each_precision'].update( up_data_2 )

        up_data_2 = {'Round_%d'%(i+1) : {} }
        
        for j in range(len(answer_ML[6])):
            up_data_3 = { labels[j] : answer_ML[6][j] }
            up_data_2['Round_%d'%(i+1)].update(up_data_3)
        
        csv_data['Each_recall'].update( up_data_2 )

        up_data_2 = {'Round_%d'%(i+1) : {} }
        
        for j in range(len(answer_ML[7])):
            up_data_3 = { labels[j] : answer_ML[7][j] }
            up_data_2['Round_%d'%(i+1)].update(up_data_3)
        
        csv_data['Each_f1'].update( up_data_2 )
        #!----------------------------------------------------

        al = al +answer_ML[0]
        family = answer_ML[1]+family
        samp = answer_ML[2] + samp
        each_beha = sum_two_list(each_beha, answer_ML[3])
        each_hamming = sum_two_list(each_hamming, answer_ML[4])
        each_precision= sum_two_list(each_precision, answer_ML[5])    
        each_recall = sum_two_list(each_recall, answer_ML[6])
        each_f1 = sum_two_list(each_f1, answer_ML[7])
        precision = answer_ML[9] + precision
        recall = answer_ML[10] + recall
        f1 = answer_ML[11] + f1
        hamming = answer_ML[12] + hamming

    #!-------------------------just update critical information into dict ---------------------------
    up_data_3_n_sam = { 'N_smaples' : n_samples }
    csv_data['All_parameter']['For_all'].update(up_data_3_n_sam)

    up_data_3_times = { 'Times' : times }
    csv_data['All_parameter']['For_all'].update(up_data_3_times)

    up_data_3_xtest_num = { 'X.test_num' : len(test.index) }
    csv_data['All_parameter']['For_all'].update(up_data_3_xtest_num)


    up_data_3_xtest_fea_num = { 'X.test_feature' : len(test.columns) }
    csv_data['All_parameter']['For_all'].update(up_data_3_xtest_fea_num)

    up_data_3_ytest_num = { 'Label_number' : len(y_test.columns) }
    csv_data['All_parameter']['For_all'].update(up_data_3_ytest_num)
    #!----------------------------------------------------

    write_log(log, "\nThis model include behaviors labels are below >>\n")
    write_log(log, "%s\n"%(labels))

    print('\nThis model include behaviors labels are below\n')
    print('%s\n'%(labels))

    each_beha_accuracy = list_divided(each_beha, times)
    each_hamming = list_divided(each_hamming, times)
    each_precision = list_divided(each_precision, times)
    each_recall = list_divided(each_recall, times)
    each_f1 = list_divided(each_f1, times)
    
    print("--------after %d times radom forest------------"%(times))
    print("al :  %.3f"%(al/times)) 
    print("family :  %.3f"%(family/times)) 
    print("sample :  %.3f"%(samp/times)) 
    print("each accuracy :  \n%s\n"%each_beha_accuracy) 
    print("each hamming :  \n%s\n"%each_hamming) 
    print("each precision :  \n%s\n"%each_precision) 
    print("each recall:  \n%s\n"%each_recall) 
    print("each f1 :  \n%s\n"%each_f1) 
    print("precision :  %.3f"%(precision /times)) 
    print("recall :  %.3f"%(recall/times)) 
    print("f1 :  %.3f"%(f1/times)) 
    print("hamming :  %.3f"%(hamming/times)) 


    write_log(log, "\n\n************af %d times rounds***************\n"%times)
    write_log(log, "The average score of all label is %.3f\n" %(al/times))
    write_log(log,"The average score of family label is %.3f\n" %(family/times))  
    write_log(log,"The average score of sample is %.3f\n" %(samp/times)) 
    write_log(log,"each accuracy :  \n%s\n"%each_beha_accuracy)
    write_log(log,"each hamming :  \n%s\n"%each_hamming)
    write_log(log,"each precision :  \n%s\n"%each_precision)
    write_log(log,"each recall :  \n%s\n"%each_recall)
    write_log(log,"each f1 :  \n%s\n"%each_f1)
    write_log(log,"precision :  \n%.3f\n"%(precision/times))
    write_log(log,"recall :  \n%.3f\n"%(recall/times))
    write_log(log,"f1 :  \n%.3f\n"%(f1/times))
    write_log(log,"hamming :  \n%.3f\n"%(hamming/times))

    if not folder_loc.endswith('/'):
            folder_loc = folder_loc+'/'
    
    if tree_mode_g =='2':
        up_data_3_oob = { 'OOB' : malware_tree.oob_score_ }
        csv_data['All_parameter']['For_all'].update(up_data_3_oob)
    else:
        extratree_x_train_test = X_train.sample(n_samples)
        extratree_y_train_test = y_train.loc[extratree_x_train_test.index, :]
        testing_training_score = malware_tree.score(extratree_x_train_test, extratree_y_train_test)
        up_data_3_oob = { 'OOB' : testing_training_score}
        csv_data['All_parameter']['For_all'].update(up_data_3_oob)

    
    #!--------------------just update critical information into dict --------------------------------
    up_data_3_all_sum = { 'All_accuracy' : al/times }
    csv_data['All_parameter']['For_all'].update(up_data_3_all_sum)
    
    up_data_3_all_f_label = { 'Family_accuracy' : family/times }
    csv_data['All_parameter']['For_all'].update(up_data_3_all_f_label)

    up_data_3_all_samp = { 'Sample_accuracy' : samp/times }
    csv_data['All_parameter']['For_all'].update(up_data_3_all_samp)

    up_data_3_all_pre = { 'All_precision' : precision/times }
    csv_data['All_parameter']['For_all'].update(up_data_3_all_pre)
    
    up_data_3_all_recall = { 'All_recall' : recall/times }
    csv_data['All_parameter']['For_all'].update(up_data_3_all_recall)
    
    up_data_3_all_f1 = { 'All_f1' : f1/times }
    csv_data['All_parameter']['For_all'].update(up_data_3_all_f1)
    
    up_data_3_all_hamming = { 'All_hamming' : hamming/times }
    csv_data['All_parameter']['For_all'].update(up_data_3_all_hamming)

    up_data_2 = {'After_%d_Round'%(times) : {} }
    for j in range(len(each_beha_accuracy)):
        up_data_3 = { labels[j] : each_beha_accuracy[j] }
        up_data_2['After_%d_Round'%(times)].update(up_data_3)
    csv_data['Sum_accuracy'].update( up_data_2 )
    #!----------------------------------------------------
    
    top_30_f = top_30_feature_importances(malware_tree.feature_importances_, 30)
    top_30_f_string, csv_data = top_30_feature_importances_withfeature(top_30_f, X_train, csv_data )
    paramet = malware_tree.get_params()
    
    if tree_mode_g =='2':
        write_log(log,'\n-------------oob score of this tree is %.3f---------------------------\n'%(malware_tree.oob_score_ ) )

    for key in malware_tree.__dict__.keys():
        write_log(log,'%s >>>>\n %s \n\n'%(key, malware_tree.__dict__[key]) )

    write_log(log,'\n--------------Below are tree parameters ------------------------\n')
    write_log(log,'\n%s\n'%(paramet))
    write_log(log,'\n--------------Below are feature importance ------------------------\n')
    write_log(log,'\n%s\n\n%s\n\n'%(top_30_f, top_30_f_string))
    
    
    for key in csv_data.keys():
        df = pd.DataFrame.from_dict(csv_data[key])
        df = df.fillna('-')
        df.columns.name = key
        df.index.name = key
        df =df.sort_index()
        print(df)
        print('\n')
        write_log(log,'\n%s\n'%(df))
        df.to_csv(folder_loc+file_name_g+'_'+ key+'_'+time_stamp+'_.csv')

    write_log(log,'\n--------------Below are csv ------------------------\n')
    write_log(log,'\n%s\n'%(csv_data))


    jsonfile = open(folder_loc+file_name_g+'_'+time_stamp+'_.json', 'w+')
    jsonfile.write(json.dumps(csv_data))
    jsonfile.close()

    close_log(log)

if __name__ == "__main__":
    
    start_time = time.time()
    save_log_location_g = sys.argv[1]  
    #!  './ '
    
    feature_file_g = sys.argv[2]
    #!  './ex/malware_feature_matrix.json'
    
    behavors_file_g = sys.argv[3]
    #!  './ex/Behaviour_labels.json'

    family_file_g = sys.argv[4]
    #!   './ex/num_to_family.json'

    include_family_g = 0
    tree_mode_g = '2'


    print("Reading feature metrix from json....%f" %(time.time()-start_time))
    original_data = pd.read_json (feature_file_g, orient='index')                                                   
    #* read malware features json file
    
    print("Reading feature metrix from json....Done %f" %(time.time()-start_time))
    
    f_dict = get_family_dict_from(family_file_g)                          
    #*  Read a json that is a dict which family number mapped to family name  
    # *  ex {"0": "Obad", "1": "GingerMaster", "2": "Svpeng", "3": "FakeAngry", "4": "Jisut"}
    
    la_dict = get_label_dict(behavors_file_g)                                  
    #*  Read a json that is a dict which family name mapped to coresponding malicious behavior  
    # *  ex. { "Obad": {"Monetization-Ransom": 0, "Anti-Analysis-NP": 0, "Monetization-Bank": 0, "InfoStealing-PI": 1, "Privilege": 1, "Monetization-Advertising": 0, "Anti-Analysis-RN": 1, "InfoStealing-DI": 1, "Anti-Analysis-SE": 1, "Anti-Analysis-DL": 0, "Anti-Analysis-EDA": 1, "Monetization-Subscription": 1}, 
    # *         "Gorpo": {"Monetization-Ransom": 0, "Anti-Analysis-NP": 0, "Monetization-Bank": 0, "InfoStealing-PI": 0, "Privilege": 1, "Monetization-Advertising": 1, "Anti-Analysis-RN": 1, "InfoStealing-DI": 1, "Anti-Analysis-SE": 1, "Anti-Analysis-DL": 0, "Anti-Analysis-EDA": 1, "Monetization-Subscription": 0} }
        
    removed_f= str(raw_input('\nDo you want to removed any family from training \n ex: 0,1,2,3\n or input 99 to skip\n') or '99')
    family_list = [removed_f]
    #*  family_list = ['1,2,4,6,8,10,11,14,15,19,21,23,25,26,29,31,32,35,38,39']             >>  for certain family that you want to repeatedly test which let you don't need to selected for each time running.
    #*  select what family you want to remove  
    
    tested_family_g= str(raw_input('\nDo you want to put any family into testing \n ex: 0,1,2,3\n or input 99 to skip\n') or '99')
    #* tested_family_g ='1,10,11,15,19,23,26,32,38,39'             >>  for certain family that you want to repeatedly test which let you don't need to selected for each time running.
    #*  select what behavior labels you want to remove  

    selected_label= str(raw_input("Do you want to selected any labels ?? input one values\n or input 77 for test all independently\n  or input 99 for test all labels\n") or  '77')
    #* selected_label = ['3,4,5,6,7,8,9,10', '3,4,6,7', '4,6,7', '3,4,6', '4,7', '4,6', '3,4', '4']           >>  for certain family that you want to repeatedly test which let you don't need to selected for each time running.
    #*  select what behavior labels you want to remove  

        #*   -----------------------------------------------ML parameter setting -----------------------------------------------
    estimator_list =  [100] #* [200, 100]  #! default [100]
    max_fea =  ['auto'] #*[None, 'auto', 'log2'] #! default [''log2']
    max_dep =[None] #*[15,20,25,30,35,40,45,50] #! default [None]
    min_samples = [1] #*[1, 5,10,20] #! default [1]
    min_sam_split = [2] #*[2,3,5,10,15,20] #! default [1]

    for max_de_g in max_dep:    
        for min_sam_split_g in min_sam_split:
            for max_fea_g  in max_fea:
                for esti_g in estimator_list:
                    for min_sam_leaf_g in min_samples:
                        for removed_family_g  in family_list:
                            #for selected_label_2 in selected_label:
                            re_la = set_up_removed_label_list(len(la_dict), selected_label)
                            for removed_label_g in re_la:
                                all_name = create_file_name_from_removed_families(removed_family_g)
                                file_name_g= all_name+'_'+'%s'%(create_file_name_from_removed_bahavior(removed_label_g, la_dict) )+'_'+str(min_sam_leaf_g)+'_'+str(max_fea_g)+'_'+str(esti_g)+'_'+str(max_de_g)+'_'+str(min_sam_split_g)
                                #* file nmae is form by all setting
                                print(file_name_g)
                                run(original_data)    #* execution of Random Forest