diff --git a/Taheri2020NCAA-labelflipping_Sourcecode/GAN_Based_defense.py b/Taheri2020NCAA-labelflipping_Sourcecode/GAN_Based_defense.py new file mode 100644 index 0000000000000000000000000000000000000000..a2e3885bf575e0ce4d0a321f07570a64a52c00c8 --- /dev/null +++ b/Taheri2020NCAA-labelflipping_Sourcecode/GAN_Based_defense.py @@ -0,0 +1,598 @@ +# -*- coding: utf-8 -*- + +""" +Created on Fri May 25 12:03:10 2018 + +@author: Rahim +#this approch use the distribution of Benign data to poison thet test data +""" +from __future__ import print_function +from sklearn.feature_selection import SelectFromModel +from sklearn.feature_selection import SelectKBest, f_regression +from sklearn.model_selection import KFold +from sklearn.model_selection import cross_val_score +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix +from sklearn import model_selection +from sklearn.feature_selection import RFE +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestRegressor +from scipy.sparse import csr_matrix, vstack, hstack +from scipy import sparse +import pandas as pd +import numpy as np +import random +import time +import argparse +import math +from numpy import * +import os.path as osp +import scipy.sparse as sp +import pickle +from sklearn import metrics +from sklearn.metrics import accuracy_score +#****************************************************************************** +CLASS = 'class' +CLASS_BEN = 'B' +CLASS_MAL = 'M' +DATA = 'data' +#********************************************Functions that will be used in this program******************************************************************************************* +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input-tables', nargs='*', dest='input_tables') + + args = parser.parse_args() + + return args +#****************************************************************************** +def read_table(table_file): + + table = dict() + + with open(table_file, 'rb') as handle: + while True: + try: + table = pickle.load(handle) + except EOFError: + break + + f_set=set() + + for k,v in table.items(): + for feature in v[DATA]: + f_set.add(feature) + + return table , f_set +#****************************************************************************** +def build_table(tables): + full_table = dict() + + file_set = set() + + for table in tables: + file_set.update(table.keys()) + for key, val in table.items(): + full_table[key] = val + + files = list(file_set) + return full_table, files +#****************************************************************************** +def convert_to_matrix(table, features, files): + mat = sp.lil.lil_matrix((len(files), len(features)), dtype=np.int8) + + print("Input Data Size = ", mat.get_shape()) + # the response vector + + cl = [0]*len(files) + + for key, val in table.items(): + k = files.index(key) + + if val[CLASS] is CLASS_BEN: + cl[k] = 1 + + for v in val[DATA]: + try: + idx = features.index(v) + mat[k, idx] = 1 + except Exception as e: + print(e) + pass + + return mat, cl +#****************************************************************************** +def delete_row_lil(mat, i): + if not isinstance(mat, sp.lil.lil_matrix): + raise ValueError("works only for LIL format -- use .tolil() first") + mat.rows = np.delete(mat.rows, i) + mat.data = np.delete(mat.data, i) + mat._shape = (mat._shape[0] - 1, mat._shape[1]) +#****************************************************************************** +def relevant_features(data, response_vector, features): + rel_features = list() + ranked_index=list() + + model =RandomForestRegressor() + rfe = RFE(model, 1) + fit = rfe.fit(data, response_vector) + old_features=features + + for i in fit.ranking_: + if i<len(features): + rel_features.append(features[i]) + ranked_index=[old_features.index(x) for x in rel_features if x in old_features] + + return rel_features ,ranked_index +#*****************************************************************Main Function********************************************************************************************************* +def main(): + args = parse_args() + + tables = [] + f_set = set() + + #read the data + for t_files in args.input_tables: + table, features = read_table(t_files) + f_set = f_set.union(features) + tables.append(table) + print(" ") + print(" ") + print("*****************************************************************************************") + print("********Using Benign Distribution + Random Forest Classifier + GAN countermeasure********") + print("*****************************************************************************************") + + #*build table from data and convert to matrix + full_table, files = build_table(tables) + files.sort() + features = list(f_set) + features.sort() + mat, cl = convert_to_matrix(full_table, features, files) + + #Doing feature Ranking on all of the Data + print("************************Doing feature Ranking on all of the Data*************************") + t0=time.time() + r_features,ranked_index = relevant_features(mat, cl, features) + t1=time.time() + print("Time of Feature Ranking=",t1-t0) + print("******************************************************************************************") + + original_selected=ranked_index[1:301] + data = sparse.lil_matrix(sparse.csr_matrix(mat)[:,original_selected]) + seed = 10 + test_size = 0.2 + X_train, X_test, Y_train, Y_test= train_test_split(data, cl, test_size= test_size, random_state=seed) + test_size = 0.25 + X_train, X_val, Y_train, Y_val= train_test_split(X_train, Y_train, test_size= test_size, random_state=seed) + #************************************************************************** + num_trees = 100 + max_features = 3 + t0=time.time() + kfold = KFold(n_splits=10, random_state=10) + model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) + model.fit(X_train, Y_train) + t1=time.time() + print("Time for Clssification Algorithm is runing on 300 high-ranked features =",t1-t0) + print("************************************Result without attack *******************************************************************************************") + # compute Classification Accuracy in train and test and Validation + scoring = 'accuracy' + results = model_selection.cross_val_score(model, X_train,Y_train, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in train: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Classification Accuracy in validation*************************** + scoring = 'accuracy' + results = model_selection.cross_val_score(model, X_val,Y_val, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in validation: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Classification Accuracy in test********************************* + scoring = 'accuracy' + results = model_selection.cross_val_score(model, X_test,Y_test, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in test: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Classification Accuracy in Validation*************************** + predictions_val = model.predict(X_val) + print("classification_report by validation:") + print(classification_report(Y_val, predictions_val)) + #********************* compute Classification Accuracy in train******************************** + predictions = model.predict(X_test) + print("classification_report by test:") + print(classification_report(Y_test, predictions)) + #********************* compute Logarithmic Loss in Train********************************* + scoring = 'neg_log_loss' + results = model_selection.cross_val_score(model, X_train,Y_train, cv=kfold, scoring=scoring) + print(("The Loss of Classification in train data: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Logarithmic Loss in validation**************************** + scoring = 'neg_log_loss' + results = model_selection.cross_val_score(model, X_val,Y_val, cv=kfold, scoring=scoring) + print(("The Loss of Classification in validation data:: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Logarithmic Loss in Test*********************************** + scoring = 'neg_log_loss' + results = model_selection.cross_val_score(model, X_test,Y_test, cv=kfold, scoring=scoring) + print(("The Loss of Classification in test data:: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Area Under ROC Curve in Train****************************** + scoring = 'roc_auc' + results = model_selection.cross_val_score(model, X_train,Y_train, cv=kfold, scoring=scoring) + print(("The Area Under ROC Curve in Train: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Area Under ROC Curve in Validation************************* + scoring = 'roc_auc' + results = model_selection.cross_val_score(model, X_val,Y_val, cv=kfold, scoring=scoring) + print(("The Area Under ROC Curve in Validation: %.3f (%.3f)") % (results.mean(), results.std())) + #********************* compute Area Under ROC Curve in Test******************************* + scoring = 'roc_auc' + results = model_selection.cross_val_score(model, X_test,Y_test, cv=kfold, scoring=scoring) + print(("The Area Under ROC Curve in test: %.3f (%.3f)") % (results.mean(), results.std())) + #*****************************Compute FPR and TPR in Validation************************** + cm=confusion_matrix(Y_test, predictions) + print("confusion_matrix=") + print(cm) + TP=cm[0][0] + print("TP=",TP) + FP=cm[0][1] + print("FP=",FP) + FN=cm[1][0] + print("FN=",FN) + TN=cm[1][1] + print("TN=",TN) + FPR=FP/(FP+TN) + print("The FPR result=", FPR) + TPR=TP/(TP+FN) + print("The TPR result=", TPR) + + TNR=TN/(TN+FP) + print("The TNR result=", TNR) + + FNR=FN/(FN+TP) + print("The FNR result=", FNR) + + AUC=1/(2*((TN/(TN+FP))+(TP/(TP+FP)))) + print("The AUC result=", AUC) + + ACC=(TP+TN)/(TP+TN+FP+FN) + print("The ACC result=", ACC) + + MCC=(TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)) + print("The Matthews correlation coefficient result=", MCC) + + print("*******************************End of Result without attack:*****************************************************************************************") + #************************************************************************************************************************************************************* + # finding Malware of test data + malware_test= sparse.lil_matrix(X_test) + cl_malware=list() + z_m=0 + count_m=0 + for i, j in enumerate(Y_test): + if j == 1: + delete_row_lil(malware_test, i-count_m) + count_m=count_m+1 + else: + cl_malware.insert(z_m, 1) + z_m=z_m+1 + + #************************** + #finding Benign of test data + benign_test = sparse.lil_matrix(X_test) + cl_benign=list() + z_b=0 + count_b=0 + for i, j in enumerate(Y_test): + if j == 0: + delete_row_lil(benign_test.tolil(), i-count_b) + count_b=count_b+1 + else: + cl_benign.insert(z_b, 1) + z_b=z_b+1 + #************************** + # finding Malware of Train data + malware_train= sparse.lil_matrix(X_train) + cl_malware=list() + z_m=0 + count_m=0 + for i, j in enumerate(Y_train): + if j == 1: + delete_row_lil(malware_train, i-count_m) + count_m=count_m+1 + else: + cl_malware.insert(z_m, 1) + z_m=z_m+1 + #*************************** + #Finding Benign of Train data + cl_X_train=list(Y_train) + benign_train=sparse.lil_matrix(X_train) + z_b=0 + count_b=0 + cl_benign_train=list() + for i, j in enumerate(cl_X_train): + if j == 0: + delete_row_lil(benign_train, i-count_b) + count_b=count_b+1 + else: + cl_benign_train.insert(z_b, 1) + z_b=z_b+1 + print("***********Size of Each Data Part:**********") + print("malware_train=", malware_train.get_shape()) + print("benign_train=", benign_train.get_shape()) + print("malware_test=", malware_test.get_shape()) + print("benign_test=", benign_test.get_shape()) + #*************************************************** + t0=time.time() + ranked_features_in_benign,ranked_index_of_benign = relevant_features(benign_train,cl_benign_train, features) + t1=time.time() + print("Time for Ranking benign_train to find important features =",t1-t0) + #*************************************************************************************************************************************************************** + numbers=list() + numbers=[3,6,9,12,15,18,21,24,27,30,60] + X_test = sp.lil.lil_matrix(X_test) + + for loop in range(10): + print("************************************************************************************************************************************************************************************") + print("Result related to loop number : ",loop) + + Malware_Test=sparse.lil_matrix(malware_test.copy()) + row_of_Malware,column_of_Malware=Malware_Test.get_shape() + index_of_row=list(range(row_of_Malware)) + random.shuffle(index_of_row) + + number_of_row_to_change=int(row_of_Malware/10) + selected_row=index_of_row[0:number_of_row_to_change] + + for i, v in enumerate(numbers): + print("*****************************************************************************************************************************************************") + print("*********************selected features :",int(v) ) + print("************************************Result after attack *************************") + max_index_of_column=int(v)+1 + t0=time.time() + rw_test,cl_test=X_test.get_shape() + poison_data=sp.lil.lil_matrix((0,cl_test),dtype=np.int8) + Malware_Test=sparse.lil_matrix(malware_test.copy()) + + counter_of_poisoned_point=0 + + for m,value in enumerate(selected_row): + flag=0 + for i, j in enumerate(ranked_index_of_benign[1:max_index_of_column]): + for k,l in enumerate(original_selected): + if j==l: + if Malware_Test[value,l]==0: + Malware_Test[value,l]=1 + flag=1 + if flag==1: + counter_of_poisoned_point=counter_of_poisoned_point+1 + + + Benign_Test=sparse.lil_matrix(benign_test.copy()) + poison_data = sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((Benign_Test, Malware_Test)))) + r,w=poison_data.get_shape() + Y_test=Y_test[0:r] + + t1=time.time() + print("Time related to applying attack in this number of Features= ",t1-t0) + + print("Number of poisoned Malware= ",counter_of_poisoned_point) + #********************* compute Classification Accuracy in test********************************* + scoring = 'accuracy' + results = model_selection.cross_val_score(model, poison_data,Y_test, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in test: %.3f (%.3f)") % (results.mean(), results.std())) + + #********************* compute Classification Accuracy in train******************************** + predictions = model.predict(poison_data) + print("classification_report by test:") + print(classification_report(Y_test, predictions)) + + #********************* compute Logarithmic Loss in Test*********************************** + scoring = 'neg_log_loss' + results = model_selection.cross_val_score(model, poison_data,Y_test, cv=kfold, scoring=scoring) + print(("The Loss of Classification in test data:: %.3f (%.3f)") % (results.mean(), results.std())) + + #********************* compute Area Under ROC Curve in Test******************************* + scoring = 'roc_auc' + results = model_selection.cross_val_score(model, poison_data,Y_test, cv=kfold, scoring=scoring) + print(("The Area Under ROC Curve in test: %.3f (%.3f)") % (results.mean(), results.std())) + #*****************************Compute FPR and TPR in Validation************************** + cm=confusion_matrix(Y_test, predictions) + print("confusion_matrix=") + print(cm) + TP=cm[0][0] + print("TP=",TP) + FP=cm[0][1] + print("FP=",FP) + FN=cm[1][0] + print("FN=",FN) + TN=cm[1][1] + print("TN=",TN) + FPR=FP/(FP+TN) + print("The FPR result=", FPR) + + TPR=TP/(TP+FN) + print("The TPR result=", TPR) + + TNR=TN/(TN+FP) + print("The TNR result=", TNR) + + FNR=FN/(FN+TP) + print("The FNR result=", FNR) + + AUC=1/(2*((TN/(TN+FP))+(TP/(TP+FP)))) + print("The AUC result=", AUC) + + ACC=(TP+TN)/(TP+TN+FP+FN) + print("The ACC result=", ACC) + + MCC=(TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)) + print("The Matthews correlation coefficient result=", MCC) + + print("********************************************************************************") + print("*******************Result after applying GAN countermeasure**************") + t0=time.time() + + + model2 = ExtraTreesClassifier(n_estimators=250,random_state=0) + model2.fit(benign_train, cl_benign_train) + importances = model2.feature_importances_ + indices = np.argsort(importances)[::-1] + + + importance_of_Features_in_benign_train=list() + for f in range(60): + importance_of_Features_in_benign_train.append(indices[f]) + + #******************************Runing the Logistic Regression and finding Some Sampels Near to Hyperplain***************************** + poison_model = LogisticRegression() + poison_model.fit(X_train,Y_train) + print("Result related to Logistic Regression:") + scoring = 'accuracy' + poison_results = model_selection.cross_val_score(poison_model, X_train,Y_train, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in train: %.3f (%.3f)") % (poison_results.mean(), poison_results.std())) + #********************* compute Logistic Regression Accuracy in validation without change *************************** + scoring = 'accuracy' + results = model_selection.cross_val_score(poison_model, X_val,Y_val, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in validation: %.3f (%.3f)") % (poison_results.mean(), poison_results.std())) + #********************* compute Logistic Regression Accuracy in test without change ********************************* + scoring = 'accuracy' + results = model_selection.cross_val_score(poison_model, X_test,Y_test, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in test: %.3f (%.3f)") % (poison_results.mean(), poison_results.std())) + #**********************Declration of Variables for finding desision value ************* + print("**************************************************************************************************") + temp=sparse.lil_matrix(X_train) + a,b=temp.get_shape() + decision_value=np.array([]) + selected_cl_malware_train=list() + selected_malware_train = sparse.lil_matrix(X_train) + #**********************finding malware_train and related desision value ********************************** + counter_of_malware_train=0 + count_deleted=0 + for j in range(a): + row=temp.getrow(j) + if Y_train[j]==0: + decision_value=np.append(decision_value,poison_model.decision_function(row)) + selected_cl_malware_train.insert(counter_of_malware_train, 0) + counter_of_malware_train=counter_of_malware_train+1 + else: + delete_row_lil(selected_malware_train.tolil(), j-count_deleted) + count_deleted=count_deleted+1 + #**********************sort the absolute value of decision value for malware_train************************* + decision_value=np.absolute(decision_value) + indices=decision_value.argsort() + + #************** Declration of Variables for selecting data************************************************* + number_of_row_malware_train,number_of_column_malware_train=malware_train.get_shape() + + number_of_row_selected_malware_train=int(number_of_row_malware_train/10) + + #****************Selecting index related to 10 percent of malware_train with minimum decision value******* + Selected_rows_as_less_likely=list() + Selected_rows_as_less_likely=indices[:number_of_row_selected_malware_train] + + Malware_less_likely=sp.lil.lil_matrix((0, number_of_column_malware_train), dtype=np.int8) + cl_less_likely=list() + counter_for_cl_less_likely=0 + for i,row_number in enumerate(Selected_rows_as_less_likely): + selected_row=malware_train.getrow(row_number) + Malware_less_likely= sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((selected_row, Malware_less_likely)))) + cl_less_likely.insert(counter_for_cl_less_likely,0) + counter_for_cl_less_likely=counter_for_cl_less_likely+1 + + number_of_row_in_Malware_less_likely,number_of_column_in_Malware_less_likely=Malware_less_likely.get_shape() + #****************finding Benign like samples******************************************************************************** + poisoned_data=sp.lil.lil_matrix((0, number_of_column_malware_train), dtype=np.int8) + c=0 + for counter_of_Malware_less_likely in range(number_of_row_in_Malware_less_likely): + selected_sample=Malware_less_likely.getrow(counter_of_Malware_less_likely) + + + c=0 + for S in range(number_of_column_in_Malware_less_likely): + index_for_change=random.randint(0,number_of_column_in_Malware_less_likely-1) + if selected_sample[0,index_for_change]==0: + selected_sample[0,index_for_change]=1 + label=model.predict(selected_sample) + if label==int(1): + poisoned_data= sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((selected_sample, poisoned_data)))) + c=c+1 + break + + Number_of_row_in_poisoned_data,Number_of_column_in_poison_data=poisoned_data.get_shape() + Y_poisoin=list() + for index in range(Number_of_row_in_poisoned_data): + Y_poisoin.append(0) + #*************************************************************************************************************************** + + poisoned_data_X=poisoned_data.copy() + poisoned_data_Y=Y_poisoin[:] + second_test_set=0.2 + X_poisoned_train, X_poisoned_test, Y_poisoned_train, Y_poisoned_test= train_test_split(poisoned_data_X, poisoned_data_Y, test_size= second_test_set, random_state=seed) + poison_data_for_retraining = sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((X_train, X_poisoned_train)))) + poison_Class_for_retraining = Y_train + Y_poisoned_train + + num_trees = 100 + max_features = 3 + kfold = KFold(n_splits=10, random_state=10) + model_for_counter_measure = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) + model_for_counter_measure.fit(poison_data_for_retraining,poison_Class_for_retraining) + + + poison_data_for_test_after_retraining = sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((X_test, X_poisoned_test)))) + poison_Class_for_test_after_retraining= Y_test + Y_poisoned_test + + t1=time.time() + print("Time related to applying GAN countermeasure in this number of Features: ",t1-t0) + #********************* compute Classification Accuracy in test********************************* + scoring = 'accuracy' + results = model_selection.cross_val_score(model_for_counter_measure, poison_data_for_test_after_retraining,poison_Class_for_test_after_retraining, cv=kfold, scoring=scoring) + print(("The accuracy of Classification in test: %.3f (%.3f)") % (results.mean(), results.std())) + + #********************* compute Classification Accuracy in train******************************** + predictions = model.predict(poison_data_for_test_after_retraining) + print("classification_report by test:") + print(classification_report(poison_Class_for_test_after_retraining, predictions)) + + #********************* compute Logarithmic Loss in Test*********************************** + scoring = 'neg_log_loss' + results = model_selection.cross_val_score(model_for_counter_measure, poison_data_for_test_after_retraining , poison_Class_for_test_after_retraining, cv=kfold, scoring=scoring) + print(("The Loss of Classification in test data:: %.3f (%.3f)") % (results.mean(), results.std())) + + #********************* compute Area Under ROC Curve in Test******************************* + scoring = 'roc_auc' + results = model_selection.cross_val_score(model_for_counter_measure, poison_data_for_test_after_retraining , poison_Class_for_test_after_retraining, cv=kfold, scoring=scoring) + print(("The Area Under ROC Curve in test: %.3f (%.3f)") % (results.mean(), results.std())) + #*****************************Compute FPR and TPR in Validation************************** + cm=confusion_matrix(poison_Class_for_test_after_retraining, predictions) + print("confusion_matrix=") + print(cm) + TP=cm[0][0] + print("TP=",TP) + FP=cm[0][1] + print("FP=",FP) + FN=cm[1][0] + print("FN=",FN) + TN=cm[1][1] + print("TN=",TN) + FPR=FP/(FP+TN) + print("The FPR result=", FPR) + + TPR=TP/(TP+FN) + print("The TPR result=", TPR) + + TNR=TN/(TN+FP) + print("The TNR result=", TNR) + + FNR=FN/(FN+TP) + print("The FNR result=", FNR) + + AUC=1/(2*((TN/(TN+FP))+(TP/(TP+FP)))) + print("The AUC result=", AUC) + + ACC=(TP+TN)/(TP+TN+FP+FN) + print("The ACC result=", ACC) + + MCC=(TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)) + print("The Matthews correlation coefficient result=", MCC) + + print("Result Related to this Numbers of features is finished:",int(v)) + Malware_Test=sparse.lil_matrix(malware_test.copy()) + selected_row=index_of_row[0:number_of_row_to_change] + original_selected=ranked_index[1:301] + print("End of loop number : ",loop) + print("************************************************************************************************************************************************************************************") +#******************************************************************************************************************************************************************** +if __name__ == "__main__": + main() +#****************************************************************************** diff --git a/Taheri2020NCAA-labelflipping_Sourcecode/Label_Flipping_Paper_with_Feature_Selection(LSD_CSD_KDD).py b/Taheri2020NCAA-labelflipping_Sourcecode/Label_Flipping_Paper_with_Feature_Selection(LSD_CSD_KDD).py new file mode 100644 index 0000000000000000000000000000000000000000..a5397c5acf8201eb291eab2c0a077d74117e6bbd --- /dev/null +++ b/Taheri2020NCAA-labelflipping_Sourcecode/Label_Flipping_Paper_with_Feature_Selection(LSD_CSD_KDD).py @@ -0,0 +1,840 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 21 14:25:17 2019 + +@author: Rahim +""" +#*****************************************************************import Library***************************************************************************** +from __future__ import print_function +from sklearn.feature_selection import SelectFromModel +from sklearn.feature_selection import SelectKBest, f_regression +from sklearn.model_selection import KFold +from sklearn.model_selection import cross_val_score +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix +from sklearn import model_selection +from sklearn.feature_selection import RFE +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestRegressor +from scipy.sparse import csr_matrix, vstack, hstack +from scipy.sparse import coo_matrix +from keras.preprocessing.text import one_hot +from sklearn import metrics +from sklearn.metrics import silhouette_samples, silhouette_score +from sklearn.semi_supervised import LabelPropagation +from sklearn.semi_supervised import LabelSpreading +from sklearn.semi_supervised import label_propagation +from sklearn.metrics import roc_auc_score +from sklearn.metrics import f1_score +from sklearn.cluster import KMeans +import math +#import keras +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation , Flatten +from sklearn.metrics import log_loss +from keras.optimizers import SGD +from keras.layers.normalization import BatchNormalization +from keras.layers.convolutional import UpSampling2D +from keras.layers.convolutional import Conv2D, MaxPooling2D, MaxPooling1D +from keras.layers.embeddings import Embedding +from scipy import sparse +import pandas as pd +import numpy as np +#import random +import sklearn +from sklearn.metrics.pairwise import manhattan_distances +from keras.models import Model +from keras.layers import Conv1D, multiply, GlobalMaxPool1D, Input , Lambda +import time +import argparse +#import math +from numpy import * +import os.path as osp +import scipy.sparse as sp +import pickle +from sklearn.metrics import accuracy_score +from warnings import simplefilter +#********************************************************************************************************************************* +CLASS = 'class' +CLASS_BEN = 'B' +CLASS_MAL = 'M' +DATA = 'data' +#********************************************Functions that will be used in this program***************************************** +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input-tables', nargs='*', dest='input_tables') + + args = parser.parse_args() + + return args +#********************************************************************************************************************************* +def read_table(table_file): + + table = dict() + + with open(table_file, 'rb') as handle: + while True: + try: + table = pickle.load(handle) + except EOFError: + break + + f_set=set() + + for k,v in table.items(): + for feature in v[DATA]: + f_set.add(feature) + + return table , f_set +#****************************************************************************** +def relevant_features(data, response_vector, features): + rel_features = list() + ranked_index=list() + + model =RandomForestRegressor() + rfe = RFE(model, 1) + fit = rfe.fit(data, response_vector) + old_features=features + + for i in fit.ranking_: + if i<len(features): + rel_features.append(features[i]) + ranked_index=[old_features.index(x) for x in rel_features if x in old_features] + + return rel_features ,ranked_index +#********************************************************************************************************************************* +def build_table(tables): + full_table = dict() + + file_set = set() + + for table in tables: + file_set.update(table.keys()) + for key, val in table.items(): + full_table[key] = val + + files = list(file_set) + return full_table, files +#********************************************************************************************************************************* +def convert_to_matrix(table, features, files): + mat = sp.lil.lil_matrix((len(files), len(features)), dtype=np.int8) + + print("Input Data Size = ", mat.get_shape()) + # the response vector + + cl = [0]*len(files) + + for key, val in table.items(): + k = files.index(key) + + if val[CLASS] is CLASS_BEN: + cl[k] = 1 + + for v in val[DATA]: + try: + idx = features.index(v) + mat[k, idx] = 1 + except Exception as e: + print(e) + pass + + return mat, cl +#****************************************************************************** +def delete_row_lil(mat, i): + if not isinstance(mat, sp.lil.lil_matrix): + raise ValueError("works only for LIL format -- use .tolil() first") + mat.rows = np.delete(mat.rows, i) + mat.data = np.delete(mat.data, i) + mat._shape = (mat._shape[0] - 1, mat._shape[1]) +#*****************************************************************Main Function******************************************************* +def main(): + simplefilter(action='ignore', category=FutureWarning) + args = parse_args() + tables = [] + f_set = set() + #read the data + for t_files in args.input_tables: + table, features = read_table(t_files) + f_set = f_set.union(features) + tables.append(table) + #************************************build table from data and convert to matrix*************************************************** + full_table, files = build_table(tables) + files.sort() + features = list(f_set) + features.sort() + mat, cl = convert_to_matrix(full_table, features, files) + print("************************Doing feature Ranking on all of the Data*************************") + r_features,ranked_index = relevant_features(mat, cl, features) + original_selected=ranked_index[1:301] + data = sparse.lil_matrix(sparse.csr_matrix(mat)[:,original_selected]) + + #******************************************Split data to train , test and validation********************************************** + seed = 10 + test_size = 0.2 + X_train, X_test, Y_train, Y_test= train_test_split(data, cl, test_size= test_size, random_state=seed) + test_size = 0.25 + X_train, X_val, Y_train, Y_val= train_test_split(X_train, Y_train, test_size= test_size, random_state=seed) + #*********************************************************************************************************************************** + print(" ") + print(" ") + print("*********Semi-Supervised Deep Learning Based Approach Against Label Flipping Attack in Malware Detection System*****************") + print(" ") + + X_train=sparse.csr_matrix(X_train) + print("row_train,column_train=", X_train.get_shape()) + print(" ") + X_val=sparse.csr_matrix(X_val) + row_val,column_val=X_val.get_shape() + print("row_val,column_val=",X_val.get_shape()) + print(" ") + X_test=sparse.csr_matrix(X_test) + row_test,column_test=X_test.get_shape() + print("row_test,column_test=",X_test.get_shape()) + print(" ") + print("********************************************************************") + #**************************************************Model Definition***************************************************************** + X_train_NoAttack=X_train.copy() + Y_train_NoAttack=Y_train[:] + + X_val_NoAttack=X_val.copy() + Y_val_NoAttack=Y_val[:] + + row_train_NoAttack,column_train_NoAttack=X_train_NoAttack.get_shape() + model_main = Sequential() + model_main.add(Embedding(row_train_NoAttack, 8, input_length=column_train_NoAttack)) + model_main.add(Conv1D(16,2, strides=2, padding='same')) + model_main.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main.add(Conv1D(32,2, strides=2, padding='same')) + model_main.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main.add(Conv1D(64,2, strides=2, padding='same')) + model_main.add(Flatten()) + model_main.add(Dense(1, activation='sigmoid')) + model_main.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main.fit(X_train_NoAttack, Y_train_NoAttack, epochs=200, verbose=0) + + Y_CNN_NoAttack=model_main.predict(X_test, verbose=0) + Y_predict_NoAttack=[0]*len(Y_CNN_NoAttack) + + for i in range(len(Y_CNN_NoAttack)): + if Y_CNN_NoAttack[i]<0.5: + Y_CNN_NoAttack[i]=0 + else: + Y_CNN_NoAttack[i]=1 + + for i in range(len(Y_CNN_NoAttack)): + Y_predict_NoAttack[i]= int(Y_CNN_NoAttack[i]) + #*****************************************************Result of Model without attack on X_test***************************************** + print("********************************Result of Model without attack******************************************************************") + loss, accuracy = model_main.evaluate(X_train_NoAttack, Y_train_NoAttack, verbose=2) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main.evaluate(X_val_NoAttack, Y_val_NoAttack, verbose=2) + print('Accuracy for Validation set: %f' % (accuracy*100)) + print('Loss for Train Validation set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_NoAttack, FP_NoAttack, FN_NoAttack, TP_NoAttack = confusion_matrix(Y_test, Y_predict_NoAttack).ravel() + print("TN_NoAttack=",TN_NoAttack) + print("FP_NoAttack=",FP_NoAttack) + print("FN_NoAttack=",FN_NoAttack) + print("TP_NoAttack=",TP_NoAttack) + print(" ") + + if (FP_NoAttack+TN_NoAttack)>0: + FPR_NoAttack=FP_NoAttack/(FP_NoAttack+TN_NoAttack) + print("The FPR_NoAttack result=", FPR_NoAttack) + + if (FP_NoAttack+TN_NoAttack)>0: + TPR_NoAttack=TP_NoAttack/(TP_NoAttack+FN_NoAttack) + print("The TPR_NoAttack result=", TPR_NoAttack) + + if (TN_NoAttack+FP_NoAttack)>0: + TNR_NoAttack=TN_NoAttack/(TN_NoAttack+FP_NoAttack) + print("The TNR_NoAttack result=", TNR_NoAttack) + + if (FN_NoAttack+TP_NoAttack)>0: + FNR_NoAttack=FN_NoAttack/(FN_NoAttack+TP_NoAttack) + print("The FNR_NoAttack result=", FNR_NoAttack) + + if ((TN_NoAttack/(TN_NoAttack+FP_NoAttack))+(TP_NoAttack/(TP_NoAttack+FP_NoAttack)))>0: + AUC_NoAttack=1/(2*((TN_NoAttack/(TN_NoAttack+FP_NoAttack))+(TP_NoAttack/(TP_NoAttack+FP_NoAttack)))) + print("The AUC_NoAttack result=", AUC_NoAttack) + + if (TP_NoAttack+TN_NoAttack+FP_NoAttack+FN_NoAttack)>0: + ACC_NoAttack=(TP_NoAttack+TN_NoAttack)/(TP_NoAttack+TN_NoAttack+FP_NoAttack+FN_NoAttack) + print("The ACC_NoAttack result=", ACC_NoAttack) + + if ((TP_NoAttack+FP_NoAttack)*(TP_NoAttack+FN_NoAttack)*(TN_NoAttack+FP_NoAttack)*(TN_NoAttack+FN_NoAttack))>0: + MCC_NoAttack=(TP_NoAttack*TN_NoAttack-FP_NoAttack*FN_NoAttack)/math.sqrt((TP_NoAttack+FP_NoAttack)*(TP_NoAttack+FN_NoAttack)*(TN_NoAttack+FP_NoAttack)*(TN_NoAttack+FN_NoAttack)) + print("The Matthews correlation coefficient result=", MCC_NoAttack) + print(" ") + print("*****************************************************End of Without Attack part************************************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************Label Flipping Attack*****************************************************") + print(" ") + #************************** + # finding Malware of Train data + malware_train= sparse.lil_matrix(X_train) + cl_malware=list() + z_m=0 + count_m=0 + for i, j in enumerate(Y_train): + if j == 1: + delete_row_lil(malware_train, i-count_m) + count_m=count_m+1 + else: + cl_malware.insert(z_m, 1) + z_m=z_m+1 + #*************************** + #Finding Benign of Train data + cl_X_train=list(Y_train) + benign_train=sparse.lil_matrix(X_train) + z_b=0 + count_b=0 + cl_benign=list() + for i, j in enumerate(cl_X_train): + if j == 0: + delete_row_lil(benign_train, i-count_b) + count_b=count_b+1 + else: + cl_benign.insert(z_b, 1) + z_b=z_b+1 + print("***********Size of Each Data Part:**********") + print("malware_train=", malware_train.get_shape()) + print("benign_train=", benign_train.get_shape()) + #*************************************************** + row_malware_train,column_malware_train=malware_train.get_shape() + #Number_of_flipped_label=int(row_malware_train) + + X_train_LFA=X_train.copy() + Y_train_LFA=Y_train[:] + + row_train_LFA,column_train_LFA=X_train_LFA.get_shape() + clusterer = KMeans(n_clusters=2, random_state=10) + X=X_train_LFA.toarray() + t0=time.time() + cluster_labels = clusterer.fit_predict(X) + sample_silhouette_values = silhouette_samples(X, cluster_labels) + #print("sample_silhouette_values=",sample_silhouette_values) + + flipped_Y_train=list(Y_train_LFA) + counter=0 + for new_index in range(row_train_LFA): + if (sample_silhouette_values[new_index]<0.1): #and (flipped_Y_train[new_index]==0) + flipped_Y_train[new_index]=abs(flipped_Y_train[new_index]-1) #flipped_Y_train[new_index]=1 + counter=counter+1 + + print("Flipped counter=", counter) + t1=time.time() + print("Time for Label Flipping Attack =",t1-t0) + print(" ") + + #************************************************************************** + model_main_LFA_Final = Sequential() + model_main_LFA_Final.add(Embedding(row_train_LFA, 8, input_length=column_train_LFA)) + model_main_LFA_Final.add(Conv1D(16,2, strides=2, padding='same')) + model_main_LFA_Final.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LFA_Final.add(Conv1D(32,2, strides=2, padding='same')) + model_main_LFA_Final.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LFA_Final.add(Conv1D(64,2, strides=2, padding='same')) + model_main_LFA_Final.add(Flatten()) + model_main_LFA_Final.add(Dense(1, activation='sigmoid')) + model_main_LFA_Final.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_LFA_Final.fit(X_train_LFA, flipped_Y_train, epochs=200, verbose=0) + + + Y_predict_LFA=model_main_LFA_Final.predict(X_test, verbose=0) + Y_predict_LFA_Final=[0]*len(Y_predict_LFA) + + for i in range(len(Y_predict_LFA)): + if Y_predict_LFA[i]<0.5: + Y_predict_LFA[i]=0 + else: + Y_predict_LFA[i]=1 + + for i in range(len(Y_predict_LFA)): + Y_predict_LFA_Final[i]= int(Y_predict_LFA[i]) + #*****************************************************Result of Model with LFA ****************************************************** + print("********************************Result of Model with LFA attack **************************************************************") + print(" ") + loss, accuracy = model_main_LFA_Final.evaluate(X_train_LFA, flipped_Y_train, verbose=2) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main_LFA_Final.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_LFA, FP_LFA, FN_LFA, TP_LFA = confusion_matrix(Y_test, Y_predict_LFA_Final).ravel() + print("TN_LFA=",TN_LFA) + print("FP_LFA=",FP_LFA) + print("FN_LFA=",FN_LFA) + print("TP_LFA=",TP_LFA) + print(" ") + + if (FP_LFA+TN_LFA)>0: + FPR_LFA=FP_LFA/(FP_LFA+TN_LFA) + print("The FPR_LFA result=", FPR_LFA) + + if (FP_LFA+TN_LFA)>0: + TPR_LFA=TP_LFA/(TP_LFA+FN_LFA) + print("The TPR_LFA result=", TPR_LFA) + + if (TN_LFA+FP_LFA)>0: + TNR_LFA=TN_LFA/(TN_LFA+FP_LFA) + print("The TNR_LFA result=", TNR_LFA) + + if (FN_LFA+TP_LFA)>0: + FNR_LFA=FN_LFA/(FN_LFA+TP_LFA) + print("The FNR_LFA result=", FNR_LFA) + + if ((TN_LFA/(TN_LFA+FP_LFA))+(TP_LFA/(TP_LFA+FP_LFA)))>0: + AUC_LFA=1/(2*((TN_LFA/(TN_LFA+FP_LFA))+(TP_LFA/(TP_LFA+FP_LFA)))) + print("The AUC_LFA result=", AUC_LFA) + + if (TP_LFA+TN_LFA+FP_LFA+FN_LFA)>0: + ACC_LFA=(TP_LFA+TN_LFA)/(TP_LFA+TN_LFA+FP_LFA+FN_LFA) + print("The ACC_LFAk result=", ACC_LFA) + + if ((TP_LFA+FP_LFA)*(TP_LFA+FN_LFA)*(TN_LFA+FP_LFA)*(TN_LFA+FN_LFA))>0: + MCC_LFA=(TP_LFA*TN_LFA-FP_LFA*FN_LFA)/math.sqrt((TP_LFA+FP_LFA)*(TP_LFA+FN_LFA)*(TN_LFA+FP_LFA)*(TN_LFA+FN_LFA)) + print("The Matthews correlation coefficient result=", MCC_LFA) + print(" ") + print("************************************************End of Label Flipping Attack part**********************************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************KNN Based Semi-Supervised Defense(KSD)************************************") + print(" ") + + X_train_KNN=X_train.copy() + Y_train_KNN=flipped_Y_train[:] + + X_val_KNN=X_val.copy() + Y_val_KNN=Y_val[:] + + row_train_KNN,column_train_KNN=X_train_KNN.get_shape() + + Number_of_flipped_label=int(row_train_KNN/50) + Y_train_corrected_By_KNN=list(Y_train_KNN) + + c=0 + m=0 + t2=time.time() + + for i in list(range(Number_of_flipped_label)): + row_KNN=X_train_KNN.getrow(i) + distances = sklearn.metrics.pairwise.manhattan_distances(row_KNN,X_val_KNN) + indices = distances.argsort()[:10] + d=indices[0] + a=d[0:10] + + F=0 + for j in range(len(a)): + t=a[j] + F=F+Y_val_KNN[t] + fraction=F/10 + if fraction>=0.5: + Y_train_corrected_By_KNN[i]=1 + m=m+1 + else: + Y_train_corrected_By_KNN[i]=0 + c=c+1 + Y_train_corrected_By_KNN_Final=np.array(Y_train_corrected_By_KNN) + t3=time.time() + print("Time for KNN Based Semi-Supervised Defense(KSD) =",t3-t2) + print(" ") + + model_main_KNN = Sequential() + model_main_KNN.add(Embedding(row_train_NoAttack, 8, input_length=column_train_NoAttack)) + model_main_KNN.add(Conv1D(16,2, strides=2, padding='same')) + model_main_KNN.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_KNN.add(Conv1D(32,2, strides=2, padding='same')) + model_main_KNN.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_KNN.add(Conv1D(64,2, strides=2, padding='same')) + model_main_KNN.add(Flatten()) + model_main_KNN.add(Dense(1, activation='sigmoid')) + model_main_KNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_KNN.fit(X_train_KNN,Y_train_corrected_By_KNN_Final, epochs=20, batch_size=32, verbose=0) + Y_predict_KNN=model_main_KNN.predict(X_test, verbose=0) + + Y_predict_KNN_Final=[0]*len(Y_predict_KNN) + for i in range(len(Y_predict_KNN)): + if Y_predict_KNN[i]<0.5: + Y_predict_KNN[i]=0 + else: + Y_predict_KNN[i]=1 + + for i in range(len(Y_predict_KNN)): + Y_predict_KNN_Final[i]= int(Y_predict_KNN[i]) + #*****************************************************Result of Model After KNN Based Defense***************************************** + print("************************Result After KNN_Based Defense************************************************************************") + print(" ") + + loss, accuracy = model_main_KNN.evaluate(X_train_KNN, Y_train_KNN, verbose=0) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main_KNN.evaluate(X_test, Y_test, batch_size=32, verbose=0) + print('Accuracy After KNN-Based Defense: %f' % (accuracy*100)) + print('Loss After KNN-Based Defense: %f' % (loss)) + print(" ") + + TN_KNN, FP_KNN, FN_KNN, TP_KNN = confusion_matrix(Y_test, Y_predict_KNN_Final).ravel() + print("TN_KNN=",TN_KNN) + print("FP_KNN=",FP_KNN) + print("FN_KNN=",FN_KNN) + print("TP_KNN=",TP_KNN) + print(" ") + + if (FP_KNN+TN_KNN)>0: + FPR_KNN=FP_KNN/(FP_KNN+TN_KNN) + print("The FPR_KNN result=", FPR_KNN) + + if (FP_KNN+TN_KNN)>0: + TPR_KNN=TP_KNN/(TP_KNN+FN_KNN) + print("The TPR_KNN result=", TPR_KNN) + + if (TN_KNN+FP_KNN)>0: + TNR_KNN=TN_KNN/(TN_KNN+FP_KNN) + print("The TNR_KNN result=", TNR_KNN) + + if (FN_KNN+TP_KNN)>0: + FNR_KNN=FN_KNN/(FN_KNN+TP_KNN) + print("The FNR_KNN result=", FNR_KNN) + + if ((TN_KNN/(TN_KNN+FP_KNN))+(TP_KNN/(TP_KNN+FP_KNN)))>0: + AUC_KNN=1/(2*((TN_KNN/(TN_KNN+FP_KNN))+(TP_KNN/(TP_KNN+FP_KNN)))) + print("The AUC_KNN result=", AUC_KNN) + + if (TP_KNN+TN_KNN+FP_KNN+FN_KNN)>0: + ACC_KNN=(TP_KNN+TN_KNN)/(TP_KNN+TN_KNN+FP_KNN+FN_KNN) + print("The ACC_KNN result=", ACC_KNN) + + if ((TP_KNN+FP_KNN)*(TP_KNN+FN_KNN)*(TN_KNN+FP_KNN)*(TN_KNN+FN_KNN))>0: + MCC_KNN=(TP_KNN*TN_KNN-FP_KNN*FN_KNN)/math.sqrt((TP_KNN+FP_KNN)*(TP_KNN+FN_KNN)*(TN_KNN+FP_KNN)*(TN_KNN+FN_KNN)) + print("The Matthews correlation coefficient result=", MCC_KNN) + print(" ") + print("************************************************End of KNN Based Semi-Supervised Defense(KSD) part*****************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************Label Based Semi-supervised Defense(LSD)**********************************") + print(" ") + #***********************label Propagation and Label Spreading for Using in Label Based Semi-supervised Defense(LSD) ******************* + X_train_LSD=X_train.copy() + Y_train_LSD=flipped_Y_train[:] + + X_val_LSD=X_val.copy() + Y_val_LSD=Y_val[:] + row_val_LSD,column_val_LSD=X_val_LSD.get_shape() + row_train_LSD,column_train_LSD=X_train_LSD.get_shape() + + t4=time.time() + + labels = np.full(row_train_LSD, -1) + for i in range(row_val_LSD): + labels[i] = Y_val_LSD[i] + + X=X_train_LSD.toarray() + label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=0.8) + label_propa=label_propagation.LabelPropagation(kernel='knn', gamma=20, n_neighbors=7, max_iter=1000, tol=0.001, n_jobs=None) + label_spread.fit(X, labels) + label_propa.fit(X, labels) + output_labels_spread = label_spread.transduction_ + output_labels_propa = label_propa.transduction_ + #*******************Convolutional Neural Network for Using in Label Based Semi-supervised Defense(LSD) ****************************** + CNN_model_for_LSD = Sequential() + CNN_model_for_LSD.add(Embedding(row_train_LSD, 8, input_length=column_train_LSD)) + CNN_model_for_LSD.add(Conv1D(16,2, strides=2, padding='same')) + CNN_model_for_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + CNN_model_for_LSD.add(Conv1D(32,2, strides=2, padding='same')) + CNN_model_for_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + CNN_model_for_LSD.add(Conv1D(64,2, strides=2, padding='same')) + CNN_model_for_LSD.add(Flatten()) + + CNN_model_for_LSD.add(Dense(1, activation='sigmoid')) + CNN_model_for_LSD.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + CNN_model_for_LSD.fit(X_train_LSD, Y_train_LSD, epochs=200, verbose=0) + + Y_predict_CNN_for_LSD=CNN_model_for_LSD.predict(X_train_LSD, verbose=0) + + Y_predict_CNN_LSD_Final=[0]*len(Y_predict_CNN_for_LSD) + for i in range(len(Y_predict_CNN_for_LSD)): + if Y_predict_CNN_for_LSD[i]<0.5: + Y_predict_CNN_for_LSD[i]=0 + else: + Y_predict_CNN_for_LSD[i]=1 + + for i in range(len(Y_predict_CNN_for_LSD)): + Y_predict_CNN_LSD_Final[i]= int(Y_predict_CNN_for_LSD[i]) + #*******************************************Voting Between CNN , label Propagation and Label Spreading************************** + Y_predict_LSD_Final=[0]*len(Y_train) + for i in range(len(Y_train)): + c=Y_train_LSD[i]+Y_predict_CNN_LSD_Final[i]+output_labels_propa[i]+output_labels_spread[i] + if 2<=c: + Y_predict_LSD_Final[i]=1 + else: + Y_predict_LSD_Final[i]=0 + t5=time.time() + print("Time for Label Based Semi-supervised Defense =",t5-t4) + print(" ") + #********************************************************************************************************************************* + model_main_LSD = Sequential() + model_main_LSD.add(Embedding(row_train_LSD, 8, input_length=column_train_LSD)) + model_main_LSD.add(Conv1D(16,2, strides=2, padding='same')) + model_main_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LSD.add(Conv1D(32,2, strides=2, padding='same')) + model_main_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LSD.add(Conv1D(64,2, strides=2, padding='same')) + model_main_LSD.add(Flatten()) + model_main_LSD.add(Dense(1, activation='sigmoid')) + model_main_LSD.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_LSD.fit(X_train_LSD, Y_predict_LSD_Final, epochs=200, verbose=0) + + Y_predict_LSD_Defense=model_main_LSD.predict(X_test, verbose=0) + Y_predict_LSD_Defense_Final=[0]*len(Y_predict_LSD_Defense) + + for i in range(len(Y_predict_LSD_Defense)): + if Y_predict_LSD_Defense[i]<0.5: + Y_predict_LSD_Defense[i]=0 + else: + Y_predict_LSD_Defense[i]=1 + + for i in range(len(Y_predict_LSD_Defense)): + Y_predict_LSD_Defense_Final[i]= int(Y_predict_LSD_Defense[i]) + #**************************************Result of Model after Label Based Semi-supervised Defense(LSD)********************************** + print("************************Result of Model after Label Based Semi-supervised Defense(LSD)*****************************************") + print(" ") + loss, accuracy = model_main.evaluate(X_train, Y_predict_LSD_Final, verbose=2) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_LSD, FP_LSD, FN_LSD, TP_LSD = confusion_matrix(Y_test, Y_predict_LSD_Defense_Final).ravel() + print("TN_LSD=",TN_LSD) + print("FP_LSD=",FP_LSD) + print("FN_LSD=",FN_LSD) + print("TP_LSD=",TP_LSD) + print(" ") + + if (FP_LSD+TN_LSD)>0: + FPR_LSD=FP_LSD/(FP_LSD+TN_LSD) + print("The FPR_LSD result=", FPR_LSD) + + if (FP_LSD+TN_LSD)>0: + TPR_LSD=TP_LSD/(TP_LSD+FN_LSD) + print("The TPR_LSD result=", TPR_LSD) + + if (TN_LSD+FP_LSD)>0: + TNR_LSD=TN_LSD/(TN_LSD+FP_LSD) + print("The TNR_LSD result=", TNR_LSD) + + if (FN_LSD+TP_LSD)>0: + FNR_LSD=FN_LSD/(FN_LSD+TP_LSD) + print("The FNR_LSD result=", FNR_LSD) + + if ((TN_LSD/(TN_LSD+FP_LSD))+(TP_LSD/(TP_LSD+FP_LSD)))>0: + AUC_LSD=1/(2*((TN_LSD/(TN_LSD+FP_LSD))+(TP_LSD/(TP_LSD+FP_LSD)))) + print("The AUC result=", AUC_LSD) + + if (TP_LSD+TN_LSD+FP_LSD+FN_LSD)>0: + ACC_LSD=(TP_LSD+TN_LSD)/(TP_LSD+TN_LSD+FP_LSD+FN_LSD) + print("The ACC result=", ACC_LSD) + + if ((TP_LSD+FP_LSD)*(TP_LSD+FN_LSD)*(TN_LSD+FP_LSD)*(TN_LSD+FN_LSD))>0: + MCC_LSD=(TP_LSD*TN_LSD-FP_LSD*FN_LSD)/math.sqrt((TP_LSD+FP_LSD)*(TP_LSD+FN_LSD)*(TN_LSD+FP_LSD)*(TN_LSD+FN_LSD)) + print("The Matthews correlation coefficient result=", MCC_LSD) + print(" ") + print("*****************************************************End of Label Based Semi-supervised Defense(LSD)***************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************Clustering Based Semi-supervised Defense(CSD)*****************************") + print(" ") + + X_train_CSD=X_train.copy() + Y_train_CSD=flipped_Y_train[:] + + X_val_CSD=X_val.copy() + Y_val_CSD=Y_val[:] + row_train_CSD,column_train_CSD=X_train_CSD.get_shape() + + t6=time.time() + + Y_predict_val_from_CNN_Model=model_main.predict(X_val_CSD, verbose=0) + + Y_predict_val_from_CNN_Model_Final=[0]*len(Y_predict_val_from_CNN_Model) + for i in range(len(Y_predict_val_from_CNN_Model)): + if Y_predict_val_from_CNN_Model[i]<0.5: + Y_predict_val_from_CNN_Model[i]=0 + else: + Y_predict_val_from_CNN_Model[i]=1 + for i in range(len(Y_predict_val_from_CNN_Model)): + Y_predict_val_from_CNN_Model_Final[i]= int(Y_predict_val_from_CNN_Model[i]) + + adjusted_rand_score_val=metrics.adjusted_rand_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + adjusted_mutual_info_score_val=metrics.adjusted_mutual_info_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + homogeneity_score_val=metrics.homogeneity_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + fowlkes_mallows_score_val=metrics.fowlkes_mallows_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + + for i in range(20): #row_train + Y_temp=Y_val_CSD.copy() + + row=X_train_CSD.getrow(i) + X_temp = sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((X_val_CSD, row)))) + Y_temp.append(Y_train_CSD[i]) + + Y_predict_CNN_compute_CSD=model_main.predict(X_temp, verbose=0) + + Y_predict_temp=[0]*len(Y_predict_CNN_compute_CSD) + + for n in range(len(Y_predict_CNN_compute_CSD)): + if Y_predict_CNN_compute_CSD[n]<0.5: + Y_predict_CNN_compute_CSD[n]=0 + else: + Y_predict_CNN_compute_CSD[n]=1 + + for m in range(len(Y_predict_CNN_compute_CSD)): + Y_predict_temp[m]= int(Y_predict_CNN_compute_CSD[m]) + + adjusted_rand_score_temp=metrics.adjusted_rand_score(Y_temp, Y_predict_temp) + adjusted_mutual_info_score_temp=metrics.adjusted_mutual_info_score(Y_temp, Y_predict_temp) + homogeneity_score_temp=metrics.homogeneity_score(Y_temp, Y_predict_temp) + fowlkes_mallows_score_temp=metrics.fowlkes_mallows_score(Y_temp, Y_predict_temp) + + landa1=abs(adjusted_rand_score_temp-adjusted_rand_score_val) + landa2=abs(adjusted_mutual_info_score_temp-adjusted_mutual_info_score_val) + landa3=abs(homogeneity_score_temp-homogeneity_score_val) + landa4=abs(fowlkes_mallows_score_temp-fowlkes_mallows_score_val) + + sum_of_diffrences=landa1+landa2+landa3+landa4 + + if sum_of_diffrences<0.1: + X_val_CSD = sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((X_val_CSD, row)))) + Y_val_CSD.append(Y_train_CSD[i]) + Y_predict_CNN_inside_CSD=model_main.predict(X_val_CSD, verbose=0) + + Y_predict_CNN_inside_CSD_Final=[0]*len(Y_predict_CNN_inside_CSD) #Y_predict_CNN_inside + for j in range(len(Y_predict_CNN_inside_CSD)): #Y_predict_CNN_inside + if Y_predict_CNN_inside_CSD[j]<0.5: + Y_predict_CNN_inside_CSD[j]=0 + else: + Y_predict_CNN_inside_CSD[j]=1 + + for k in range(len(Y_predict_CNN_inside_CSD)): #Y_predict_CNN_inside + Y_predict_CNN_inside_CSD_Final[k]= int(Y_predict_CNN_inside_CSD[k]) + + adjusted_rand_score_val=metrics.adjusted_rand_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + adjusted_mutual_info_score_val=metrics.adjusted_mutual_info_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + homogeneity_score_val=metrics.homogeneity_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + fowlkes_mallows_score_val=metrics.fowlkes_mallows_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + t7=time.time() + print("Time for Clustering Based Semi-supervised Defense =",t7-t6) + print(" ") + #**************************************************************************************** + X_train_Final_CSD= X_val_CSD.copy() + Y_train_Final_CSD=Y_val_CSD.copy() + row_train_CSD_Final,col_train_CSD_Final=X_train_Final_CSD.get_shape() + + model_main_CSD = Sequential() + model_main_CSD.add(Embedding(row_train_CSD_Final, 8, input_length=col_train_CSD_Final)) + model_main_CSD.add(Conv1D(16,2, strides=2, padding='same')) + model_main_CSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_CSD.add(Conv1D(32,2, strides=2, padding='same')) + model_main_CSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_CSD.add(Conv1D(64,2, strides=2, padding='same')) + model_main_CSD.add(Flatten()) + model_main_CSD.add(Dense(1, activation='sigmoid')) + model_main_CSD.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_CSD.fit(X_train_Final_CSD, Y_train_Final_CSD, epochs=200, verbose=0) + + Y_test_predict_CSD=model_main_CSD.predict(X_test, verbose=0) + + Y_test_predict_CSD_Final=[0]*len(Y_test_predict_CSD) + for i in range(len(Y_test_predict_CSD)): + if Y_test_predict_CSD[i]<0.5: + Y_test_predict_CSD[i]=0 + else: + Y_test_predict_CSD[i]=1 + + for i in range(len(Y_test_predict_CSD)): + Y_test_predict_CSD_Final[i]= int(Y_test_predict_CSD[i]) + + #*****************************************************Result of Model after Clustering Based Semi-supervised Defense(CSD)************** + print("***********************Result of Model after Clustering Based Semi-supervised Defense(CSD)*************************************") + print(" ") + + loss, accuracy = model_main_CSD.evaluate(X_train_Final_CSD, Y_train_Final_CSD, verbose=2) + print('Accuracy for New Train set: %f' % (accuracy*100)) + print('Loss for New Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main_CSD.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_CSD, FP_CSD, FN_CSD, TP_CSD = confusion_matrix(Y_test, Y_test_predict_CSD_Final).ravel() + print("TN_CSD=",TN_CSD) + print("FP_CSD=",FP_CSD) + print("FN_CSD=",FN_CSD) + print("TP_CSD=",TP_CSD) + print(" ") + + if (FP_CSD+TN_CSD)>0: + FPR_CSD=FP_CSD/(FP_CSD+TN_CSD) + print("The FPR_CSD result=", FPR_CSD) + + if (FP_CSD+TN_CSD)>0: + TPR_CSD=TP_CSD/(TP_CSD+FN_CSD) + print("The TPR_CSD result=", TPR_CSD) + + if (TN_CSD+FP_CSD)>0: + TNR_CSD=TN_CSD/(TN_CSD+FP_CSD) + print("The TNR_CSD result=", TNR_CSD) + + if (FN_CSD+TP_CSD)>0: + FNR_CSD=FN_CSD/(FN_CSD+TP_CSD) + print("The FNR_CSD result=", FNR_CSD) + + if ((TN_CSD/(TN_CSD+FP_CSD))+(TP_CSD/(TP_CSD+FP_CSD)))>0: + AUC_CSD=1/(2*((TN_CSD/(TN_CSD+FP_CSD))+(TP_CSD/(TP_CSD+FP_CSD)))) + print("The AUC_CSD result=", AUC_CSD) + + if (TP_CSD+TN_CSD+FP_CSD+FN_CSD)>0: + ACC_CSD=(TP_CSD+TN_CSD)/(TP_CSD+TN_CSD+FP_CSD+FN_CSD) + print("The ACC_CSD result=", ACC_CSD) + + if ((TP_CSD+FP_CSD)*(TP_CSD+FN_CSD)*(TN_CSD+FP_CSD)*(TN_CSD+FN_CSD))>0: + MCC_CSD=(TP_CSD*TN_CSD-FP_CSD*FN_CSD)/math.sqrt((TP_CSD+FP_CSD)*(TP_CSD+FN_CSD)*(TN_CSD+FP_CSD)*(TN_CSD+FN_CSD)) + print("The Matthews correlation coefficient result=", MCC_CSD) + print(" ") + print("************************************************End of Clustering Based Semi-supervised Defense(LSD)***************************") + print(" ") + print(" ") + print(" ") +#****************************************************************************************************************************************** +if __name__ == "__main__": + main() +#****************************************************************************** \ No newline at end of file diff --git a/Taheri2020NCAA-labelflipping_Sourcecode/Label_Flipping_Paper_without_Feature_Selection(LSD_CSD_KDD).py b/Taheri2020NCAA-labelflipping_Sourcecode/Label_Flipping_Paper_without_Feature_Selection(LSD_CSD_KDD).py new file mode 100644 index 0000000000000000000000000000000000000000..baf5038c8542251e51c12edfdbae2473cab18d83 --- /dev/null +++ b/Taheri2020NCAA-labelflipping_Sourcecode/Label_Flipping_Paper_without_Feature_Selection(LSD_CSD_KDD).py @@ -0,0 +1,833 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 21 14:25:17 2019 + +@author: Rahim +""" +#*****************************************************************import Library***************************************************************************** +from __future__ import print_function +from sklearn.feature_selection import SelectFromModel +from sklearn.feature_selection import SelectKBest, f_regression +from sklearn.model_selection import KFold +from sklearn.model_selection import cross_val_score +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix +from sklearn import model_selection +from sklearn.feature_selection import RFE +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestRegressor +from scipy.sparse import csr_matrix, vstack, hstack +from scipy.sparse import coo_matrix +from keras.preprocessing.text import one_hot +from sklearn import metrics +from sklearn.metrics import silhouette_samples, silhouette_score +from sklearn.semi_supervised import LabelPropagation +from sklearn.semi_supervised import LabelSpreading +from sklearn.semi_supervised import label_propagation +from sklearn.metrics import roc_auc_score +from sklearn.metrics import f1_score +from sklearn.cluster import KMeans +import math +#import keras +from keras.models import Sequential +from keras.layers import Dense, Dropout, Activation , Flatten +from sklearn.metrics import log_loss +from keras.optimizers import SGD +from keras.layers.normalization import BatchNormalization +from keras.layers.convolutional import UpSampling2D +from keras.layers.convolutional import Conv2D, MaxPooling2D, MaxPooling1D +from keras.layers.embeddings import Embedding +from scipy import sparse +import pandas as pd +import numpy as np +#import random +import sklearn +from sklearn.metrics.pairwise import manhattan_distances +from keras.models import Model +from keras.layers import Conv1D, multiply, GlobalMaxPool1D, Input , Lambda +import time +import argparse +#import math +from numpy import * +import os.path as osp +import scipy.sparse as sp +import pickle +from sklearn.metrics import accuracy_score +#********************************************************************************************************************************* +CLASS = 'class' +CLASS_BEN = 'B' +CLASS_MAL = 'M' +DATA = 'data' +#********************************************Functions that will be used in this program***************************************** +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--input-tables', nargs='*', dest='input_tables') + + args = parser.parse_args() + + return args +#********************************************************************************************************************************* +def read_table(table_file): + + table = dict() + + with open(table_file, 'rb') as handle: + while True: + try: + table = pickle.load(handle) + except EOFError: + break + + f_set=set() + + for k,v in table.items(): + for feature in v[DATA]: + f_set.add(feature) + + return table , f_set +#********************************************************************************************************************************* +def build_table(tables): + full_table = dict() + + file_set = set() + + for table in tables: + file_set.update(table.keys()) + for key, val in table.items(): + full_table[key] = val + + files = list(file_set) + return full_table, files +#********************************************************************************************************************************* +def convert_to_matrix(table, features, files): + mat = sp.lil.lil_matrix((len(files), len(features)), dtype=np.int8) + + print("Input Data Size = ", mat.get_shape()) + # the response vector + + cl = [0]*len(files) + + for key, val in table.items(): + k = files.index(key) + + if val[CLASS] is CLASS_BEN: + cl[k] = 1 + + for v in val[DATA]: + try: + idx = features.index(v) + mat[k, idx] = 1 + except Exception as e: + print(e) + pass + + return mat, cl +#****************************************************************************** +def delete_row_lil(mat, i): + if not isinstance(mat, sp.lil.lil_matrix): + raise ValueError("works only for LIL format -- use .tolil() first") + mat.rows = np.delete(mat.rows, i) + mat.data = np.delete(mat.data, i) + mat._shape = (mat._shape[0] - 1, mat._shape[1]) +#****************************************************************************** +def relevant_features(data, response_vector, features): + rel_features = list() + ranked_index=list() + + model =RandomForestRegressor() + rfe = RFE(model, 1) + fit = rfe.fit(data, response_vector) + old_features=features + + for i in fit.ranking_: + if i<len(features): + rel_features.append(features[i]) + ranked_index=[old_features.index(x) for x in rel_features if x in old_features] + + return rel_features ,ranked_index +#*****************************************************************Main Function******************************************************* +def main(): + args = parse_args() + tables = [] + f_set = set() + #read the data + for t_files in args.input_tables: + table, features = read_table(t_files) + f_set = f_set.union(features) + tables.append(table) + print(" ") + print(" ") + print("*********Semi-Supervised Deep Learning Based Approach Against Label Flipping Attack in Malware Detection System*****************") + print(" ") + #************************************build table from data and convert to matrix*************************************************** + full_table, files = build_table(tables) + files.sort() + features = list(f_set) + features.sort() + mat, cl = convert_to_matrix(full_table, features, files) + data = sparse.lil_matrix(sparse.csr_matrix(mat)) + #******************************************Split data to train , test and validation********************************************** + seed = 10 + test_size = 0.2 + X_train, X_test, Y_train, Y_test= train_test_split(data, cl, test_size= test_size, random_state=seed) + test_size = 0.25 + X_train, X_val, Y_train, Y_val= train_test_split(X_train, Y_train, test_size= test_size, random_state=seed) + #*********************************************************************************************************************************** + X_train=sparse.csr_matrix(X_train) + print("row_train,column_train=", X_train.get_shape()) + print(" ") + X_val=sparse.csr_matrix(X_val) + row_val,column_val=X_val.get_shape() + print("row_val,column_val=",X_val.get_shape()) + print(" ") + X_test=sparse.csr_matrix(X_test) + row_test,column_test=X_test.get_shape() + print("row_test,column_test=",X_test.get_shape()) + print(" ") + print("********************************************************************") + #**************************************************Model Definition***************************************************************** + X_train_NoAttack=X_train.copy() + Y_train_NoAttack=Y_train[:] + + X_val_NoAttack=X_val.copy() + Y_val_NoAttack=Y_val[:] + + row_train_NoAttack,column_train_NoAttack=X_train_NoAttack.get_shape() + model_main = Sequential() + model_main.add(Embedding(row_train_NoAttack, 8, input_length=column_train_NoAttack)) + model_main.add(Conv1D(16,2, strides=2, padding='same')) + model_main.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main.add(Conv1D(32,2, strides=2, padding='same')) + model_main.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main.add(Conv1D(64,2, strides=2, padding='same')) + model_main.add(Flatten()) + model_main.add(Dense(1, activation='sigmoid')) + model_main.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main.fit(X_train_NoAttack, Y_train_NoAttack, epochs=200, verbose=0) + + Y_CNN_NoAttack=model_main.predict(X_test, verbose=0) + Y_predict_NoAttack=[0]*len(Y_CNN_NoAttack) + + for i in range(len(Y_CNN_NoAttack)): + if Y_CNN_NoAttack[i]<0.5: + Y_CNN_NoAttack[i]=0 + else: + Y_CNN_NoAttack[i]=1 + + for i in range(len(Y_CNN_NoAttack)): + Y_predict_NoAttack[i]= int(Y_CNN_NoAttack[i]) + #*****************************************************Result of Model without attack on X_test***************************************** + print("********************************Result of Model without attack******************************************************************") + loss, accuracy = model_main.evaluate(X_train_NoAttack, Y_train_NoAttack, verbose=2) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main.evaluate(X_val_NoAttack, Y_val_NoAttack, verbose=2) + print('Accuracy for Validation set: %f' % (accuracy*100)) + print('Loss for Train Validation set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_NoAttack, FP_NoAttack, FN_NoAttack, TP_NoAttack = confusion_matrix(Y_test, Y_predict_NoAttack).ravel() + print("TN_NoAttack=",TN_NoAttack) + print("FP_NoAttack=",FP_NoAttack) + print("FN_NoAttack=",FN_NoAttack) + print("TP_NoAttack=",TP_NoAttack) + print(" ") + + if (FP_NoAttack+TN_NoAttack)>0: + FPR_NoAttack=FP_NoAttack/(FP_NoAttack+TN_NoAttack) + print("The FPR_NoAttack result=", FPR_NoAttack) + + if (FP_NoAttack+TN_NoAttack)>0: + TPR_NoAttack=TP_NoAttack/(TP_NoAttack+FN_NoAttack) + print("The TPR_NoAttack result=", TPR_NoAttack) + + if (TN_NoAttack+FP_NoAttack)>0: + TNR_NoAttack=TN_NoAttack/(TN_NoAttack+FP_NoAttack) + print("The TNR_NoAttack result=", TNR_NoAttack) + + if (FN_NoAttack+TP_NoAttack)>0: + FNR_NoAttack=FN_NoAttack/(FN_NoAttack+TP_NoAttack) + print("The FNR_NoAttack result=", FNR_NoAttack) + + if ((TN_NoAttack/(TN_NoAttack+FP_NoAttack))+(TP_NoAttack/(TP_NoAttack+FP_NoAttack)))>0: + AUC_NoAttack=1/(2*((TN_NoAttack/(TN_NoAttack+FP_NoAttack))+(TP_NoAttack/(TP_NoAttack+FP_NoAttack)))) + print("The AUC_NoAttack result=", AUC_NoAttack) + + if (TP_NoAttack+TN_NoAttack+FP_NoAttack+FN_NoAttack)>0: + ACC_NoAttack=(TP_NoAttack+TN_NoAttack)/(TP_NoAttack+TN_NoAttack+FP_NoAttack+FN_NoAttack) + print("The ACC_NoAttack result=", ACC_NoAttack) + + if ((TP_NoAttack+FP_NoAttack)*(TP_NoAttack+FN_NoAttack)*(TN_NoAttack+FP_NoAttack)*(TN_NoAttack+FN_NoAttack))>0: + MCC_NoAttack=(TP_NoAttack*TN_NoAttack-FP_NoAttack*FN_NoAttack)/math.sqrt((TP_NoAttack+FP_NoAttack)*(TP_NoAttack+FN_NoAttack)*(TN_NoAttack+FP_NoAttack)*(TN_NoAttack+FN_NoAttack)) + print("The Matthews correlation coefficient result=", MCC_NoAttack) + print(" ") + print("*****************************************************End of Without Attack part************************************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************Label Flipping Attack*****************************************************") + print(" ") + #************************** + # finding Malware of Train data + malware_train= sparse.lil_matrix(X_train) + cl_malware=list() + z_m=0 + count_m=0 + for i, j in enumerate(Y_train): + if j == 1: + delete_row_lil(malware_train, i-count_m) + count_m=count_m+1 + else: + cl_malware.insert(z_m, 1) + z_m=z_m+1 + #*************************** + #Finding Benign of Train data + cl_X_train=list(Y_train) + benign_train=sparse.lil_matrix(X_train) + z_b=0 + count_b=0 + cl_benign=list() + for i, j in enumerate(cl_X_train): + if j == 0: + delete_row_lil(benign_train, i-count_b) + count_b=count_b+1 + else: + cl_benign.insert(z_b, 1) + z_b=z_b+1 + print("***********Size of Each Data Part:**********") + print("malware_train=", malware_train.get_shape()) + print("benign_train=", benign_train.get_shape()) + #*************************************************** + row_malware_train,column_malware_train=malware_train.get_shape() + #Number_of_flipped_label=int(row_malware_train) + + X_train_LFA=X_train.copy() + Y_train_LFA=Y_train[:] + + row_train_LFA,column_train_LFA=X_train_LFA.get_shape() + clusterer = KMeans(n_clusters=2, random_state=10) + X=X_train_LFA.toarray() + t0=time.time() + cluster_labels = clusterer.fit_predict(X) + sample_silhouette_values = silhouette_samples(X, cluster_labels) + #print("sample_silhouette_values=",sample_silhouette_values) + + flipped_Y_train=list(Y_train_LFA) + counter=0 + for new_index in range(row_train_LFA): + if (sample_silhouette_values[new_index]<0.1): #and (flipped_Y_train[new_index]==0) + flipped_Y_train[new_index]=abs(flipped_Y_train[new_index]-1) #flipped_Y_train[new_index]=1 + counter=counter+1 + + print("Flipped counter=", counter) + t1=time.time() + print("Time for Label Flipping Attack =",t1-t0) + print(" ") + + #************************************************************************** + model_main_LFA_Final = Sequential() + model_main_LFA_Final.add(Embedding(row_train_LFA, 8, input_length=column_train_LFA)) + model_main_LFA_Final.add(Conv1D(16,2, strides=2, padding='same')) + model_main_LFA_Final.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LFA_Final.add(Conv1D(32,2, strides=2, padding='same')) + model_main_LFA_Final.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LFA_Final.add(Conv1D(64,2, strides=2, padding='same')) + model_main_LFA_Final.add(Flatten()) + model_main_LFA_Final.add(Dense(1, activation='sigmoid')) + model_main_LFA_Final.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_LFA_Final.fit(X_train_LFA, flipped_Y_train, epochs=200, verbose=0) + + + Y_predict_LFA=model_main_LFA_Final.predict(X_test, verbose=0) + Y_predict_LFA_Final=[0]*len(Y_predict_LFA) + + for i in range(len(Y_predict_LFA)): + if Y_predict_LFA[i]<0.5: + Y_predict_LFA[i]=0 + else: + Y_predict_LFA[i]=1 + + for i in range(len(Y_predict_LFA)): + Y_predict_LFA_Final[i]= int(Y_predict_LFA[i]) + #*****************************************************Result of Model with LFA ****************************************************** + print("********************************Result of Model with LFA attack **************************************************************") + print(" ") + loss, accuracy = model_main_LFA_Final.evaluate(X_train_LFA, flipped_Y_train, verbose=2) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main_LFA_Final.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_LFA, FP_LFA, FN_LFA, TP_LFA = confusion_matrix(Y_test, Y_predict_LFA_Final).ravel() + print("TN_LFA=",TN_LFA) + print("FP_LFA=",FP_LFA) + print("FN_LFA=",FN_LFA) + print("TP_LFA=",TP_LFA) + print(" ") + + if (FP_LFA+TN_LFA)>0: + FPR_LFA=FP_LFA/(FP_LFA+TN_LFA) + print("The FPR_LFA result=", FPR_LFA) + + if (FP_LFA+TN_LFA)>0: + TPR_LFA=TP_LFA/(TP_LFA+FN_LFA) + print("The TPR_LFA result=", TPR_LFA) + + if (TN_LFA+FP_LFA)>0: + TNR_LFA=TN_LFA/(TN_LFA+FP_LFA) + print("The TNR_LFA result=", TNR_LFA) + + if (FN_LFA+TP_LFA)>0: + FNR_LFA=FN_LFA/(FN_LFA+TP_LFA) + print("The FNR_LFA result=", FNR_LFA) + + if ((TN_LFA/(TN_LFA+FP_LFA))+(TP_LFA/(TP_LFA+FP_LFA)))>0: + AUC_LFA=1/(2*((TN_LFA/(TN_LFA+FP_LFA))+(TP_LFA/(TP_LFA+FP_LFA)))) + print("The AUC_LFA result=", AUC_LFA) + + if (TP_LFA+TN_LFA+FP_LFA+FN_LFA)>0: + ACC_LFA=(TP_LFA+TN_LFA)/(TP_LFA+TN_LFA+FP_LFA+FN_LFA) + print("The ACC_LFAk result=", ACC_LFA) + + if ((TP_LFA+FP_LFA)*(TP_LFA+FN_LFA)*(TN_LFA+FP_LFA)*(TN_LFA+FN_LFA))>0: + MCC_LFA=(TP_LFA*TN_LFA-FP_LFA*FN_LFA)/math.sqrt((TP_LFA+FP_LFA)*(TP_LFA+FN_LFA)*(TN_LFA+FP_LFA)*(TN_LFA+FN_LFA)) + print("The Matthews correlation coefficient result=", MCC_LFA) + print(" ") + print("************************************************End of Label Flipping Attack part**********************************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************KNN Based Semi-Supervised Defense(KSD)************************************") + print(" ") + + X_train_KNN=X_train.copy() + Y_train_KNN=flipped_Y_train[:] + + X_val_KNN=X_val.copy() + Y_val_KNN=Y_val[:] + + row_train_KNN,column_train_KNN=X_train_KNN.get_shape() + + Number_of_flipped_label=int(row_train_KNN/50) + Y_train_corrected_By_KNN=list(Y_train_KNN) + + c=0 + m=0 + t2=time.time() + + for i in list(range(Number_of_flipped_label)): + row_KNN=X_train_KNN.getrow(i) + distances = sklearn.metrics.pairwise.manhattan_distances(row_KNN,X_val_KNN) + indices = distances.argsort()[:10] + d=indices[0] + a=d[0:10] + + F=0 + for j in range(len(a)): + t=a[j] + F=F+Y_val_KNN[t] + fraction=F/10 + if fraction>=0.5: + Y_train_corrected_By_KNN[i]=1 + m=m+1 + else: + Y_train_corrected_By_KNN[i]=0 + c=c+1 + Y_train_corrected_By_KNN_Final=np.array(Y_train_corrected_By_KNN) + t3=time.time() + print("Time for KNN Based Semi-Supervised Defense(KSD) =",t3-t2) + print(" ") + + model_main_KNN = Sequential() + model_main_KNN.add(Embedding(row_train_NoAttack, 8, input_length=column_train_NoAttack)) + model_main_KNN.add(Conv1D(16,2, strides=2, padding='same')) + model_main_KNN.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_KNN.add(Conv1D(32,2, strides=2, padding='same')) + model_main_KNN.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_KNN.add(Conv1D(64,2, strides=2, padding='same')) + model_main_KNN.add(Flatten()) + model_main_KNN.add(Dense(1, activation='sigmoid')) + model_main_KNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_KNN.fit(X_train_KNN,Y_train_corrected_By_KNN_Final, epochs=20, batch_size=32, verbose=0) + Y_predict_KNN=model_main_KNN.predict(X_test, verbose=0) + + Y_predict_KNN_Final=[0]*len(Y_predict_KNN) + for i in range(len(Y_predict_KNN)): + if Y_predict_KNN[i]<0.5: + Y_predict_KNN[i]=0 + else: + Y_predict_KNN[i]=1 + + for i in range(len(Y_predict_KNN)): + Y_predict_KNN_Final[i]= int(Y_predict_KNN[i]) + #*****************************************************Result of Model After KNN Based Defense***************************************** + print("************************Result After KNN_Based Defense************************************************************************") + print(" ") + + loss, accuracy = model_main_KNN.evaluate(X_train_KNN, Y_train_KNN, verbose=0) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main_KNN.evaluate(X_test, Y_test, batch_size=32, verbose=0) + print('Accuracy After KNN-Based Defense: %f' % (accuracy*100)) + print('Loss After KNN-Based Defense: %f' % (loss)) + print(" ") + + TN_KNN, FP_KNN, FN_KNN, TP_KNN = confusion_matrix(Y_test, Y_predict_KNN_Final).ravel() + print("TN_KNN=",TN_KNN) + print("FP_KNN=",FP_KNN) + print("FN_KNN=",FN_KNN) + print("TP_KNN=",TP_KNN) + print(" ") + + if (FP_KNN+TN_KNN)>0: + FPR_KNN=FP_KNN/(FP_KNN+TN_KNN) + print("The FPR_KNN result=", FPR_KNN) + + if (FP_KNN+TN_KNN)>0: + TPR_KNN=TP_KNN/(TP_KNN+FN_KNN) + print("The TPR_KNN result=", TPR_KNN) + + if (TN_KNN+FP_KNN)>0: + TNR_KNN=TN_KNN/(TN_KNN+FP_KNN) + print("The TNR_KNN result=", TNR_KNN) + + if (FN_KNN+TP_KNN)>0: + FNR_KNN=FN_KNN/(FN_KNN+TP_KNN) + print("The FNR_KNN result=", FNR_KNN) + + if ((TN_KNN/(TN_KNN+FP_KNN))+(TP_KNN/(TP_KNN+FP_KNN)))>0: + AUC_KNN=1/(2*((TN_KNN/(TN_KNN+FP_KNN))+(TP_KNN/(TP_KNN+FP_KNN)))) + print("The AUC_KNN result=", AUC_KNN) + + if (TP_KNN+TN_KNN+FP_KNN+FN_KNN)>0: + ACC_KNN=(TP_KNN+TN_KNN)/(TP_KNN+TN_KNN+FP_KNN+FN_KNN) + print("The ACC_KNN result=", ACC_KNN) + + if ((TP_KNN+FP_KNN)*(TP_KNN+FN_KNN)*(TN_KNN+FP_KNN)*(TN_KNN+FN_KNN))>0: + MCC_KNN=(TP_KNN*TN_KNN-FP_KNN*FN_KNN)/math.sqrt((TP_KNN+FP_KNN)*(TP_KNN+FN_KNN)*(TN_KNN+FP_KNN)*(TN_KNN+FN_KNN)) + print("The Matthews correlation coefficient result=", MCC_KNN) + print(" ") + print("************************************************End of KNN Based Semi-Supervised Defense(KSD) part*****************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************Label Based Semi-supervised Defense(LSD)**********************************") + print(" ") + #***********************label Propagation and Label Spreading for Using in Label Based Semi-supervised Defense(LSD) ******************* + X_train_LSD=X_train.copy() + Y_train_LSD=flipped_Y_train[:] + + X_val_LSD=X_val.copy() + Y_val_LSD=Y_val[:] + row_val_LSD,column_val_LSD=X_val_LSD.get_shape() + row_train_LSD,column_train_LSD=X_train_LSD.get_shape() + + t4=time.time() + + labels = np.full(row_train_LSD, -1) + for i in range(row_val_LSD): + labels[i] = Y_val_LSD[i] + + X=X_train_LSD.toarray() + label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=0.8) + label_propa=label_propagation.LabelPropagation(kernel='knn', gamma=20, n_neighbors=7, max_iter=1000, tol=0.001, n_jobs=None) + label_spread.fit(X, labels) + label_propa.fit(X, labels) + output_labels_spread = label_spread.transduction_ + output_labels_propa = label_propa.transduction_ + #*******************Convolutional Neural Network for Using in Label Based Semi-supervised Defense(LSD) ****************************** + CNN_model_for_LSD = Sequential() + CNN_model_for_LSD.add(Embedding(row_train_LSD, 8, input_length=column_train_LSD)) + CNN_model_for_LSD.add(Conv1D(16,2, strides=2, padding='same')) + CNN_model_for_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + CNN_model_for_LSD.add(Conv1D(32,2, strides=2, padding='same')) + CNN_model_for_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + CNN_model_for_LSD.add(Conv1D(64,2, strides=2, padding='same')) + CNN_model_for_LSD.add(Flatten()) + + CNN_model_for_LSD.add(Dense(1, activation='sigmoid')) + CNN_model_for_LSD.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + CNN_model_for_LSD.fit(X_train_LSD, Y_train_LSD, epochs=200, verbose=0) + + Y_predict_CNN_for_LSD=CNN_model_for_LSD.predict(X_train_LSD, verbose=0) + + Y_predict_CNN_LSD_Final=[0]*len(Y_predict_CNN_for_LSD) + for i in range(len(Y_predict_CNN_for_LSD)): + if Y_predict_CNN_for_LSD[i]<0.5: + Y_predict_CNN_for_LSD[i]=0 + else: + Y_predict_CNN_for_LSD[i]=1 + + for i in range(len(Y_predict_CNN_for_LSD)): + Y_predict_CNN_LSD_Final[i]= int(Y_predict_CNN_for_LSD[i]) + #*******************************************Voting Between CNN , label Propagation and Label Spreading************************** + Y_predict_LSD_Final=[0]*len(Y_train) + for i in range(len(Y_train)): + c=Y_train_LSD[i]+Y_predict_CNN_LSD_Final[i]+output_labels_propa[i]+output_labels_spread[i] + if 2<=c: + Y_predict_LSD_Final[i]=1 + else: + Y_predict_LSD_Final[i]=0 + t5=time.time() + print("Time for Label Based Semi-supervised Defense =",t5-t4) + print(" ") + #********************************************************************************************************************************* + model_main_LSD = Sequential() + model_main_LSD.add(Embedding(row_train_LSD, 8, input_length=column_train_LSD)) + model_main_LSD.add(Conv1D(16,2, strides=2, padding='same')) + model_main_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LSD.add(Conv1D(32,2, strides=2, padding='same')) + model_main_LSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_LSD.add(Conv1D(64,2, strides=2, padding='same')) + model_main_LSD.add(Flatten()) + model_main_LSD.add(Dense(1, activation='sigmoid')) + model_main_LSD.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_LSD.fit(X_train_LSD, Y_predict_LSD_Final, epochs=200, verbose=0) + + Y_predict_LSD_Defense=model_main_LSD.predict(X_test, verbose=0) + Y_predict_LSD_Defense_Final=[0]*len(Y_predict_LSD_Defense) + + for i in range(len(Y_predict_LSD_Defense)): + if Y_predict_LSD_Defense[i]<0.5: + Y_predict_LSD_Defense[i]=0 + else: + Y_predict_LSD_Defense[i]=1 + + for i in range(len(Y_predict_LSD_Defense)): + Y_predict_LSD_Defense_Final[i]= int(Y_predict_LSD_Defense[i]) + #**************************************Result of Model after Label Based Semi-supervised Defense(LSD)********************************** + print("************************Result of Model after Label Based Semi-supervised Defense(LSD)*****************************************") + print(" ") + loss, accuracy = model_main.evaluate(X_train, Y_predict_LSD_Final, verbose=2) + print('Accuracy for Train set: %f' % (accuracy*100)) + print('Loss for Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_LSD, FP_LSD, FN_LSD, TP_LSD = confusion_matrix(Y_test, Y_predict_LSD_Defense_Final).ravel() + print("TN_LSD=",TN_LSD) + print("FP_LSD=",FP_LSD) + print("FN_LSD=",FN_LSD) + print("TP_LSD=",TP_LSD) + print(" ") + + if (FP_LSD+TN_LSD)>0: + FPR_LSD=FP_LSD/(FP_LSD+TN_LSD) + print("The FPR_LSD result=", FPR_LSD) + + if (FP_LSD+TN_LSD)>0: + TPR_LSD=TP_LSD/(TP_LSD+FN_LSD) + print("The TPR_LSD result=", TPR_LSD) + + if (TN_LSD+FP_LSD)>0: + TNR_LSD=TN_LSD/(TN_LSD+FP_LSD) + print("The TNR_LSD result=", TNR_LSD) + + if (FN_LSD+TP_LSD)>0: + FNR_LSD=FN_LSD/(FN_LSD+TP_LSD) + print("The FNR_LSD result=", FNR_LSD) + + if ((TN_LSD/(TN_LSD+FP_LSD))+(TP_LSD/(TP_LSD+FP_LSD)))>0: + AUC_LSD=1/(2*((TN_LSD/(TN_LSD+FP_LSD))+(TP_LSD/(TP_LSD+FP_LSD)))) + print("The AUC result=", AUC_LSD) + + if (TP_LSD+TN_LSD+FP_LSD+FN_LSD)>0: + ACC_LSD=(TP_LSD+TN_LSD)/(TP_LSD+TN_LSD+FP_LSD+FN_LSD) + print("The ACC result=", ACC_LSD) + + if ((TP_LSD+FP_LSD)*(TP_LSD+FN_LSD)*(TN_LSD+FP_LSD)*(TN_LSD+FN_LSD))>0: + MCC_LSD=(TP_LSD*TN_LSD-FP_LSD*FN_LSD)/math.sqrt((TP_LSD+FP_LSD)*(TP_LSD+FN_LSD)*(TN_LSD+FP_LSD)*(TN_LSD+FN_LSD)) + print("The Matthews correlation coefficient result=", MCC_LSD) + print(" ") + print("*****************************************************End of Label Based Semi-supervised Defense(LSD)***************************") + print(" ") + print(" ") + print(" ") + print("*****************************************************Clustering Based Semi-supervised Defense(CSD)*****************************") + print(" ") + + X_train_CSD=X_train.copy() + Y_train_CSD=flipped_Y_train[:] + + X_val_CSD=X_val.copy() + Y_val_CSD=Y_val[:] + row_train_CSD,column_train_CSD=X_train_CSD.get_shape() + + t6=time.time() + + Y_predict_val_from_CNN_Model=model_main.predict(X_val_CSD, verbose=0) + + Y_predict_val_from_CNN_Model_Final=[0]*len(Y_predict_val_from_CNN_Model) + for i in range(len(Y_predict_val_from_CNN_Model)): + if Y_predict_val_from_CNN_Model[i]<0.5: + Y_predict_val_from_CNN_Model[i]=0 + else: + Y_predict_val_from_CNN_Model[i]=1 + for i in range(len(Y_predict_val_from_CNN_Model)): + Y_predict_val_from_CNN_Model_Final[i]= int(Y_predict_val_from_CNN_Model[i]) + + adjusted_rand_score_val=metrics.adjusted_rand_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + adjusted_mutual_info_score_val=metrics.adjusted_mutual_info_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + homogeneity_score_val=metrics.homogeneity_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + fowlkes_mallows_score_val=metrics.fowlkes_mallows_score(Y_val_CSD, Y_predict_val_from_CNN_Model_Final) + + for i in range(20): #row_train + Y_temp=Y_val_CSD.copy() + + row=X_train_CSD.getrow(i) + X_temp = sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((X_val_CSD, row)))) + Y_temp.append(Y_train_CSD[i]) + + Y_predict_CNN_compute_CSD=model_main.predict(X_temp, verbose=0) + + Y_predict_temp=[0]*len(Y_predict_CNN_compute_CSD) + + for n in range(len(Y_predict_CNN_compute_CSD)): + if Y_predict_CNN_compute_CSD[n]<0.5: + Y_predict_CNN_compute_CSD[n]=0 + else: + Y_predict_CNN_compute_CSD[n]=1 + + for m in range(len(Y_predict_CNN_compute_CSD)): + Y_predict_temp[m]= int(Y_predict_CNN_compute_CSD[m]) + + adjusted_rand_score_temp=metrics.adjusted_rand_score(Y_temp, Y_predict_temp) + adjusted_mutual_info_score_temp=metrics.adjusted_mutual_info_score(Y_temp, Y_predict_temp) + homogeneity_score_temp=metrics.homogeneity_score(Y_temp, Y_predict_temp) + fowlkes_mallows_score_temp=metrics.fowlkes_mallows_score(Y_temp, Y_predict_temp) + + landa1=abs(adjusted_rand_score_temp-adjusted_rand_score_val) + landa2=abs(adjusted_mutual_info_score_temp-adjusted_mutual_info_score_val) + landa3=abs(homogeneity_score_temp-homogeneity_score_val) + landa4=abs(fowlkes_mallows_score_temp-fowlkes_mallows_score_val) + + sum_of_diffrences=landa1+landa2+landa3+landa4 + + if sum_of_diffrences<0.1: + X_val_CSD = sp.lil.lil_matrix(sparse.csr_matrix(sparse.vstack((X_val_CSD, row)))) + Y_val_CSD.append(Y_train_CSD[i]) + Y_predict_CNN_inside_CSD=model_main.predict(X_val_CSD, verbose=0) + + Y_predict_CNN_inside_CSD_Final=[0]*len(Y_predict_CNN_inside_CSD) #Y_predict_CNN_inside + for j in range(len(Y_predict_CNN_inside_CSD)): #Y_predict_CNN_inside + if Y_predict_CNN_inside_CSD[j]<0.5: + Y_predict_CNN_inside_CSD[j]=0 + else: + Y_predict_CNN_inside_CSD[j]=1 + + for k in range(len(Y_predict_CNN_inside_CSD)): #Y_predict_CNN_inside + Y_predict_CNN_inside_CSD_Final[k]= int(Y_predict_CNN_inside_CSD[k]) + + adjusted_rand_score_val=metrics.adjusted_rand_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + adjusted_mutual_info_score_val=metrics.adjusted_mutual_info_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + homogeneity_score_val=metrics.homogeneity_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + fowlkes_mallows_score_val=metrics.fowlkes_mallows_score(Y_val_CSD, Y_predict_CNN_inside_CSD_Final) + t7=time.time() + print("Time for Clustering Based Semi-supervised Defense =",t7-t6) + print(" ") + #**************************************************************************************** + X_train_Final_CSD= X_val_CSD.copy() + Y_train_Final_CSD=Y_val_CSD.copy() + row_train_CSD_Final,col_train_CSD_Final=X_train_Final_CSD.get_shape() + + model_main_CSD = Sequential() + model_main_CSD.add(Embedding(row_train_CSD_Final, 8, input_length=col_train_CSD_Final)) + model_main_CSD.add(Conv1D(16,2, strides=2, padding='same')) + model_main_CSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_CSD.add(Conv1D(32,2, strides=2, padding='same')) + model_main_CSD.add(MaxPooling1D(pool_size = (4), strides=(2))) + model_main_CSD.add(Conv1D(64,2, strides=2, padding='same')) + model_main_CSD.add(Flatten()) + model_main_CSD.add(Dense(1, activation='sigmoid')) + model_main_CSD.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) + model_main_CSD.fit(X_train_Final_CSD, Y_train_Final_CSD, epochs=200, verbose=0) + + Y_test_predict_CSD=model_main_CSD.predict(X_test, verbose=0) + + Y_test_predict_CSD_Final=[0]*len(Y_test_predict_CSD) + for i in range(len(Y_test_predict_CSD)): + if Y_test_predict_CSD[i]<0.5: + Y_test_predict_CSD[i]=0 + else: + Y_test_predict_CSD[i]=1 + + for i in range(len(Y_test_predict_CSD)): + Y_test_predict_CSD_Final[i]= int(Y_test_predict_CSD[i]) + + #*****************************************************Result of Model after Clustering Based Semi-supervised Defense(CSD)************** + print("***********************Result of Model after Clustering Based Semi-supervised Defense(CSD)*************************************") + print(" ") + + loss, accuracy = model_main_CSD.evaluate(X_train_Final_CSD, Y_train_Final_CSD, verbose=2) + print('Accuracy for New Train set: %f' % (accuracy*100)) + print('Loss for New Train set: %f' % (loss)) + print(" ") + + loss, accuracy = model_main_CSD.evaluate(X_test, Y_test, verbose=2) + print('Accuracy for Test set: %f' % (accuracy*100)) + print('Loss for Test set:: %f' % (loss)) + print(" ") + + TN_CSD, FP_CSD, FN_CSD, TP_CSD = confusion_matrix(Y_test, Y_test_predict_CSD_Final).ravel() + print("TN_CSD=",TN_CSD) + print("FP_CSD=",FP_CSD) + print("FN_CSD=",FN_CSD) + print("TP_CSD=",TP_CSD) + print(" ") + + if (FP_CSD+TN_CSD)>0: + FPR_CSD=FP_CSD/(FP_CSD+TN_CSD) + print("The FPR_CSD result=", FPR_CSD) + + if (FP_CSD+TN_CSD)>0: + TPR_CSD=TP_CSD/(TP_CSD+FN_CSD) + print("The TPR_CSD result=", TPR_CSD) + + if (TN_CSD+FP_CSD)>0: + TNR_CSD=TN_CSD/(TN_CSD+FP_CSD) + print("The TNR_CSD result=", TNR_CSD) + + if (FN_CSD+TP_CSD)>0: + FNR_CSD=FN_CSD/(FN_CSD+TP_CSD) + print("The FNR_CSD result=", FNR_CSD) + + if ((TN_CSD/(TN_CSD+FP_CSD))+(TP_CSD/(TP_CSD+FP_CSD)))>0: + AUC_CSD=1/(2*((TN_CSD/(TN_CSD+FP_CSD))+(TP_CSD/(TP_CSD+FP_CSD)))) + print("The AUC_CSD result=", AUC_CSD) + + if (TP_CSD+TN_CSD+FP_CSD+FN_CSD)>0: + ACC_CSD=(TP_CSD+TN_CSD)/(TP_CSD+TN_CSD+FP_CSD+FN_CSD) + print("The ACC_CSD result=", ACC_CSD) + + if ((TP_CSD+FP_CSD)*(TP_CSD+FN_CSD)*(TN_CSD+FP_CSD)*(TN_CSD+FN_CSD))>0: + MCC_CSD=(TP_CSD*TN_CSD-FP_CSD*FN_CSD)/math.sqrt((TP_CSD+FP_CSD)*(TP_CSD+FN_CSD)*(TN_CSD+FP_CSD)*(TN_CSD+FN_CSD)) + print("The Matthews correlation coefficient result=", MCC_CSD) + print(" ") + print("************************************************End of Clustering Based Semi-supervised Defense(LSD)***************************") + print(" ") + print(" ") + print(" ") +#****************************************************************************************************************************************** +if __name__ == "__main__": + main() +#****************************************************************************** \ No newline at end of file diff --git a/Taheri2020NCAA-labelflipping_Sourcecode/README.txt b/Taheri2020NCAA-labelflipping_Sourcecode/README.txt index 855f77f6f498508911a43e93fca30e64f6671264..d557dd50551a7e881f5c37d5a5064ffaf7628897 100644 --- a/Taheri2020NCAA-labelflipping_Sourcecode/README.txt +++ b/Taheri2020NCAA-labelflipping_Sourcecode/README.txt @@ -28,5 +28,5 @@ Note: you need to preprocess and clean the dataset before implementation. I will be glad to cite our paper with the following details in your research papers: R. Taheri, R. Javidan, M. Shojafar, Z. Pooranian, A. Miri, M. Conti, "On Defending Against Label Flipping Attacks on Malware Detection Systems", Springer, Neural Computing and Applications (NCAA), Vol. 32, pp. 14781–14800, July 2020. - -DOI: https://doi.org/10.1007/s00521-020-04831-9 + +DOI: https://doi.org/10.1007/s00521-020-04831-9 \ No newline at end of file diff --git a/Taheri2020NCAA-labelflipping_Sourcecode/copyright notice.docx b/Taheri2020NCAA-labelflipping_Sourcecode/copyright notice.docx new file mode 100644 index 0000000000000000000000000000000000000000..e238f77a424bf7fa46148840f9c8485e73d491ab Binary files /dev/null and b/Taheri2020NCAA-labelflipping_Sourcecode/copyright notice.docx differ