From e2f3da1792f46ee87e9e1793dc040fdf13c53a2c Mon Sep 17 00:00:00 2001 From: VasilyShcherbinin <vasily.shcherbinin@outlook.com> Date: Mon, 11 Feb 2019 15:08:55 +0000 Subject: [PATCH] Minor formatting changes --- UCS/Problem_Multiplexer.py | 2 +- UCS/Problem_Parity.py | 2 +- UCS/UCS_Configuration_File.txt | 4 +- UCS/UCS_DataManagement.py | 57 +----------------- UCS/UCS_Run.py | 2 +- .../UCS_DataManagement.cpython-36.pyc | Bin 12185 -> 10853 bytes XCS/Problem_Multiplexer.py | 2 +- XCS/XCS_Configuration_File.txt | 4 +- XCS/XCS_DataManagement.py | 57 +----------------- XCS/XCS_Run.py | 5 +- 10 files changed, 12 insertions(+), 123 deletions(-) diff --git a/UCS/Problem_Multiplexer.py b/UCS/Problem_Multiplexer.py index b0e0a80..86487cc 100644 --- a/UCS/Problem_Multiplexer.py +++ b/UCS/Problem_Multiplexer.py @@ -118,7 +118,7 @@ if __name__ == '__main__': return i return None - bits = 3 + bits = 11 instances = 10 generate_complete_multiplexer_data(str(bits)+"Multiplexer_Data_Complete.txt", bits) # 3,6,11,20,37 diff --git a/UCS/Problem_Parity.py b/UCS/Problem_Parity.py index 55b0f69..cd7f875 100644 --- a/UCS/Problem_Parity.py +++ b/UCS/Problem_Parity.py @@ -113,7 +113,7 @@ if __name__ == '__main__': return bits return None - bits = 6 + bits = 11 instances = 10 #generate_parity_data(str(bits)+"-"+str(instances)+"Parity_Data.txt", bits, instances) diff --git a/UCS/UCS_Configuration_File.txt b/UCS/UCS_Configuration_File.txt index c0b3fc5..640167e 100644 --- a/UCS/UCS_Configuration_File.txt +++ b/UCS/UCS_Configuration_File.txt @@ -4,13 +4,13 @@ ###### Major Run Parameters - Essential to be set correctly for a successful run of the algorithm ######-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- datasetDirectory=Demo_Datasets # Directory/Path of training and testing datasets (assumes they are in the same directory) -trainFile=6Multiplexer_Data_Complete.txt # FileName of training dataset +trainFile=11Parity_Data_Complete.txt # FileName of training dataset testFile=None # FileName of testing dataset. If no testing data available or desired, put 'None'. outputDirectory=Local_Output # Output file directory outputFile=ExampleRun # FileName of output files. learningIterations=20000 # Specify complete algorithm evaluation checkpoints and maximum number of learning iterations (e.g. 1000.2000.5000 = A maximum of 5000 learning iterations with evaluations at 1000, 2000, and 5000 iterations) N=1000 # Maximum size of the rule population (a.k.a. Micro-classifier population size, where N is the sum of the classifier numerosities in the population) -p_spec=0.9 # The probability of specifying an attribute when covering. (1-p_spec = the probability of adding '#' in ternary rule representations). Greater numbers of attributes in a dataset will require lower values of p_spec. +p_spec=0.5 # The probability of specifying an attribute when covering. (1-p_spec = the probability of adding '#' in ternary rule representations). Greater numbers of attributes in a dataset will require lower values of p_spec. kfold=5 # if not used, set to 0. ######-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- diff --git a/UCS/UCS_DataManagement.py b/UCS/UCS_DataManagement.py index cf71636..f2e151f 100644 --- a/UCS/UCS_DataManagement.py +++ b/UCS/UCS_DataManagement.py @@ -328,36 +328,9 @@ class DataManagement: formatted) # One time randomization of the order the of the phenotypes in the data, so that if the data was ordered by phenotype, this potential learning bias (based on phenotype ordering) is eliminated. return formatted - def splitFolds(self): + def splitDataIntoKSets(self): """ divide data set into kfold sets. """ data_size = len(self.trainFormatted) - class_counts = [0] * len(self.phenotypeList) - for instance in self.trainFormatted: - class_counts[self.phenotypeList.index(instance[1])] += 1 - fold_size = int(data_size / cons.kfold) - split_again = True - while split_again: - split_again = False - self.folds = [[] for _ in range(cons.kfold)] - start_point = 0 - for i in range(cons.kfold): - end_point = start_point + fold_size - if i < data_size % cons.kfold: - end_point += 1 - self.folds[i] = self.trainFormatted[start_point:end_point] - start_point = end_point - fold_class_counts = [0] * len(self.phenotypeList) - for instance in self.folds[i]: - fold_class_counts[self.phenotypeList.index(instance[1])] += 1 - for j in range(len(self.phenotypeList)): - if fold_class_counts[j] == class_counts[j]: - random.shuffle(self.trainFormatted) - split_again = True - - def splitFolds2(self): - """ divide data set into kfold sets. """ - self.trainFormatted = stratify(self.trainFormatted) - data_size = len(self.trainFormatted) self.folds = [[] for _ in range(cons.kfold)] for fold_id in range(cons.kfold): fold_size = int(data_size / cons.kfold) @@ -380,31 +353,3 @@ class DataManagement: self.numTestphenotypes = len(self.testFormatted) print("DataManagement: Number of Instances = " + str(self.numTrainphenotypes)) print("DataManagement: Number of Instances = " + str(self.numTestphenotypes)) - - -def stratify(all_data): - """ divide data set into kfold sets. """ - # sort by class - index = 1 - numb_instances = len(all_data) - while index < numb_instances: - instance1 = all_data[index - 1] - for j in range(index, numb_instances): - instance2 = all_data[j] - if instance1[1] == instance2[1]: - # swap(index, j) - temp = all_data[index] - all_data[index] = all_data[j] - all_data[j] = temp - index += 1 - index += 1 - # rearrange classes to kfold trunks. - stratified_data = [] - start = 0 - while len(stratified_data) < numb_instances: - j = start - while j < numb_instances: - stratified_data.append(all_data[j]) - j += cons.kfold - start += 1 - return stratified_data diff --git a/UCS/UCS_Run.py b/UCS/UCS_Run.py index e1ba168..c3522ee 100644 --- a/UCS/UCS_Run.py +++ b/UCS/UCS_Run.py @@ -66,7 +66,7 @@ if __name__ == '__main__': t0 = time.clock() if cons.kfold > 0: total_instances = env.formatData.numTrainphenotypes - env.formatData.splitFolds2() + env.formatData.splitDataIntoKSets() accurate_numbs = [0.0] * cons.kfold for i in range(cons.kfold): env.formatData.selectTrainTestSets(i) diff --git a/UCS/__pycache__/UCS_DataManagement.cpython-36.pyc b/UCS/__pycache__/UCS_DataManagement.cpython-36.pyc index 90e92a9f349e96bd7db0f2eedbcd5b1fc0463cb0..090d386c4a3ac90637f80a54319103df117ded60 100644 GIT binary patch delta 530 zcmah`$w~u35UuXnl7I?sh;hNhg^8mcL_8RROD-zOMR6G;VRS|rjT4<|P{Ry4#2=78 z>BX~0@g{yrZX*7N)rt#vu%TX8zt>g$nz|{!t*573S`r0jl}bf`SNOFg%;zzJ(d+c6 zLf1f7A2znMRu4G`Ok#sc?Oy(@Sqech`Iw1?{dE;p1!GB6-Xbsv;fORyL~LTyD`1pK zM{<MYp+Q0@D;MmC**%CTQ$r$dBvFr~kba)J;TU`~lx%4h1RKd-;!BC-@Vkm%>VujR z(kz@A{7&EF$788_k)RifUR0>f<Sk|uEYCW0P8^R-2B#I532xbA)zvkJ1(VrIb>FH4 zIZWJu?N?7ubAAr1i@N47;O?gc6a<K$`gw$=a!{?<K~r))rmKD$DKl<k+j8J`*PNy% z<g#m<@sA{MYYxB93=XvaeScNB?ZgH^`AeoFHT19J$F}h<LN@q_F-*4jieZp#er_z1 rHvVL6syo=pFY&&KA=1Nh6TPukUyN3l`P|ePKb%S>#LQ$GlVZv{FNcF* delta 1771 zcmZ`(&u<$=6rP#=X(w?^anmNDQb>anM_`E{0fHzhZODZ-Ae!Q6scUUKcC?P;#JkhV zM6<F`iiEh(XixA5aN>fBIB`T3r(U>O9FP!45Pt&STPICZ(Us=uy*F>(yxI4?@2`z- zYGZGVjb*<czjOPyPZ;|%IdabP^3mnum@S?A%rm99$x7}I>zhvQ9%HP*JT93h<V(qq zjz2afrx7$RIf5=doO1Q0{s^LMtZP`&b<UV#nzi^LkGbUAJhj9jFR)nj7HMadRcB&D z3+SR{XhVugcE~L@$)tFT8Elz#XDdvZnk%biPh&4Or4d`bf3)?0hilS|Eon*nK2wge z`YrnqDG#}3HCOW3?)^gUp||=|>}Xq?7g+3SXP9t;?Xd8B?b2oNq98=LU!Vx+h+1aa zjqPVj&-BW}-AQ&_e&)D*|NrX$qr!UcpbSchw^M=Dg*dA-I*V%r$sK{W6jY{_MGPW1 zqc=X_-9m+Rzryk=Oghwy<XUlVG{lc(;R?jYTC!c-?B9YT60mJ{@Y<@c{2PA9-)`(S zI;tG)w*$4d*OpOvxxH8S+tCV?AX4?c-TjsI(n$CoWt%0DCvgR0U<4f%T4b)0xIsRv zNm)XN93Mg8=w<R}`Vv>xqTrz6jn><K6xHf`cRMN?xIrgUey83TWa#WLWF)Pq=5PB! z#~Z<ZsA~Irbl=Cw=4BckIkdeO(nDu5@>V!c_orfkZqG*h{gW4dK>wi(FB4gw=c9av z=eWsTtO=|<v>a9;)$`ab@N<Ydn0)R(tJANhk5EnjBSrRiP)?8ysP(xb03X9#3V_j2 zX3KhlMZ2d?J=JC`6yXmMDWD1dsg<I!i6Jn6MtEbhlRepqO?~E6fu%^c=-%X&;<#gL zfnj+&dAGRres~iFh8rZ#{;lv9e8E#_DL?n+OUd8G>ApK~BNh58Xnr<?ND3B$!&B&} zkO=}|B|Ak#xCwpWq}>Z-=#gdbHJedG0YpKFjAzRl3^B1r_*!aEc@sFp#3<*<(fo_k zQ_rvR;iLKUoL7?H7v}h#gqL3BDk+z)@_f=QRYq<jOX!o>N*<S9=2OYPrK$duJ02Qv zK51?ShKZFn_c0W3M@m+p8`0a>Tnq4o)=Y!OE`pq5qqnX&`40ug`3ky&e!y>`dmLkJ zk=zj>m_kE~?cqLI%Y6otLHKf-4z1?ha^+OdRoFI*9VueBcNHjeKxcIQub?8cH>Uvv zSU+p01<=u@#24CBM$1Glv->S_f&7Tjx?e&k-Y^bkXJl|Mkc}DXCm$9s_EY*uiKJu= z86a)yn|S<bXG<B+_1o<l&3DQRUaoU@cdK?X){&PvQC|-~#nDrHX<(|x?!GsE44<GO z(=-Dso%r=<5D->z`g7|1A{1uO&}K23G_rZ&LZdY*OpGp#bQUrPPkOFoCFd6Bc@-_b VkX&CJ&yOO7DO|&~a(&ik{(qxgnY;i1 diff --git a/XCS/Problem_Multiplexer.py b/XCS/Problem_Multiplexer.py index a97a9ab..c0ea963 100644 --- a/XCS/Problem_Multiplexer.py +++ b/XCS/Problem_Multiplexer.py @@ -119,7 +119,7 @@ if __name__ == '__main__': return None - bits = 6 + bits = 11 instances = 10 generate_complete_multiplexer_data(str(bits) + "Multiplexer_Data_Complete.txt", bits) # 3,6,11,20,37 diff --git a/XCS/XCS_Configuration_File.txt b/XCS/XCS_Configuration_File.txt index f5dcc65..49cf4e8 100644 --- a/XCS/XCS_Configuration_File.txt +++ b/XCS/XCS_Configuration_File.txt @@ -4,14 +4,14 @@ ###### Major Run Parameters - Essential to be set correctly for a successful run of the algorithm ######-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- datasetDirectory=Demo_Datasets # Directory/Path of training and testing datasets (assumes they are in the same directory) -trainFile=6Multiplexer_Data_Complete.txt # Path/FileName of training dataset +trainFile=11Multiplexer_Data_Complete.txt # Path/FileName of training dataset testFile=None # Path/FileName of testing dataset. If no testing data available or desired, put 'None'. outputDirectory=Local_Output # Path/NewName for new algorithm output files. Note: Do not give a file extension, this is done automatically. outputFile=ExampleRun # FileName of output files. learningIterations=20000 # Specify complete algorithm evaluation checkpoints and maximum number of learning iterations (e.g. 1000.2000.5000 = A maximum of 5000 learning iterations with evaluations at 1000, 2000, and 5000 iterations) N=1000 # Maximum size of the rule population (a.k.a. Micro-classifier population size, where N is the sum of the classifier numerosities in the population) p_spec=0.5 # The probability of specifying an attribute when covering. (1-p_spec = the probability of adding '#' in ternary rule representations). Greater numbers of attributes in a dataset will require lower values of p_spec. -kfold=5 # if not used, set to 0. +kfold=10 # if not used, set to 0. ######-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ###### Logistical Run Parameters diff --git a/XCS/XCS_DataManagement.py b/XCS/XCS_DataManagement.py index 44d08d3..cfffdcd 100644 --- a/XCS/XCS_DataManagement.py +++ b/XCS/XCS_DataManagement.py @@ -293,36 +293,9 @@ class DataManagement: # random.shuffle(formatted) #One time randomization of the order the of the instances in the data, so that if the data was ordered by phenotype, this potential learning bias (based on instance ordering) is eliminated. return formatted - def splitFolds(self): + def splitDataIntoKSets(self): """ divide data set into kfold sets. """ data_size = len(self.trainFormatted) - class_counts = [0] * len(self.phenotypeList) - for instance in self.trainFormatted: - class_counts[self.phenotypeList.index(instance[1])] += 1 - fold_size = int(data_size / cons.kfold) - split_again = True - while split_again: - split_again = False - self.folds = [[] for _ in range(cons.kfold)] - start_point = 0 - for i in range(cons.kfold): - end_point = start_point + fold_size - if i < data_size % cons.kfold: - end_point += 1 - self.folds[i] = self.trainFormatted[start_point:end_point] - start_point = end_point - fold_class_counts = [0] * len(self.phenotypeList) - for instance in self.folds[i]: - fold_class_counts[self.phenotypeList.index(instance[1])] += 1 - for j in range(len(self.phenotypeList)): - if fold_class_counts[j] == class_counts[j]: - random.shuffle(self.trainFormatted) - split_again = True - - def splitFolds2(self): - """ divide data set into kfold sets. """ - self.trainFormatted = stratify(self.trainFormatted) - data_size = len(self.trainFormatted) self.folds = [[] for _ in range(cons.kfold)] for fold_id in range(cons.kfold): fold_size = int(data_size / cons.kfold) @@ -345,31 +318,3 @@ class DataManagement: self.numTestphenotypes = len(self.formatted_test_data) print("DataManagement: Number of Instances = " + str(self.numTrainphenotypes)) print("DataManagement: Number of Instances = " + str(self.numTestphenotypes)) - - -def stratify(all_data): - """ divide data set into kfold sets. """ - # sort by class - index = 1 - numb_instances = len(all_data) - while index < numb_instances: - instance1 = all_data[index - 1] - for j in range(index, numb_instances): - instance2 = all_data[j] - if instance1[1] == instance2[1]: - # swap(index, j) - temp = all_data[index] - all_data[index] = all_data[j] - all_data[j] = temp - index += 1 - index += 1 - # rearrange classes to kfold trunks. - stratified_data = [] - start = 0 - while len(stratified_data) < numb_instances: - j = start - while j < numb_instances: - stratified_data.append(all_data[j]) - j += cons.kfold - start += 1 - return stratified_data diff --git a/XCS/XCS_Run.py b/XCS/XCS_Run.py index e4ac230..8fa5de6 100644 --- a/XCS/XCS_Run.py +++ b/XCS/XCS_Run.py @@ -52,14 +52,13 @@ if __name__ == '__main__': t0 = time.clock() if cons.kfold > 0: total_instances = env.format_data.numTrainphenotypes - env.format_data.splitFolds2() + env.format_data.splitDataIntoKSets() accurate_numbs = [0.0] * cons.kfold for i in range(cons.kfold): env.format_data.selectTrainTestSets(i) cons.parseIterations() # Identify the maximum number of learning iterations as well as evaluation checkpoints. XCS().run_XCS() accuracy = XCS.standard_accuracy - # accuracy = XCS().run_XCS()[0] accurate_numbs[i] = accuracy * env.format_data.numTestphenotypes print("AVERAGE ACCURACY AFTER " + str(cons.kfold) + "-FOLD CROSS VALIDATION is " + str( sum(accurate_numbs) / total_instances)) @@ -68,4 +67,4 @@ if __name__ == '__main__': XCS().run_XCS() t1 = time.clock() total = t1 - t0 - print("Run time in seconds: %.2f" % round(total, 2)) + print("Total un time in seconds: %.2f" % round(total, 2)) -- GitLab