From e2f3da1792f46ee87e9e1793dc040fdf13c53a2c Mon Sep 17 00:00:00 2001
From: VasilyShcherbinin <vasily.shcherbinin@outlook.com>
Date: Mon, 11 Feb 2019 15:08:55 +0000
Subject: [PATCH] Minor formatting changes

---
 UCS/Problem_Multiplexer.py                    |   2 +-
 UCS/Problem_Parity.py                         |   2 +-
 UCS/UCS_Configuration_File.txt                |   4 +-
 UCS/UCS_DataManagement.py                     |  57 +-----------------
 UCS/UCS_Run.py                                |   2 +-
 .../UCS_DataManagement.cpython-36.pyc         | Bin 12185 -> 10853 bytes
 XCS/Problem_Multiplexer.py                    |   2 +-
 XCS/XCS_Configuration_File.txt                |   4 +-
 XCS/XCS_DataManagement.py                     |  57 +-----------------
 XCS/XCS_Run.py                                |   5 +-
 10 files changed, 12 insertions(+), 123 deletions(-)

diff --git a/UCS/Problem_Multiplexer.py b/UCS/Problem_Multiplexer.py
index b0e0a80..86487cc 100644
--- a/UCS/Problem_Multiplexer.py
+++ b/UCS/Problem_Multiplexer.py
@@ -118,7 +118,7 @@ if __name__ == '__main__':
                 return i
         return None
 
-    bits = 3
+    bits = 11
     instances = 10
 
     generate_complete_multiplexer_data(str(bits)+"Multiplexer_Data_Complete.txt", bits) # 3,6,11,20,37
diff --git a/UCS/Problem_Parity.py b/UCS/Problem_Parity.py
index 55b0f69..cd7f875 100644
--- a/UCS/Problem_Parity.py
+++ b/UCS/Problem_Parity.py
@@ -113,7 +113,7 @@ if __name__ == '__main__':
             return bits
         return None
 
-    bits = 6
+    bits = 11
     instances = 10
 
     #generate_parity_data(str(bits)+"-"+str(instances)+"Parity_Data.txt", bits, instances)
diff --git a/UCS/UCS_Configuration_File.txt b/UCS/UCS_Configuration_File.txt
index c0b3fc5..640167e 100644
--- a/UCS/UCS_Configuration_File.txt
+++ b/UCS/UCS_Configuration_File.txt
@@ -4,13 +4,13 @@
 ###### Major Run Parameters - Essential to be set correctly for a successful run of the algorithm
 ######--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 datasetDirectory=Demo_Datasets					# Directory/Path of training and testing datasets (assumes they are in the same directory)
-trainFile=6Multiplexer_Data_Complete.txt		        # FileName of training dataset
+trainFile=11Parity_Data_Complete.txt		# FileName of training dataset
 testFile=None									# FileName of testing dataset.  If no testing data available or desired, put 'None'.
 outputDirectory=Local_Output					# Output file directory
 outputFile=ExampleRun							# FileName of output files.
 learningIterations=20000					# Specify complete algorithm evaluation checkpoints and maximum number of learning iterations (e.g. 1000.2000.5000 = A maximum of 5000 learning iterations with evaluations at 1000, 2000, and 5000 iterations)
 N=1000											# Maximum size of the rule population (a.k.a. Micro-classifier population size, where N is the sum of the classifier numerosities in the population)
-p_spec=0.9										# The probability of specifying an attribute when covering. (1-p_spec = the probability of adding '#' in ternary rule representations). Greater numbers of attributes in a dataset will require lower values of p_spec.
+p_spec=0.5										# The probability of specifying an attribute when covering. (1-p_spec = the probability of adding '#' in ternary rule representations). Greater numbers of attributes in a dataset will require lower values of p_spec.
 kfold=5									        # if not used, set to 0.
 
 ######--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
diff --git a/UCS/UCS_DataManagement.py b/UCS/UCS_DataManagement.py
index cf71636..f2e151f 100644
--- a/UCS/UCS_DataManagement.py
+++ b/UCS/UCS_DataManagement.py
@@ -328,36 +328,9 @@ class DataManagement:
             formatted)  # One time randomization of the order the of the phenotypes in the data, so that if the data was ordered by phenotype, this potential learning bias (based on phenotype ordering) is eliminated.
         return formatted
 
-    def splitFolds(self):
+    def splitDataIntoKSets(self):
         """ divide data set into kfold sets. """
         data_size = len(self.trainFormatted)
-        class_counts = [0] * len(self.phenotypeList)
-        for instance in self.trainFormatted:
-            class_counts[self.phenotypeList.index(instance[1])] += 1
-        fold_size = int(data_size / cons.kfold)
-        split_again = True
-        while split_again:
-            split_again = False
-            self.folds = [[] for _ in range(cons.kfold)]
-            start_point = 0
-            for i in range(cons.kfold):
-                end_point = start_point + fold_size
-                if i < data_size % cons.kfold:
-                    end_point += 1
-                self.folds[i] = self.trainFormatted[start_point:end_point]
-                start_point = end_point
-                fold_class_counts = [0] * len(self.phenotypeList)
-                for instance in self.folds[i]:
-                    fold_class_counts[self.phenotypeList.index(instance[1])] += 1
-                for j in range(len(self.phenotypeList)):
-                    if fold_class_counts[j] == class_counts[j]:
-                        random.shuffle(self.trainFormatted)
-                        split_again = True
-
-    def splitFolds2(self):
-        """ divide data set into kfold sets. """
-        self.trainFormatted = stratify(self.trainFormatted)
-        data_size = len(self.trainFormatted)
         self.folds = [[] for _ in range(cons.kfold)]
         for fold_id in range(cons.kfold):
             fold_size = int(data_size / cons.kfold)
@@ -380,31 +353,3 @@ class DataManagement:
         self.numTestphenotypes = len(self.testFormatted)
         print("DataManagement: Number of Instances = " + str(self.numTrainphenotypes))
         print("DataManagement: Number of Instances = " + str(self.numTestphenotypes))
-
-
-def stratify(all_data):
-    """ divide data set into kfold sets. """
-    # sort by class
-    index = 1
-    numb_instances = len(all_data)
-    while index < numb_instances:
-        instance1 = all_data[index - 1]
-        for j in range(index, numb_instances):
-            instance2 = all_data[j]
-            if instance1[1] == instance2[1]:
-                # swap(index, j)
-                temp = all_data[index]
-                all_data[index] = all_data[j]
-                all_data[j] = temp
-                index += 1
-        index += 1
-    # rearrange classes to kfold trunks.
-    stratified_data = []
-    start = 0
-    while len(stratified_data) < numb_instances:
-        j = start
-        while j < numb_instances:
-            stratified_data.append(all_data[j])
-            j += cons.kfold
-        start += 1
-    return stratified_data
diff --git a/UCS/UCS_Run.py b/UCS/UCS_Run.py
index e1ba168..c3522ee 100644
--- a/UCS/UCS_Run.py
+++ b/UCS/UCS_Run.py
@@ -66,7 +66,7 @@ if __name__ == '__main__':
     t0 = time.clock()
     if cons.kfold > 0:
         total_instances = env.formatData.numTrainphenotypes
-        env.formatData.splitFolds2()
+        env.formatData.splitDataIntoKSets()
         accurate_numbs = [0.0] * cons.kfold
         for i in range(cons.kfold):
             env.formatData.selectTrainTestSets(i)
diff --git a/UCS/__pycache__/UCS_DataManagement.cpython-36.pyc b/UCS/__pycache__/UCS_DataManagement.cpython-36.pyc
index 90e92a9f349e96bd7db0f2eedbcd5b1fc0463cb0..090d386c4a3ac90637f80a54319103df117ded60 100644
GIT binary patch
delta 530
zcmah`$w~u35UuXnl7I?sh;hNhg^8mcL_8RROD-zOMR6G;VRS|rjT4<|P{Ry4#2=78
z>BX~0@g{yrZX*7N)rt#vu%TX8zt>g$nz|{!t*573S`r0jl}bf`SNOFg%;zzJ(d+c6
zLf1f7A2znMRu4G`Ok#sc?Oy(@Sqech`Iw1?{dE;p1!GB6-Xbsv;fORyL~LTyD`1pK
zM{<MYp+Q0@D;MmC**%CTQ$r$dBvFr~kba)J;TU`~lx%4h1RKd-;!BC-@Vkm%>VujR
z(kz@A{7&EF$788_k)RifUR0>f<Sk|uEYCW0P8^R-2B#I532xbA)zvkJ1(VrIb>FH4
zIZWJu?N?7ubAAr1i@N47;O?gc6a<K$`gw$=a!{?<K~r))rmKD$DKl<k+j8J`*PNy%
z<g#m<@sA{MYYxB93=XvaeScNB?ZgH^`AeoFHT19J$F}h<LN@q_F-*4jieZp#er_z1
rHvVL6syo=pFY&&KA=1Nh6TPukUyN3l`P|ePKb%S>#LQ$GlVZv{FNcF*

delta 1771
zcmZ`(&u<$=6rP#=X(w?^anmNDQb>anM_`E{0fHzhZODZ-Ae!Q6scUUKcC?P;#JkhV
zM6<F`iiEh(XixA5aN>fBIB`T3r(U>O9FP!45Pt&STPICZ(Us=uy*F>(yxI4?@2`z-
zYGZGVjb*<czjOPyPZ;|%IdabP^3mnum@S?A%rm99$x7}I>zhvQ9%HP*JT93h<V(qq
zjz2afrx7$RIf5=doO1Q0{s^LMtZP`&b<UV#nzi^LkGbUAJhj9jFR)nj7HMadRcB&D
z3+SR{XhVugcE~L@$)tFT8Elz#XDdvZnk%biPh&4Or4d`bf3)?0hilS|Eon*nK2wge
z`YrnqDG#}3HCOW3?)^gUp||=|>}Xq?7g+3SXP9t;?Xd8B?b2oNq98=LU!Vx+h+1aa
zjqPVj&-BW}-AQ&_e&)D*|NrX$qr!UcpbSchw^M=Dg*dA-I*V%r$sK{W6jY{_MGPW1
zqc=X_-9m+Rzryk=Oghwy<XUlVG{lc(;R?jYTC!c-?B9YT60mJ{@Y<@c{2PA9-)`(S
zI;tG)w*$4d*OpOvxxH8S+tCV?AX4?c-TjsI(n$CoWt%0DCvgR0U<4f%T4b)0xIsRv
zNm)XN93Mg8=w<R}`Vv>xqTrz6jn><K6xHf`cRMN?xIrgUey83TWa#WLWF)Pq=5PB!
z#~Z<ZsA~Irbl=Cw=4BckIkdeO(nDu5@>V!c_orfkZqG*h{gW4dK>wi(FB4gw=c9av
z=eWsTtO=|<v>a9;)$`ab@N<Ydn0)R(tJANhk5EnjBSrRiP)?8ysP(xb03X9#3V_j2
zX3KhlMZ2d?J=JC`6yXmMDWD1dsg<I!i6Jn6MtEbhlRepqO?~E6fu%^c=-%X&;<#gL
zfnj+&dAGRres~iFh8rZ#{;lv9e8E#_DL?n+OUd8G>ApK~BNh58Xnr<?ND3B$!&B&}
zkO=}|B|Ak#xCwpWq}>Z-=#gdbHJedG0YpKFjAzRl3^B1r_*!aEc@sFp#3<*<(fo_k
zQ_rvR;iLKUoL7?H7v}h#gqL3BDk+z)@_f=QRYq<jOX!o>N*<S9=2OYPrK$duJ02Qv
zK51?ShKZFn_c0W3M@m+p8`0a>Tnq4o)=Y!OE`pq5qqnX&`40ug`3ky&e!y>`dmLkJ
zk=zj>m_kE~?cqLI%Y6otLHKf-4z1?ha^+OdRoFI*9VueBcNHjeKxcIQub?8cH>Uvv
zSU+p01<=u@#24CBM$1Glv->S_f&7Tjx?e&k-Y^bkXJl|Mkc}DXCm$9s_EY*uiKJu=
z86a)yn|S<bXG<B+_1o<l&3DQRUaoU@cdK?X){&PvQC|-~#nDrHX<(|x?!GsE44<GO
z(=-Dso%r=<5D->z`g7|1A{1uO&}K23G_rZ&LZdY*OpGp#bQUrPPkOFoCFd6Bc@-_b
VkX&CJ&yOO7DO|&~a(&ik{(qxgnY;i1

diff --git a/XCS/Problem_Multiplexer.py b/XCS/Problem_Multiplexer.py
index a97a9ab..c0ea963 100644
--- a/XCS/Problem_Multiplexer.py
+++ b/XCS/Problem_Multiplexer.py
@@ -119,7 +119,7 @@ if __name__ == '__main__':
         return None
 
 
-    bits = 6
+    bits = 11
     instances = 10
 
     generate_complete_multiplexer_data(str(bits) + "Multiplexer_Data_Complete.txt", bits)  # 3,6,11,20,37
diff --git a/XCS/XCS_Configuration_File.txt b/XCS/XCS_Configuration_File.txt
index f5dcc65..49cf4e8 100644
--- a/XCS/XCS_Configuration_File.txt
+++ b/XCS/XCS_Configuration_File.txt
@@ -4,14 +4,14 @@
 ###### Major Run Parameters - Essential to be set correctly for a successful run of the algorithm
 ######--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 datasetDirectory=Demo_Datasets			    # Directory/Path of training and testing datasets (assumes they are in the same directory)
-trainFile=6Multiplexer_Data_Complete.txt	# Path/FileName of training dataset
+trainFile=11Multiplexer_Data_Complete.txt	# Path/FileName of training dataset
 testFile=None								# Path/FileName of testing dataset.  If no testing data available or desired, put 'None'.
 outputDirectory=Local_Output					    # Path/NewName for new algorithm output files. Note: Do not give a file extension, this is done automatically.
 outputFile=ExampleRun							# FileName of output files.
 learningIterations=20000						# Specify complete algorithm evaluation checkpoints and maximum number of learning iterations (e.g. 1000.2000.5000 = A maximum of 5000 learning iterations with evaluations at 1000, 2000, and 5000 iterations)
 N=1000											# Maximum size of the rule population (a.k.a. Micro-classifier population size, where N is the sum of the classifier numerosities in the population)
 p_spec=0.5										# The probability of specifying an attribute when covering. (1-p_spec = the probability of adding '#' in ternary rule representations). Greater numbers of attributes in a dataset will require lower values of p_spec.
-kfold=5									        # if not used, set to 0.
+kfold=10									        # if not used, set to 0.
 
 ######--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 ###### Logistical Run Parameters
diff --git a/XCS/XCS_DataManagement.py b/XCS/XCS_DataManagement.py
index 44d08d3..cfffdcd 100644
--- a/XCS/XCS_DataManagement.py
+++ b/XCS/XCS_DataManagement.py
@@ -293,36 +293,9 @@ class DataManagement:
         # random.shuffle(formatted) #One time randomization of the order the of the instances in the data, so that if the data was ordered by phenotype, this potential learning bias (based on instance ordering) is eliminated.
         return formatted
 
-    def splitFolds(self):
+    def splitDataIntoKSets(self):
         """ divide data set into kfold sets. """
         data_size = len(self.trainFormatted)
-        class_counts = [0] * len(self.phenotypeList)
-        for instance in self.trainFormatted:
-            class_counts[self.phenotypeList.index(instance[1])] += 1
-        fold_size = int(data_size / cons.kfold)
-        split_again = True
-        while split_again:
-            split_again = False
-            self.folds = [[] for _ in range(cons.kfold)]
-            start_point = 0
-            for i in range(cons.kfold):
-                end_point = start_point + fold_size
-                if i < data_size % cons.kfold:
-                    end_point += 1
-                self.folds[i] = self.trainFormatted[start_point:end_point]
-                start_point = end_point
-                fold_class_counts = [0] * len(self.phenotypeList)
-                for instance in self.folds[i]:
-                    fold_class_counts[self.phenotypeList.index(instance[1])] += 1
-                for j in range(len(self.phenotypeList)):
-                    if fold_class_counts[j] == class_counts[j]:
-                        random.shuffle(self.trainFormatted)
-                        split_again = True
-
-    def splitFolds2(self):
-        """ divide data set into kfold sets. """
-        self.trainFormatted = stratify(self.trainFormatted)
-        data_size = len(self.trainFormatted)
         self.folds = [[] for _ in range(cons.kfold)]
         for fold_id in range(cons.kfold):
             fold_size = int(data_size / cons.kfold)
@@ -345,31 +318,3 @@ class DataManagement:
         self.numTestphenotypes = len(self.formatted_test_data)
         print("DataManagement: Number of Instances = " + str(self.numTrainphenotypes))
         print("DataManagement: Number of Instances = " + str(self.numTestphenotypes))
-
-
-def stratify(all_data):
-    """ divide data set into kfold sets. """
-    # sort by class
-    index = 1
-    numb_instances = len(all_data)
-    while index < numb_instances:
-        instance1 = all_data[index - 1]
-        for j in range(index, numb_instances):
-            instance2 = all_data[j]
-            if instance1[1] == instance2[1]:
-                # swap(index, j)
-                temp = all_data[index]
-                all_data[index] = all_data[j]
-                all_data[j] = temp
-                index += 1
-        index += 1
-    # rearrange classes to kfold trunks.
-    stratified_data = []
-    start = 0
-    while len(stratified_data) < numb_instances:
-        j = start
-        while j < numb_instances:
-            stratified_data.append(all_data[j])
-            j += cons.kfold
-        start += 1
-    return stratified_data
diff --git a/XCS/XCS_Run.py b/XCS/XCS_Run.py
index e4ac230..8fa5de6 100644
--- a/XCS/XCS_Run.py
+++ b/XCS/XCS_Run.py
@@ -52,14 +52,13 @@ if __name__ == '__main__':
     t0 = time.clock()
     if cons.kfold > 0:
         total_instances = env.format_data.numTrainphenotypes
-        env.format_data.splitFolds2()
+        env.format_data.splitDataIntoKSets()
         accurate_numbs = [0.0] * cons.kfold
         for i in range(cons.kfold):
             env.format_data.selectTrainTestSets(i)
             cons.parseIterations()  # Identify the maximum number of learning iterations as well as evaluation checkpoints.
             XCS().run_XCS()
             accuracy = XCS.standard_accuracy
-            # accuracy = XCS().run_XCS()[0]
             accurate_numbs[i] = accuracy * env.format_data.numTestphenotypes
         print("AVERAGE ACCURACY AFTER " + str(cons.kfold) + "-FOLD CROSS VALIDATION is " + str(
             sum(accurate_numbs) / total_instances))
@@ -68,4 +67,4 @@ if __name__ == '__main__':
         XCS().run_XCS()
     t1 = time.clock()
     total = t1 - t0
-    print("Run time in seconds: %.2f" % round(total, 2))
+    print("Total un time in seconds: %.2f" % round(total, 2))
-- 
GitLab