From 72a046740f21bb2cda200b185a4fc008dc65bc1b Mon Sep 17 00:00:00 2001
From: mribary <m.ribary@surrey.ac.uk>
Date: Wed, 29 Apr 2020 18:51:04 +0100
Subject: [PATCH] K-means silhouette on tfidf_norm_top50

---
 dump/silhouette_scores_norm_top50.txt   | Bin 0 -> 1706 bytes
 images/norm_top50_silhouette_2to75.png  | Bin 0 -> 7149 bytes
 script/K-means_silhouette_norm_top50.py |  60 ++++++++++++++++++++++++
 3 files changed, 60 insertions(+)
 create mode 100644 dump/silhouette_scores_norm_top50.txt
 create mode 100644 images/norm_top50_silhouette_2to75.png
 create mode 100644 script/K-means_silhouette_norm_top50.py

diff --git a/dump/silhouette_scores_norm_top50.txt b/dump/silhouette_scores_norm_top50.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fb3306087a46f4cddf97b2d76c616c8dcd55e2f5
GIT binary patch
literal 1706
zcmXZadsNPM90%~{iFg)jD5j`vOpI)pq=z51Eqb=bM=qn9B$p_?QxYp(^vTMDYzb+V
zOOJ@uqDC%}OLM=j8!Ot*cFv~b%sQRUG5el<_WR@c<Mn%=&+Yjobqj*dEi^JVA}ZcJ
zbWL=adqnK&n3W;X(IN3RYePd;heX>zFYj%Z#>7X3*}!Unl}@KyHVNHKI+H#f-3iui
zrfv)LN{#5@@9*#TT0Q^$m~=7dxx~k?y~hDd&Tz8^1LF;X%?kYrYahc{tAoq(ukX;%
zi_x3VN7S<rndYKg4YmwB!keNkm4Pkh?Kv9yGWrqvi`K@T&GwzUSA#vnfiOVyxnGiH
z{nPy#1~T3v3=(a2|21S!U7-d?h7)11sJ}g}OYAJrFofYua1nK^8ME-|<$W4l8AAzg
zi+XqzI(YOtq~RULFv4)r+4X$}RNE<imob9iCfagp_NLp8MH)sjMiJf<-4@rg_ILXO
z8b&k55Zpx*jn!lCcPbsrc%SfrsG*>K!?g;FhH;GXgbAW~r^;S9wW_)wGA0r{M7!>l
zm#4ZO)G&$RNti5pp38?7)gRX2#V``4h&uOu)s{?Z-l+_YFin(SHcYN_SJ~+dZ-S4g
zSJu|1#u(M@BgV&sPee=07Y$2`JF3B#F@rEuG{DLyI3+`6{TL>~r=qE)9gVAwDgBJ$
zPxxH4IQvr5^DC;`EXHgCiF$i<+6HH-jpr}|2y;bs`qQs-Jyo}PjQNBGqAC4T<{G*m
z)9?jjAz_i|;?$MN4SR|;1TumM!J;S2yc2?smTLHtv6v7dnzb{#;zg*cyMz%+SSren
zr=26$9oG=XSVmYbI?Bc?c*4UH4J#Nc3E`qSmOh1kC)EtA7^?{pqW06%bHhp1jbyAL
zM2U`g^oPf-0F?zJny^+h@XYc~V}rUw3?r7XPIRM=3=f{DY#d`fAzn18-s$Xxd8+OP
z#zw+d|JSWddueDrq9K8iNZ2I$xOlpGWTVO^F_H->qITyZ)&@4I-BKBw32CC+a)+bC
zPi2{rPS_&)+rZ5iF<)iBW@HdDMW?#xmU=s=J8Wf`3EM=Q``ViFGgXD{j2(n5QA91y
zu+39f-^uuf@U5t}@qEY7Kh-~!&DcfA5v>|>%Q@zz+G00j4`HvUZ&dgVuLPyJjD3VW
zQP;Z3&wMRPzhmSR3Pknojdyoj)OS|M*iSehI_7?4TWg!z;vnM?p-42Y{9akY1U0XP
zahOmnYRn!zv#wfQ{RrbIp+wZqx2>!FvD)Go<2a#IbYZ`!fksx@6O1xKxu~Dl-oc*9
zWg5O`R1i*z7PRMHDRDWW;S{5iP$k-xTs5;fQC;!}#%aPC(Q#>+)6R9NZq<ylgma>?
zraM6sQ&qR~j0=Ps(dc|KKMPlvyvVpjs1@x!UVkEDgxdHr;|k%b=!@NdTgN%63fCBQ
zgdasqsCCz0RjS){#tp(v(c)KAj1{%&zPA{+33o&b@@CE1yj)ed%eY6lFWT_Pp+3w_
z=>x_?!Xr`ZtF`kp^3=vZG3p5oqK7BW=%}2hw2{$7Xci5~8q{u1R{EIngz&RyP0c^A
Qeo0c3wJ=%<Ps7~*1qqOtVgLXD

literal 0
HcmV?d00001

diff --git a/images/norm_top50_silhouette_2to75.png b/images/norm_top50_silhouette_2to75.png
new file mode 100644
index 0000000000000000000000000000000000000000..6af602ded7e88188d14e5b594c03c5ae54003ba7
GIT binary patch
literal 7149
zcmeAS@N?(olHy`uVBq!ia0y~yU<ERn4{)#nsjNF!DuEPBv6E*A2N2Y7q;vrJoCO|{
z#S9E`hd`Jy??R~^P|%{pHKHUqKdq!Zu_%=xH?gE3C%+^oGfAN=wWv5VKhIdtP|rw5
zA)}<Ez)D{qA+MKTl&-JZz0iw+LE78X#WAE}&YK&KybKBg2MiAEf9}kx7#YP{oqzs}
z4$u+?g_nQ$Ks3WWLk30={|o~QkZLes766GJ;&A{`9LA%Pqk%A*5=OHFFb#|b!e}6j
z)()dp<Y+r#w6O#%2}c8AG!RC+2cvx?2VgcB?IVo_!f5vZm<C1zVKfj%`$(gGq|rXo
z@aY~r*p|$01ghVg|EzugQPhngnF%~bevCx{B-+F20HP#>8$gtS8hFG!!H0nf$U86t
sGOBLK$N`cZ)i@eBpmZ>ra)wUs&|%==t_gX3y#N$(p00i_>zopr0B76`$^ZZW

literal 0
HcmV?d00001

diff --git a/script/K-means_silhouette_norm_top50.py b/script/K-means_silhouette_norm_top50.py
new file mode 100644
index 0000000..713687a
--- /dev/null
+++ b/script/K-means_silhouette_norm_top50.py
@@ -0,0 +1,60 @@
+# Import packages
+import pandas as pd 
+from sklearn import cluster
+from sklearn.metrics import silhouette_score
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Load normalized dataframes
+df = pd.read_csv('./dump/D_lemmatized_norm.csv', index_col=0)
+sf = pd.read_csv('./dump/tfidf_sections_norm_top50.csv', index_col=0)
+tf = pd.read_csv('./dump/tfidf_titles_norm.csv', index_col=0)
+
+# Extract matrix from dataframe
+X = np.array(sf.values)         # Tfidf matrix of shape 340 (sections) x 3868 (terms)
+section_IDs = list(sf.index)    # List for section_IDs
+# X.shape
+
+# Generate silhouette scores for the range between 2 and 75 clusters
+NumberOfClusters=range(2,75)
+silhouette_score_values=list()
+for i in NumberOfClusters:
+    classifier=cluster.KMeans(i,init='k-means++', n_init=10, max_iter=300, \
+        tol=0.0001, verbose=0, random_state=None, copy_x=True)
+    classifier.fit(X)
+    labels = classifier.predict(X)
+    score = silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None)
+    silhouette_score_values.append(score)
+    print('|', end= '')
+
+# Pickle silhouette scores into a binary file
+import pickle
+with open('./dump/silhouette_scores_norm_top50.txt', 'wb') as fp:
+    pickle.dump(silhouette_score_values, fp)
+
+# Unpickle from the binary file
+with open('./dump/silhouette_scores_norm_top50.txt', 'rb') as fp:   
+    silhouette_score_values = pickle.load(fp)
+
+# Calculate the optimal number of clusters and its silhouette score
+optimal = NumberOfClusters[silhouette_score_values.index(max(silhouette_score_values))]
+print('Optimal number of components: ' + str(optimal) + '\n' + \
+        'Silhouette score: ' + str(max(silhouette_score_values)))
+
+# Plot silhouette scores
+plt.plot(NumberOfClusters, silhouette_score_values)
+plt.title('Silhouette score values' + '\n' +     'based on a normalized Tfidf matrix of the top 50 lemmas (tfidf) retained from' + '\n' +     'the 340 thematic sections exceeding 100 unique lemmas in the Digest')
+plt.xlabel('Number of clusters')
+plt.ylabel('Silhouette score')
+plt.legend(['Optimal number of components: ' + str(optimal) + '\n' +     'Silhouette score: ' + str(max(silhouette_score_values))], loc=8)
+plt.tick_params(    
+    axis= 'both',       # changes apply to the x-axis
+    which='both',       # both major and minor ticks are affected
+    direction='in'      # ticks inside the axis
+    )
+plt.axvline(x=optimal, color='r', alpha=0.5, linestyle=':')
+plt.axhline(y=max(silhouette_score_values), color='r', alpha=0.5, linestyle=':')
+plt.show()
+# Save plot
+plt.savefig('./images/norm_top50_silhouette_2to75.png', dpi=200)
+plt.close()
\ No newline at end of file
-- 
GitLab