From 72a046740f21bb2cda200b185a4fc008dc65bc1b Mon Sep 17 00:00:00 2001 From: mribary <m.ribary@surrey.ac.uk> Date: Wed, 29 Apr 2020 18:51:04 +0100 Subject: [PATCH] K-means silhouette on tfidf_norm_top50 --- dump/silhouette_scores_norm_top50.txt | Bin 0 -> 1706 bytes images/norm_top50_silhouette_2to75.png | Bin 0 -> 7149 bytes script/K-means_silhouette_norm_top50.py | 60 ++++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 dump/silhouette_scores_norm_top50.txt create mode 100644 images/norm_top50_silhouette_2to75.png create mode 100644 script/K-means_silhouette_norm_top50.py diff --git a/dump/silhouette_scores_norm_top50.txt b/dump/silhouette_scores_norm_top50.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb3306087a46f4cddf97b2d76c616c8dcd55e2f5 GIT binary patch literal 1706 zcmXZadsNPM90%~{iFg)jD5j`vOpI)pq=z51Eqb=bM=qn9B$p_?QxYp(^vTMDYzb+V zOOJ@uqDC%}OLM=j8!Ot*cFv~b%sQRUG5el<_WR@c<Mn%=&+Yjobqj*dEi^JVA}ZcJ zbWL=adqnK&n3W;X(IN3RYePd;heX>zFYj%Z#>7X3*}!Unl}@KyHVNHKI+H#f-3iui zrfv)LN{#5@@9*#TT0Q^$m~=7dxx~k?y~hDd&Tz8^1LF;X%?kYrYahc{tAoq(ukX;% zi_x3VN7S<rndYKg4YmwB!keNkm4Pkh?Kv9yGWrqvi`K@T&GwzUSA#vnfiOVyxnGiH z{nPy#1~T3v3=(a2|21S!U7-d?h7)11sJ}g}OYAJrFofYua1nK^8ME-|<$W4l8AAzg zi+XqzI(YOtq~RULFv4)r+4X$}RNE<imob9iCfagp_NLp8MH)sjMiJf<-4@rg_ILXO z8b&k55Zpx*jn!lCcPbsrc%SfrsG*>K!?g;FhH;GXgbAW~r^;S9wW_)wGA0r{M7!>l zm#4ZO)G&$RNti5pp38?7)gRX2#V``4h&uOu)s{?Z-l+_YFin(SHcYN_SJ~+dZ-S4g zSJu|1#u(M@BgV&sPee=07Y$2`JF3B#F@rEuG{DLyI3+`6{TL>~r=qE)9gVAwDgBJ$ zPxxH4IQvr5^DC;`EXHgCiF$i<+6HH-jpr}|2y;bs`qQs-Jyo}PjQNBGqAC4T<{G*m z)9?jjAz_i|;?$MN4SR|;1TumM!J;S2yc2?smTLHtv6v7dnzb{#;zg*cyMz%+SSren zr=26$9oG=XSVmYbI?Bc?c*4UH4J#Nc3E`qSmOh1kC)EtA7^?{pqW06%bHhp1jbyAL zM2U`g^oPf-0F?zJny^+h@XYc~V}rUw3?r7XPIRM=3=f{DY#d`fAzn18-s$Xxd8+OP z#zw+d|JSWddueDrq9K8iNZ2I$xOlpGWTVO^F_H->qITyZ)&@4I-BKBw32CC+a)+bC zPi2{rPS_&)+rZ5iF<)iBW@HdDMW?#xmU=s=J8Wf`3EM=Q``ViFGgXD{j2(n5QA91y zu+39f-^uuf@U5t}@qEY7Kh-~!&DcfA5v>|>%Q@zz+G00j4`HvUZ&dgVuLPyJjD3VW zQP;Z3&wMRPzhmSR3Pknojdyoj)OS|M*iSehI_7?4TWg!z;vnM?p-42Y{9akY1U0XP zahOmnYRn!zv#wfQ{RrbIp+wZqx2>!FvD)Go<2a#IbYZ`!fksx@6O1xKxu~Dl-oc*9 zWg5O`R1i*z7PRMHDRDWW;S{5iP$k-xTs5;fQC;!}#%aPC(Q#>+)6R9NZq<ylgma>? zraM6sQ&qR~j0=Ps(dc|KKMPlvyvVpjs1@x!UVkEDgxdHr;|k%b=!@NdTgN%63fCBQ zgdasqsCCz0RjS){#tp(v(c)KAj1{%&zPA{+33o&b@@CE1yj)ed%eY6lFWT_Pp+3w_ z=>x_?!Xr`ZtF`kp^3=vZG3p5oqK7BW=%}2hw2{$7Xci5~8q{u1R{EIngz&RyP0c^A Qeo0c3wJ=%<Ps7~*1qqOtVgLXD literal 0 HcmV?d00001 diff --git a/images/norm_top50_silhouette_2to75.png b/images/norm_top50_silhouette_2to75.png new file mode 100644 index 0000000000000000000000000000000000000000..6af602ded7e88188d14e5b594c03c5ae54003ba7 GIT binary patch literal 7149 zcmeAS@N?(olHy`uVBq!ia0y~yU<ERn4{)#nsjNF!DuEPBv6E*A2N2Y7q;vrJoCO|{ z#S9E`hd`Jy??R~^P|%{pHKHUqKdq!Zu_%=xH?gE3C%+^oGfAN=wWv5VKhIdtP|rw5 zA)}<Ez)D{qA+MKTl&-JZz0iw+LE78X#WAE}&YK&KybKBg2MiAEf9}kx7#YP{oqzs} z4$u+?g_nQ$Ks3WWLk30={|o~QkZLes766GJ;&A{`9LA%Pqk%A*5=OHFFb#|b!e}6j z)()dp<Y+r#w6O#%2}c8AG!RC+2cvx?2VgcB?IVo_!f5vZm<C1zVKfj%`$(gGq|rXo z@aY~r*p|$01ghVg|EzugQPhngnF%~bevCx{B-+F20HP#>8$gtS8hFG!!H0nf$U86t sGOBLK$N`cZ)i@eBpmZ>ra)wUs&|%==t_gX3y#N$(p00i_>zopr0B76`$^ZZW literal 0 HcmV?d00001 diff --git a/script/K-means_silhouette_norm_top50.py b/script/K-means_silhouette_norm_top50.py new file mode 100644 index 0000000..713687a --- /dev/null +++ b/script/K-means_silhouette_norm_top50.py @@ -0,0 +1,60 @@ +# Import packages +import pandas as pd +from sklearn import cluster +from sklearn.metrics import silhouette_score +import numpy as np +import matplotlib.pyplot as plt + +# Load normalized dataframes +df = pd.read_csv('./dump/D_lemmatized_norm.csv', index_col=0) +sf = pd.read_csv('./dump/tfidf_sections_norm_top50.csv', index_col=0) +tf = pd.read_csv('./dump/tfidf_titles_norm.csv', index_col=0) + +# Extract matrix from dataframe +X = np.array(sf.values) # Tfidf matrix of shape 340 (sections) x 3868 (terms) +section_IDs = list(sf.index) # List for section_IDs +# X.shape + +# Generate silhouette scores for the range between 2 and 75 clusters +NumberOfClusters=range(2,75) +silhouette_score_values=list() +for i in NumberOfClusters: + classifier=cluster.KMeans(i,init='k-means++', n_init=10, max_iter=300, \ + tol=0.0001, verbose=0, random_state=None, copy_x=True) + classifier.fit(X) + labels = classifier.predict(X) + score = silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None) + silhouette_score_values.append(score) + print('|', end= '') + +# Pickle silhouette scores into a binary file +import pickle +with open('./dump/silhouette_scores_norm_top50.txt', 'wb') as fp: + pickle.dump(silhouette_score_values, fp) + +# Unpickle from the binary file +with open('./dump/silhouette_scores_norm_top50.txt', 'rb') as fp: + silhouette_score_values = pickle.load(fp) + +# Calculate the optimal number of clusters and its silhouette score +optimal = NumberOfClusters[silhouette_score_values.index(max(silhouette_score_values))] +print('Optimal number of components: ' + str(optimal) + '\n' + \ + 'Silhouette score: ' + str(max(silhouette_score_values))) + +# Plot silhouette scores +plt.plot(NumberOfClusters, silhouette_score_values) +plt.title('Silhouette score values' + '\n' + 'based on a normalized Tfidf matrix of the top 50 lemmas (tfidf) retained from' + '\n' + 'the 340 thematic sections exceeding 100 unique lemmas in the Digest') +plt.xlabel('Number of clusters') +plt.ylabel('Silhouette score') +plt.legend(['Optimal number of components: ' + str(optimal) + '\n' + 'Silhouette score: ' + str(max(silhouette_score_values))], loc=8) +plt.tick_params( + axis= 'both', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + direction='in' # ticks inside the axis + ) +plt.axvline(x=optimal, color='r', alpha=0.5, linestyle=':') +plt.axhline(y=max(silhouette_score_values), color='r', alpha=0.5, linestyle=':') +plt.show() +# Save plot +plt.savefig('./images/norm_top50_silhouette_2to75.png', dpi=200) +plt.close() \ No newline at end of file -- GitLab