diff --git a/dump/silhouette_scores_norm_top50.txt b/dump/silhouette_scores_norm_top50.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb3306087a46f4cddf97b2d76c616c8dcd55e2f5 Binary files /dev/null and b/dump/silhouette_scores_norm_top50.txt differ diff --git a/images/norm_top50_silhouette_2to75.png b/images/norm_top50_silhouette_2to75.png new file mode 100644 index 0000000000000000000000000000000000000000..6af602ded7e88188d14e5b594c03c5ae54003ba7 Binary files /dev/null and b/images/norm_top50_silhouette_2to75.png differ diff --git a/script/K-means_silhouette_norm_top50.py b/script/K-means_silhouette_norm_top50.py new file mode 100644 index 0000000000000000000000000000000000000000..713687a57e935eea8dde5392db8fe10fcc0791a5 --- /dev/null +++ b/script/K-means_silhouette_norm_top50.py @@ -0,0 +1,60 @@ +# Import packages +import pandas as pd +from sklearn import cluster +from sklearn.metrics import silhouette_score +import numpy as np +import matplotlib.pyplot as plt + +# Load normalized dataframes +df = pd.read_csv('./dump/D_lemmatized_norm.csv', index_col=0) +sf = pd.read_csv('./dump/tfidf_sections_norm_top50.csv', index_col=0) +tf = pd.read_csv('./dump/tfidf_titles_norm.csv', index_col=0) + +# Extract matrix from dataframe +X = np.array(sf.values) # Tfidf matrix of shape 340 (sections) x 3868 (terms) +section_IDs = list(sf.index) # List for section_IDs +# X.shape + +# Generate silhouette scores for the range between 2 and 75 clusters +NumberOfClusters=range(2,75) +silhouette_score_values=list() +for i in NumberOfClusters: + classifier=cluster.KMeans(i,init='k-means++', n_init=10, max_iter=300, \ + tol=0.0001, verbose=0, random_state=None, copy_x=True) + classifier.fit(X) + labels = classifier.predict(X) + score = silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None) + silhouette_score_values.append(score) + print('|', end= '') + +# Pickle silhouette scores into a binary file +import pickle +with open('./dump/silhouette_scores_norm_top50.txt', 'wb') as fp: + pickle.dump(silhouette_score_values, fp) + +# Unpickle from the binary file +with open('./dump/silhouette_scores_norm_top50.txt', 'rb') as fp: + silhouette_score_values = pickle.load(fp) + +# Calculate the optimal number of clusters and its silhouette score +optimal = NumberOfClusters[silhouette_score_values.index(max(silhouette_score_values))] +print('Optimal number of components: ' + str(optimal) + '\n' + \ + 'Silhouette score: ' + str(max(silhouette_score_values))) + +# Plot silhouette scores +plt.plot(NumberOfClusters, silhouette_score_values) +plt.title('Silhouette score values' + '\n' + 'based on a normalized Tfidf matrix of the top 50 lemmas (tfidf) retained from' + '\n' + 'the 340 thematic sections exceeding 100 unique lemmas in the Digest') +plt.xlabel('Number of clusters') +plt.ylabel('Silhouette score') +plt.legend(['Optimal number of components: ' + str(optimal) + '\n' + 'Silhouette score: ' + str(max(silhouette_score_values))], loc=8) +plt.tick_params( + axis= 'both', # changes apply to the x-axis + which='both', # both major and minor ticks are affected + direction='in' # ticks inside the axis + ) +plt.axvline(x=optimal, color='r', alpha=0.5, linestyle=':') +plt.axhline(y=max(silhouette_score_values), color='r', alpha=0.5, linestyle=':') +plt.show() +# Save plot +plt.savefig('./images/norm_top50_silhouette_2to75.png', dpi=200) +plt.close() \ No newline at end of file