Skip to content
Snippets Groups Projects
Commit b6305717 authored by Ribary, Marton Dr (School of Law)'s avatar Ribary, Marton Dr (School of Law)
Browse files

Clustering attempts

parent 512774eb
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
```
# Import basic packages
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.metrics.pairwise import linear_kernel # cosine_similarity as linear_kernel#
from scipy.cluster.hierarchy import dendrogram, linkage
# from scipy.cluster.hierarchy import ward, dendrogram # Ward's method for hierarchical clustering
%matplotlib inline
```
%% Cell type:code id: tags:
```
# Load dataframe
path_doc = '/home/mribary/Dropbox/pyDigest/dump/D_doc_sections_001.csv'
path_tfidf = '/home/mribary/Dropbox/pyDigest/dump/D_tfidf_sections_001.csv'
df = pd.read_csv(path_doc, index_col=0) # Sections with documents inlcuding lemmas from text
tf = pd.read_csv(path_tfidf, index_col=0) # Original tfidf matrix
terms = list(tf.columns) # List of terms included in the Tfidf matrix as dimensions
section_IDs = list(df.index) # List for section_IDs
```
%% Cell type:code id: tags:
```
# List the first 100 terms with the highest tfidf scores in a section
# list(tf.loc[0].sort_values(ascending=False)[0:100].index)
# Create list of unique terms which appear in the top 100 tfidf of sections
top_terms = []
for i in tf.index:
t = list(tf.loc[i].sort_values(ascending=False)[0:100].index)
top_terms.extend(t)
top_terms = list(set(top_terms)) # 6286 unique terms
# Keep top terms in the tfidf matrix
top_tf = tf[top_terms] # Streamlined tfidf matrix
# len(top_tf.columns) # 6286 columns for 6286 unique terms
# len(top_tf) # 432 rows for 432 sections
```
%% Cell type:code id: tags:
```
X = np.array(top_tf.values) # Tfidf matrix of shape 432 (sections) x 6286 (terms)
# X.shape
```
%% Cell type:code id: tags:
```
# Generate linkage matrices with all available methods
Z_ward = linkage(X, method='ward') # Uses the Ward variance minimization algorithm
Z_single = linkage(X, method='single') # Nearest Point algorithm
Z_complete = linkage(X, method='complete') # Farthest Point Algorithm or Voor Hees Algorithm
Z_average = linkage(X, method='average') # UPGMA algorithm
Z_weighted = linkage(X, method='weighted') # WPGMA algorithm
Z_centroid = linkage(X, method='centroid') # Euclidean distance between the centroid and the centroid of a remaining cluster
Z_median = linkage(X, method='median') # Merged clusters' centroid to be come the average
```
%% Cell type:code id: tags:
```
'''
methods = ['ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median']
metrics = ['euclidean', 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
tuples = []
for x in metrics:
d = [(k, x) for k in methods]
tuples.extend(d)
len(tuples)
'''
```
%% Cell type:markdown id: tags:
#### Default metric is 'euclidean'.
Possible metrics are ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, \‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, \‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.
#### How to read the linkage matrix
`Z[i]` will tell us which clusters were merged in the i-th iteration.
```python
>>> Z[0] with an output
array([ 52. , 53. , 0.04151, 2. ])
```
In its first iteration the linkage algorithm decided to merge the two clusters (original samples here) with indices 52 and 53, as they only had a distance of 0.04151. This created a cluster with a total of 2 samples. We can see that each row of the resulting array has the format [idx1, idx2, dist, sample_count].
%% Cell type:code id: tags:
```
# Calculate Cophenetic Correlation Coefficient for the different linkage types
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
linkage_matrices = [Z_ward, Z_single, Z_complete, Z_average, Z_weighted, Z_centroid, Z_median]
# linkage_dict = {'ward':[], 'single':[], 'complete':[], 'average':[], 'weighted':[], 'centroid':[], 'median':[]}
for Z in linkage_matrices:
c, coph_dists = cophenet(Z, pdist(X))
print(c)
```
%% Output
0.19541725126458756
0.5008994958011028
0.49728620126501494
0.6829207838359184
0.6032541179525797
0.5821612977691866
0.4487048319117512
%% Cell type:code id: tags:
```
'''
# Create dendogram for hierarchical clustering with Ward's method
from scipy.cluster.hierarchy import dendrogram, ward
cosine_sim = linear_kernel(X, X) # Generate cosine similarity matrix: cosine_sim
dist = 1 - cosine_sim # Distance matrix for hierarchical clustering
linkage_matrix = ward(dist) #Linkage_matrix using Ward's method
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=section_IDs)
plt.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
plt.tight_layout() #show plot with tight layout
#uncomment below to save figure
plt.savefig('./_Work_in_progress/ward_clusters.png', dpi=200) #save figure as ward_clusters
'''
```
...@@ -15,7 +15,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer ...@@ -15,7 +15,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer() vectorizer = TfidfVectorizer()
# Load dataframes from GitHub repo # Load dataframes from GitHub repo
file_path_df = './dump/Ddf_v105.csv' file_path_df = './dump/Ddf_v106.csv'
file_path_s = './dump/Ddf_sections_v001.csv' file_path_s = './dump/Ddf_sections_v001.csv'
file_path_sID = './dump/Ddf_Section_IDs_v001.csv' file_path_sID = './dump/Ddf_Section_IDs_v001.csv'
file_path_stoplist = './dump/D_stoplist_001.txt' file_path_stoplist = './dump/D_stoplist_001.txt'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment