Clustering attempts

b6305717 · Ribary, Marton Dr (School of Law) · 512774eb · b6305717 · b6305717
Commit b6305717 authored 5 years ago by Ribary, Marton Dr (School of Law)
--- a/script/NLP_clustering_sections.ipynb
+++ b/script/NLP_clustering_sections.ipynb
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "version": "3.7.5-final"
+  },
+  "orig_nbformat": 2,
+  "file_extension": ".py",
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3,
+  "kernelspec": {
+   "name": "python37564bit13fddfa0140645c199f4c0ad8a176c2c",
+   "display_name": "Python 3.7.5 64-bit"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import basic packages\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib as mpl\n",
+    "from sklearn.metrics.pairwise import linear_kernel       # cosine_similarity as linear_kernel#\n",
+    "from scipy.cluster.hierarchy import dendrogram, linkage\n",
+    "# from scipy.cluster.hierarchy import ward, dendrogram   # Ward's method for hierarchical clustering\n",
+    "%matplotlib inline                                       "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load dataframe\n",
+    "path_doc = '/home/mribary/Dropbox/pyDigest/dump/D_doc_sections_001.csv'\n",
+    "path_tfidf = '/home/mribary/Dropbox/pyDigest/dump/D_tfidf_sections_001.csv'\n",
+    "df = pd.read_csv(path_doc, index_col=0)     # Sections with documents inlcuding lemmas from text\n",
+    "tf = pd.read_csv(path_tfidf, index_col=0)   # Original tfidf matrix\n",
+    "terms = list(tf.columns)                    # List of terms included in the Tfidf matrix as dimensions\n",
+    "section_IDs = list(df.index)                # List for section_IDs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# List the first 100 terms with the highest tfidf scores in a section\n",
+    "# list(tf.loc[0].sort_values(ascending=False)[0:100].index)\n",
+    "\n",
+    "# Create list of unique terms which appear in the top 100 tfidf of sections\n",
+    "top_terms = []\n",
+    "for i in tf.index:\n",
+    "    t = list(tf.loc[i].sort_values(ascending=False)[0:100].index)\n",
+    "    top_terms.extend(t)\n",
+    "top_terms = list(set(top_terms))        # 6286 unique terms\n",
+    "\n",
+    "# Keep top terms in the tfidf matrix\n",
+    "top_tf = tf[top_terms]                  # Streamlined tfidf matrix\n",
+    "# len(top_tf.columns)                   # 6286 columns for 6286 unique terms\n",
+    "# len(top_tf)                           # 432 rows for 432 sections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = np.array(top_tf.values)           # Tfidf matrix of shape 432 (sections) x 6286 (terms)\n",
+    "# X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate linkage matrices with all available methods\n",
+    "Z_ward = linkage(X, method='ward')         # Uses the Ward variance minimization algorithm\n",
+    "Z_single = linkage(X, method='single')     # Nearest Point algorithm\n",
+    "Z_complete = linkage(X, method='complete') # Farthest Point Algorithm or Voor Hees Algorithm\n",
+    "Z_average = linkage(X, method='average')   # UPGMA algorithm\n",
+    "Z_weighted = linkage(X, method='weighted') # WPGMA algorithm\n",
+    "Z_centroid = linkage(X, method='centroid') # Euclidean distance between the centroid and the centroid of a remaining cluster\n",
+    "Z_median = linkage(X, method='median')     # Merged clusters' centroid to be come the average"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "methods = ['ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median']\n",
+    "metrics = ['euclidean', 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice',  'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']\n",
+    "tuples = []\n",
+    "for x in metrics:\n",
+    "    d = [(k, x) for k in methods]\n",
+    "    tuples.extend(d)\n",
+    "len(tuples)\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Default metric is 'euclidean'.\n",
+    "\n",
+    "Possible metrics are ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, \\‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, \\‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.\n",
+    "\n",
+    "#### How to read the linkage matrix\n",
+    "\n",
+    "`Z[i]` will tell us which clusters were merged in the i-th iteration. \n",
+    "\n",
+    "```python\n",
+    ">>> Z[0] with an output\n",
+    "array([ 52.     ,  53.     ,   0.04151,   2.     ])\n",
+    "```\n",
+    "\n",
+    "In its first iteration the linkage algorithm decided to merge the two clusters (original samples here) with indices 52 and 53, as they only had a distance of 0.04151. This created a cluster with a total of 2 samples. We can see that each row of the resulting array has the format [idx1, idx2, dist, sample_count]."
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "0.19541725126458756\n0.5008994958011028\n0.49728620126501494\n0.6829207838359184\n0.6032541179525797\n0.5821612977691866\n0.4487048319117512\n"
+    }
+   ],
+   "source": [
+    "# Calculate Cophenetic Correlation Coefficient for the different linkage types\n",
+    "from scipy.cluster.hierarchy import cophenet\n",
+    "from scipy.spatial.distance import pdist\n",
+    "linkage_matrices = [Z_ward, Z_single, Z_complete, Z_average, Z_weighted, Z_centroid, Z_median]\n",
+    "# linkage_dict = {'ward':[], 'single':[], 'complete':[], 'average':[], 'weighted':[], 'centroid':[], 'median':[]}\n",
+    "for Z in linkage_matrices:\n",
+    "    c, coph_dists = cophenet(Z, pdist(X))\n",
+    "    print(c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "# Create dendogram for hierarchical clustering with Ward's method\n",
+    "from scipy.cluster.hierarchy import dendrogram, ward\n",
+    "\n",
+    "cosine_sim = linear_kernel(X, X)                    # Generate cosine similarity matrix: cosine_sim\n",
+    "dist = 1 - cosine_sim                               # Distance matrix for hierarchical clustering\n",
+    "\n",
+    "linkage_matrix = ward(dist)                         #Linkage_matrix using Ward's method\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(15, 20)) # set size\n",
+    "ax = dendrogram(linkage_matrix, orientation=\"right\", labels=section_IDs)\n",
+    "\n",
+    "plt.tick_params(\\\n",
+    "    axis= 'x',          # changes apply to the x-axis\n",
+    "    which='both',      # both major and minor ticks are affected\n",
+    "    bottom='off',      # ticks along the bottom edge are off\n",
+    "    top='off',         # ticks along the top edge are off\n",
+    "    labelbottom='off')\n",
+    "\n",
+    "plt.tight_layout() #show plot with tight layout\n",
+    "\n",
+    "#uncomment below to save figure\n",
+    "plt.savefig('./_Work_in_progress/ward_clusters.png', dpi=200) #save figure as ward_clusters\n",
+    "'''"
+   ]
+  }
+ ]
+}
\ No newline at end of file
+%% Cell type:code id: tags:
+``` 
+# Import basic packages
+import pandas as pd
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+from sklearn.metrics.pairwise import linear_kernel       # cosine_similarity as linear_kernel#
+from scipy.cluster.hierarchy import dendrogram, linkage
+# from scipy.cluster.hierarchy import ward, dendrogram   # Ward's method for hierarchical clustering
+%matplotlib inline
+```
+%% Cell type:code id: tags:
+``` 
+# Load dataframe
+path_doc = '/home/mribary/Dropbox/pyDigest/dump/D_doc_sections_001.csv'
+path_tfidf = '/home/mribary/Dropbox/pyDigest/dump/D_tfidf_sections_001.csv'
+df = pd.read_csv(path_doc, index_col=0)     # Sections with documents inlcuding lemmas from text
+tf = pd.read_csv(path_tfidf, index_col=0)   # Original tfidf matrix
+terms = list(tf.columns)                    # List of terms included in the Tfidf matrix as dimensions
+section_IDs = list(df.index)                # List for section_IDs
+```
+%% Cell type:code id: tags:
+``` 
+# List the first 100 terms with the highest tfidf scores in a section
+# list(tf.loc[0].sort_values(ascending=False)[0:100].index)
+# Create list of unique terms which appear in the top 100 tfidf of sections
+top_terms = []
+for i in tf.index:
+    t = list(tf.loc[i].sort_values(ascending=False)[0:100].index)
+    top_terms.extend(t)
+top_terms = list(set(top_terms))        # 6286 unique terms
+# Keep top terms in the tfidf matrix
+top_tf = tf[top_terms]                  # Streamlined tfidf matrix
+# len(top_tf.columns)                   # 6286 columns for 6286 unique terms
+# len(top_tf)                           # 432 rows for 432 sections
+```
+%% Cell type:code id: tags:
+``` 
+X = np.array(top_tf.values)           # Tfidf matrix of shape 432 (sections) x 6286 (terms)
+# X.shape
+```
+%% Cell type:code id: tags:
+``` 
+# Generate linkage matrices with all available methods
+Z_ward = linkage(X, method='ward')         # Uses the Ward variance minimization algorithm
+Z_single = linkage(X, method='single')     # Nearest Point algorithm
+Z_complete = linkage(X, method='complete') # Farthest Point Algorithm or Voor Hees Algorithm
+Z_average = linkage(X, method='average')   # UPGMA algorithm
+Z_weighted = linkage(X, method='weighted') # WPGMA algorithm
+Z_centroid = linkage(X, method='centroid') # Euclidean distance between the centroid and the centroid of a remaining cluster
+Z_median = linkage(X, method='median')     # Merged clusters' centroid to be come the average
+```
+%% Cell type:code id: tags:
+``` 
+'''
+methods = ['ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median']
+metrics = ['euclidean', 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice',  'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
+tuples = []
+for x in metrics:
+    d = [(k, x) for k in methods]
+    tuples.extend(d)
+len(tuples)
+'''
+```
+%% Cell type:markdown id: tags:
+#### Default metric is 'euclidean'.
+Possible metrics are ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, \‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, \‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.
+#### How to read the linkage matrix
+`Z[i]` will tell us which clusters were merged in the i-th iteration.
+```python
+>>> Z[0] with an output
+array([ 52.     ,  53.     ,   0.04151,   2.     ])
+```
+In its first iteration the linkage algorithm decided to merge the two clusters (original samples here) with indices 52 and 53, as they only had a distance of 0.04151. This created a cluster with a total of 2 samples. We can see that each row of the resulting array has the format [idx1, idx2, dist, sample_count].
+%% Cell type:code id: tags:
+``` 
+# Calculate Cophenetic Correlation Coefficient for the different linkage types
+from scipy.cluster.hierarchy import cophenet
+from scipy.spatial.distance import pdist
+linkage_matrices = [Z_ward, Z_single, Z_complete, Z_average, Z_weighted, Z_centroid, Z_median]
+# linkage_dict = {'ward':[], 'single':[], 'complete':[], 'average':[], 'weighted':[], 'centroid':[], 'median':[]}
+for Z in linkage_matrices:
+    c, coph_dists = cophenet(Z, pdist(X))
+    print(c)
+```
+%% Output
+0.19541725126458756
+0.5008994958011028
+0.49728620126501494
+0.6829207838359184
+0.6032541179525797
+0.5821612977691866
+0.4487048319117512
+%% Cell type:code id: tags:
+``` 
+'''
+# Create dendogram for hierarchical clustering with Ward's method
+from scipy.cluster.hierarchy import dendrogram, ward
+cosine_sim = linear_kernel(X, X)                    # Generate cosine similarity matrix: cosine_sim
+dist = 1 - cosine_sim                               # Distance matrix for hierarchical clustering
+linkage_matrix = ward(dist)                         #Linkage_matrix using Ward's method
+fig, ax = plt.subplots(figsize=(15, 20)) # set size
+ax = dendrogram(linkage_matrix, orientation="right", labels=section_IDs)
+plt.tick_params(\
+    axis= 'x',          # changes apply to the x-axis
+    which='both',      # both major and minor ticks are affected
+    bottom='off',      # ticks along the bottom edge are off
+    top='off',         # ticks along the top edge are off
+    labelbottom='off')
+plt.tight_layout() #show plot with tight layout
+#uncomment below to save figure
+plt.savefig('./_Work_in_progress/ward_clusters.png', dpi=200) #save figure as ward_clusters
+'''
+```
--- a/script/NLP_sections_001.py
+++ b/script/NLP_sections_001.py
@@ -15,7 +15,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 vectorizer = TfidfVectorizer()
 # Load dataframes from GitHub repo
-file_path_df = './dump/Ddf_v105.csv'
+file_path_df = './dump/Ddf_v106.csv'
 file_path_s = './dump/Ddf_sections_v001.csv'
 file_path_sID = './dump/Ddf_Section_IDs_v001.csv'
 file_path_stoplist = './dump/D_stoplist_001.txt'