diff --git a/script/NLP_clustering_sections.ipynb b/script/NLP_clustering_sections.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f788fe0fe31583c670468c0ec835c779c7d3bbe8 --- /dev/null +++ b/script/NLP_clustering_sections.ipynb @@ -0,0 +1,201 @@ +{ + "nbformat": 4, + "nbformat_minor": 2, + "metadata": { + "language_info": { + "name": "python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "version": "3.7.5-final" + }, + "orig_nbformat": 2, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "npconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3, + "kernelspec": { + "name": "python37564bit13fddfa0140645c199f4c0ad8a176c2c", + "display_name": "Python 3.7.5 64-bit" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Import basic packages\n", + "import pandas as pd\n", + "import re\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib as mpl\n", + "from sklearn.metrics.pairwise import linear_kernel # cosine_similarity as linear_kernel#\n", + "from scipy.cluster.hierarchy import dendrogram, linkage\n", + "# from scipy.cluster.hierarchy import ward, dendrogram # Ward's method for hierarchical clustering\n", + "%matplotlib inline " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Load dataframe\n", + "path_doc = '/home/mribary/Dropbox/pyDigest/dump/D_doc_sections_001.csv'\n", + "path_tfidf = '/home/mribary/Dropbox/pyDigest/dump/D_tfidf_sections_001.csv'\n", + "df = pd.read_csv(path_doc, index_col=0) # Sections with documents inlcuding lemmas from text\n", + "tf = pd.read_csv(path_tfidf, index_col=0) # Original tfidf matrix\n", + "terms = list(tf.columns) # List of terms included in the Tfidf matrix as dimensions\n", + "section_IDs = list(df.index) # List for section_IDs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# List the first 100 terms with the highest tfidf scores in a section\n", + "# list(tf.loc[0].sort_values(ascending=False)[0:100].index)\n", + "\n", + "# Create list of unique terms which appear in the top 100 tfidf of sections\n", + "top_terms = []\n", + "for i in tf.index:\n", + " t = list(tf.loc[i].sort_values(ascending=False)[0:100].index)\n", + " top_terms.extend(t)\n", + "top_terms = list(set(top_terms)) # 6286 unique terms\n", + "\n", + "# Keep top terms in the tfidf matrix\n", + "top_tf = tf[top_terms] # Streamlined tfidf matrix\n", + "# len(top_tf.columns) # 6286 columns for 6286 unique terms\n", + "# len(top_tf) # 432 rows for 432 sections" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X = np.array(top_tf.values) # Tfidf matrix of shape 432 (sections) x 6286 (terms)\n", + "# X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate linkage matrices with all available methods\n", + "Z_ward = linkage(X, method='ward') # Uses the Ward variance minimization algorithm\n", + "Z_single = linkage(X, method='single') # Nearest Point algorithm\n", + "Z_complete = linkage(X, method='complete') # Farthest Point Algorithm or Voor Hees Algorithm\n", + "Z_average = linkage(X, method='average') # UPGMA algorithm\n", + "Z_weighted = linkage(X, method='weighted') # WPGMA algorithm\n", + "Z_centroid = linkage(X, method='centroid') # Euclidean distance between the centroid and the centroid of a remaining cluster\n", + "Z_median = linkage(X, method='median') # Merged clusters' centroid to be come the average" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "methods = ['ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median']\n", + "metrics = ['euclidean', 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']\n", + "tuples = []\n", + "for x in metrics:\n", + " d = [(k, x) for k in methods]\n", + " tuples.extend(d)\n", + "len(tuples)\n", + "'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Default metric is 'euclidean'.\n", + "\n", + "Possible metrics are ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, ‘correlation’, ‘cosine’, ‘dice’, \\‘euclidean’, ‘hamming’, ‘jaccard’, ‘jensenshannon’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, \\‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’.\n", + "\n", + "#### How to read the linkage matrix\n", + "\n", + "`Z[i]` will tell us which clusters were merged in the i-th iteration. \n", + "\n", + "```python\n", + ">>> Z[0] with an output\n", + "array([ 52. , 53. , 0.04151, 2. ])\n", + "```\n", + "\n", + "In its first iteration the linkage algorithm decided to merge the two clusters (original samples here) with indices 52 and 53, as they only had a distance of 0.04151. This created a cluster with a total of 2 samples. We can see that each row of the resulting array has the format [idx1, idx2, dist, sample_count]." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "0.19541725126458756\n0.5008994958011028\n0.49728620126501494\n0.6829207838359184\n0.6032541179525797\n0.5821612977691866\n0.4487048319117512\n" + } + ], + "source": [ + "# Calculate Cophenetic Correlation Coefficient for the different linkage types\n", + "from scipy.cluster.hierarchy import cophenet\n", + "from scipy.spatial.distance import pdist\n", + "linkage_matrices = [Z_ward, Z_single, Z_complete, Z_average, Z_weighted, Z_centroid, Z_median]\n", + "# linkage_dict = {'ward':[], 'single':[], 'complete':[], 'average':[], 'weighted':[], 'centroid':[], 'median':[]}\n", + "for Z in linkage_matrices:\n", + " c, coph_dists = cophenet(Z, pdist(X))\n", + " print(c)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "# Create dendogram for hierarchical clustering with Ward's method\n", + "from scipy.cluster.hierarchy import dendrogram, ward\n", + "\n", + "cosine_sim = linear_kernel(X, X) # Generate cosine similarity matrix: cosine_sim\n", + "dist = 1 - cosine_sim # Distance matrix for hierarchical clustering\n", + "\n", + "linkage_matrix = ward(dist) #Linkage_matrix using Ward's method\n", + "\n", + "fig, ax = plt.subplots(figsize=(15, 20)) # set size\n", + "ax = dendrogram(linkage_matrix, orientation=\"right\", labels=section_IDs)\n", + "\n", + "plt.tick_params(\\\n", + " axis= 'x', # changes apply to the x-axis\n", + " which='both', # both major and minor ticks are affected\n", + " bottom='off', # ticks along the bottom edge are off\n", + " top='off', # ticks along the top edge are off\n", + " labelbottom='off')\n", + "\n", + "plt.tight_layout() #show plot with tight layout\n", + "\n", + "#uncomment below to save figure\n", + "plt.savefig('./_Work_in_progress/ward_clusters.png', dpi=200) #save figure as ward_clusters\n", + "'''" + ] + } + ] +} \ No newline at end of file diff --git a/script/NLP_sections_001.py b/script/NLP_sections_001.py index 5d35e90ee344832589c45b358727b5f688aafd9c..42cb99421f33ff51670a628c37bfd4d0c741bfef 100644 --- a/script/NLP_sections_001.py +++ b/script/NLP_sections_001.py @@ -15,7 +15,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() # Load dataframes from GitHub repo -file_path_df = './dump/Ddf_v105.csv' +file_path_df = './dump/Ddf_v106.csv' file_path_s = './dump/Ddf_sections_v001.csv' file_path_sID = './dump/Ddf_Section_IDs_v001.csv' file_path_stoplist = './dump/D_stoplist_001.txt'