diff --git a/script/NLP_clustering_sections.ipynb b/script/NLP_clustering_sections.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..f788fe0fe31583c670468c0ec835c779c7d3bbe8
--- /dev/null
+++ b/script/NLP_clustering_sections.ipynb
@@ -0,0 +1,201 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "metadata": {
+  "language_info": {
+   "name": "python",
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "version": "3.7.5-final"
+  },
+  "orig_nbformat": 2,
+  "file_extension": ".py",
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3,
+  "kernelspec": {
+   "name": "python37564bit13fddfa0140645c199f4c0ad8a176c2c",
+   "display_name": "Python 3.7.5 64-bit"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import basic packages\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib as mpl\n",
+    "from sklearn.metrics.pairwise import linear_kernel       # cosine_similarity as linear_kernel#\n",
+    "from scipy.cluster.hierarchy import dendrogram, linkage\n",
+    "# from scipy.cluster.hierarchy import ward, dendrogram   # Ward's method for hierarchical clustering\n",
+    "%matplotlib inline                                       "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load dataframe\n",
+    "path_doc = '/home/mribary/Dropbox/pyDigest/dump/D_doc_sections_001.csv'\n",
+    "path_tfidf = '/home/mribary/Dropbox/pyDigest/dump/D_tfidf_sections_001.csv'\n",
+    "df = pd.read_csv(path_doc, index_col=0)     # Sections with documents inlcuding lemmas from text\n",
+    "tf = pd.read_csv(path_tfidf, index_col=0)   # Original tfidf matrix\n",
+    "terms = list(tf.columns)                    # List of terms included in the Tfidf matrix as dimensions\n",
+    "section_IDs = list(df.index)                # List for section_IDs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# List the first 100 terms with the highest tfidf scores in a section\n",
+    "# list(tf.loc[0].sort_values(ascending=False)[0:100].index)\n",
+    "\n",
+    "# Create list of unique terms which appear in the top 100 tfidf of sections\n",
+    "top_terms = []\n",
+    "for i in tf.index:\n",
+    "    t = list(tf.loc[i].sort_values(ascending=False)[0:100].index)\n",
+    "    top_terms.extend(t)\n",
+    "top_terms = list(set(top_terms))        # 6286 unique terms\n",
+    "\n",
+    "# Keep top terms in the tfidf matrix\n",
+    "top_tf = tf[top_terms]                  # Streamlined tfidf matrix\n",
+    "# len(top_tf.columns)                   # 6286 columns for 6286 unique terms\n",
+    "# len(top_tf)                           # 432 rows for 432 sections"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = np.array(top_tf.values)           # Tfidf matrix of shape 432 (sections) x 6286 (terms)\n",
+    "# X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate linkage matrices with all available methods\n",
+    "Z_ward = linkage(X, method='ward')         # Uses the Ward variance minimization algorithm\n",
+    "Z_single = linkage(X, method='single')     # Nearest Point algorithm\n",
+    "Z_complete = linkage(X, method='complete') # Farthest Point Algorithm or Voor Hees Algorithm\n",
+    "Z_average = linkage(X, method='average')   # UPGMA algorithm\n",
+    "Z_weighted = linkage(X, method='weighted') # WPGMA algorithm\n",
+    "Z_centroid = linkage(X, method='centroid') # Euclidean distance between the centroid and the centroid of a remaining cluster\n",
+    "Z_median = linkage(X, method='median')     # Merged clusters' centroid to be come the average"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "methods = ['ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median']\n",
+    "metrics = ['euclidean', 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice',  'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']\n",
+    "tuples = []\n",
+    "for x in metrics:\n",
+    "    d = [(k, x) for k in methods]\n",
+    "    tuples.extend(d)\n",
+    "len(tuples)\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Default metric is 'euclidean'.\n",
+    "\n",
+    "Possible metrics are â€˜braycurtisâ€™, â€˜canberraâ€™, â€˜chebyshevâ€™, â€˜cityblockâ€™, â€˜correlationâ€™, â€˜cosineâ€™, â€˜diceâ€™, \\â€˜euclideanâ€™, â€˜hammingâ€™, â€˜jaccardâ€™, â€˜jensenshannonâ€™, â€˜kulsinskiâ€™, â€˜mahalanobisâ€™, â€˜matchingâ€™, â€˜minkowskiâ€™, \\â€˜rogerstanimotoâ€™, â€˜russellraoâ€™, â€˜seuclideanâ€™, â€˜sokalmichenerâ€™, â€˜sokalsneathâ€™, â€˜sqeuclideanâ€™, â€˜yuleâ€™.\n",
+    "\n",
+    "#### How to read the linkage matrix\n",
+    "\n",
+    "`Z[i]` will tell us which clusters were merged in the i-th iteration. \n",
+    "\n",
+    "```python\n",
+    ">>> Z[0] with an output\n",
+    "array([ 52.     ,  53.     ,   0.04151,   2.     ])\n",
+    "```\n",
+    "\n",
+    "In its first iteration the linkage algorithm decided to merge the two clusters (original samples here) with indices 52 and 53, as they only had a distance of 0.04151. This created a cluster with a total of 2 samples. We can see that each row of the resulting array has the format [idx1, idx2, dist, sample_count]."
+   ],
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "0.19541725126458756\n0.5008994958011028\n0.49728620126501494\n0.6829207838359184\n0.6032541179525797\n0.5821612977691866\n0.4487048319117512\n"
+    }
+   ],
+   "source": [
+    "# Calculate Cophenetic Correlation Coefficient for the different linkage types\n",
+    "from scipy.cluster.hierarchy import cophenet\n",
+    "from scipy.spatial.distance import pdist\n",
+    "linkage_matrices = [Z_ward, Z_single, Z_complete, Z_average, Z_weighted, Z_centroid, Z_median]\n",
+    "# linkage_dict = {'ward':[], 'single':[], 'complete':[], 'average':[], 'weighted':[], 'centroid':[], 'median':[]}\n",
+    "for Z in linkage_matrices:\n",
+    "    c, coph_dists = cophenet(Z, pdist(X))\n",
+    "    print(c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "# Create dendogram for hierarchical clustering with Ward's method\n",
+    "from scipy.cluster.hierarchy import dendrogram, ward\n",
+    "\n",
+    "cosine_sim = linear_kernel(X, X)                    # Generate cosine similarity matrix: cosine_sim\n",
+    "dist = 1 - cosine_sim                               # Distance matrix for hierarchical clustering\n",
+    "\n",
+    "linkage_matrix = ward(dist)                         #Linkage_matrix using Ward's method\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(15, 20)) # set size\n",
+    "ax = dendrogram(linkage_matrix, orientation=\"right\", labels=section_IDs)\n",
+    "\n",
+    "plt.tick_params(\\\n",
+    "    axis= 'x',          # changes apply to the x-axis\n",
+    "    which='both',      # both major and minor ticks are affected\n",
+    "    bottom='off',      # ticks along the bottom edge are off\n",
+    "    top='off',         # ticks along the top edge are off\n",
+    "    labelbottom='off')\n",
+    "\n",
+    "plt.tight_layout() #show plot with tight layout\n",
+    "\n",
+    "#uncomment below to save figure\n",
+    "plt.savefig('./_Work_in_progress/ward_clusters.png', dpi=200) #save figure as ward_clusters\n",
+    "'''"
+   ]
+  }
+ ]
+}
\ No newline at end of file
diff --git a/script/NLP_sections_001.py b/script/NLP_sections_001.py
index 5d35e90ee344832589c45b358727b5f688aafd9c..42cb99421f33ff51670a628c37bfd4d0c741bfef 100644
--- a/script/NLP_sections_001.py
+++ b/script/NLP_sections_001.py
@@ -15,7 +15,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 vectorizer = TfidfVectorizer()
 
 # Load dataframes from GitHub repo
-file_path_df = './dump/Ddf_v105.csv'
+file_path_df = './dump/Ddf_v106.csv'
 file_path_s = './dump/Ddf_sections_v001.csv'
 file_path_sID = './dump/Ddf_Section_IDs_v001.csv'
 file_path_stoplist = './dump/D_stoplist_001.txt'