From ed7b15a2d9cbc395001892d9cd47dc7277974594 Mon Sep 17 00:00:00 2001 From: mribary <m.ribary@surrey.ac.uk> Date: Mon, 15 Jun 2020 11:01:50 +0100 Subject: [PATCH] neighbours notebook --- NLP_documentation.md | 8 +- pyDigest_documentation.md | 2 +- script/fasttext_001.py | 2 +- script/neighbours.ipynb | 214 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 220 insertions(+), 6 deletions(-) create mode 100644 script/neighbours.ipynb diff --git a/NLP_documentation.md b/NLP_documentation.md index 520e675..d28b845 100644 --- a/NLP_documentation.md +++ b/NLP_documentation.md @@ -159,13 +159,13 @@ The `eval` function uses all 2579 test words, but it may be adjusted to test by ### Footnotes -[^1]: Patrick J. Burns, "Building a text analysis pipeline for classical languages," in _Digital classical philology: Ancient Greek and Latin in the digital revolution_, edited by Monica Berti. Berlin: Walter de Gruyter, 2019, 159-176. +[^1]: Burns, P. J., "Building a text analysis pipeline for classical languages," in _Digital classical philology: Ancient Greek and Latin in the digital revolution_, edited by Monica Berti. Berlin: Walter de Gruyter, 2019, 159-176. -[^2]: Patrick J. Burns, "[Multiplex lemmatization with the Classical Language Toolkit](https://lila-erc.eu/wp-content/uploads/2019/06/burns-lemmatisation.pdf)," presented at the _First LiLa Workshop: Linguistic Resources & NLP Tools for Latin_ on 3 June 2019. +[^2]: Burns, P. J., "[Multiplex lemmatization with the Classical Language Toolkit](https://lila-erc.eu/wp-content/uploads/2019/06/burns-lemmatisation.pdf)," presented at the _First LiLa Workshop: Linguistic Resources & NLP Tools for Latin_ on 3 June 2019. -[^3]: Patrick J. Burns, "[Latin lemmatization: Tools, resources & future directions](https://github.com/diyclassics/lemmatizer-review/blob/master/lemmatizer-review.ipynb)," pre-publication draft available on GitHub, last updated on 3 June 2019. +[^3]: Burns, P. J., "[Latin lemmatization: Tools, resources & future directions](https://github.com/diyclassics/lemmatizer-review/blob/master/lemmatizer-review.ipynb)," pre-publication draft available on GitHub, last updated on 3 June 2019. -[^4]: Patrick J. Burns, "[Constructing stoplists for historical languages](https://journals.ub.uni-heidelberg.de/index.php/dco/article/view/52124/48812)," _Digital Classics Online_ 4:2 (2018): 4-20. +[^4]: Burns, P. J., "[Constructing stoplists for historical languages](https://journals.ub.uni-heidelberg.de/index.php/dco/article/view/52124/48812)," _Digital Classics Online_ 4:2 (2018): 4-20. [^5]: The default value is `zou` which stands for the composite measure proposed by Feng Zou and his colleagues. Their measure is calculated from mean probability, variance probability and entropy which are some of the other possible measure to be passed for `basis`. See Feng Zou, Fu Lee Wang, Xiaotie Deng, Song Han, and Lu Sheng Wang, "[Automatic Construction of Chinese Stop Word List](https://pdfs.semanticscholar.org/c543/8e216071f6180c228cc557fb1d3c77edb3a3.pdf),†In _Proceedings of the 5th WSEAS International Conference on Applied Computer Science_, 1010–1015. diff --git a/pyDigest_documentation.md b/pyDigest_documentation.md index 55530d6..28c294c 100644 --- a/pyDigest_documentation.md +++ b/pyDigest_documentation.md @@ -110,4 +110,4 @@ The function downloads a file from an online repository to the system's temporar The function takes a Latin gensim-FastText object and prints the TOEFL-style synonym evaluation score based on LiLa's benchmark. The benchmark tsv file is downloaded directly from the [LiLa embeddings](https://embeddings.lila-erc.eu) website. The file includes almost 3000 Latin words accompanied by 4 other words one of which is marked as a synonym by a Latin expert. The word with the highest similarity score is chosen as the model's answer in this virtual TOEFL-style multiple choice synonym challenge. The higher the percantage, the closer the model is to return a synonym which agrees with the one chosen by a Latin expert. -**Example for use**: `xxxxx.py`, to generate evaluation score for four Latin fasttext word embeddings models. \ No newline at end of file +**Example for use**: `Fasttext_001.py`, to generate the evaluation scores for four Latin `fastText` word embeddings models. \ No newline at end of file diff --git a/script/fasttext_001.py b/script/fasttext_001.py index 08de961..dca2c91 100644 --- a/script/fasttext_001.py +++ b/script/fasttext_001.py @@ -138,7 +138,7 @@ latinise_cbow = fasttext.train_unsupervised(latinise_lemma_path, model='cbow') latinise_cbow.save_model('/dump/wordvec/latinise_cbow.bin') ################ -#| Evaluation \# +#| Evaluation |# ################ # Import the eval function and gensim's load_facebook_model function diff --git a/script/neighbours.ipynb b/script/neighbours.ipynb new file mode 100644 index 0000000..85b811f --- /dev/null +++ b/script/neighbours.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import re\n", + "import fasttext\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['/home/mribary/Documents/wordvec/digest_skip.bin', '/home/mribary/Documents/wordvec/romtext_skip.bin', '/home/mribary/Documents/wordvec/latinise_skip.bin', '/home/mribary/Documents/wordvec/lasla_skip.bin']\n" + ] + } + ], + "source": [ + "path = '/home/mribary/Documents/wordvec'\n", + "file_list = glob.glob(os.path.join(path, '*skip.bin'))\n", + "print(file_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" + ] + } + ], + "source": [ + "# Load models into a dictionary (with fasttext)\n", + "models = [] \n", + "for i in range(len(file_list)):\n", + " model_name = re.search('(\\/)([a-z]+_[a-z]*)(\\.)', file_list[i]).group(2)\n", + " x, y = model_name, fasttext.load_model(file_list[i])\n", + " models.append((x, y))\n", + "models = dict(models)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def neighbors(lemma, models, top=5):\n", + " '''\n", + " Based on a dictionary of fasttext `models` where key is the model's name\n", + " and the value is a fasttext object, the function returns a pandas dataframe\n", + " of closest neighbors for a `lemma`. If no value is entered to `top`, then\n", + " the function returns the default number of 5 top neigbouring lemmas.\n", + " '''\n", + " keys = list(models.keys())\n", + " dictionary = {}\n", + " for i in range(len(keys)):\n", + " tuples = models[keys[i]].get_nearest_neighbors(lemma, top)\n", + " neighbors = []\n", + " for j in range(len(tuples)):\n", + " x, y = tuples[j][1], round(tuples[j][0], 4)\n", + " neighbors.append((x, y))\n", + " #print(neighbors)\n", + " dictionary[keys[i]] = neighbors\n", + " df = pd.DataFrame(dictionary)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>digest_skip</th>\n", + " <th>romtext_skip</th>\n", + " <th>latinise_skip</th>\n", + " <th>lasla_skip</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>(credo, 0.8073)</td>\n", + " <td>(faeneravit, 0.7013)</td>\n", + " <td>(pecuniaris, 0.8476)</td>\n", + " <td>(pecuniarius, 0.8478)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>(creditrix, 0.7814)</td>\n", + " <td>(mutuus, 0.6952)</td>\n", + " <td>(pecuniaque, 0.8445)</td>\n", + " <td>(pecu, 0.7435)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>(faenero, 0.7799)</td>\n", + " <td>(faenero, 0.6832)</td>\n", + " <td>(pecuniamque, 0.8443)</td>\n", + " <td>(pecuniosus, 0.7342)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>(debitrix, 0.7699)</td>\n", + " <td>(mutuo, 0.6814)</td>\n", + " <td>(pecuniaeque, 0.801)</td>\n", + " <td>(syngrapha, 0.7096)</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>(faeneravit, 0.7622)</td>\n", + " <td>(solverim, 0.6798)</td>\n", + " <td>(Pecuniam, 0.7988)</td>\n", + " <td>(creditor, 0.7093)</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " digest_skip romtext_skip latinise_skip \\\n", + "0 (credo, 0.8073) (faeneravit, 0.7013) (pecuniaris, 0.8476) \n", + "1 (creditrix, 0.7814) (mutuus, 0.6952) (pecuniaque, 0.8445) \n", + "2 (faenero, 0.7799) (faenero, 0.6832) (pecuniamque, 0.8443) \n", + "3 (debitrix, 0.7699) (mutuo, 0.6814) (pecuniaeque, 0.801) \n", + "4 (faeneravit, 0.7622) (solverim, 0.6798) (Pecuniam, 0.7988) \n", + "\n", + " lasla_skip \n", + "0 (pecuniarius, 0.8478) \n", + "1 (pecu, 0.7435) \n", + "2 (pecuniosus, 0.7342) \n", + "3 (syngrapha, 0.7096) \n", + "4 (creditor, 0.7093) " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "neighbors('pecunia', models, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.5 64-bit", + "language": "python", + "name": "python37564bit13fddfa0140645c199f4c0ad8a176c2c" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file -- GitLab