diff --git a/pyDigest.py b/pyDigest.py
index 0bbc0e4c25088ad9d6829ba439a2ecd4d7a8df50..eb53c4a3456c80da703d9ce818ddb8e4d66bae07 100644
--- a/pyDigest.py
+++ b/pyDigest.py
@@ -34,6 +34,7 @@ def similar_sections(id, size=10):
id: thematic section's id
size: number of similar thematic sections returned
'''
+ import pandas as pd
path_sID = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/Ddf_Section_IDs_v001.csv'
path_doc = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/D_doc_sections_001.csv'
path_df = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/Ddf_v105.csv'
diff --git a/pyDigest_documentation.md b/pyDigest_documentation.md
index 58f54ff3efd3ad77a9981e097ebe05973a62c0e0..df73c7ba2b4ec8b14cc63a912a69439c66094a0d 100644
--- a/pyDigest_documentation.md
+++ b/pyDigest_documentation.md
@@ -1,6 +1,16 @@
## "pyDigest" - General functions
-The documentation describes the functions defined and stored in `pyDigest.py` in the project's top directory. These functions can be called in other Python scripts of the repository in two forms:
+The documentation describes the functions defined and stored in `pyDigest.py`. A copy of the file is stored in the user site directory at `~/.local/lib/python3.7/site-packages`.
+
+After opening a Terminal window, the following bash commands could be used to navigate to the directory and create a copy of `pyDigest.py` which is then called by Python scripts. This is a strictly temporary solution for a privately managed module.
+
+```bash
+$ path=`python3 -m site --user-site`
+$ cd $path
+$ cp [path_for_pyDigest.py] .
+```
+
+These functions can be called in other Python scripts of the repository in two forms:
1. Importing a specific function
diff --git a/script/pyDigest.py b/script/pyDigest.py
deleted file mode 100644
index 0bbc0e4c25088ad9d6829ba439a2ecd4d7a8df50..0000000000000000000000000000000000000000
--- a/script/pyDigest.py
+++ /dev/null
@@ -1,62 +0,0 @@
-def similar(id, corpus, size=10):
- '''The function returns the most similar documents to the one passed into based on cosine similarity
- calculated on the Tfidf matrix of a given corpus
- id: the index of the "document" in the corpus queried for its most similar documents
- corpus: a list of plain word strings ("documents"), the position of the "document" in the list is the
- id where indexing runs from 0 until len(corpus)-1
- size: the number of documents returned, default value is set to 10.'''
- # Handle errors
- valid_id = range(len(corpus))
- if id not in valid_id:
- raise ValueError("id must be in the range of len(corpus) which is between 0 and %r." % len(corpus))
- if type(corpus) != list:
- raise TypeError("corpus must be a plain list of word strings.")
- if type(size) != int:
- raise TypeError("size must be an integer")
- valid_size = range(1, (len(corpus)-1))
- if size not in valid_size:
- raise ValueError("size must be between 1 and %r." % (len(corpus)-1))
- # Import modules and initilaize models
- from sklearn.metrics.pairwise import linear_kernel
- from sklearn.feature_extraction.text import TfidfVectorizer
- vectorizer = TfidfVectorizer()
- # Calculate Tfidf matrix (X) and cosine similarity matrix (cosine_sim)
- X = vectorizer.fit_transform(corpus)
- cosine_sim = linear_kernel(X, X)
- # Calculate most similar documents
- sim_scores = list(enumerate(cosine_sim[id]))
- sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
- sim_scores = sim_scores[1:(size + 1)]
- return sim_scores
-
-def similar_sections(id, size=10):
- '''Returns a dataframe with the most similar thematic sections
- id: thematic section's id
- size: number of similar thematic sections returned
- '''
- path_sID = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/Ddf_Section_IDs_v001.csv'
- path_doc = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/D_doc_sections_001.csv'
- path_df = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/Ddf_v105.csv'
- sID = pd.read_csv(path_sID, index_col=0) # sections with section IDs (432)
- doc_df = pd.read_csv(path_doc, index_col=0)
- df = pd.read_csv(path_df, index_col=0) # text units (21055)
- corpus = list(doc_df.doc)
- similar_to_id = similar(id, corpus, size)
- similar_dict_id = {'Section_id':[], 'Book_no':[], 'Section_no':[], 'Section_title':[], 'Similarity_score':[]}
- for i in range(size):
- section_id = similar_to_id[i][0]
- text_unit_id = sID.loc[sID.Section_id == section_id].index[0]
- book_no = df.loc[df.index == text_unit_id,'Book_no'].values[0]
- section_no = df.loc[df.index == text_unit_id,'Section_no'].values[0]
- section_title = df.loc[df.index == text_unit_id,'Section_title'].values[0]
- similarity_score = similar_to_id[i][1]
- similar_dict_id['Section_id'].append(section_id)
- similar_dict_id['Book_no'].append(book_no)
- similar_dict_id['Section_no'].append(section_no)
- similar_dict_id['Section_title'].append(section_title.lower())
- similar_dict_id['Similarity_score'].append(similarity_score)
- similar_df_id = pd.DataFrame(similar_dict_id)
- title = doc_df.Title[id]
- print("Thematic sections most similar to thematic section %r:" %id)
- print("%r" %title)
- return similar_df_id
\ No newline at end of file