Skip to content
Snippets Groups Projects
Commit 0dcabd82 authored by Ribary, Marton Dr (School of Law)'s avatar Ribary, Marton Dr (School of Law)
Browse files

Requirements for python venv

parent 03649db0
No related branches found
No related tags found
No related merge requests found
...@@ -2,4 +2,5 @@ ...@@ -2,4 +2,5 @@
_Work_in_progress/ _Work_in_progress/
_Scholarship/ _Scholarship/
_Notes/ _Notes/
_Drafts/ _Drafts/
\ No newline at end of file .venv
\ No newline at end of file
%% Cell type:markdown id: tags:
##1 Getting text ready for vectorization
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Import packages and models from cltk and initialize tools # Import packages and models from cltk and initialize tools
from cltk.corpus.utils.importer import CorpusImporter from cltk.corpus.utils.importer import CorpusImporter
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
corpus_importer = CorpusImporter('latin') corpus_importer = CorpusImporter('latin')
corpus_importer.import_corpus('latin_models_cltk') corpus_importer.import_corpus('latin_models_cltk')
lemmatizer = BackoffLatinLemmatizer() lemmatizer = BackoffLatinLemmatizer()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
text = 'Hoc igitur ius nostrum constat aut ex scripto aut sine scripto, ut apud Graecos: τῶν νόμων οἱ μὲν ἔγγραφοι, οἱ δὲ ἄγραφοι .' text = 'Hoc igitur ius nostrum constat aut ex scripto aut sine scripto, ut apud Graecos: τῶν νόμων οἱ μὲν ἔγγραφοι, οἱ δὲ ἄγραφοι .'
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import re import re
punctuation = r"[\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»]" punctuation = r"[\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»]"
new_text = ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text]) new_text = ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])
text_no_punct = re.sub(punctuation, '', new_text) text_no_punct = re.sub(punctuation, '', new_text)
text_one_white_space = re.sub(r"\s{2,}", ' ', text_no_punct) text_one_white_space = re.sub(r"\s{2,}", ' ', text_no_punct)
text_no_trailing_space = text_one_white_space.strip() text_no_trailing_space = text_one_white_space.strip()
text_lower = text_no_trailing_space.lower() text_lower = text_no_trailing_space.lower()
print(text_lower) print(text_lower)
``` ```
%% Output %% Output
hoc igitur ius nostrum constat aut ex scripto aut sine scripto ut apud graecos hoc igitur ius nostrum constat aut ex scripto aut sine scripto ut apud graecos
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
text_split = text_lower.split(' ') text_split = text_lower.split(' ')
print(text_split) print(text_split)
``` ```
%% Output %% Output
['hoc', 'igitur', 'ius', 'nostrum', 'constat', 'aut', 'ex', 'scripto', 'aut', 'sine', 'scripto', 'ut', 'apud', 'graecos'] ['hoc', 'igitur', 'ius', 'nostrum', 'constat', 'aut', 'ex', 'scripto', 'aut', 'sine', 'scripto', 'ut', 'apud', 'graecos']
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
lemmas = lemmatizer.lemmatize(text_split) lemmas = lemmatizer.lemmatize(text_split)
print(lemmas) print(lemmas)
``` ```
%% Output %% Output
[('hoc', 'hic'), ('igitur', 'igitur'), ('ius', 'ius'), ('nostrum', 'nos'), ('constat', 'consto'), ('aut', 'aut'), ('ex', 'ex'), ('scripto', 'scribo'), ('aut', 'aut'), ('sine', 'sine'), ('scripto', 'scribo'), ('ut', 'ut'), ('apud', 'apud'), ('graecos', 'graecus')] [('hoc', 'hic'), ('igitur', 'igitur'), ('ius', 'ius'), ('nostrum', 'nos'), ('constat', 'consto'), ('aut', 'aut'), ('ex', 'ex'), ('scripto', 'scribo'), ('aut', 'aut'), ('sine', 'sine'), ('scripto', 'scribo'), ('ut', 'ut'), ('apud', 'apud'), ('graecos', 'graecus')]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
path_stoplist = '/home/mribary/Dropbox/pyDigest/dump/D_stoplist_001.txt' path_stoplist = '/home/mribary/OneDrive/Git/pydigest/dump/D_stoplist_001.txt'
stopwords = list(pd.read_csv(path_stoplist, header=None)[0]) # 57 custom stopwords stopwords = list(pd.read_csv(path_stoplist, header=None)[0]) # 57 custom stopwords
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
textunit = '' textunit = ''
for y in range(len(lemmas)): for y in range(len(lemmas)):
if stopwords is not None: if stopwords is not None:
if lemmas[y][1] not in stopwords: if lemmas[y][1] not in stopwords:
textunit = textunit + str(lemmas[y][1] + ' ') textunit = textunit + str(lemmas[y][1] + ' ')
else: else:
textunit = textunit + str(lemmas[y][1] + ' ') textunit = textunit + str(lemmas[y][1] + ' ')
textunit = textunit.strip() textunit = textunit.strip()
print(textunit) print(textunit)
``` ```
%% Output %% Output
igitur ius nos consto scribo scribo apud graecus igitur ius nos consto scribo scribo apud graecus
%% Cell type:code id: tags:
``` python
```
......
cltk==0.1.117
fasttext==0.9.2
gensim==3.8.3
matplotlib==3.2.1
nltk==3.5
numpy==1.18.4
pandas==1.0.3
regex==2020.4.4
scikit-learn==0.22.2.post1
scipy==1.4.1
seaborn==0.10.1
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment