Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
pyDigest.py 10.13 KiB
def similar(id, corpus, size=10):
    '''The function returns the most similar documents to the one passed into based on cosine similarity
    calculated on the Tfidf matrix of a given corpus
    id: the index of the "document" in the corpus queried for its most similar documents
    corpus: a list of plain word strings ("documents"), the position of the "document" in the list is the
    id where indexing runs from 0 until len(corpus)-1
    size: the number of documents returned, default value is set to 10.'''
    # Handle errors
    valid_id = range(len(corpus))
    if id not in valid_id:
        raise ValueError("id must be in the range of len(corpus) which is between 0 and %r." % len(corpus))
    if type(corpus) != list:
        raise TypeError("corpus must be a plain list of word strings.")
    if type(size) != int:
        raise TypeError("size must be an integer")
    valid_size = range(1, (len(corpus)-1))
    if size not in valid_size:
        raise ValueError("size must be between 1 and %r." % (len(corpus)-1))
    # Import modules and initilaize models
    from sklearn.metrics.pairwise import linear_kernel              
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer()
    # Calculate Tfidf matrix (X) and cosine similarity matrix (cosine_sim)
    X = vectorizer.fit_transform(corpus)
    cosine_sim = linear_kernel(X, X)
    # Calculate most similar documents
    sim_scores = list(enumerate(cosine_sim[id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(size + 1)]
    return sim_scores

def similar_sections(id, size=10):
    '''Returns a dataframe with the most similar thematic sections
    id: thematic section's id
    size: number of similar thematic sections returned
    '''
    import pandas as pd
    path_sID = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/Ddf_Section_IDs_v001.csv'
    path_doc = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/D_doc_sections_001.csv'
    path_df = 'https://raw.githubusercontent.com/mribary/pyDigest/master/input/Ddf_v105.csv'
    sID = pd.read_csv(path_sID, index_col=0)        # sections with section IDs (432)
    doc_df = pd.read_csv(path_doc, index_col=0)
    df = pd.read_csv(path_df, index_col=0)          # text units (21055)
    corpus = list(doc_df.doc)
    similar_to_id = similar(id, corpus, size)
    similar_dict_id = {'Section_id':[], 'Book_no':[], 'Section_no':[], 'Section_title':[], 'Similarity_score':[]}
    for i in range(size):
        section_id = similar_to_id[i][0]
        text_unit_id = sID.loc[sID.Section_id == section_id].index[0]
        book_no = df.loc[df.index == text_unit_id,'Book_no'].values[0]
        section_no = df.loc[df.index == text_unit_id,'Section_no'].values[0]
        section_title = df.loc[df.index == text_unit_id,'Section_title'].values[0]
        similarity_score = similar_to_id[i][1]
        similar_dict_id['Section_id'].append(section_id)
        similar_dict_id['Book_no'].append(book_no)
        similar_dict_id['Section_no'].append(section_no)
        similar_dict_id['Section_title'].append(section_title.lower())
        similar_dict_id['Similarity_score'].append(similarity_score)
    similar_df_id = pd.DataFrame(similar_dict_id)
    title = doc_df.Title[id]
    print("Thematic sections most similar to thematic section %r:" %id)
    print("%r" %title)
    return similar_df_id

def linkage_for_clustering(X, threshold=0.0):
    ''' The function takes a matrix X with observations stored in rows and features stored in columns.
    It returns a dataframe with linkage combinations of method and metric used for hierarchical
    clustering sorted by reverse order based on the absolute value of the cophenetic correlation
    coefficient (CCC). The CCC score ranges between -1 and 1 and measures how how faithfully a
    dendrogram preserves the pairwise distances between the original unmodeled data points.
    The cophenetic correlation is expected to be positive if the original distances are compared
    to cophenetic distances (or similarities to similarities) and negative if distances are
    compared to similarities.
    It needs to be noted that CCC is calculated for the whole dendrogram. Ideally, one should
    calculate CCC at the specific cut point where the dendrogram's output is used to identify the
    clusters. It is recommended to calculate CCC at the specific cut level yielding k clusters to
    confirm that the correct method-metric combination has been used for hierarchical clustering.
    The 'average' method generally produces the best CCC score especially with matrices with high
    dimensionality. Instead of relying exclusively on the CCC score, one also needs to consider 
    what method-metric combination suits the particular dataset on which hierarchical clustering 
    is performed by scipy's linkage function.
    '''
    import numpy as np
    # Handle errors
    if isinstance(X, np.ndarray) is not True:
        raise TypeError("X must be a matrix with samples in rows and observations in columns")
    if type(threshold) != float:
        raise TypeError("threshold must be a float")
    if abs(threshold) > 1:
        raise ValueError("threshold must be between -1 and 1")
    # Import basic packages
    import pandas as pd
    from scipy.cluster.hierarchy import linkage
    # List of 7 methods for the linkage function
    methods = ['ward', 'single', 'complete', 'average', 'weighted', 'centroid', 'median']
    # List of 22 metrics for the linkage function
    metrics = ['braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', \
        'dice',  'euclidean', 'hamming', 'jaccard', 'jensenshannon', 'kulsinski', 'mahalanobis', \
        'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', \
        'sokalsneath', 'sqeuclidean', 'yule']
    # Create list of dictioanries for the 154 method-metric combinations
    dicts = []
    for x in methods:
        for y in metrics:
            d = {'method':x, 'metric':y}
            dicts.append(d)
    # Load combinations into a dataframe
    linkages = {'method':[], 'metric': []}
    for i in range(len(dicts)):
        linkages['method'].append(dicts[i]['method'])
        linkages['metric'].append(dicts[i]['metric'])
    l = pd.DataFrame(linkages, columns=['method', 'metric'])
    # Calculate linkage matrices (Z) from X
    Z_matrices = []
    valid_mms = []
    for i in range(len(dicts)):
        try:
            Z = linkage(X, method=dicts[i]['method'], metric=dicts[i]['metric'])
            Z_matrices.append(Z)
            valid_mms.append(True)
            print('|', end= '')
        except:
            valid_mms.append(False)
            pass
    # Drop invald combinations and reindex
    l = l.loc[valid_mms]
    l.reset_index(drop=True)
    # Calculate Cophenetic Correlation Coefficient for valid linkage combinations
    from scipy.cluster.hierarchy import cophenet
    from scipy.spatial.distance import pdist
    valid_scores = []
    CCC_scores = []
    for Z in Z_matrices:
        try:
            c, coph_dists = cophenet(Z, pdist(X))
            if np.isnan(c):
                valid_scores.append(False)
                CCC_scores.append(None)
            else:
                valid_scores.append(True)
                CCC_scores.append(c)
                print('|', end= '')
        except RuntimeWarning:
            valid_scores.append(False)
            pass
    # Insert scores, drop no values and reset index
    l['CCC_score'] = CCC_scores
    l['CCC_abs_score'] = [abs(number) if number is not None else number for number in CCC_scores]
    l = l.loc[valid_scores]
    l.reset_index(drop=True)
    # Sort method-metric pairs according to CCC score
    l.sort_values(by=['CCC_score', 'method', 'metric'], ascending=False, inplace=True)
    return l[l.CCC_score > threshold]

def latin_lemma_text(list_of_texts, stopwords=None):
    '''
    Create a list of continuous lemma texts for Latin with cltk (prerequisite).
       
    list_of_texts: raw text items stored in a list object
    stopwords: list of stopwords to be removed, default is None where nothing is removed
    
    Latin lemmatizer is cltk's BackoffLatinLemmatizer. Install, import and load before using the function
    '''

    # Import packages and models from cltk and initialize tools
    from cltk.corpus.utils.importer import CorpusImporter
    from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
    corpus_importer = CorpusImporter('latin')                           # Initialize cltk's CorpusImporter
    corpus_importer.import_corpus('latin_models_cltk')                  # Import the latin_models_cltk corpus
    lemmatizer = BackoffLatinLemmatizer()                               # Initialize Latin lemmatizer
        
    import re
    punctuation = r"[\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»]"             # Punctuation pattern
    a = []
    for i in range(len(list_of_texts)):
        text = str(list_of_texts[i])
        new_text = ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])                                                               # Remove Greek (non-ASCII) characters
        text_no_punct = re.sub(punctuation, '', new_text)               # Remove punctuation
        text_one_white_space = re.sub(r"\s{2,}", ' ', text_no_punct)    # Leave only one white space b/w words
        text_no_trailing_space = text_one_white_space.strip()           # Remove trailing white space
        text_lower = text_no_trailing_space.lower()                     # Transform to all lower case
        text_split = text_lower.split(' ')                              # Split to a list of tokens
        lemmas = lemmatizer.lemmatize(text_split)                       # Lemmatize
        textunit = ''                                                   # Empyt string for textunti
        for y in range(len(lemmas)):
            if stopwords is not None:
                if lemmas[y][1] not in stopwords:
                    textunit = textunit + str(lemmas[y][1] + ' ')
            else:
                textunit = textunit + str(lemmas[y][1] + ' ')
        textunit = textunit.strip()
        a.append(textunit)                                           # Add the "document" to a list
    return a