diff --git a/build.py b/build.py index 81c253ad1168d992b59698b6ee6896d03a259077..bf300b45784eba694322b7634542cba1cc166e0f 100644 --- a/build.py +++ b/build.py @@ -2,23 +2,19 @@ import warnings warnings.filterwarnings('ignore') import re +import os from random import seed import pandas as pd import numpy as np import pickle -import os -from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression -from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import MultiLabelBinarizer from sklearn.utils import resample from sklearn.multiclass import OneVsRestClassifier -from numpy import array - from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split -from datetime import datetime + #Functions for splitting into train/test sets. def split_dataset(df, test_size=0.25): @@ -41,24 +37,11 @@ def get_train_test(df, test_size=0.25): mlb = MultiLabelBinarizer() vectorizer = CountVectorizer() - #Fetching the original dataset. -# url = "https://github.com/Jamchello/WikiMoviePlots/blob/master/wiki_movie_plots_deduped.csv?raw=true" movies = pd.read_csv('./data/Data.csv', delimiter=",") print("Pre-processing data...") -MIN_MOVIES = 500 movies['Count'] = 1 - -genre_count = movies.Genre.value_counts() -filtered_genre_count = genre_count[genre_count >= MIN_MOVIES].values # Table of genres and count with more than 50 movies -genres_array = movies.Genre -genres = pd.DataFrame({'Genre':genres_array}) -genres['Count'] = 1 -genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) -genres = genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) -top_genres = np.array(genres[genres['Count'] >= MIN_MOVIES]['Genre']) - movies['GenreCorrected'] =movies['Genre'] movies['GenreCorrected']=movies['GenreCorrected'].str.strip() movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' - ', '|') @@ -340,17 +323,12 @@ genres['Count'] = 1 genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) genres = genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) genres = genres[genres['Genre'] != ''] # Remove 'unknown' genre -print(genres.head(25)) - -genres_count = np.array(genres[genres.Count >= MIN_MOVIES].Count) -genres_list = np.array(genres[genres.Count >= MIN_MOVIES].Genre) MIN_MOVIES = 2000 # Minimum number of movies per genre main_genres = np.array(genres[genres.Count >= MIN_MOVIES].Genre) # List of genres that will be used for classification movies['GenreSplitMain'] = movies['GenreSplit'].apply(lambda x: x[np.in1d(x, main_genres)]) -main_genres_count = np.array(genres[genres.Count >= MIN_MOVIES].Count) movies['GenreCount'] = movies['GenreSplitMain'].apply(len) movies_df = movies[movies['GenreCount'] != 0][["Plot", "GenreSplitMain"]] @@ -422,16 +400,10 @@ for character in replace: mlb.fit([main_genres]) #Upsampling - -genre_count_table = pd.DataFrame(columns=['Genre', 'MovieCount']) -i = 0 highest_movie_count = 0 for genre in main_genres: - genre_count_table.loc[i, 'Genre'] = genre - genre_count_table.loc[i, 'MovieCount'] = movies_df[encoded_genres[genre] == 1].shape[0] if movies_df[encoded_genres[genre] == 1].shape[0] >= highest_movie_count: highest_movie_count = movies_df[encoded_genres[genre] == 1].shape[0] - i += 1 resampled = [] for genre in main_genres: @@ -446,10 +418,8 @@ for genre in main_genres: upsampled_df = pd.concat(resampled) -movies = upsampled_df.copy() -optimal_val_split_lr = 0.1 - print("Training model...") +optimal_val_split_lr = 0.1 movies_train, movies_test, x_train_lr, y_train_lr, x_test_lr, y_test_lr = get_train_test(upsampled_df, test_size=optimal_val_split_lr) lr_classifier = LogisticRegression(max_iter=2000) classifier = OneVsRestClassifier(lr_classifier) @@ -466,15 +436,9 @@ with open('mlb.pickle', 'wb') as handle: with open('model.pickle', 'wb') as handle: pickle.dump(classifier, handle) -#Automatically restart the services: MAY NEED TO CHANGE TO PUSH TO GITHUB AND THEN SSHING INTO THE SERVER ETC ETC - -# print('Restarting services') -# os.chdir("~/Dev/coursework") -# os.system("forever restart;cd NodeAPI;pm2 restart /NodeAPI/index.js") -# print('Restarted services') print('Pushing trained pickle models...') os.system("git add mlb.pickle vectorizer.pickle model.pickle") -os.system("git commit -m \"Update re-trained models\"") +os.system("git commit -m \"Update re-trained model pickle files\"") os.system("git push -u origin master")