From be25681d1bb02727d869ef5f199571c5f5fd0224 Mon Sep 17 00:00:00 2001
From: "Mcintosh, Liam S (UG - Computer Science)" <lm00840@surrey.ac.uk>
Date: Mon, 24 May 2021 19:51:59 +0000
Subject: [PATCH] Clean up build.py

---
 build.py | 44 ++++----------------------------------------
 1 file changed, 4 insertions(+), 40 deletions(-)

diff --git a/build.py b/build.py
index 81c253a..bf300b4 100644
--- a/build.py
+++ b/build.py
@@ -2,23 +2,19 @@
 import warnings
 warnings.filterwarnings('ignore')
 import re
+import os
 from random import seed
 import pandas as pd
 import numpy as np
 import pickle
-import os
 
-from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LogisticRegression
-from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.preprocessing import MultiLabelBinarizer
 from sklearn.utils import resample
 from sklearn.multiclass import OneVsRestClassifier
-from numpy import array
-
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.model_selection import train_test_split
-from datetime import datetime
+
 
 #Functions for splitting into train/test sets.
 def split_dataset(df, test_size=0.25):
@@ -41,24 +37,11 @@ def get_train_test(df, test_size=0.25):
 mlb = MultiLabelBinarizer()
 vectorizer = CountVectorizer()
 
-
 #Fetching the original dataset.
-# url = "https://github.com/Jamchello/WikiMoviePlots/blob/master/wiki_movie_plots_deduped.csv?raw=true"
 movies = pd.read_csv('./data/Data.csv', delimiter=",")
 
 print("Pre-processing data...")
-MIN_MOVIES = 500
 movies['Count'] = 1
-
-genre_count = movies.Genre.value_counts()
-filtered_genre_count = genre_count[genre_count >= MIN_MOVIES].values # Table of genres and count with more than 50 movies
-genres_array = movies.Genre
-genres = pd.DataFrame({'Genre':genres_array})
-genres['Count'] = 1
-genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False)
-genres = genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False)
-top_genres = np.array(genres[genres['Count'] >= MIN_MOVIES]['Genre'])
-
 movies['GenreCorrected'] =movies['Genre'] 
 movies['GenreCorrected']=movies['GenreCorrected'].str.strip()
 movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' - ', '|')
@@ -340,17 +323,12 @@ genres['Count'] = 1
 genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False)
 genres = genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False)
 genres = genres[genres['Genre'] != '']  # Remove 'unknown' genre
-print(genres.head(25))
-
-genres_count = np.array(genres[genres.Count >= MIN_MOVIES].Count)
-genres_list = np.array(genres[genres.Count >= MIN_MOVIES].Genre)
 
 MIN_MOVIES = 2000  # Minimum number of movies per genre
 
 main_genres = np.array(genres[genres.Count >= MIN_MOVIES].Genre)  # List of genres that will be used for classification
 movies['GenreSplitMain'] = movies['GenreSplit'].apply(lambda x: x[np.in1d(x, main_genres)])
 
-main_genres_count = np.array(genres[genres.Count >= MIN_MOVIES].Count)
 
 movies['GenreCount'] = movies['GenreSplitMain'].apply(len)
 movies_df = movies[movies['GenreCount'] != 0][["Plot", "GenreSplitMain"]]
@@ -422,16 +400,10 @@ for character in replace:
 mlb.fit([main_genres])
 
 #Upsampling
-
-genre_count_table = pd.DataFrame(columns=['Genre', 'MovieCount'])
-i = 0
 highest_movie_count = 0
 for genre in main_genres:
-    genre_count_table.loc[i, 'Genre'] = genre
-    genre_count_table.loc[i, 'MovieCount'] = movies_df[encoded_genres[genre] == 1].shape[0]
     if movies_df[encoded_genres[genre] == 1].shape[0] >= highest_movie_count:
         highest_movie_count = movies_df[encoded_genres[genre] == 1].shape[0]
-    i += 1
 
 resampled = []
 for genre in main_genres:
@@ -446,10 +418,8 @@ for genre in main_genres:
 
 upsampled_df = pd.concat(resampled)
 
-movies = upsampled_df.copy()
-optimal_val_split_lr = 0.1
-
 print("Training model...")
+optimal_val_split_lr = 0.1
 movies_train, movies_test, x_train_lr, y_train_lr, x_test_lr, y_test_lr = get_train_test(upsampled_df, test_size=optimal_val_split_lr)
 lr_classifier = LogisticRegression(max_iter=2000)
 classifier = OneVsRestClassifier(lr_classifier)
@@ -466,15 +436,9 @@ with open('mlb.pickle', 'wb') as handle:
 with open('model.pickle', 'wb') as handle:
     pickle.dump(classifier, handle)
 
-#Automatically restart the services: MAY NEED TO CHANGE TO PUSH TO GITHUB AND THEN SSHING INTO THE SERVER ETC ETC
-
-# print('Restarting services')
-# os.chdir("~/Dev/coursework")
-# os.system("forever restart;cd NodeAPI;pm2 restart /NodeAPI/index.js")
-# print('Restarted services')
 
 print('Pushing trained pickle models...')
 os.system("git add mlb.pickle vectorizer.pickle model.pickle")
-os.system("git commit -m \"Update re-trained models\"")
+os.system("git commit -m \"Update re-trained model pickle files\"")
 os.system("git push -u origin master")
 
-- 
GitLab