From bb4dfc9ffca39a127df193f2862dfed047e0201f Mon Sep 17 00:00:00 2001 From: "Mcintosh, Liam S (UG - Computer Science)" <lm00840@surrey.ac.uk> Date: Mon, 24 May 2021 13:14:34 +0100 Subject: [PATCH] Update build.py --- build.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/build.py b/build.py index 0f57d9f..dc83f4b 100644 --- a/build.py +++ b/build.py @@ -18,6 +18,9 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from datetime import datetime +import warnings +warnings.filterwarnings('ignore') + #Functions for splitting into train/test sets. def split_dataset(df, test_size=0.25): movies_train, movies_test = train_test_split(df, test_size=test_size, shuffle=True) @@ -44,6 +47,7 @@ vectorizer = CountVectorizer() # url = "https://github.com/Jamchello/WikiMoviePlots/blob/master/wiki_movie_plots_deduped.csv?raw=true" movies = pd.read_csv('./data/Data.csv', delimiter=",") +print("Pre-processing data...") MIN_MOVIES = 500 movies['Count'] = 1 @@ -446,13 +450,16 @@ upsampled_df = pd.concat(resampled) movies = upsampled_df.copy() optimal_val_split_lr = 0.1 +print("Training model...") movies_train, movies_test, x_train_lr, y_train_lr, x_test_lr, y_test_lr = get_train_test(upsampled_df, test_size=optimal_val_split_lr) lr_classifier = LogisticRegression(max_iter=2000) classifier = OneVsRestClassifier(lr_classifier) classifier.fit(x_train_lr, y_train_lr) +print("Pickling files...") for item in ['mlb', 'vectorizer','model']: + print("Saving " + f'{item}.pickle') with open(f'{item}.pickle', 'wb') as handle: pickle.dump(item, handle) -- GitLab