diff --git a/build.py b/build.py index 0f57d9f2410a0b17d7782d47bd07932671324a55..dc83f4ba41a35f0c4eec562e99bf64b1ebef57f5 100644 --- a/build.py +++ b/build.py @@ -18,6 +18,9 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from datetime import datetime +import warnings +warnings.filterwarnings('ignore') + #Functions for splitting into train/test sets. def split_dataset(df, test_size=0.25): movies_train, movies_test = train_test_split(df, test_size=test_size, shuffle=True) @@ -44,6 +47,7 @@ vectorizer = CountVectorizer() # url = "https://github.com/Jamchello/WikiMoviePlots/blob/master/wiki_movie_plots_deduped.csv?raw=true" movies = pd.read_csv('./data/Data.csv', delimiter=",") +print("Pre-processing data...") MIN_MOVIES = 500 movies['Count'] = 1 @@ -446,13 +450,16 @@ upsampled_df = pd.concat(resampled) movies = upsampled_df.copy() optimal_val_split_lr = 0.1 +print("Training model...") movies_train, movies_test, x_train_lr, y_train_lr, x_test_lr, y_test_lr = get_train_test(upsampled_df, test_size=optimal_val_split_lr) lr_classifier = LogisticRegression(max_iter=2000) classifier = OneVsRestClassifier(lr_classifier) classifier.fit(x_train_lr, y_train_lr) +print("Pickling files...") for item in ['mlb', 'vectorizer','model']: + print("Saving " + f'{item}.pickle') with open(f'{item}.pickle', 'wb') as handle: pickle.dump(item, handle)