diff --git a/app.py b/app.py index 7aebde3acd003e8affdbc95fd26e2307ba743c7f..aa51c0b8859484a0e370c67dea3be8267fc027aa 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,7 @@ import numpy as np -from flask import Flask, request, jsonify, render_template +from flask import Flask, request, render_template import pickle +import re import json app = Flask(__name__) #Initialize the flask App @@ -9,13 +10,67 @@ model = pickle.load(open('model.pickle', 'rb')) mlb = pickle.load(open('mlb.pickle','rb')) vectorizer = pickle.load(open('vectorizer.pickle','rb')) -@app.route('/') -def home(): - return render_template('index.ejs') +replace = [':', ';', '<', '=', '>', '?', '@', '\\', '_', '`', + '\n', '\r', '#', '$', '%', '&', "'", '*', '+', '-', '{', '|', '}', + '\xa0', '¢', '£', 'Â¥', '«', '°', '´', '»', '¼', '½', '×', 'ß', 'à ', 'á', 'â', + 'ã', 'ä', 'Ã¥', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'Ã', 'î', 'ï', 'ð', 'ñ', + 'ò', 'ó', 'ô', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'Ä', 'ă', 'ć', 'Ä', 'Ä‘', + 'Ä“', 'Ä—', 'Ä›', 'ÄŸ', 'Ä¡', 'ħ', 'Ä©', 'Ä«', 'ı', 'ĺ', 'Å‚', 'Å„', 'Å', 'Å“', 'Å›', + 'ÅŸ', 'Å¡', 'Å£', 'Å©', 'Å«', 'Å', 'ź', 'ż', 'ž', 'Æ¡', 'ư', 'Ç”', 'È™', 'È›', 'É', + 'É”', 'É™', 'É¡', 'ɪ', 'ɾ', 'ʃ', 'ÊŠ', 'ʲ', 'Ê»', 'ʼ', 'ˈ', 'Ë', '̇', 'μ', 'Ï€', + 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', + 'Ñ€', 'Ñ', 'Ñ‚', 'у', 'Ñ…', 'ч', 'ш', 'Ñ‹', 'ÑŒ', 'Ñ', 'Ñ‘', 'Ñ’', 'ј', 'Ñš', '×”', + 'ו', '×™', 'ך', 'ל', 'מ', 'ש', 'ं', 'अ', 'आ', 'उ', 'क', 'ग', 'च', 'ज', 'ट', + 'ठ', 'ण', 'त', 'द', 'न', 'प', 'फ', 'ब', 'à¤', 'म', 'य', 'र', 'ल', 'व', 'ष', + 'स', 'ह', 'ा', 'ि', 'ी', 'à¥', 'ू', 'ृ', 'े', 'ै', 'ो', 'à¥', 'ক', 'ঠ', 'ত', 'থ', + 'দ', 'ধ', 'ন', 'ব', 'র', 'শ', 'ষ', 'স', 'া', 'ি', 'à§€', 'à§', 'ে', 'à§', 'ế', 'á»…', + 'ệ', 'á»™', '\u2009', '\u200a', '\u200b', '\u200c', '\u200d', 'â€', '–', '—', '―', + '‘', '’', '‚', '“', 'â€', '…', '′', '″', 'â„', '₤', '€', '₱', '₹', 'â…“', 'â…ž', 'â†', + '−', '♥', '\u3000', 'ã‚', 'ã„', 'ã†', 'ãŠ', 'ã‹', 'ãŒ', 'ã', 'ã', 'ã‘', 'ã’', 'ã“', + 'ã—', 'ã™', 'ã›', 'ã«', 'ã®', 'ã»', 'ã¾', 'ã¿', 'ã‚‚', 'ã‚„', 'ã‚…', 'よ', 'り', 'ã‚“', + 'ã‚¢', 'イ', 'ã‚«', 'ガ', 'ã‚', 'ã‚®', 'ク', 'ケ', 'ã‚´', 'サ', 'ã‚¶', 'ã‚·', 'ジ', 'ス', + 'ズ', 'ソ', 'ゾ', 'ã‚¿', 'ダ', 'ッ', 'ツ', 'デ', 'ト', 'ド', 'ニ', 'ヌ', 'ãƒ', 'ノ', + 'ãƒ', 'パ', 'フ', 'ブ', 'プ', 'ペ', 'ボ', 'ム', 'メ', 'ャ', 'ヤ', 'ュ', 'ラ', 'リ', + 'ル', 'ãƒ', 'ワ', 'ン', 'ヶ', '・', 'ー', 'ㄜ', '一', '七', '三', '世', 'ä¸', '丸', + '丹', '举', 'ä¹…', '之', '也', '予', '二', '井', '京', '人', 'ä»', '介', 'ä¼½', '俊', + 'ä¿¡', 'å…ƒ', 'å…„', 'å…ˆ', 'å…‰', 'å…š', 'å…¬', 'å…', '冯', '刀', '刃', '刘', 'åŠ', '力', + '勇', 'å‹™', '化', 'å', 'åƒ', '原', 'å‹', 'å¸', 'åˆ', 'å', 'å‘', 'å›', 'å³', '命', + 'å’²', 'å››', '団', '図', '国', 'åœ', '城', '域', 'å¡”', '士', '外', '多', '夢', '大', + '天', '夫', '奪', '女', 'å§', '婆', '婉', 'å', 'å™', 'å¦', 'å«', '宇', '安', 'å®—', + 'å®™', 'å®®', 'å®¶', 'å°', 'å°š', 'å°¾', 'å±±', 'å³¶', 'å·ž', 'å·«', '布', '师', '師', '府', + 'åº', 'å¼', 'å¼', 'å¼ ', 'å¾·', '怪', 'æµ', '悟', '悪', '感', '我', '擊', 'æ•™', 'æ–¹', + 'æ—…', 'æ—¥', '春', '書', '月', '朋', '望', '木', '本', 'æ‘', 'æ¡', 'æ', 'æ°', 'æ±', + 'æž—', '柊', '柑', 'æ¡‘', '森', '椰', 'æ¥', '樵', '樹', 'æ£', 'æ¦', 'æ©', 'æ»', 'æ°', + 'æ°‘', 'æ°¸', '汤', 'æ²™', 'æ²¢', 'æ²»', '法', 'æ³°', 'æ´¥', 'æµ·', '清', '渡', 'æ¹–', 'æº', + 'æ½›', '澎', '澤', '濱', 'ç£', '点', '焉', 'ç„¡', '爱', '爷', '物', '犯', 'ç‹', 'ç‹™', + '狸', '王', 'çŠ', 'çª', '瑜', '瑞', '生', 'ç”°', 'ç”±', '甲', '町', 'ç•‘', 'çš„', '真', + '神', 'ç¥', 'ç©‚', '空', 'ç’', 'ç´€', 'ç¶¿', 'ç¹”', '美', 'è€', '者', 'è–', '興', '良', + '花', '芳', 'è‰', 'èœ', 'è©', '葵', 'è“', 'è–ˆ', 'è—', '號', '蛇', '行', 'è¡›', '裕', + '襄', '西', '語', '談', '諜', 'è˜', '讃', '识', 'è°·', 'è²´', 'è³¢', 'è´', 'è¶…', 'è¶³', + '轩', '逆', 'éŠ', 'é”', '邦', '郎', '部', '里', '野', '金', '鈴', '鉄', '鎮', 'é–“', + '防', '陆', '陸', '陽', '隊', '雄', '雪', 'é’', '響', '首', '香', '馮', 'é§¿', '马', + '髪', 'é”', 'éš', 'é¹…', '麻', '黃', '黄', 'é¾', 'é¾™', 'ë¶', 'ì´Œ', 'fl', '\ufeff',] + +def preProcess(text): + text = text.lower() + text = re.sub(r"what's", "what is ", text) + text = re.sub(r"\'s", " ", text) + text = re.sub(r"\'ve", " have ", text) + text = re.sub(r"can't", "can not ", text) + text = re.sub(r"n't", " not ", text) + text = re.sub(r"i'm", "i am ", text) + text = re.sub(r"\'re", " are ", text) + text = re.sub(r"\'d", " would ", text) + text = re.sub(r"\'ll", " will ", text) + text = re.sub(r"\'scuse", " excuse ", text) + text = text.strip(' ') + for character in replace: + text = text.replace(character,"") + return text @app.route('/predict',methods=['POST']) def predict(): - reqInput = request.form.get('text') + reqInput = preProcess(request.form.get('text')) vector = vectorizer.transform([reqInput]) prediction = model.predict(vector) output = json.dumps(mlb.inverse_transform(prediction)[0]) diff --git a/build.py b/build.py new file mode 100644 index 0000000000000000000000000000000000000000..0f57d9f2410a0b17d7782d47bd07932671324a55 --- /dev/null +++ b/build.py @@ -0,0 +1,470 @@ +#Imports +import re +from random import seed +import pandas as pd +import numpy as np +import pickle +import os + +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import MultiLabelBinarizer +from sklearn.utils import resample +from sklearn.multiclass import OneVsRestClassifier +from numpy import array + +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split +from datetime import datetime + +#Functions for splitting into train/test sets. +def split_dataset(df, test_size=0.25): + movies_train, movies_test = train_test_split(df, test_size=test_size, shuffle=True) + return movies_train, movies_test + +def get_train_test(df, test_size=0.25): + movies_train, movies_test = split_dataset(df, test_size) + + y_train = mlb.transform(movies_train["GenreSplitMain"].tolist()) + y_test = mlb.transform(movies_test["GenreSplitMain"].tolist()) + + vectorizer.fit(movies_train.Plot) + + x_train = vectorizer.transform(movies_train.Plot) + x_test = vectorizer.transform(movies_test.Plot) + + return movies_train, movies_test, x_train, y_train, x_test, y_test + +mlb = MultiLabelBinarizer() +vectorizer = CountVectorizer() + + +#Fetching the original dataset. +# url = "https://github.com/Jamchello/WikiMoviePlots/blob/master/wiki_movie_plots_deduped.csv?raw=true" +movies = pd.read_csv('./data/Data.csv', delimiter=",") + +MIN_MOVIES = 500 +movies['Count'] = 1 + +genre_count = movies.Genre.value_counts() +filtered_genre_count = genre_count[genre_count >= MIN_MOVIES].values # Table of genres and count with more than 50 movies +genres_array = movies.Genre +genres = pd.DataFrame({'Genre':genres_array}) +genres['Count'] = 1 +genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) +genres = genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) +top_genres = np.array(genres[genres['Count'] >= MIN_MOVIES]['Genre']) + +movies['GenreCorrected'] =movies['Genre'] +movies['GenreCorrected']=movies['GenreCorrected'].str.strip() +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' - ', '|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' / ', '|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('/', '|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' & ', '|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(', ', '|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('; ', '|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-pic', 'biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biopic', 'biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographical', 'biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biodrama', 'biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-drama', 'biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographic', 'biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(film genre\)', '') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animated','animation') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anime','animation') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('children\'s','children') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedey','comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[not in citation given\]','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' set 4,000 years ago in the canadian arctic','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historical','history') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romantic','romance') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3-d','animation') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3d','animation') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('viacom 18 motion pictures','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci-fi','science_fiction') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ttriller','thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('.','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('based on radio serial','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' on the early years of hitler','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci fi','science_fiction') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science fiction','science_fiction') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' (30min)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('16 mm film','short') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[140\]','drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[144\]','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' for ','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventures','adventure') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung fu','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung-fu','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial arts','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war ii','war') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war i','war') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography about montreal canadiens star|maurice richard','biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bholenath movies|cinekorn entertainment','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(volleyball\)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy film','spy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anthology film','anthology') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography fim','biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('avant-garde','avant_garde') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biker film','biker') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('buddy cop','buddy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('buddy film','buddy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy 2-reeler','comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('films','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('film','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography of pioneering american photographer eadweard muybridge','biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('british-german co-production','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bruceploitation','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy-drama adaptation of the mordecai richler novel','comedy-drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies by the mob\|knkspl','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movie','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming of age','coming_of_age') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming-of-age','coming_of_age') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama about child soldiers','drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( based).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( co-produced).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( adapted).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( about).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musical b','musical') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animationchildren','animation|children') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' period','period') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama loosely','drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace("yogesh dattatraya gosavi's directorial debut \[9\]",'') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace("war-time","war") +movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wartime","war") +movies['GenreCorrected']=movies['GenreCorrected'].str.replace("ww1","war") +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('unknown','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wwii","war") +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psychological','psycho') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom-coms','romance') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('true crime','crime') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|007','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('slice of life','slice_of_life') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computer animation','animation') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gun fu','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('j-horror','horror') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(shogi|chess\)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('afghan war drama','war drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|6 separate stories','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(30min\)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' (road bicycle racing)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' v-cinema','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv miniseries','tv_miniseries') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|docudrama','\|documentary|drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' in animation','|animation') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptation).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptated).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adapted).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( on ).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('american football','sports') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dev\|nusrat jahan','sports') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('television miniseries','tv_miniseries') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(artistic\)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \|direct-to-dvd','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('history dram','history drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial art','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psycho thriller,','psycho thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|1 girl\|3 suitors','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(road bicycle racing\)','') +filterE = movies['GenreCorrected']=="ero" +movies.loc[filterE,'GenreCorrected']="adult" +filterE = movies['GenreCorrected']=="music" +movies.loc[filterE,'GenreCorrected']="musical" +filterE = movies['GenreCorrected']=="-" +movies.loc[filterE,'GenreCorrected']='' +filterE = movies['GenreCorrected']=="comedy–drama" +movies.loc[filterE,'GenreCorrected'] = "comedy|drama" +filterE = movies['GenreCorrected']=="comedy–horror" +movies.loc[filterE,'GenreCorrected'] = "comedy|horror" +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' ','|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace(',','|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('-','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionadventure','action|adventure') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actioncomedy','action|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actiondrama','action|drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionlove','action|love') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionmasala','action|masala') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionchildren','action|children') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasychildren\|','fantasy|children') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasycomedy','fantasy|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasyperiod','fantasy|period') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('cbctv_miniseries','tv_miniseries') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedy','drama|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedysocial','drama|comedy|social') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedydrama','comedy|drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedyhorror','comedy|horror') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sciencefiction','science_fiction') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventurecomedy','adventure|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animationdrama','animation|drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\|','|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('muslim','religious') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('thriler','thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('crimethriller','crime|thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantay','fantasy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionthriller','action|thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedysocial','comedy|social') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martialarts','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(children\|poker\|karuta\)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('epichistory','epic|history') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotica','adult') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotic','adult') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((\|produced\|).+)','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('chanbara','chambara') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedythriller','comedy|thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('colour\|yellow\|productions\|eros\|international','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|directtodvd','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('liveaction','live|action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('melodrama','drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroes','superheroe') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gangsterthriller','gangster|thriller') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heistcomedy','comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heist','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historic','history') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historydisaster','history|disaster') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('warcomedy','war|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('westerncomedy','western|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ancientcostume','costume') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computeranimation','animation') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramatic','drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramedy','drama|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramaa','drama') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('famil\|','family') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroe','superhero') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biogtaphy','biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('devotionalbiography','devotional|biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('docufiction','documentary|fiction') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familydrama','family|drama') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('espionage','spy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('supeheroes','superhero') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancefiction','romance|fiction') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horrorthriller','horror|thriller') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspensethriller','suspense|thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musicaliography','musical|biography') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('triller','thriller') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(fiction\)','|fiction') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanceaction','romance|action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancecomedy','romance|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancehorror','romance|horror') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romcom','romance|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom\|com','romance|comedy') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('satirical','satire') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fictionchildren','science_fiction|children') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('homosexual','adult') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sexual','adult') + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('mockumentary','documentary') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('periodic','period') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanctic','romantic') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('politics','political') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('samurai','martial_arts') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv_miniseries','series') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('serial','series') + +filterE = movies['GenreCorrected']=="musical–comedy" +movies.loc[filterE,'GenreCorrected'] = "musical|comedy" + +filterE = movies['GenreCorrected']=="roman|porno" +movies.loc[filterE,'GenreCorrected'] = "adult" + + +filterE = movies['GenreCorrected']=="action—masala" +movies.loc[filterE,'GenreCorrected'] = "action|masala" + + +filterE = movies['GenreCorrected']=="horror–thriller" +movies.loc[filterE,'GenreCorrected'] = "horror|thriller" + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('family','children') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial_arts','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horror','thriller') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('war','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventure','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fiction','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('noir','black') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superhero','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('social','') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspense','action') + + +filterE = movies['GenreCorrected']=="drama|romance|adult|children" +movies.loc[filterE,'GenreCorrected'] = "drama|romance|adult" + +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|–\|','|') +movies['GenreCorrected']=movies['GenreCorrected'].str.strip(to_strip='\|') +movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionner','action') +movies['GenreCorrected']=movies['GenreCorrected'].str.strip() + +moviesGenre = movies[['GenreCorrected','Count']].groupby(['GenreCorrected']).count() + +movies[['GenreCorrected','Count']].groupby(['GenreCorrected'], as_index=False).count().shape[0] + +movies['GenreSplit']=movies['GenreCorrected'].str.split('|') +movies['GenreSplit']= movies['GenreSplit'].apply(np.sort).apply(np.unique) +movies[['GenreCorrected', 'GenreSplit']][100:120] + +genres_array = np.array([]) + +for i in range(0,movies.shape[0]-1): + genres_array = np.concatenate((genres_array, movies['GenreSplit'][i])) + +genres = pd.DataFrame({'Genre':genres_array}) +genres['Count'] = 1 +genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) +genres = genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) +genres = genres[genres['Genre'] != ''] # Remove 'unknown' genre +print(genres.head(25)) + +genres_count = np.array(genres[genres.Count >= MIN_MOVIES].Count) +genres_list = np.array(genres[genres.Count >= MIN_MOVIES].Genre) + +MIN_MOVIES = 2000 # Minimum number of movies per genre + +main_genres = np.array(genres[genres.Count >= MIN_MOVIES].Genre) # List of genres that will be used for classification +movies['GenreSplitMain'] = movies['GenreSplit'].apply(lambda x: x[np.in1d(x, main_genres)]) + +main_genres_count = np.array(genres[genres.Count >= MIN_MOVIES].Count) + +movies['GenreCount'] = movies['GenreSplitMain'].apply(len) +movies_df = movies[movies['GenreCount'] != 0][["Plot", "GenreSplitMain"]] + +encoded_genres = movies_df.GenreSplitMain.apply(lambda x: '-'.join(x)).str.get_dummies(sep='-') + + +def clean_text(text): + text = text.lower() + text = re.sub(r"what's", "what is ", text) + text = re.sub(r"\'s", " ", text) + text = re.sub(r"\'ve", " have ", text) + text = re.sub(r"can't", "can not ", text) + text = re.sub(r"n't", " not ", text) + text = re.sub(r"i'm", "i am ", text) + text = re.sub(r"\'re", " are ", text) + text = re.sub(r"\'d", " would ", text) + text = re.sub(r"\'ll", " will ", text) + text = re.sub(r"\'scuse", " excuse ", text) + text = text.strip(' ') + return text + +replace = [':', ';', '<', '=', '>', '?', '@', '\\', '_', '`', + '\n', '\r', '#', '$', '%', '&', "'", '*', '+', '-', '{', '|', '}', + '\xa0', '¢', '£', 'Â¥', '«', '°', '´', '»', '¼', '½', '×', 'ß', 'à ', 'á', 'â', + 'ã', 'ä', 'Ã¥', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'Ã', 'î', 'ï', 'ð', 'ñ', + 'ò', 'ó', 'ô', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'Ä', 'ă', 'ć', 'Ä', 'Ä‘', + 'Ä“', 'Ä—', 'Ä›', 'ÄŸ', 'Ä¡', 'ħ', 'Ä©', 'Ä«', 'ı', 'ĺ', 'Å‚', 'Å„', 'Å', 'Å“', 'Å›', + 'ÅŸ', 'Å¡', 'Å£', 'Å©', 'Å«', 'Å', 'ź', 'ż', 'ž', 'Æ¡', 'ư', 'Ç”', 'È™', 'È›', 'É', + 'É”', 'É™', 'É¡', 'ɪ', 'ɾ', 'ʃ', 'ÊŠ', 'ʲ', 'Ê»', 'ʼ', 'ˈ', 'Ë', '̇', 'μ', 'Ï€', + 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', + 'Ñ€', 'Ñ', 'Ñ‚', 'у', 'Ñ…', 'ч', 'ш', 'Ñ‹', 'ÑŒ', 'Ñ', 'Ñ‘', 'Ñ’', 'ј', 'Ñš', '×”', + 'ו', '×™', 'ך', 'ל', 'מ', 'ש', 'ं', 'अ', 'आ', 'उ', 'क', 'ग', 'च', 'ज', 'ट', + 'ठ', 'ण', 'त', 'द', 'न', 'प', 'फ', 'ब', 'à¤', 'म', 'य', 'र', 'ल', 'व', 'ष', + 'स', 'ह', 'ा', 'ि', 'ी', 'à¥', 'ू', 'ृ', 'े', 'ै', 'ो', 'à¥', 'ক', 'ঠ', 'ত', 'থ', + 'দ', 'ধ', 'ন', 'ব', 'র', 'শ', 'ষ', 'স', 'া', 'ি', 'à§€', 'à§', 'ে', 'à§', 'ế', 'á»…', + 'ệ', 'á»™', '\u2009', '\u200a', '\u200b', '\u200c', '\u200d', 'â€', '–', '—', '―', + '‘', '’', '‚', '“', 'â€', '…', '′', '″', 'â„', '₤', '€', '₱', '₹', 'â…“', 'â…ž', 'â†', + '−', '♥', '\u3000', 'ã‚', 'ã„', 'ã†', 'ãŠ', 'ã‹', 'ãŒ', 'ã', 'ã', 'ã‘', 'ã’', 'ã“', + 'ã—', 'ã™', 'ã›', 'ã«', 'ã®', 'ã»', 'ã¾', 'ã¿', 'ã‚‚', 'ã‚„', 'ã‚…', 'よ', 'り', 'ã‚“', + 'ã‚¢', 'イ', 'ã‚«', 'ガ', 'ã‚', 'ã‚®', 'ク', 'ケ', 'ã‚´', 'サ', 'ã‚¶', 'ã‚·', 'ジ', 'ス', + 'ズ', 'ソ', 'ゾ', 'ã‚¿', 'ダ', 'ッ', 'ツ', 'デ', 'ト', 'ド', 'ニ', 'ヌ', 'ãƒ', 'ノ', + 'ãƒ', 'パ', 'フ', 'ブ', 'プ', 'ペ', 'ボ', 'ム', 'メ', 'ャ', 'ヤ', 'ュ', 'ラ', 'リ', + 'ル', 'ãƒ', 'ワ', 'ン', 'ヶ', '・', 'ー', 'ㄜ', '一', '七', '三', '世', 'ä¸', '丸', + '丹', '举', 'ä¹…', '之', '也', '予', '二', '井', '京', '人', 'ä»', '介', 'ä¼½', '俊', + 'ä¿¡', 'å…ƒ', 'å…„', 'å…ˆ', 'å…‰', 'å…š', 'å…¬', 'å…', '冯', '刀', '刃', '刘', 'åŠ', '力', + '勇', 'å‹™', '化', 'å', 'åƒ', '原', 'å‹', 'å¸', 'åˆ', 'å', 'å‘', 'å›', 'å³', '命', + 'å’²', 'å››', '団', '図', '国', 'åœ', '城', '域', 'å¡”', '士', '外', '多', '夢', '大', + '天', '夫', '奪', '女', 'å§', '婆', '婉', 'å', 'å™', 'å¦', 'å«', '宇', '安', 'å®—', + 'å®™', 'å®®', 'å®¶', 'å°', 'å°š', 'å°¾', 'å±±', 'å³¶', 'å·ž', 'å·«', '布', '师', '師', '府', + 'åº', 'å¼', 'å¼', 'å¼ ', 'å¾·', '怪', 'æµ', '悟', '悪', '感', '我', '擊', 'æ•™', 'æ–¹', + 'æ—…', 'æ—¥', '春', '書', '月', '朋', '望', '木', '本', 'æ‘', 'æ¡', 'æ', 'æ°', 'æ±', + 'æž—', '柊', '柑', 'æ¡‘', '森', '椰', 'æ¥', '樵', '樹', 'æ£', 'æ¦', 'æ©', 'æ»', 'æ°', + 'æ°‘', 'æ°¸', '汤', 'æ²™', 'æ²¢', 'æ²»', '法', 'æ³°', 'æ´¥', 'æµ·', '清', '渡', 'æ¹–', 'æº', + 'æ½›', '澎', '澤', '濱', 'ç£', '点', '焉', 'ç„¡', '爱', '爷', '物', '犯', 'ç‹', 'ç‹™', + '狸', '王', 'çŠ', 'çª', '瑜', '瑞', '生', 'ç”°', 'ç”±', '甲', '町', 'ç•‘', 'çš„', '真', + '神', 'ç¥', 'ç©‚', '空', 'ç’', 'ç´€', 'ç¶¿', 'ç¹”', '美', 'è€', '者', 'è–', '興', '良', + '花', '芳', 'è‰', 'èœ', 'è©', '葵', 'è“', 'è–ˆ', 'è—', '號', '蛇', '行', 'è¡›', '裕', + '襄', '西', '語', '談', '諜', 'è˜', '讃', '识', 'è°·', 'è²´', 'è³¢', 'è´', 'è¶…', 'è¶³', + '轩', '逆', 'éŠ', 'é”', '邦', '郎', '部', '里', '野', '金', '鈴', '鉄', '鎮', 'é–“', + '防', '陆', '陸', '陽', '隊', '雄', '雪', 'é’', '響', '首', '香', '馮', 'é§¿', '马', + '髪', 'é”', 'éš', 'é¹…', '麻', '黃', '黄', 'é¾', 'é¾™', 'ë¶', 'ì´Œ', 'fl', '\ufeff',] + + +movies_df['Plot'] = movies_df['Plot'].apply(clean_text) +for character in replace: + movies_df['Plot'] = movies_df['Plot'].apply(lambda x: x.replace(character, '')) + +mlb.fit([main_genres]) + +#Upsampling + +genre_count_table = pd.DataFrame(columns=['Genre', 'MovieCount']) +i = 0 +highest_movie_count = 0 +for genre in main_genres: + genre_count_table.loc[i, 'Genre'] = genre + genre_count_table.loc[i, 'MovieCount'] = movies_df[encoded_genres[genre] == 1].shape[0] + if movies_df[encoded_genres[genre] == 1].shape[0] >= highest_movie_count: + highest_movie_count = movies_df[encoded_genres[genre] == 1].shape[0] + i += 1 + +resampled = [] +for genre in main_genres: + df = movies_df[encoded_genres[genre] == 1] + if len(df) == highest_movie_count: + df_upsample = df + else: + print("Resampling {}: [{}] to {} samples".format(genre, len(df), highest_movie_count)) + df_upsample = resample(df, replace=True, n_samples=highest_movie_count) + + resampled.append(df_upsample) + +upsampled_df = pd.concat(resampled) + +movies = upsampled_df.copy() +optimal_val_split_lr = 0.1 + +movies_train, movies_test, x_train_lr, y_train_lr, x_test_lr, y_test_lr = get_train_test(upsampled_df, test_size=optimal_val_split_lr) +lr_classifier = LogisticRegression(max_iter=2000) +classifier = OneVsRestClassifier(lr_classifier) +classifier.fit(x_train_lr, y_train_lr) + + +for item in ['mlb', 'vectorizer','model']: + with open(f'{item}.pickle', 'wb') as handle: + pickle.dump(item, handle) + +#Automatically restart the services: MAY NEED TO CHANGE TO PUSH TO GITHUB AND THEN SSHING INTO THE SERVER ETC ETC + +# print('Restarting services') +# os.chdir("~/Dev/coursework") +# os.system("forever restart;cd NodeAPI;pm2 restart /NodeAPI/index.js") +# print('Restarted services') + +print('Pushing trained pickle models...') +os.system("git add mlb.pickle vectorizer.pickle model.pickle") +os.system("git commit -m \"Update re-trained models\"") +os.system("git push -u origin master") + diff --git a/data/.gitkeep b/data/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/data/Data.csv b/data/Data.csv new file mode 100644 index 0000000000000000000000000000000000000000..fd67c9bc1014c7e86ad0ac4f753f2ffd53b1521e Binary files /dev/null and b/data/Data.csv differ diff --git a/data/test.txt b/data/test.txt deleted file mode 100644 index 524acfffa760fd0b8c1de7cf001f8dd348b399d8..0000000000000000000000000000000000000000 --- a/data/test.txt +++ /dev/null @@ -1 +0,0 @@ -Test file diff --git a/mlb.pickle b/mlb.pickle index de75a1db7de12d7f07fef6d54f72d35f7c5eed71..6a74ded4d09524c9ba37f45765efa03d06f32ca5 100644 Binary files a/mlb.pickle and b/mlb.pickle differ diff --git a/model.pickle b/model.pickle index 85c8237522740a4c73a3244bded7a692e37e5e91..7482bf47b53d5482ff76918cca53894ed08f4782 100644 Binary files a/model.pickle and b/model.pickle differ diff --git a/push_updates.sh b/push_updates.sh deleted file mode 100644 index 258b40351fdebec507ee09a5b6d26f69c27f03de..0000000000000000000000000000000000000000 --- a/push_updates.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -ehco "Pushing updates to repo..." diff --git a/update_model.sh b/update_model.sh new file mode 100644 index 0000000000000000000000000000000000000000..2dd62e4943016c4e08664ebf9a3682f9e4979d3a --- /dev/null +++ b/update_model.sh @@ -0,0 +1,6 @@ +#!/bin/bash +echo "Pulling updates..." +git pull + +echo "Restarting server..." +pm2 restart all diff --git a/vectorizer.pickle b/vectorizer.pickle index 9fb289c932bdb6aec603b028b1d59ad4e5fa3796..bc3bcee39a706c7887dd62eb6d0ece7bec214ebd 100644 Binary files a/vectorizer.pickle and b/vectorizer.pickle differ