diff --git a/update.py b/update.py index 0f71b682794dd0a017425971206e2da83f81aa08..50bef6d058de9ce6a3543f4c8a0999030927f849 100644 --- a/update.py +++ b/update.py @@ -1,306 +1,306 @@ -import re - -from random import seed - import pandas as pd - import numpy as np - -import seaborn as sn - -import matplotlib.pyplot as plt +import pickle from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -from sklearn.metrics import accuracy_score, multilabel_confusion_matrix +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics import accuracy_score from sklearn.preprocessing import MultiLabelBinarizer from sklearn.utils import resample from sklearn.multiclass import OneVsRestClassifier -from nltk.corpus import stopwords from nltk import download -download('stopwords') - -from string import punctuation - import warnings +download('stopwords') warnings.filterwarnings('ignore') url = "https://github.com/Jamchello/WikiMoviePlots/blob/master/wiki_movie_plots_deduped.csv?raw=true" movies = pd.read_csv(url, delimiter=",") movies['Count'] = 1 -movies['GenreCorrected'] =movies['Genre'] -movies['GenreCorrected']=movies['GenreCorrected'].str.strip() -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' - ', '|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' / ', '|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('/', '|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' & ', '|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(', ', '|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('; ', '|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-pic', 'biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biopic', 'biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographical', 'biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biodrama', 'biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bio-drama', 'biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biographic', 'biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(film genre\)', '') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animated','animation') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anime','animation') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('children\'s','children') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedey','comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[not in citation given\]','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' set 4,000 years ago in the canadian arctic','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historical','history') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romantic','romance') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3-d','animation') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('3d','animation') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('viacom 18 motion pictures','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci-fi','science_fiction') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ttriller','thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('.','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('based on radio serial','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' on the early years of hitler','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sci fi','science_fiction') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science fiction','science_fiction') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' (30min)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('16 mm film','short') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[140\]','drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\[144\]','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' for ','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventures','adventure') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung fu','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('kung-fu','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial arts','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war ii','war') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('world war i','war') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography about montreal canadiens star|maurice richard','biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bholenath movies|cinekorn entertainment','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(volleyball\)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy film','spy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('anthology film','anthology') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography fim','biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('avant-garde','avant_garde') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biker film','biker') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('buddy cop','buddy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('buddy film','buddy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy 2-reeler','comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('films','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('film','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biography of pioneering american photographer eadweard muybridge','biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('british-german co-production','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('bruceploitation','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedy-drama adaptation of the mordecai richler novel','comedy-drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies by the mob\|knkspl','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movies','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('movie','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming of age','coming_of_age') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('coming-of-age','coming_of_age') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama about child soldiers','drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( based).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( co-produced).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( adapted).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( about).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musical b','musical') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animationchildren','animation|children') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' period','period') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('drama loosely','drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace("yogesh dattatraya gosavi's directorial debut \[9\]",'') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace("war-time","war") -movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wartime","war") -movies['GenreCorrected']=movies['GenreCorrected'].str.replace("ww1","war") -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('unknown','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace("wwii","war") -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psychological','psycho') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom-coms','romance') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('true crime','crime') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|007','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('slice of life','slice_of_life') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computer animation','animation') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gun fu','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('j-horror','horror') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(shogi|chess\)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('afghan war drama','war drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|6 separate stories','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(30min\)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' (road bicycle racing)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' v-cinema','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv miniseries','tv_miniseries') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|docudrama','\|documentary|drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' in animation','|animation') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptation).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adaptated).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((adapted).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('(( on ).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('american football','sports') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dev\|nusrat jahan','sports') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('television miniseries','tv_miniseries') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(artistic\)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \|direct-to-dvd','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('history dram','history drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial art','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('psycho thriller,','psycho thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|1 girl\|3 suitors','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' \(road bicycle racing\)','') -filterE = movies['GenreCorrected']=="ero" -movies.loc[filterE,'GenreCorrected']="adult" -filterE = movies['GenreCorrected']=="music" -movies.loc[filterE,'GenreCorrected']="musical" -filterE = movies['GenreCorrected']=="-" -movies.loc[filterE,'GenreCorrected']='' -filterE = movies['GenreCorrected']=="comedy–drama" -movies.loc[filterE,'GenreCorrected'] = "comedy|drama" -filterE = movies['GenreCorrected']=="comedy–horror" -movies.loc[filterE,'GenreCorrected'] = "comedy|horror" -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(' ','|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace(',','|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('-','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionadventure','action|adventure') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actioncomedy','action|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actiondrama','action|drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionlove','action|love') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionmasala','action|masala') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionchildren','action|children') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasychildren\|','fantasy|children') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasycomedy','fantasy|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantasyperiod','fantasy|period') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('cbctv_miniseries','tv_miniseries') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedy','drama|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramacomedysocial','drama|comedy|social') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedydrama','comedy|drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramathriller','drama|thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedyhorror','comedy|horror') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sciencefiction','science_fiction') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventurecomedy','adventure|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('animationdrama','animation|drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\|','|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('muslim','religious') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('thriler','thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('crimethriller','crime|thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('fantay','fantasy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionthriller','action|thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedysocial','comedy|social') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martialarts','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(children\|poker\|karuta\)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('epichistory','epic|history') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotica','adult') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('erotic','adult') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('((\|produced\|).+)','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('chanbara','chambara') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('comedythriller','comedy|thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biblical','religious') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('colour\|yellow\|productions\|eros\|international','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|directtodvd','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('liveaction','live|action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('melodrama','drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroes','superheroe') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('gangsterthriller','gangster|thriller') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heistcomedy','comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('heist','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historic','history') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('historydisaster','history|disaster') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('warcomedy','war|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('westerncomedy','western|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('ancientcostume','costume') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('computeranimation','animation') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramatic','drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familya','family') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramedy','drama|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('dramaa','drama') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('famil\|','family') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superheroe','superhero') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('biogtaphy','biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('devotionalbiography','devotional|biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('docufiction','documentary|fiction') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('familydrama','family|drama') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('espionage','spy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('supeheroes','superhero') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancefiction','romance|fiction') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horrorthriller','horror|thriller') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspensethriller','suspense|thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('musicaliography','musical|biography') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('triller','thriller') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|\(fiction\)','|fiction') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanceaction','romance|action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancecomedy','romance|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romancehorror','romance|horror') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romcom','romance|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('rom\|com','romance|comedy') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('satirical','satire') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fictionchildren','science_fiction|children') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('homosexual','adult') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('sexual','adult') - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('mockumentary','documentary') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('periodic','period') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('romanctic','romantic') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('politics','political') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('samurai','martial_arts') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('tv_miniseries','series') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('serial','series') - -filterE = movies['GenreCorrected']=="musical–comedy" -movies.loc[filterE,'GenreCorrected'] = "musical|comedy" - -filterE = movies['GenreCorrected']=="roman|porno" -movies.loc[filterE,'GenreCorrected'] = "adult" - - -filterE = movies['GenreCorrected']=="action—masala" -movies.loc[filterE,'GenreCorrected'] = "action|masala" - - -filterE = movies['GenreCorrected']=="horror–thriller" -movies.loc[filterE,'GenreCorrected'] = "horror|thriller" - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('family','children') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('martial_arts','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('horror','thriller') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('war','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('adventure','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('science_fiction','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('western','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('noir','black') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('spy','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('superhero','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('social','') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('suspense','action') - - -filterE = movies['GenreCorrected']=="drama|romance|adult|children" -movies.loc[filterE,'GenreCorrected'] = "drama|romance|adult" - -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('\|–\|','|') -movies['GenreCorrected']=movies['GenreCorrected'].str.strip(to_strip='\|') -movies['GenreCorrected']=movies['GenreCorrected'].str.replace('actionner','action') -movies['GenreCorrected']=movies['GenreCorrected'].str.strip() - -moviesGenre = movies[['GenreCorrected','Count']].groupby(['GenreCorrected']).count() -moviesGenre.to_csv('GenreCorrected.csv',sep=',') - -movies[['GenreCorrected','Count']].groupby(['GenreCorrected'], as_index=False).count().shape[0] - -movies['GenreSplit']=movies['GenreCorrected'].str.split('|') -movies['GenreSplit']= movies['GenreSplit'].apply(np.sort).apply(np.unique) +movies['GenreCorrected'] = movies['Genre'] +movies['GenreCorrected'] = movies['GenreCorrected'].str.strip() +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' - ', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' / ', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('/', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' & ', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(', ', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('; ', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('bio-pic', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biopic', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biographical', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biodrama', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('bio-drama', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biographic', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(film genre\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('animated', 'animation') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('anime', 'animation') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('children\'s', 'children') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('comedey', 'comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\[not in citation given\]', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' set 4,000 years ago in the canadian arctic', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('historical', 'history') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('romantic', 'romance') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('3-d', 'animation') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('3d', 'animation') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('viacom 18 motion pictures', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('sci-fi', 'science_fiction') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('ttriller', 'thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('.', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('based on radio serial', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' on the early years of hitler', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('sci fi', 'science_fiction') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('science fiction', 'science_fiction') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' (30min)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('16 mm film', 'short') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\[140\]', 'drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\[144\]', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' for ', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('adventures', 'adventure') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('kung fu', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('kung-fu', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('martial arts', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('world war ii', 'war') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('world war i', 'war') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace( + 'biography about montreal canadiens star|maurice richard', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('bholenath movies|cinekorn entertainment', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(volleyball\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('spy film', 'spy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('anthology film', 'anthology') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biography fim', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('avant-garde', 'avant_garde') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biker film', 'biker') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('buddy cop', 'buddy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('buddy film', 'buddy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('comedy 2-reeler', 'comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('films', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('film', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace( + 'biography of pioneering american photographer eadweard muybridge', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('british-german co-production', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('bruceploitation', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('comedy-drama adaptation of the mordecai richler novel', + 'comedy-drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('movies by the mob\|knkspl', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('movies', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('movie', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('coming of age', 'coming_of_age') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('coming-of-age', 'coming_of_age') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('drama about child soldiers', 'drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('(( based).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('(( co-produced).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('(( adapted).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('(( about).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('musical b', 'musical') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('animationchildren', 'animation|children') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' period', 'period') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('drama loosely', 'drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(aquatics|swimming\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace("yogesh dattatraya gosavi's directorial debut \[9\]", + '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace("war-time", "war") +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace("wartime", "war") +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace("ww1", "war") +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('unknown', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace("wwii", "war") +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('psychological', 'psycho') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('rom-coms', 'romance') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('true crime', 'crime') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|007', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('slice of life', 'slice_of_life') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('computer animation', 'animation') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('gun fu', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('j-horror', 'horror') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(shogi|chess\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('afghan war drama', 'war drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|6 separate stories', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(30min\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' (road bicycle racing)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' v-cinema', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('tv miniseries', 'tv_miniseries') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|docudrama', '\|documentary|drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' in animation', '|animation') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('((adaptation).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('((adaptated).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('((adapted).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('(( on ).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('american football', 'sports') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dev\|nusrat jahan', 'sports') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('television miniseries', 'tv_miniseries') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(artistic\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \|direct-to-dvd', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('history dram', 'history drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('martial art', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('psycho thriller,', 'psycho thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|1 girl\|3 suitors', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' \(road bicycle racing\)', '') +filterE = movies['GenreCorrected'] == "ero" +movies.loc[filterE, 'GenreCorrected'] = "adult" +filterE = movies['GenreCorrected'] == "music" +movies.loc[filterE, 'GenreCorrected'] = "musical" +filterE = movies['GenreCorrected'] == "-" +movies.loc[filterE, 'GenreCorrected'] = '' +filterE = movies['GenreCorrected'] == "comedy–drama" +movies.loc[filterE, 'GenreCorrected'] = "comedy|drama" +filterE = movies['GenreCorrected'] == "comedy–horror" +movies.loc[filterE, 'GenreCorrected'] = "comedy|horror" +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(' ', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace(',', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('-', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actionadventure', 'action|adventure') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actioncomedy', 'action|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actiondrama', 'action|drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actionlove', 'action|love') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actionmasala', 'action|masala') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actionchildren', 'action|children') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('fantasychildren\|', 'fantasy|children') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('fantasycomedy', 'fantasy|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('fantasyperiod', 'fantasy|period') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('cbctv_miniseries', 'tv_miniseries') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dramacomedy', 'drama|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dramacomedysocial', 'drama|comedy|social') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dramathriller', 'drama|thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('comedydrama', 'comedy|drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dramathriller', 'drama|thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('comedyhorror', 'comedy|horror') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('sciencefiction', 'science_fiction') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('adventurecomedy', 'adventure|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('animationdrama', 'animation|drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|\|', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('muslim', 'religious') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('thriler', 'thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('crimethriller', 'crime|thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('fantay', 'fantasy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actionthriller', 'action|thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('comedysocial', 'comedy|social') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('martialarts', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|\(children\|poker\|karuta\)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('epichistory', 'epic|history') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('erotica', 'adult') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('erotic', 'adult') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('((\|produced\|).+)', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('chanbara', 'chambara') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('comedythriller', 'comedy|thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biblical', 'religious') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biblical', 'religious') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('colour\|yellow\|productions\|eros\|international', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|directtodvd', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('liveaction', 'live|action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('melodrama', 'drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('superheroes', 'superheroe') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('gangsterthriller', 'gangster|thriller') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('heistcomedy', 'comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('heist', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('historic', 'history') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('historydisaster', 'history|disaster') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('warcomedy', 'war|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('westerncomedy', 'western|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('ancientcostume', 'costume') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('computeranimation', 'animation') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dramatic', 'drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('familya', 'family') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('familya', 'family') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dramedy', 'drama|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('dramaa', 'drama') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('famil\|', 'family') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('superheroe', 'superhero') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('biogtaphy', 'biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('devotionalbiography', 'devotional|biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('docufiction', 'documentary|fiction') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('familydrama', 'family|drama') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('espionage', 'spy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('supeheroes', 'superhero') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('romancefiction', 'romance|fiction') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('horrorthriller', 'horror|thriller') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('suspensethriller', 'suspense|thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('musicaliography', 'musical|biography') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('triller', 'thriller') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|\(fiction\)', '|fiction') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('romanceaction', 'romance|action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('romancecomedy', 'romance|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('romancehorror', 'romance|horror') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('romcom', 'romance|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('rom\|com', 'romance|comedy') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('satirical', 'satire') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('science_fictionchildren', 'science_fiction|children') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('homosexual', 'adult') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('sexual', 'adult') + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('mockumentary', 'documentary') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('periodic', 'period') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('romanctic', 'romantic') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('politics', 'political') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('samurai', 'martial_arts') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('tv_miniseries', 'series') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('serial', 'series') + +filterE = movies['GenreCorrected'] == "musical–comedy" +movies.loc[filterE, 'GenreCorrected'] = "musical|comedy" + +filterE = movies['GenreCorrected'] == "roman|porno" +movies.loc[filterE, 'GenreCorrected'] = "adult" + +filterE = movies['GenreCorrected'] == "action—masala" +movies.loc[filterE, 'GenreCorrected'] = "action|masala" + +filterE = movies['GenreCorrected'] == "horror–thriller" +movies.loc[filterE, 'GenreCorrected'] = "horror|thriller" + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('family', 'children') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('martial_arts', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('horror', 'thriller') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('war', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('adventure', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('science_fiction', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('western', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('western', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('noir', 'black') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('spy', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('superhero', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('social', '') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('suspense', 'action') + +filterE = movies['GenreCorrected'] == "drama|romance|adult|children" +movies.loc[filterE, 'GenreCorrected'] = "drama|romance|adult" + +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('\|–\|', '|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.strip(to_strip='\|') +movies['GenreCorrected'] = movies['GenreCorrected'].str.replace('actionner', 'action') +movies['GenreCorrected'] = movies['GenreCorrected'].str.strip() + +moviesGenre = movies[['GenreCorrected', 'Count']].groupby(['GenreCorrected']).count() +moviesGenre.to_csv('GenreCorrected.csv', sep=',') + +movies[['GenreCorrected', 'Count']].groupby(['GenreCorrected'], as_index=False).count().shape[0] + +movies['GenreSplit'] = movies['GenreCorrected'].str.split('|') +movies['GenreSplit'] = movies['GenreSplit'].apply(np.sort).apply(np.unique) + +genres_array = np.array([]) + +for i in range(0,movies.shape[0]-1): + genres_array = np.concatenate((genres_array, movies['GenreSplit'][i])) + +genres = pd.DataFrame({'Genre':genres_array}) +genres['Count'] = 1 +genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) +genres = genres[['Genre', 'Count']].groupby(['Genre'], as_index=False).sum().sort_values(['Count'], ascending=False) +genres = genres[genres['Genre'] != ''] # Remove 'unknown' genre MIN_MOVIES = 2000 # Minimum number of movies per genre @@ -314,20 +314,25 @@ mlb = MultiLabelBinarizer() mlb.fit([main_genres]) vectorizer = CountVectorizer() +highest_movie_count = 0 +for genre in main_genres: + if movies_df[encoded_genres[genre] == 1].shape[0] >= highest_movie_count: + highest_movie_count = movies_df[encoded_genres[genre] == 1].shape[0] + resampled = [] for genre in main_genres: df = movies_df[encoded_genres[genre] == 1] - if len(df) == highest_movie_count: + if len(df) == highest_movie_count: df_upsample = df else: print("Resampling {}: [{}] to {} samples".format(genre, len(df), highest_movie_count)) df_upsample = resample(df, replace=True, n_samples=highest_movie_count) - + resampled.append(df_upsample) upsampled_df = pd.concat(resampled) -movies_train, movies_test = train_test_split(upsampled_df, test_size=test_size, shuffle=True) +movies_train, movies_test = train_test_split(upsampled_df, test_size=0.2, shuffle=True) y_train_lr = mlb.transform(movies_train["GenreSplitMain"].tolist()) y_test_lr = mlb.transform(movies_test["GenreSplitMain"].tolist()) @@ -344,6 +349,6 @@ classifier.fit(x_train_lr, y_train_lr) print("Accuracy: ", accuracy_score(y_train_lr, classifier.predict(x_train_lr))) print("Validation Accuracy:", accuracy_score(y_test_lr, classifier.predict(x_test_lr))) -pickle.dump(vectorizer, open("test_data/new_vectorizer.pickle","wb")) -pickle.dump(classifier, open("test_data/new_model.pickle","wb")) -pickle.dump(forest, open("test_data/new_mlb.pickle","wb")) +pickle.dump(vectorizer, open("test_data/new_vectorizer.pickle", "wb")) +pickle.dump(classifier, open("test_data/new_model.pickle", "wb")) +pickle.dump(mlb, open("test_data/new_mlb.pickle", "wb"))