diff --git a/Disertationmainfile5.ipynb b/Disertationmainfile5.ipynb deleted file mode 100644 index 1a872625163868b59b4b39ceefefa04835e513fa..0000000000000000000000000000000000000000 --- a/Disertationmainfile5.ipynb +++ /dev/null @@ -1,1478 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "_uuid": "886dde36801814b005047755c12ce3e3a9b1c441" - }, - "source": [ - "# **Accident Severity Classification**" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", - "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" - }, - "outputs": [], - "source": [ - "\n", - "import numpy as np\n", - "import pandas as pd \n", - "from datetime import datetime as dt\n", - "import time\n", - "import matplotlib.pyplot as plt\n", - "import warnings\n", - "from sklearn.feature_selection import VarianceThreshold\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.metrics import roc_auc_score\n", - "from sklearn.pipeline import Pipeline, FeatureUnion\n", - "from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, OneHotEncoder, KBinsDiscretizer, MaxAbsScaler\n", - "from sklearn.model_selection import train_test_split as split\n", - "from sklearn.metrics import confusion_matrix, classification_report, accuracy_score\n", - "from sklearn.linear_model import LogisticRegression\n", - "import seaborn as sns\n", - "sns.set()\n", - "import math\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", - "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(2058408, 57)\n" - ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Accident_Index</th>\n", - " <th>Age_Band_of_Driver</th>\n", - " <th>Age_of_Vehicle</th>\n", - " <th>Driver_Home_Area_Type</th>\n", - " <th>Driver_IMD_Decile</th>\n", - " <th>Engine_Capacity_.CC.</th>\n", - " <th>Hit_Object_in_Carriageway</th>\n", - " <th>Hit_Object_off_Carriageway</th>\n", - " <th>Journey_Purpose_of_Driver</th>\n", - " <th>Junction_Location</th>\n", - " <th>...</th>\n", - " <th>Police_Force</th>\n", - " <th>Road_Surface_Conditions</th>\n", - " <th>Road_Type</th>\n", - " <th>Special_Conditions_at_Site</th>\n", - " <th>Speed_limit</th>\n", - " <th>Time</th>\n", - " <th>Urban_or_Rural_Area</th>\n", - " <th>Weather_Conditions</th>\n", - " <th>Year_y</th>\n", - " <th>InScotland</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>200501BS00002</td>\n", - " <td>36 - 45</td>\n", - " <td>3.0</td>\n", - " <td>Data missing or out of range</td>\n", - " <td>NaN</td>\n", - " <td>8268.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>Journey as part of work</td>\n", - " <td>Leaving roundabout</td>\n", - " <td>...</td>\n", - " <td>Metropolitan Police</td>\n", - " <td>Dry</td>\n", - " <td>Dual carriageway</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>17:36</td>\n", - " <td>Urban</td>\n", - " <td>Fine no high winds</td>\n", - " <td>2005</td>\n", - " <td>No</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>200501BS00003</td>\n", - " <td>26 - 35</td>\n", - " <td>5.0</td>\n", - " <td>Urban area</td>\n", - " <td>3.0</td>\n", - " <td>8300.0</td>\n", - " <td>Parked vehicle</td>\n", - " <td>None</td>\n", - " <td>Journey as part of work</td>\n", - " <td>Not at or within 20 metres of junction</td>\n", - " <td>...</td>\n", - " <td>Metropolitan Police</td>\n", - " <td>Dry</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>00:15</td>\n", - " <td>Urban</td>\n", - " <td>Fine no high winds</td>\n", - " <td>2005</td>\n", - " <td>No</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>200501BS00004</td>\n", - " <td>46 - 55</td>\n", - " <td>4.0</td>\n", - " <td>Urban area</td>\n", - " <td>1.0</td>\n", - " <td>1769.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>Other/Not known (2005-10)</td>\n", - " <td>Not at or within 20 metres of junction</td>\n", - " <td>...</td>\n", - " <td>Metropolitan Police</td>\n", - " <td>Dry</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>10:35</td>\n", - " <td>Urban</td>\n", - " <td>Fine no high winds</td>\n", - " <td>2005</td>\n", - " <td>No</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>200501BS00005</td>\n", - " <td>46 - 55</td>\n", - " <td>10.0</td>\n", - " <td>Data missing or out of range</td>\n", - " <td>NaN</td>\n", - " <td>85.0</td>\n", - " <td>Kerb</td>\n", - " <td>None</td>\n", - " <td>Other/Not known (2005-10)</td>\n", - " <td>Not at or within 20 metres of junction</td>\n", - " <td>...</td>\n", - " <td>Metropolitan Police</td>\n", - " <td>Wet or damp</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>21:13</td>\n", - " <td>Urban</td>\n", - " <td>Fine no high winds</td>\n", - " <td>2005</td>\n", - " <td>No</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>200501BS00006</td>\n", - " <td>46 - 55</td>\n", - " <td>1.0</td>\n", - " <td>Urban area</td>\n", - " <td>4.0</td>\n", - " <td>2976.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>Other/Not known (2005-10)</td>\n", - " <td>Not at or within 20 metres of junction</td>\n", - " <td>...</td>\n", - " <td>Metropolitan Police</td>\n", - " <td>Wet or damp</td>\n", - " <td>Single carriageway</td>\n", - " <td>Oil or diesel</td>\n", - " <td>30.0</td>\n", - " <td>12:40</td>\n", - " <td>Urban</td>\n", - " <td>Raining no high winds</td>\n", - " <td>2005</td>\n", - " <td>No</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>5 rows × 57 columns</p>\n", - "</div>" - ], - "text/plain": [ - " Accident_Index Age_Band_of_Driver Age_of_Vehicle \\\n", - "0 200501BS00002 36 - 45 3.0 \n", - "1 200501BS00003 26 - 35 5.0 \n", - "2 200501BS00004 46 - 55 4.0 \n", - "3 200501BS00005 46 - 55 10.0 \n", - "4 200501BS00006 46 - 55 1.0 \n", - "\n", - " Driver_Home_Area_Type Driver_IMD_Decile Engine_Capacity_.CC. \\\n", - "0 Data missing or out of range NaN 8268.0 \n", - "1 Urban area 3.0 8300.0 \n", - "2 Urban area 1.0 1769.0 \n", - "3 Data missing or out of range NaN 85.0 \n", - "4 Urban area 4.0 2976.0 \n", - "\n", - " Hit_Object_in_Carriageway Hit_Object_off_Carriageway \\\n", - "0 None None \n", - "1 Parked vehicle None \n", - "2 None None \n", - "3 Kerb None \n", - "4 None None \n", - "\n", - " Journey_Purpose_of_Driver Junction_Location ... \\\n", - "0 Journey as part of work Leaving roundabout ... \n", - "1 Journey as part of work Not at or within 20 metres of junction ... \n", - "2 Other/Not known (2005-10) Not at or within 20 metres of junction ... \n", - "3 Other/Not known (2005-10) Not at or within 20 metres of junction ... \n", - "4 Other/Not known (2005-10) Not at or within 20 metres of junction ... \n", - "\n", - " Police_Force Road_Surface_Conditions Road_Type \\\n", - "0 Metropolitan Police Dry Dual carriageway \n", - "1 Metropolitan Police Dry Single carriageway \n", - "2 Metropolitan Police Dry Single carriageway \n", - "3 Metropolitan Police Wet or damp Single carriageway \n", - "4 Metropolitan Police Wet or damp Single carriageway \n", - "\n", - " Special_Conditions_at_Site Speed_limit Time Urban_or_Rural_Area \\\n", - "0 None 30.0 17:36 Urban \n", - "1 None 30.0 00:15 Urban \n", - "2 None 30.0 10:35 Urban \n", - "3 None 30.0 21:13 Urban \n", - "4 Oil or diesel 30.0 12:40 Urban \n", - "\n", - " Weather_Conditions Year_y InScotland \n", - "0 Fine no high winds 2005 No \n", - "1 Fine no high winds 2005 No \n", - "2 Fine no high winds 2005 No \n", - "3 Fine no high winds 2005 No \n", - "4 Raining no high winds 2005 No \n", - "\n", - "[5 rows x 57 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "acc = pd.read_csv('../Downloads/Accident_Information.csv')\n", - "veh = pd.read_csv('../Downloads/Vehicle_Information.csv', encoding='ISO-8859-1')\n", - "\n", - "dataofcoll = pd.merge(veh, acc, how = 'inner', on = 'Accident_Index')\n", - "\n", - "print(dataofcoll.shape)\n", - "dataofcoll.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "_uuid": "1da95d00002e2e75d44b81b9be1e1a6471d7664e", - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(617522, 57)\n" - ] - } - ], - "source": [ - "\n", - "weights = np.where(dataofcoll['Accident_Severity'] == 'Slight', .2, .8)\n", - "accident = dataofcoll.sample(frac=0.3, replace=True, weights=weights)\n", - "print(accident.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "_uuid": "3dc8d223d250fa4b986b1f82eababf7e13f4fedf" - }, - "outputs": [], - "source": [ - "accident2 = accident[['Accident_Index', '1st_Road_Class','Day_of_Week', 'Junction_Detail','Light_Conditions', 'Number_of_Casualties',\n", - " 'Number_of_Vehicles', 'Road_Surface_Conditions', 'Road_Type', 'Special_Conditions_at_Site', 'Speed_limit',\n", - " 'Time', 'Urban_or_Rural_Area', 'Weather_Conditions', 'Age_Band_of_Driver', 'Age_of_Vehicle',\n", - " 'Hit_Object_in_Carriageway', 'Hit_Object_off_Carriageway', 'make', 'Engine_Capacity_.CC.', 'Sex_of_Driver',\n", - " 'Skidding_and_Overturning', 'Vehicle_Manoeuvre', 'Vehicle_Type', 'Accident_Severity'\n", - " ]]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "_uuid": "dd7d70a22c6f0813fc8dfd47e33d9c998a0ba0ef" - }, - "source": [ - "## **From multiclass to two-classes**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "_uuid": "6312d213b3fd3484e20fa83a1f2ad5bb6e665983", - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1 0.601067\n", - "0 0.398933\n", - "Name: Accident_Severity_Slight, dtype: float64" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "accident2['Accident_Severity'] = accident2['Accident_Severity'].replace(['Serious', 'Fatal'], 'Serious or Fatal')\n", - "accident2 = pd.get_dummies(accident2, columns=['Accident_Severity'])\n", - "accident2 = accident2.drop('Accident_Severity_Serious or Fatal', axis=1)\n", - "accident2.Accident_Severity_Slight.value_counts(normalize=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "_uuid": "1dae034fb2052895e86a02f619a6877f67375861" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "<Figure size 1008x360 with 0 Axes>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "accident_slight = accident2.Accident_Severity_Slight == 1\n", - "accident_slight = accident2.Accident_Severity_Slight == 0\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "_uuid": "8e64e49c0d3b7f88d739186aa67312576f06dc82" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(617522, 23) (617522,)\n" - ] - } - ], - "source": [ - "X = accident2.drop(['Accident_Index','Accident_Severity_Slight'], axis=1)\n", - "y = accident2.Accident_Severity_Slight\n", - "print(X.shape,\n", - " y.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "_uuid": "e002c2d140cf892124efcd331e5564ffdf43e682" - }, - "source": [ - "# **3. Training/Predicting Pipeline**\n", - "## **Transform Speed Limit**" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "_uuid": "de6a9c22bfe0ae0fd69631b27a4bb174da9d8e2f" - }, - "outputs": [], - "source": [ - "def onecodespeed(accident):\n", - " return accident[['Speed_limit']]\n", - "\n", - "FullTransformer_SpeedLimit = Pipeline([(\"Select_Speed_Limit\", FunctionTransformer(func=onecodespeed, validate=False)),\n", - " (\"Fill_Null\", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n", - " (\"One_Hot_Encoder\", OneHotEncoder(sparse = False, handle_unknown='ignore'))\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "_uuid": "b0d8ea39452e796de213ee23d2f2a18684be547a" - }, - "outputs": [], - "source": [ - "def car(accident):\n", - " list_of_small_makers = list(dataofcoll['make'].value_counts()[dataofcoll['make'].value_counts() < 2000].index)\n", - " return accident['make'].replace(list_of_small_makers, 'Other').to_frame()\n", - "\n", - "FullTransformertoOnMake = Pipeline([(\"Select_Make\", FunctionTransformer(func=car, validate=False)),\n", - " (\"Fill_Null\", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='Other')),\n", - " (\"One_Hot_Encoder\", OneHotEncoder(sparse = False, handle_unknown='ignore'))])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "_uuid": "1e990e01def800571511c13fe54bc6036e934126" - }, - "outputs": [], - "source": [ - "def enginesize(accident):\n", - " return accident[['Engine_Capacity_.CC.']]\n", - "\n", - "FullTransformerOnEnginesize = Pipeline([(\"Select_Engine_Capacity\", FunctionTransformer(func=enginesize, validate=False)),\n", - " (\"Fill_Null\", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),\n", - " (\"Car_Types_by_Engine_Capacity\", KBinsDiscretizer(n_bins=7, encode='ordinal', strategy='quantile')),\n", - " (\"One_Hot_Encoder\", OneHotEncoder(sparse = False, handle_unknown='ignore'))\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "_uuid": "54cd276128c4fec26d922b9b332468be1114dbf6" - }, - "outputs": [], - "source": [ - "def column_to_onehotencode(accident):\n", - " return accident[['1st_Road_Class', 'Day_of_Week', 'Junction_Detail', 'Light_Conditions', 'Number_of_Casualties', \n", - " 'Number_of_Vehicles', 'Road_Surface_Conditions', 'Road_Type', 'Special_Conditions_at_Site', \n", - " 'Urban_or_Rural_Area', 'Weather_Conditions', 'Age_Band_of_Driver', 'Hit_Object_in_Carriageway',\n", - " 'Hit_Object_off_Carriageway', 'Sex_of_Driver', 'Skidding_and_Overturning',\n", - " 'Vehicle_Manoeuvre', 'Vehicle_Type'\n", - " ]]\n", - "\n", - "DataToOneHotTransformerOnColumns = Pipeline([(\"Select_Columns\", FunctionTransformer(func=column_to_onehotencode, validate=False)),\n", - " (\"One_Hot_Encoder\", OneHotEncoder(sparse = False, handle_unknown='ignore'))])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "_uuid": "ad450742878f97b87195e37c2ee4ec11ad255443" - }, - "outputs": [], - "source": [ - "FeatureUnionTransformer = FeatureUnion([\n", - " (\"FTEngineCapacity\", FullTransformerOnEnginesize),\n", - " (\"FTMake\", FullTransformertoOnMake),\n", - " (\"FTSpeedLimit\", FullTransformer_SpeedLimit),\n", - " (\"OHEColumns\", DataToOneHotTransformerOnColumns)])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "_uuid": "eeeae8ba4b89dcab70c249b6edce35a720fe2f8a" - }, - "outputs": [], - "source": [ - "Full_Transformer = Pipeline([\n", - " (\"Feature_Engineering\", FeatureUnionTransformer),\n", - " (\"Min_Max_Transformer\", MaxAbsScaler())\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "_uuid": "01f0ff3782eee344740648031d7a82381c807d4e" - }, - "outputs": [], - "source": [ - "X_train, X_test, y_train, y_test = split(X, y)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "_uuid": "f452ddcd8d4d2919252116f9e4553e91a378b7a4" - }, - "source": [ - "## **4.1 Logistic Regression**" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Classification Report:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.56 0.66 0.60 61610\n", - " 1 0.74 0.65 0.69 92771\n", - "\n", - " accuracy 0.65 154381\n", - " macro avg 0.65 0.65 0.65 154381\n", - "weighted avg 0.67 0.65 0.66 154381\n", - "\n", - "Score: 0.7102721982118161\n" - ] - } - ], - "source": [ - "Full_Transformer.fit(X_train)\n", - "\n", - "lf = LogisticRegression(class_weight = \"balanced\")\n", - "\n", - "X_train_transformed = Full_Transformer.transform(X_train)\n", - "lf.fit(X_train_transformed, y_train)\n", - "\n", - "X_test_transformed = Full_Transformer.transform(X_test)\n", - "\n", - "y_pred = lf.predict(X_test_transformed)\n", - "\n", - "print('Classification Report:' '\\n',classification_report(y_test, y_pred))\n", - "\n", - "print('Score:',roc_auc_score(y_test.values, lf.predict_proba(X_test_transformed)[:, 1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "<Figure size 432x288 with 1 Axes>" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# create confusion matrix# create confusion matrix\n", - "matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# create dataframe\n", - "class_names = dataofcoll.Accident_Severity.values\n", - "dataframe = pd.DataFrame(matrix, index=['Serious or Fatal', 'Slight'], \n", - " columns=['Serious or Fatal', 'Slight'])\n", - "\n", - "# create heatmap\n", - "sns.heatmap(dataframe, annot=True, cbar=None, cmap='Blues')\n", - "plt.title('Confusion Matrix')\n", - "plt.tight_layout(), plt.xlabel('True Values'), plt.ylabel('Predicted Values')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "_uuid": "19879a9691ce3a8aa6165962366e583d4af5b2fa" - }, - "source": [ - "## **4.2 Random Forest Classifier**" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "_uuid": "a39a44a3d5dcaff0afc10821872011246e011db6" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Classification Report:\n", - " precision recall f1-score support\n", - "\n", - " 0 0.78 0.67 0.72 61742\n", - " 1 0.80 0.87 0.83 92639\n", - "\n", - " accuracy 0.79 154381\n", - " macro avg 0.79 0.77 0.78 154381\n", - "weighted avg 0.79 0.79 0.79 154381\n", - "\n", - "Score: 0.8540924830957961\n" - ] - } - ], - "source": [ - "\n", - "rf = RandomForestClassifier(n_estimators=100, n_jobs=3)\n", - "\n", - "Full_Transformer.fit(X_train)\n", - "X_train_transformed = Full_Transformer.transform(X_train)\n", - "rf.fit(X_train_transformed, y_train)\n", - "\n", - "X_test_transformed = Full_Transformer.transform(X_test)\n", - "\n", - "y_pred = rf.predict(X_test_transformed)\n", - "\n", - "print('Classification Report:' '\\n',classification_report(y_test, y_pred))\n", - "\n", - "print('Score:',roc_auc_score(y_test.values, rf.predict_proba(X_test_transformed)[:, 1]))" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>1st_Road_Class</th>\n", - " <th>Day_of_Week</th>\n", - " <th>Junction_Detail</th>\n", - " <th>Light_Conditions</th>\n", - " <th>Number_of_Casualties</th>\n", - " <th>Number_of_Vehicles</th>\n", - " <th>Road_Surface_Conditions</th>\n", - " <th>Road_Type</th>\n", - " <th>Special_Conditions_at_Site</th>\n", - " <th>Speed_limit</th>\n", - " <th>...</th>\n", - " <th>Age_Band_of_Driver</th>\n", - " <th>Age_of_Vehicle</th>\n", - " <th>Hit_Object_in_Carriageway</th>\n", - " <th>Hit_Object_off_Carriageway</th>\n", - " <th>make</th>\n", - " <th>Engine_Capacity_.CC.</th>\n", - " <th>Sex_of_Driver</th>\n", - " <th>Skidding_and_Overturning</th>\n", - " <th>Vehicle_Manoeuvre</th>\n", - " <th>Vehicle_Type</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>456708</th>\n", - " <td>B</td>\n", - " <td>Thursday</td>\n", - " <td>T or staggered junction</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>Dry</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>20.0</td>\n", - " <td>...</td>\n", - " <td>36 - 45</td>\n", - " <td>4.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>RENAULT</td>\n", - " <td>1870.0</td>\n", - " <td>Female</td>\n", - " <td>None</td>\n", - " <td>Turning right</td>\n", - " <td>Car</td>\n", - " </tr>\n", - " <tr>\n", - " <th>370736</th>\n", - " <td>A</td>\n", - " <td>Saturday</td>\n", - " <td>Not at junction or within 20 metres</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>Dry</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>60.0</td>\n", - " <td>...</td>\n", - " <td>36 - 45</td>\n", - " <td>13.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>KAWASAKI</td>\n", - " <td>749.0</td>\n", - " <td>Male</td>\n", - " <td>None</td>\n", - " <td>Overtaking static vehicle - offside</td>\n", - " <td>Motorcycle over 500cc</td>\n", - " </tr>\n", - " <tr>\n", - " <th>243494</th>\n", - " <td>B</td>\n", - " <td>Tuesday</td>\n", - " <td>Not at junction or within 20 metres</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>4</td>\n", - " <td>Dry</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>...</td>\n", - " <td>26 - 35</td>\n", - " <td>8.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>PEUGEOT</td>\n", - " <td>1997.0</td>\n", - " <td>Male</td>\n", - " <td>None</td>\n", - " <td>Slowing or stopping</td>\n", - " <td>Car</td>\n", - " </tr>\n", - " <tr>\n", - " <th>785303</th>\n", - " <td>A</td>\n", - " <td>Saturday</td>\n", - " <td>Roundabout</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>Dry</td>\n", - " <td>Roundabout</td>\n", - " <td>None</td>\n", - " <td>60.0</td>\n", - " <td>...</td>\n", - " <td>46 - 55</td>\n", - " <td>1.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>FIAT</td>\n", - " <td>1108.0</td>\n", - " <td>Female</td>\n", - " <td>None</td>\n", - " <td>Going ahead other</td>\n", - " <td>Car</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1359698</th>\n", - " <td>Unclassified</td>\n", - " <td>Wednesday</td>\n", - " <td>Roundabout</td>\n", - " <td>Darkness - lights lit</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>Wet or damp</td>\n", - " <td>Roundabout</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>...</td>\n", - " <td>36 - 45</td>\n", - " <td>6.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>HONDA</td>\n", - " <td>1799.0</td>\n", - " <td>Female</td>\n", - " <td>None</td>\n", - " <td>Turning left</td>\n", - " <td>Car</td>\n", - " </tr>\n", - " <tr>\n", - " <th>...</th>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1782633</th>\n", - " <td>A</td>\n", - " <td>Wednesday</td>\n", - " <td>T or staggered junction</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>Dry</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>...</td>\n", - " <td>11 - 15</td>\n", - " <td>NaN</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>NaN</td>\n", - " <td>NaN</td>\n", - " <td>Male</td>\n", - " <td>None</td>\n", - " <td>Overtaking static vehicle - offside</td>\n", - " <td>Pedal cycle</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1857601</th>\n", - " <td>Unclassified</td>\n", - " <td>Saturday</td>\n", - " <td>Crossroads</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>Dry</td>\n", - " <td>One way street</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>...</td>\n", - " <td>36 - 45</td>\n", - " <td>1.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>MAZDA</td>\n", - " <td>2191.0</td>\n", - " <td>Male</td>\n", - " <td>None</td>\n", - " <td>Parked</td>\n", - " <td>Car</td>\n", - " </tr>\n", - " <tr>\n", - " <th>490713</th>\n", - " <td>A</td>\n", - " <td>Wednesday</td>\n", - " <td>Crossroads</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>Dry</td>\n", - " <td>Dual carriageway</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>...</td>\n", - " <td>36 - 45</td>\n", - " <td>4.0</td>\n", - " <td>Bollard or refuge</td>\n", - " <td>None</td>\n", - " <td>SUZUKI</td>\n", - " <td>1586.0</td>\n", - " <td>Female</td>\n", - " <td>None</td>\n", - " <td>Going ahead other</td>\n", - " <td>Car</td>\n", - " </tr>\n", - " <tr>\n", - " <th>124192</th>\n", - " <td>A</td>\n", - " <td>Thursday</td>\n", - " <td>Roundabout</td>\n", - " <td>Daylight</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>Dry</td>\n", - " <td>Roundabout</td>\n", - " <td>None</td>\n", - " <td>30.0</td>\n", - " <td>...</td>\n", - " <td>26 - 35</td>\n", - " <td>2.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>RENAULT</td>\n", - " <td>4116.0</td>\n", - " <td>Male</td>\n", - " <td>None</td>\n", - " <td>Going ahead left-hand bend</td>\n", - " <td>Goods over 3.5t. and under 7.5t</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1855856</th>\n", - " <td>Unclassified</td>\n", - " <td>Monday</td>\n", - " <td>T or staggered junction</td>\n", - " <td>Darkness - lights lit</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>Wet or damp</td>\n", - " <td>Single carriageway</td>\n", - " <td>None</td>\n", - " <td>40.0</td>\n", - " <td>...</td>\n", - " <td>46 - 55</td>\n", - " <td>3.0</td>\n", - " <td>None</td>\n", - " <td>None</td>\n", - " <td>TOYOTA</td>\n", - " <td>1329.0</td>\n", - " <td>Female</td>\n", - " <td>None</td>\n", - " <td>Turning right</td>\n", - " <td>Car</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>463141 rows × 23 columns</p>\n", - "</div>" - ], - "text/plain": [ - " 1st_Road_Class Day_of_Week Junction_Detail \\\n", - "456708 B Thursday T or staggered junction \n", - "370736 A Saturday Not at junction or within 20 metres \n", - "243494 B Tuesday Not at junction or within 20 metres \n", - "785303 A Saturday Roundabout \n", - "1359698 Unclassified Wednesday Roundabout \n", - "... ... ... ... \n", - "1782633 A Wednesday T or staggered junction \n", - "1857601 Unclassified Saturday Crossroads \n", - "490713 A Wednesday Crossroads \n", - "124192 A Thursday Roundabout \n", - "1855856 Unclassified Monday T or staggered junction \n", - "\n", - " Light_Conditions Number_of_Casualties Number_of_Vehicles \\\n", - "456708 Daylight 1 2 \n", - "370736 Daylight 1 3 \n", - "243494 Daylight 1 4 \n", - "785303 Daylight 1 2 \n", - "1359698 Darkness - lights lit 1 2 \n", - "... ... ... ... \n", - "1782633 Daylight 1 2 \n", - "1857601 Daylight 1 2 \n", - "490713 Daylight 1 1 \n", - "124192 Daylight 1 1 \n", - "1855856 Darkness - lights lit 1 2 \n", - "\n", - " Road_Surface_Conditions Road_Type \\\n", - "456708 Dry Single carriageway \n", - "370736 Dry Single carriageway \n", - "243494 Dry Single carriageway \n", - "785303 Dry Roundabout \n", - "1359698 Wet or damp Roundabout \n", - "... ... ... \n", - "1782633 Dry Single carriageway \n", - "1857601 Dry One way street \n", - "490713 Dry Dual carriageway \n", - "124192 Dry Roundabout \n", - "1855856 Wet or damp Single carriageway \n", - "\n", - " Special_Conditions_at_Site Speed_limit ... Age_Band_of_Driver \\\n", - "456708 None 20.0 ... 36 - 45 \n", - "370736 None 60.0 ... 36 - 45 \n", - "243494 None 30.0 ... 26 - 35 \n", - "785303 None 60.0 ... 46 - 55 \n", - "1359698 None 30.0 ... 36 - 45 \n", - "... ... ... ... ... \n", - "1782633 None 30.0 ... 11 - 15 \n", - "1857601 None 30.0 ... 36 - 45 \n", - "490713 None 30.0 ... 36 - 45 \n", - "124192 None 30.0 ... 26 - 35 \n", - "1855856 None 40.0 ... 46 - 55 \n", - "\n", - " Age_of_Vehicle Hit_Object_in_Carriageway Hit_Object_off_Carriageway \\\n", - "456708 4.0 None None \n", - "370736 13.0 None None \n", - "243494 8.0 None None \n", - "785303 1.0 None None \n", - "1359698 6.0 None None \n", - "... ... ... ... \n", - "1782633 NaN None None \n", - "1857601 1.0 None None \n", - "490713 4.0 Bollard or refuge None \n", - "124192 2.0 None None \n", - "1855856 3.0 None None \n", - "\n", - " make Engine_Capacity_.CC. Sex_of_Driver Skidding_and_Overturning \\\n", - "456708 RENAULT 1870.0 Female None \n", - "370736 KAWASAKI 749.0 Male None \n", - "243494 PEUGEOT 1997.0 Male None \n", - "785303 FIAT 1108.0 Female None \n", - "1359698 HONDA 1799.0 Female None \n", - "... ... ... ... ... \n", - "1782633 NaN NaN Male None \n", - "1857601 MAZDA 2191.0 Male None \n", - "490713 SUZUKI 1586.0 Female None \n", - "124192 RENAULT 4116.0 Male None \n", - "1855856 TOYOTA 1329.0 Female None \n", - "\n", - " Vehicle_Manoeuvre Vehicle_Type \n", - "456708 Turning right Car \n", - "370736 Overtaking static vehicle - offside Motorcycle over 500cc \n", - "243494 Slowing or stopping Car \n", - "785303 Going ahead other Car \n", - "1359698 Turning left Car \n", - "... ... ... \n", - "1782633 Overtaking static vehicle - offside Pedal cycle \n", - "1857601 Parked Car \n", - "490713 Going ahead other Car \n", - "124192 Going ahead left-hand bend Goods over 3.5t. and under 7.5t \n", - "1855856 Turning right Car \n", - "\n", - "[463141 rows x 23 columns]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_train" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "463141" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(X_train_transformed)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "feature_names = [f\"feature {i}\" for i in range(X_train_transformed.shape[1])]" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['feature 0',\n", - " 'feature 1',\n", - " 'feature 2',\n", - " 'feature 3',\n", - " 'feature 4',\n", - " 'feature 5',\n", - " 'feature 6',\n", - " 'feature 7',\n", - " 'feature 8',\n", - " 'feature 9',\n", - " 'feature 10',\n", - " 'feature 11',\n", - " 'feature 12',\n", - " 'feature 13',\n", - " 'feature 14',\n", - " 'feature 15',\n", - " 'feature 16',\n", - " 'feature 17',\n", - " 'feature 18',\n", - " 'feature 19',\n", - " 'feature 20',\n", - " 'feature 21',\n", - " 'feature 22',\n", - " 'feature 23',\n", - " 'feature 24',\n", - " 'feature 25',\n", - " 'feature 26',\n", - " 'feature 27',\n", - " 'feature 28',\n", - " 'feature 29',\n", - " 'feature 30',\n", - " 'feature 31',\n", - " 'feature 32',\n", - " 'feature 33',\n", - " 'feature 34',\n", - " 'feature 35',\n", - " 'feature 36',\n", - " 'feature 37',\n", - " 'feature 38',\n", - " 'feature 39',\n", - " 'feature 40',\n", - " 'feature 41',\n", - " 'feature 42',\n", - " 'feature 43',\n", - " 'feature 44',\n", - " 'feature 45',\n", - " 'feature 46',\n", - " 'feature 47',\n", - " 'feature 48',\n", - " 'feature 49',\n", - " 'feature 50',\n", - " 'feature 51',\n", - " 'feature 52',\n", - " 'feature 53',\n", - " 'feature 54',\n", - " 'feature 55',\n", - " 'feature 56',\n", - " 'feature 57',\n", - " 'feature 58',\n", - " 'feature 59',\n", - " 'feature 60',\n", - " 'feature 61',\n", - " 'feature 62',\n", - " 'feature 63',\n", - " 'feature 64',\n", - " 'feature 65',\n", - " 'feature 66',\n", - " 'feature 67',\n", - " 'feature 68',\n", - " 'feature 69',\n", - " 'feature 70',\n", - " 'feature 71',\n", - " 'feature 72',\n", - " 'feature 73',\n", - " 'feature 74',\n", - " 'feature 75',\n", - " 'feature 76',\n", - " 'feature 77',\n", - " 'feature 78',\n", - " 'feature 79',\n", - " 'feature 80',\n", - " 'feature 81',\n", - " 'feature 82',\n", - " 'feature 83',\n", - " 'feature 84',\n", - " 'feature 85',\n", - " 'feature 86',\n", - " 'feature 87',\n", - " 'feature 88',\n", - " 'feature 89',\n", - " 'feature 90',\n", - " 'feature 91',\n", - " 'feature 92',\n", - " 'feature 93',\n", - " 'feature 94',\n", - " 'feature 95',\n", - " 'feature 96',\n", - " 'feature 97',\n", - " 'feature 98',\n", - " 'feature 99',\n", - " 'feature 100',\n", - " 'feature 101',\n", - " 'feature 102',\n", - " 'feature 103',\n", - " 'feature 104',\n", - " 'feature 105',\n", - " 'feature 106',\n", - " 'feature 107',\n", - " 'feature 108',\n", - " 'feature 109',\n", - " 'feature 110',\n", - " 'feature 111',\n", - " 'feature 112',\n", - " 'feature 113',\n", - " 'feature 114',\n", - " 'feature 115',\n", - " 'feature 116',\n", - " 'feature 117',\n", - " 'feature 118',\n", - " 'feature 119',\n", - " 'feature 120',\n", - " 'feature 121',\n", - " 'feature 122',\n", - " 'feature 123',\n", - " 'feature 124',\n", - " 'feature 125',\n", - " 'feature 126',\n", - " 'feature 127',\n", - " 'feature 128',\n", - " 'feature 129',\n", - " 'feature 130',\n", - " 'feature 131',\n", - " 'feature 132',\n", - " 'feature 133',\n", - " 'feature 134',\n", - " 'feature 135',\n", - " 'feature 136',\n", - " 'feature 137',\n", - " 'feature 138',\n", - " 'feature 139',\n", - " 'feature 140',\n", - " 'feature 141',\n", - " 'feature 142',\n", - " 'feature 143',\n", - " 'feature 144',\n", - " 'feature 145',\n", - " 'feature 146',\n", - " 'feature 147',\n", - " 'feature 148',\n", - " 'feature 149',\n", - " 'feature 150',\n", - " 'feature 151',\n", - " 'feature 152',\n", - " 'feature 153',\n", - " 'feature 154',\n", - " 'feature 155',\n", - " 'feature 156',\n", - " 'feature 157',\n", - " 'feature 158',\n", - " 'feature 159',\n", - " 'feature 160',\n", - " 'feature 161',\n", - " 'feature 162',\n", - " 'feature 163',\n", - " 'feature 164',\n", - " 'feature 165',\n", - " 'feature 166',\n", - " 'feature 167',\n", - " 'feature 168',\n", - " 'feature 169',\n", - " 'feature 170',\n", - " 'feature 171',\n", - " 'feature 172',\n", - " 'feature 173',\n", - " 'feature 174',\n", - " 'feature 175',\n", - " 'feature 176',\n", - " 'feature 177',\n", - " 'feature 178',\n", - " 'feature 179',\n", - " 'feature 180',\n", - " 'feature 181',\n", - " 'feature 182',\n", - " 'feature 183',\n", - " 'feature 184',\n", - " 'feature 185',\n", - " 'feature 186',\n", - " 'feature 187',\n", - " 'feature 188',\n", - " 'feature 189',\n", - " 'feature 190',\n", - " 'feature 191',\n", - " 'feature 192',\n", - " 'feature 193',\n", - " 'feature 194',\n", - " 'feature 195',\n", - " 'feature 196',\n", - " 'feature 197',\n", - " 'feature 198',\n", - " 'feature 199',\n", - " 'feature 200',\n", - " 'feature 201',\n", - " 'feature 202',\n", - " 'feature 203',\n", - " 'feature 204',\n", - " 'feature 205',\n", - " 'feature 206',\n", - " 'feature 207',\n", - " 'feature 208',\n", - " 'feature 209',\n", - " 'feature 210',\n", - " 'feature 211',\n", - " 'feature 212',\n", - " 'feature 213',\n", - " 'feature 214',\n", - " 'feature 215',\n", - " 'feature 216',\n", - " 'feature 217',\n", - " 'feature 218']" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "feature_names" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "importances = rf.feature_importances_\n", - "std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "feature 0 0.023685\n", - "feature 1 0.000850\n", - "feature 2 0.016691\n", - "feature 3 0.013066\n", - "feature 4 0.003740\n", - " ... \n", - "feature 214 0.003173\n", - "feature 215 0.000064\n", - "feature 216 0.003639\n", - "feature 217 0.000013\n", - "feature 218 0.006902\n", - "Length: 219, dtype: float64" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "forest_importances = pd.Series(importances, index=feature_names)\n", - "forest_importances" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "<Figure size 432x288 with 1 Axes>" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# create confusion matrix# create confusion matrix\n", - "matrix = confusion_matrix(y_test, y_pred)\n", - "\n", - "# create dataframe\n", - "class_names = dataofcoll.Accident_Severity.values\n", - "dataframe = pd.DataFrame(matrix, index=['Serious or Fatal', 'Slight'], \n", - " columns=['Serious or Fatal', 'Slight'])\n", - "\n", - "# create heatmap\n", - "sns.heatmap(dataframe, annot=True, cbar=None, cmap='Blues')\n", - "plt.title('Confusion Matrix')\n", - "plt.tight_layout(), plt.xlabel('True Values'), plt.ylabel('Predicted Values')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -}