diff --git a/Q_LEARNING_DATESET3.ipynb b/Q_LEARNING_DATESET3.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b6bbb376d225813de79e456c99260e20878e56a9 --- /dev/null +++ b/Q_LEARNING_DATESET3.ipynb @@ -0,0 +1,2649 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This file implements Q-Learning for Dataset 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q-Learning and Double Q-Learning Algorithm Description**</br>\n", + "\n", + "**Q-Learning**:\n", + "\n", + "Q-learning is a reinforcement learning algorithm used to find the optimal action-selection policy for a given finite Markov decision process (MDP).\n", + "The algorithm uses a Q-table to store the expected utility of actions taken in specific states.\n", + "The agent updates the Q-values based on the rewards received and the expected future rewards (using the Bellman equation).\n", + "\n", + "**Double Q-Learning:**\n", + "\n", + "Double Q-learning addresses the overestimation bias in Q-learning by using two separate Q-tables (Q1 and Q2).\n", + "Each Q-table is updated independently, using the other Q-table to estimate the value of the next state.\n", + "\n", + "**Implementation Steps**\n", + "1.Preprocess the Data: Load and preprocess the bank dataset.</br>\n", + "\n", + "2.Implement Q-Learning: Set up and train a Q-learning model.</br>\n", + "\n", + "3.Implement Double Q-Learning: Set up and train a Double Q-learning model.</br>\n", + "\n", + "4.Evaluate the Models: Evaluate the performance of both models.</br>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Read dataset, check for null and duplicates\n", + "def read_dataset(dataset):\n", + " \n", + " data = pd.read_csv(dataset)\n", + " nrow = len(data.index)\n", + " ncol = len(data.columns)\n", + "\n", + " print(\"The dataset contains\", format(nrow, \",d\"), \"rows and\", ncol, \"columns.\")\n", + " \n", + " #Check for null values\n", + " if ((data.isna().sum()).sum()) > 0:\n", + " print(\"There are null items in the dataset\")\n", + " else:\n", + " print(\"There are no null items in the dataset\")\n", + " \n", + " #Check for duplicates\n", + " #(data.duplicated(subset=col_names)).sum()\n", + "\n", + " if (data.duplicated().sum()) > 0:\n", + " print(\"There are duplicates in the dataset\")\n", + " else:\n", + " print(\"There are no duplicates in the dataset\")\n", + " \n", + " return data\n", + "\n", + "\n", + "\n", + "#Function to categorize data into numeric and categorical\n", + "def categorize_data(data):\n", + " \n", + " numeric=[]\n", + " categorical=[]\n", + " numeric_dtypes = [\"int64\", \"int32\", \"float64\", \"float32\"]\n", + "\n", + " for i in range (len(data.columns)):\n", + " if data[data.columns[i]].dtype in numeric_dtypes:\n", + " numeric.append(data.columns[i])\n", + " else:\n", + " categorical.append(data.columns[i])\n", + " \n", + " return numeric, categorical\n", + "\n", + " \n", + "#Function to check for outliers\n", + "def outliers_check(data, numeric):\n", + " outliers_sum =[]\n", + "\n", + " for col in (numeric_cols):\n", + " Q1 = data[col].quantile(0.25)\n", + " Q3 = data[col].quantile(0.75)\n", + " IQR = Q3 - Q1\n", + " outliers = (data[col] < (Q1 - 10 * IQR)) | (data[col] > (Q3 + 10 * IQR))\n", + " print(col, \"\", outliers.sum())\n", + " outliers_sum.append(outliers.sum())\n", + "\n", + " return outliers.sum()\n", + "\n", + "\n", + "def remove_duplicates(data):\n", + " duplicated_sum = data.duplicated().sum()\n", + " if duplicated_sum == 0:\n", + " print(\"Number of duplicated rows in dataset =\", duplicated_sum)\n", + " return data\n", + " else:\n", + " print(\"Number of duplicated rows in dataset =\", duplicated_sum)\n", + " data = data[~data.duplicated()]\n", + " print(\"Duplicated rows have been removed\")\n", + " return data\n", + "\n", + " \n", + "def remove_outliers(data, numeric):\n", + " \n", + " for col in (numeric_cols):\n", + " median_value = np.median(data[col])\n", + " Q1 = data[col].quantile(0.25)\n", + " Q3 = data[col].quantile(0.75)\n", + " IQR = Q3 - Q1\n", + " outliers = (data[col] < (Q1 - 10 * IQR)) | (data[col] > (Q3 + 10 * IQR))\n", + " #print(col, \"\", outliers.sum())\n", + " data.loc[outliers, col] = median_value\n", + " return data\n", + "\n", + "\n", + "# remove special characters from columns\n", + "def remove_spec_chars(data, categorical_cols):\n", + " for col in categorical_cols:\n", + " data[col] = data[col].str.replace(r'\\W+', '').str.strip() #replaces special characters with white sapaces and removes the white spaces\n", + " return data\n", + "\n", + "\n", + "def replace_unknown(data, categorical_cols):\n", + " for col in categorical_cols:\n", + " if \"unknown\" in data[col].values:\n", + " #source: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mode.html\n", + " mode = data[col].mode()[0]\n", + " data[col] = data[col].replace(\"unknown\", mode)\n", + " return data\n", + "\n", + " \n", + "def oneHotEncoding(data, categorical, drop_first):\n", + "\n", + " data_final = pd.get_dummies(data, columns=categorical, drop_first=drop_first)\n", + "\n", + " return data_final" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 11,162 rows and 17 columns.\n", + "There are no null items in the dataset\n", + "There are no duplicates in the dataset\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "data = read_dataset(\"bank1.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>job</th>\n", + " <th>marital</th>\n", + " <th>education</th>\n", + " <th>default</th>\n", + " <th>balance</th>\n", + " <th>housing</th>\n", + " <th>loan</th>\n", + " <th>contact</th>\n", + " <th>day</th>\n", + " <th>month</th>\n", + " <th>duration</th>\n", + " <th>campaign</th>\n", + " <th>pdays</th>\n", + " <th>previous</th>\n", + " <th>poutcome</th>\n", + " <th>deposit</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>59</td>\n", + " <td>admin.</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2343</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1042</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>56</td>\n", + " <td>admin.</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>45</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1467</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>41</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>1270</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1389</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>55</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2476</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>579</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54</td>\n", + " <td>admin.</td>\n", + " <td>married</td>\n", + " <td>tertiary</td>\n", + " <td>no</td>\n", + " <td>184</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>673</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " age job marital education default balance housing loan contact \\\n", + "0 59 admin. married secondary no 2343 yes no unknown \n", + "1 56 admin. married secondary no 45 no no unknown \n", + "2 41 technician married secondary no 1270 yes no unknown \n", + "3 55 services married secondary no 2476 yes no unknown \n", + "4 54 admin. married tertiary no 184 no no unknown \n", + "\n", + " day month duration campaign pdays previous poutcome deposit \n", + "0 5 may 1042 1 -1 0 unknown yes \n", + "1 5 may 1467 1 -1 0 unknown yes \n", + "2 5 may 1389 1 -1 0 unknown yes \n", + "3 5 may 579 1 -1 0 unknown yes \n", + "4 5 may 673 2 -1 0 unknown yes " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option('display.max_columns', None)\n", + "\n", + "(data).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 11162 entries, 0 to 11161\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 age 11162 non-null int64 \n", + " 1 job 11162 non-null object\n", + " 2 marital 11162 non-null object\n", + " 3 education 11162 non-null object\n", + " 4 default 11162 non-null object\n", + " 5 balance 11162 non-null int64 \n", + " 6 housing 11162 non-null object\n", + " 7 loan 11162 non-null object\n", + " 8 contact 11162 non-null object\n", + " 9 day 11162 non-null int64 \n", + " 10 month 11162 non-null object\n", + " 11 duration 11162 non-null int64 \n", + " 12 campaign 11162 non-null int64 \n", + " 13 pdays 11162 non-null int64 \n", + " 14 previous 11162 non-null int64 \n", + " 15 poutcome 11162 non-null object\n", + " 16 deposit 11162 non-null object\n", + "dtypes: int64(7), object(10)\n", + "memory usage: 1.4+ MB\n" + ] + } + ], + "source": [ + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>balance</th>\n", + " <th>day</th>\n", + " <th>duration</th>\n", + " <th>campaign</th>\n", + " <th>pdays</th>\n", + " <th>previous</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>11162.000000</td>\n", + " <td>11162.000000</td>\n", + " <td>11162.000000</td>\n", + " <td>11162.000000</td>\n", + " <td>11162.000000</td>\n", + " <td>11162.000000</td>\n", + " <td>11162.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>41.231948</td>\n", + " <td>1528.538524</td>\n", + " <td>15.658036</td>\n", + " <td>371.993818</td>\n", + " <td>2.508421</td>\n", + " <td>51.330407</td>\n", + " <td>0.832557</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>11.913369</td>\n", + " <td>3225.413326</td>\n", + " <td>8.420740</td>\n", + " <td>347.128386</td>\n", + " <td>2.722077</td>\n", + " <td>108.758282</td>\n", + " <td>2.292007</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>18.000000</td>\n", + " <td>-6847.000000</td>\n", + " <td>1.000000</td>\n", + " <td>2.000000</td>\n", + " <td>1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>32.000000</td>\n", + " <td>122.000000</td>\n", + " <td>8.000000</td>\n", + " <td>138.000000</td>\n", + " <td>1.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>39.000000</td>\n", + " <td>550.000000</td>\n", + " <td>15.000000</td>\n", + " <td>255.000000</td>\n", + " <td>2.000000</td>\n", + " <td>-1.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>49.000000</td>\n", + " <td>1708.000000</td>\n", + " <td>22.000000</td>\n", + " <td>496.000000</td>\n", + " <td>3.000000</td>\n", + " <td>20.750000</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>95.000000</td>\n", + " <td>81204.000000</td>\n", + " <td>31.000000</td>\n", + " <td>3881.000000</td>\n", + " <td>63.000000</td>\n", + " <td>854.000000</td>\n", + " <td>58.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " age balance day duration campaign \\\n", + "count 11162.000000 11162.000000 11162.000000 11162.000000 11162.000000 \n", + "mean 41.231948 1528.538524 15.658036 371.993818 2.508421 \n", + "std 11.913369 3225.413326 8.420740 347.128386 2.722077 \n", + "min 18.000000 -6847.000000 1.000000 2.000000 1.000000 \n", + "25% 32.000000 122.000000 8.000000 138.000000 1.000000 \n", + "50% 39.000000 550.000000 15.000000 255.000000 2.000000 \n", + "75% 49.000000 1708.000000 22.000000 496.000000 3.000000 \n", + "max 95.000000 81204.000000 31.000000 3881.000000 63.000000 \n", + "\n", + " pdays previous \n", + "count 11162.000000 11162.000000 \n", + "mean 51.330407 0.832557 \n", + "std 108.758282 2.292007 \n", + "min -1.000000 0.000000 \n", + "25% -1.000000 0.000000 \n", + "50% -1.000000 0.000000 \n", + "75% 20.750000 1.000000 \n", + "max 854.000000 58.000000 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Summary statistics of the dataset\n", + "data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "no 5873\n", + "yes 5289\n", + "Name: deposit, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Class imbalance\n", + "data['deposit'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7 numeric columns and 10 categorical columns\n" + ] + } + ], + "source": [ + "#Categorize data into numeric and categorical\n", + "numeric_cols, categorical_cols = categorize_data(data)\n", + "\n", + "print(f\"{len(numeric_cols)} numeric columns and {len(categorical_cols)} categorical columns\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "age 0\n", + "balance 74\n", + "day 0\n", + "duration 0\n", + "campaign 27\n", + "pdays 917\n", + "previous 73\n" + ] + } + ], + "source": [ + "#Check for outliers\n", + "outliers = outliers_check(data, numeric_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 1008x720 with 2 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Convert categorical variables to numerical using one-hot encoding\n", + "data_encoded = pd.get_dummies(data)\n", + "\n", + "# Calculate the correlation matrix\n", + "corr = data_encoded.corr()\n", + "\n", + "# Create a heatmap\n", + "plt.figure(figsize=(14, 10))\n", + "sns.heatmap(corr, annot=False, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n", + "plt.title('Heatmap of Correlation Matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>job</th>\n", + " <th>marital</th>\n", + " <th>education</th>\n", + " <th>default</th>\n", + " <th>balance</th>\n", + " <th>housing</th>\n", + " <th>loan</th>\n", + " <th>contact</th>\n", + " <th>day</th>\n", + " <th>month</th>\n", + " <th>duration</th>\n", + " <th>campaign</th>\n", + " <th>pdays</th>\n", + " <th>previous</th>\n", + " <th>poutcome</th>\n", + " <th>deposit</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>59</td>\n", + " <td>admin.</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2343</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1042</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>56</td>\n", + " <td>admin.</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>45</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1467</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>41</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>1270</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1389</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>55</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2476</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>579</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54</td>\n", + " <td>admin.</td>\n", + " <td>married</td>\n", + " <td>tertiary</td>\n", + " <td>no</td>\n", + " <td>184</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>673</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11157</th>\n", + " <td>33</td>\n", + " <td>blue-collar</td>\n", + " <td>single</td>\n", + " <td>primary</td>\n", + " <td>no</td>\n", + " <td>1</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>20</td>\n", + " <td>apr</td>\n", + " <td>257</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11158</th>\n", + " <td>39</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>733</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>16</td>\n", + " <td>jun</td>\n", + " <td>83</td>\n", + " <td>4</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11159</th>\n", + " <td>32</td>\n", + " <td>technician</td>\n", + " <td>single</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>29</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>19</td>\n", + " <td>aug</td>\n", + " <td>156</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11160</th>\n", + " <td>43</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>yes</td>\n", + " <td>cellular</td>\n", + " <td>8</td>\n", + " <td>may</td>\n", + " <td>9</td>\n", + " <td>2</td>\n", + " <td>172</td>\n", + " <td>5</td>\n", + " <td>failure</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11161</th>\n", + " <td>34</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>9</td>\n", + " <td>jul</td>\n", + " <td>628</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>11162 rows × 17 columns</p>\n", + "</div>" + ], + "text/plain": [ + " age job marital education default balance housing loan \\\n", + "0 59 admin. married secondary no 2343 yes no \n", + "1 56 admin. married secondary no 45 no no \n", + "2 41 technician married secondary no 1270 yes no \n", + "3 55 services married secondary no 2476 yes no \n", + "4 54 admin. married tertiary no 184 no no \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 33 blue-collar single primary no 1 yes no \n", + "11158 39 services married secondary no 733 no no \n", + "11159 32 technician single secondary no 29 no no \n", + "11160 43 technician married secondary no 0 no yes \n", + "11161 34 technician married secondary no 0 no no \n", + "\n", + " contact day month duration campaign pdays previous poutcome \\\n", + "0 unknown 5 may 1042 1 -1 0 unknown \n", + "1 unknown 5 may 1467 1 -1 0 unknown \n", + "2 unknown 5 may 1389 1 -1 0 unknown \n", + "3 unknown 5 may 579 1 -1 0 unknown \n", + "4 unknown 5 may 673 2 -1 0 unknown \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 cellular 20 apr 257 1 -1 0 unknown \n", + "11158 unknown 16 jun 83 4 -1 0 unknown \n", + "11159 cellular 19 aug 156 2 -1 0 unknown \n", + "11160 cellular 8 may 9 2 172 5 failure \n", + "11161 cellular 9 jul 628 1 -1 0 unknown \n", + "\n", + " deposit \n", + "0 yes \n", + "1 yes \n", + "2 yes \n", + "3 yes \n", + "4 yes \n", + "... ... \n", + "11157 no \n", + "11158 no \n", + "11159 no \n", + "11160 no \n", + "11161 no \n", + "\n", + "[11162 rows x 17 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "#remove outliers\n", + "removed_outliers=remove_outliers(data, numeric_cols)\n", + "(removed_outliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "<ipython-input-4-c8f6f2d77743>:89: FutureWarning: The default value of regex will change from True to False in a future version.\n", + " data[col] = data[col].str.replace(r'\\W+', '').str.strip() #replaces special characters with white sapaces and removes the white spaces\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>job</th>\n", + " <th>marital</th>\n", + " <th>education</th>\n", + " <th>default</th>\n", + " <th>balance</th>\n", + " <th>housing</th>\n", + " <th>loan</th>\n", + " <th>contact</th>\n", + " <th>day</th>\n", + " <th>month</th>\n", + " <th>duration</th>\n", + " <th>campaign</th>\n", + " <th>pdays</th>\n", + " <th>previous</th>\n", + " <th>poutcome</th>\n", + " <th>deposit</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>59</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2343</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1042</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>56</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>45</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1467</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>41</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>1270</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1389</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>55</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2476</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>579</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>tertiary</td>\n", + " <td>no</td>\n", + " <td>184</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>673</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11157</th>\n", + " <td>33</td>\n", + " <td>bluecollar</td>\n", + " <td>single</td>\n", + " <td>primary</td>\n", + " <td>no</td>\n", + " <td>1</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>20</td>\n", + " <td>apr</td>\n", + " <td>257</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11158</th>\n", + " <td>39</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>733</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>unknown</td>\n", + " <td>16</td>\n", + " <td>jun</td>\n", + " <td>83</td>\n", + " <td>4</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11159</th>\n", + " <td>32</td>\n", + " <td>technician</td>\n", + " <td>single</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>29</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>19</td>\n", + " <td>aug</td>\n", + " <td>156</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11160</th>\n", + " <td>43</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>yes</td>\n", + " <td>cellular</td>\n", + " <td>8</td>\n", + " <td>may</td>\n", + " <td>9</td>\n", + " <td>2</td>\n", + " <td>172</td>\n", + " <td>5</td>\n", + " <td>failure</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11161</th>\n", + " <td>34</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>9</td>\n", + " <td>jul</td>\n", + " <td>628</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>11162 rows × 17 columns</p>\n", + "</div>" + ], + "text/plain": [ + " age job marital education default balance housing loan \\\n", + "0 59 admin married secondary no 2343 yes no \n", + "1 56 admin married secondary no 45 no no \n", + "2 41 technician married secondary no 1270 yes no \n", + "3 55 services married secondary no 2476 yes no \n", + "4 54 admin married tertiary no 184 no no \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 33 bluecollar single primary no 1 yes no \n", + "11158 39 services married secondary no 733 no no \n", + "11159 32 technician single secondary no 29 no no \n", + "11160 43 technician married secondary no 0 no yes \n", + "11161 34 technician married secondary no 0 no no \n", + "\n", + " contact day month duration campaign pdays previous poutcome \\\n", + "0 unknown 5 may 1042 1 -1 0 unknown \n", + "1 unknown 5 may 1467 1 -1 0 unknown \n", + "2 unknown 5 may 1389 1 -1 0 unknown \n", + "3 unknown 5 may 579 1 -1 0 unknown \n", + "4 unknown 5 may 673 2 -1 0 unknown \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 cellular 20 apr 257 1 -1 0 unknown \n", + "11158 unknown 16 jun 83 4 -1 0 unknown \n", + "11159 cellular 19 aug 156 2 -1 0 unknown \n", + "11160 cellular 8 may 9 2 172 5 failure \n", + "11161 cellular 9 jul 628 1 -1 0 unknown \n", + "\n", + " deposit \n", + "0 yes \n", + "1 yes \n", + "2 yes \n", + "3 yes \n", + "4 yes \n", + "... ... \n", + "11157 no \n", + "11158 no \n", + "11159 no \n", + "11160 no \n", + "11161 no \n", + "\n", + "[11162 rows x 17 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "removed_special_chars = remove_spec_chars(removed_outliers, categorical_cols)\n", + "\n", + "removed_special_chars" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>job</th>\n", + " <th>marital</th>\n", + " <th>education</th>\n", + " <th>default</th>\n", + " <th>balance</th>\n", + " <th>housing</th>\n", + " <th>loan</th>\n", + " <th>contact</th>\n", + " <th>day</th>\n", + " <th>month</th>\n", + " <th>duration</th>\n", + " <th>campaign</th>\n", + " <th>pdays</th>\n", + " <th>previous</th>\n", + " <th>poutcome</th>\n", + " <th>deposit</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>59</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2343</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1042</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>56</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>45</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1467</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>41</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>1270</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1389</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>55</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2476</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>579</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>tertiary</td>\n", + " <td>no</td>\n", + " <td>184</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>673</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11157</th>\n", + " <td>33</td>\n", + " <td>bluecollar</td>\n", + " <td>single</td>\n", + " <td>primary</td>\n", + " <td>no</td>\n", + " <td>1</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>20</td>\n", + " <td>apr</td>\n", + " <td>257</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11158</th>\n", + " <td>39</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>733</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>16</td>\n", + " <td>jun</td>\n", + " <td>83</td>\n", + " <td>4</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11159</th>\n", + " <td>32</td>\n", + " <td>technician</td>\n", + " <td>single</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>29</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>19</td>\n", + " <td>aug</td>\n", + " <td>156</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11160</th>\n", + " <td>43</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>yes</td>\n", + " <td>cellular</td>\n", + " <td>8</td>\n", + " <td>may</td>\n", + " <td>9</td>\n", + " <td>2</td>\n", + " <td>172</td>\n", + " <td>5</td>\n", + " <td>failure</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11161</th>\n", + " <td>34</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>9</td>\n", + " <td>jul</td>\n", + " <td>628</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>unknown</td>\n", + " <td>no</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>11162 rows × 17 columns</p>\n", + "</div>" + ], + "text/plain": [ + " age job marital education default balance housing loan \\\n", + "0 59 admin married secondary no 2343 yes no \n", + "1 56 admin married secondary no 45 no no \n", + "2 41 technician married secondary no 1270 yes no \n", + "3 55 services married secondary no 2476 yes no \n", + "4 54 admin married tertiary no 184 no no \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 33 bluecollar single primary no 1 yes no \n", + "11158 39 services married secondary no 733 no no \n", + "11159 32 technician single secondary no 29 no no \n", + "11160 43 technician married secondary no 0 no yes \n", + "11161 34 technician married secondary no 0 no no \n", + "\n", + " contact day month duration campaign pdays previous poutcome \\\n", + "0 cellular 5 may 1042 1 -1 0 unknown \n", + "1 cellular 5 may 1467 1 -1 0 unknown \n", + "2 cellular 5 may 1389 1 -1 0 unknown \n", + "3 cellular 5 may 579 1 -1 0 unknown \n", + "4 cellular 5 may 673 2 -1 0 unknown \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 cellular 20 apr 257 1 -1 0 unknown \n", + "11158 cellular 16 jun 83 4 -1 0 unknown \n", + "11159 cellular 19 aug 156 2 -1 0 unknown \n", + "11160 cellular 8 may 9 2 172 5 failure \n", + "11161 cellular 9 jul 628 1 -1 0 unknown \n", + "\n", + " deposit \n", + "0 yes \n", + "1 yes \n", + "2 yes \n", + "3 yes \n", + "4 yes \n", + "... ... \n", + "11157 no \n", + "11158 no \n", + "11159 no \n", + "11160 no \n", + "11161 no \n", + "\n", + "[11162 rows x 17 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#replace unknown with mode\n", + "\n", + "removed_unknown = replace_unknown(removed_special_chars, categorical_cols)\n", + "\n", + "removed_unknown" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>job</th>\n", + " <th>marital</th>\n", + " <th>education</th>\n", + " <th>default</th>\n", + " <th>balance</th>\n", + " <th>housing</th>\n", + " <th>loan</th>\n", + " <th>contact</th>\n", + " <th>day</th>\n", + " <th>month</th>\n", + " <th>duration</th>\n", + " <th>campaign</th>\n", + " <th>pdays</th>\n", + " <th>previous</th>\n", + " <th>deposit</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>59</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2343</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1042</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>56</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>45</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1467</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>41</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>1270</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>1389</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>55</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>2476</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>579</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54</td>\n", + " <td>admin</td>\n", + " <td>married</td>\n", + " <td>tertiary</td>\n", + " <td>no</td>\n", + " <td>184</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>5</td>\n", + " <td>may</td>\n", + " <td>673</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>yes</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11157</th>\n", + " <td>33</td>\n", + " <td>bluecollar</td>\n", + " <td>single</td>\n", + " <td>primary</td>\n", + " <td>no</td>\n", + " <td>1</td>\n", + " <td>yes</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>20</td>\n", + " <td>apr</td>\n", + " <td>257</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11158</th>\n", + " <td>39</td>\n", + " <td>services</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>733</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>16</td>\n", + " <td>jun</td>\n", + " <td>83</td>\n", + " <td>4</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11159</th>\n", + " <td>32</td>\n", + " <td>technician</td>\n", + " <td>single</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>29</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>19</td>\n", + " <td>aug</td>\n", + " <td>156</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11160</th>\n", + " <td>43</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>yes</td>\n", + " <td>cellular</td>\n", + " <td>8</td>\n", + " <td>may</td>\n", + " <td>9</td>\n", + " <td>2</td>\n", + " <td>172</td>\n", + " <td>5</td>\n", + " <td>no</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11161</th>\n", + " <td>34</td>\n", + " <td>technician</td>\n", + " <td>married</td>\n", + " <td>secondary</td>\n", + " <td>no</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " <td>no</td>\n", + " <td>cellular</td>\n", + " <td>9</td>\n", + " <td>jul</td>\n", + " <td>628</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>no</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>11162 rows × 16 columns</p>\n", + "</div>" + ], + "text/plain": [ + " age job marital education default balance housing loan \\\n", + "0 59 admin married secondary no 2343 yes no \n", + "1 56 admin married secondary no 45 no no \n", + "2 41 technician married secondary no 1270 yes no \n", + "3 55 services married secondary no 2476 yes no \n", + "4 54 admin married tertiary no 184 no no \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 33 bluecollar single primary no 1 yes no \n", + "11158 39 services married secondary no 733 no no \n", + "11159 32 technician single secondary no 29 no no \n", + "11160 43 technician married secondary no 0 no yes \n", + "11161 34 technician married secondary no 0 no no \n", + "\n", + " contact day month duration campaign pdays previous deposit \n", + "0 cellular 5 may 1042 1 -1 0 yes \n", + "1 cellular 5 may 1467 1 -1 0 yes \n", + "2 cellular 5 may 1389 1 -1 0 yes \n", + "3 cellular 5 may 579 1 -1 0 yes \n", + "4 cellular 5 may 673 2 -1 0 yes \n", + "... ... ... ... ... ... ... ... ... \n", + "11157 cellular 20 apr 257 1 -1 0 no \n", + "11158 cellular 16 jun 83 4 -1 0 no \n", + "11159 cellular 19 aug 156 2 -1 0 no \n", + "11160 cellular 8 may 9 2 172 5 no \n", + "11161 cellular 9 jul 628 1 -1 0 no \n", + "\n", + "[11162 rows x 16 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#drop \"poutcocme column\"\n", + "categorical_cols.remove('poutcome')\n", + "\n", + "dropped_column = removed_unknown.drop(columns=[\"poutcome\"])\n", + "\n", + "dropped_column" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>age</th>\n", + " <th>balance</th>\n", + " <th>day</th>\n", + " <th>duration</th>\n", + " <th>campaign</th>\n", + " <th>pdays</th>\n", + " <th>previous</th>\n", + " <th>job_bluecollar</th>\n", + " <th>job_entrepreneur</th>\n", + " <th>job_housemaid</th>\n", + " <th>job_management</th>\n", + " <th>job_retired</th>\n", + " <th>job_selfemployed</th>\n", + " <th>job_services</th>\n", + " <th>job_student</th>\n", + " <th>job_technician</th>\n", + " <th>job_unemployed</th>\n", + " <th>marital_married</th>\n", + " <th>marital_single</th>\n", + " <th>education_secondary</th>\n", + " <th>education_tertiary</th>\n", + " <th>default_yes</th>\n", + " <th>housing_yes</th>\n", + " <th>loan_yes</th>\n", + " <th>contact_telephone</th>\n", + " <th>month_aug</th>\n", + " <th>month_dec</th>\n", + " <th>month_feb</th>\n", + " <th>month_jan</th>\n", + " <th>month_jul</th>\n", + " <th>month_jun</th>\n", + " <th>month_mar</th>\n", + " <th>month_may</th>\n", + " <th>month_nov</th>\n", + " <th>month_oct</th>\n", + " <th>month_sep</th>\n", + " <th>deposit_yes</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>59</td>\n", + " <td>2343</td>\n", + " <td>5</td>\n", + " <td>1042</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>56</td>\n", + " <td>45</td>\n", + " <td>5</td>\n", + " <td>1467</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>41</td>\n", + " <td>1270</td>\n", + " <td>5</td>\n", + " <td>1389</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>55</td>\n", + " <td>2476</td>\n", + " <td>5</td>\n", + " <td>579</td>\n", + " <td>1</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>54</td>\n", + " <td>184</td>\n", + " <td>5</td>\n", + " <td>673</td>\n", + " <td>2</td>\n", + " <td>-1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " age balance day duration campaign pdays previous job_bluecollar \\\n", + "0 59 2343 5 1042 1 -1 0 0 \n", + "1 56 45 5 1467 1 -1 0 0 \n", + "2 41 1270 5 1389 1 -1 0 0 \n", + "3 55 2476 5 579 1 -1 0 0 \n", + "4 54 184 5 673 2 -1 0 0 \n", + "\n", + " job_entrepreneur job_housemaid job_management job_retired \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " job_selfemployed job_services job_student job_technician \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 1 \n", + "3 0 1 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " job_unemployed marital_married marital_single education_secondary \\\n", + "0 0 1 0 1 \n", + "1 0 1 0 1 \n", + "2 0 1 0 1 \n", + "3 0 1 0 1 \n", + "4 0 1 0 0 \n", + "\n", + " education_tertiary default_yes housing_yes loan_yes contact_telephone \\\n", + "0 0 0 1 0 0 \n", + "1 0 0 0 0 0 \n", + "2 0 0 1 0 0 \n", + "3 0 0 1 0 0 \n", + "4 1 0 0 0 0 \n", + "\n", + " month_aug month_dec month_feb month_jan month_jul month_jun \\\n", + "0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 \n", + "\n", + " month_mar month_may month_nov month_oct month_sep deposit_yes \n", + "0 0 1 0 0 0 1 \n", + "1 0 1 0 0 0 1 \n", + "2 0 1 0 0 0 1 \n", + "3 0 1 0 0 0 1 \n", + "4 0 1 0 0 0 1 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#One hot encoding\n", + "\n", + "data_encoded = oneHotEncoding(dropped_column, categorical_cols, True)\n", + "\n", + "data_encoded.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(11162, 37)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_encoded.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age balance day duration campaign pdays previous job_bluecollar \\\n", + "0 59 2343 5 1042 1 -1 0 0 \n", + "1 56 45 5 1467 1 -1 0 0 \n", + "2 41 1270 5 1389 1 -1 0 0 \n", + "3 55 2476 5 579 1 -1 0 0 \n", + "4 54 184 5 673 2 -1 0 0 \n", + "\n", + " job_entrepreneur job_housemaid job_management job_retired \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " job_selfemployed job_services job_student job_technician \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 1 \n", + "3 0 1 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " job_unemployed marital_married marital_single education_secondary \\\n", + "0 0 1 0 1 \n", + "1 0 1 0 1 \n", + "2 0 1 0 1 \n", + "3 0 1 0 1 \n", + "4 0 1 0 0 \n", + "\n", + " education_tertiary default_yes housing_yes loan_yes contact_telephone \\\n", + "0 0 0 1 0 0 \n", + "1 0 0 0 0 0 \n", + "2 0 0 1 0 0 \n", + "3 0 0 1 0 0 \n", + "4 1 0 0 0 0 \n", + "\n", + " month_aug month_dec month_feb month_jan month_jul month_jun \\\n", + "0 0 0 0 0 0 0 \n", + "1 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 \n", + "\n", + " month_mar month_may month_nov month_oct month_sep deposit_yes \n", + "0 0 1 0 0 0 1 \n", + "1 0 1 0 0 0 1 \n", + "2 0 1 0 0 0 1 \n", + "3 0 1 0 0 0 1 \n", + "4 0 1 0 0 0 1 \n" + ] + } + ], + "source": [ + "# Convert boolean columns from True/False to 1/0\n", + "binary_data = data_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))\n", + "\n", + "# Check if the conversion is successful\n", + "print(binary_data.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 59 2343 5 ... 0 0 0]\n", + " [ 56 45 5 ... 0 0 0]\n", + " [ 41 1270 5 ... 0 0 0]\n", + " ...\n", + " [ 32 29 19 ... 0 0 0]\n", + " [ 43 0 8 ... 0 0 0]\n", + " [ 34 0 9 ... 0 0 0]]\n" + ] + } + ], + "source": [ + "# Define the states and actions\n", + "states = binary_data.drop(columns=['deposit_yes']).values\n", + "print(states)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "actions = [0, 1] # 0: no deposit, 1: deposit\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Split data into training and testing\n", + "train_states, test_states, train_target, test_target = train_test_split(states, binary_data['deposit_yes'], test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy of Q-learning policy: 49.57\n", + "Confusion Matrix for Q-learning:\n", + "[[572 594]\n", + " [532 535]]\n", + "Classification Report for Q-learning:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.52 0.49 0.50 1166\n", + " 1 0.47 0.50 0.49 1067\n", + "\n", + " accuracy 0.50 2233\n", + " macro avg 0.50 0.50 0.50 2233\n", + "weighted avg 0.50 0.50 0.50 2233\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy of Double Q-learning policy: 49.84\n", + "Confusion Matrix for Double Q-learning:\n", + "[[582 584]\n", + " [536 531]]\n", + "Classification Report for Double Q-learning:\n", + " precision recall f1-score support\n", + "\n", + " 0 0.52 0.50 0.51 1166\n", + " 1 0.48 0.50 0.49 1067\n", + "\n", + " accuracy 0.50 2233\n", + " macro avg 0.50 0.50 0.50 2233\n", + "weighted avg 0.50 0.50 0.50 2233\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Initialize Q-table\n", + "q_table = np.zeros((train_states.shape[0], len(actions)))\n", + "\n", + "# Hyperparameters\n", + "alpha = 0.1 # Learning rate\n", + "gamma = 0.9 # Discount factor\n", + "epsilon = 0.1 # Exploration factor\n", + "rewards_per_episod =[]\n", + "\n", + "\n", + "# Initialize rewards_per_episode list\n", + "rewards_per_episode = []\n", + "\n", + "# Training loop for Q-learning\n", + "for episode in range(1000):\n", + " total_reward = 0\n", + " for state_index, state in enumerate(train_states):\n", + " if np.random.uniform(0, 1) < epsilon:\n", + " action = np.random.choice(actions)\n", + " else:\n", + " action = np.argmax(q_table[state_index])\n", + "\n", + " # Reward is whether the action matches the actual deposit status\n", + " reward = 1 if action == train_target.iloc[state_index] else -1\n", + " total_reward += reward\n", + "\n", + " # Update Q-table\n", + " old_value = q_table[state_index, action]\n", + " next_max = np.max(q_table[state_index])\n", + " new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)\n", + " q_table[state_index, action] = new_value\n", + " \n", + " # Append total reward for this episode\n", + " rewards_per_episode.append(total_reward)\n", + "\n", + "# Test the learned Q-table\n", + "predictions = []\n", + "for state in test_states:\n", + " state_index = np.where(np.all(train_states == state, axis=1))[0]\n", + " if state_index.size > 0:\n", + " action = np.argmax(q_table[state_index[0]])\n", + " else:\n", + " action = np.random.choice(actions) # Random action if state not in training\n", + " predictions.append(action)\n", + "\n", + "# Calculate accuracy\n", + "accuracy_rl = round(accuracy_score(test_target, predictions) * 100, 2)\n", + "print(\"Accuracy of Q-learning policy:\", accuracy_rl)\n", + "\n", + "# Print confusion matrix and classification report for Q-learning\n", + "print(\"Confusion Matrix for Q-learning:\")\n", + "print(confusion_matrix(test_target, predictions))\n", + "print(\"Classification Report for Q-learning:\")\n", + "print(classification_report(test_target, predictions))\n", + "\n", + "# Plotting the rewards\n", + "plt.figure(figsize=(10, 5))\n", + "plt.plot(rewards_per_episode, label='Rewards per Episode')\n", + "plt.xlabel('Episode')\n", + "plt.ylabel('Total Reward')\n", + "plt.title('Rewards per Episode Over Training')\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "# Scale the continuous features\n", + "scaler = StandardScaler()\n", + "train_states = scaler.fit_transform(train_states)\n", + "test_states = scaler.transform(test_states)\n", + "\n", + "# Initialize Q-tables for Double Q-learning\n", + "q_table_1 = np.zeros((len(train_states), len(actions)))\n", + "q_table_2 = np.zeros((len(train_states), len(actions)))\n", + "\n", + "# Hyperparameters\n", + "alpha = 0.1\n", + "gamma = 0.99\n", + "epsilon = 1.0\n", + "epsilon_min = 0.01\n", + "epsilon_decay = 0.995\n", + "num_episodes = 1000\n", + "rewards_per_episode = []\n", + "\n", + "# Training loop for Double Q-learning\n", + "for episode in range(num_episodes):\n", + " total_reward = 0\n", + " for state_index, state in enumerate(train_states):\n", + " if np.random.rand() < epsilon:\n", + " action = np.random.choice(actions)\n", + " else:\n", + " action = np.argmax(q_table_1[state_index] + q_table_2[state_index])\n", + "\n", + " next_state_index = np.random.randint(0, len(train_states))\n", + " next_state = train_states[next_state_index]\n", + " reward = 1 if action == train_target.iloc[state_index] else -1\n", + " total_reward += reward\n", + "\n", + " if np.random.rand() < 0.5:\n", + " best_next_action = np.argmax(q_table_1[next_state_index])\n", + " q_table_1[state_index, action] = (1 - alpha) * q_table_1[state_index, action] + alpha * (reward + gamma * q_table_2[next_state_index, best_next_action])\n", + " else:\n", + " best_next_action = np.argmax(q_table_2[next_state_index])\n", + " q_table_2[state_index, action] = (1 - alpha) * q_table_2[state_index, action] + alpha * (reward + gamma * q_table_1[next_state_index, best_next_action])\n", + " \n", + " rewards_per_episode.append(total_reward)\n", + " epsilon = max(epsilon_min, epsilon * epsilon_decay)\n", + "\n", + "# Testing the learned policy\n", + "predictions = []\n", + "for state in test_states:\n", + " state_index = np.where(np.all(train_states == state, axis=1))[0]\n", + " if state_index.size > 0:\n", + " action = np.argmax(q_table_1[state_index[0]] + q_table_2[state_index[0]])\n", + " else:\n", + " action = np.random.choice(actions) # Random action if state not in training\n", + " predictions.append(action)\n", + "\n", + "accuracy_double_q = round(accuracy_score(test_target, predictions) * 100, 2)\n", + "print(\"Accuracy of Double Q-learning policy:\", accuracy_double_q)\n", + "\n", + "# Print confusion matrix and classification report for Double Q-learning\n", + "print(\"Confusion Matrix for Double Q-learning:\")\n", + "print(confusion_matrix(test_target, predictions))\n", + "print(\"Classification Report for Double Q-learning:\")\n", + "print(classification_report(test_target, predictions))\n", + "\n", + "# Plotting the rewards\n", + "plt.figure(figsize=(10, 5))\n", + "plt.plot(rewards_per_episode, label='Rewards per Episode')\n", + "plt.xlabel('Episode')\n", + "plt.ylabel('Total Reward')\n", + "plt.title('Rewards per Episode Over Training')\n", + "plt.legend()\n", + "plt.show()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}