diff --git a/notebooks/preprocessing_arpit.ipynb b/notebooks/preprocessing_arpit.ipynb deleted file mode 100644 index 72940f947012ce5207091e86547dd2d32bf3e61a..0000000000000000000000000000000000000000 --- a/notebooks/preprocessing_arpit.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "7ce4daec", - "metadata": {}, - "source": [ - "### Loading the dataset " - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "0e4c74ab", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "# Load the raw dataset\n", - "file_path = \"C:/Users/Arpit Mahapatra/Desktop/MLDM Coursework 2025/mlmavericks_coursework/data/raw/census_income_rawdata.csv\"\n", - "df = pd.read_csv(file_path)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "d4182c53", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>age</th>\n", - " <th>workclass</th>\n", - " <th>fnlwgt</th>\n", - " <th>education</th>\n", - " <th>education-num</th>\n", - " <th>marital-status</th>\n", - " <th>occupation</th>\n", - " <th>relationship</th>\n", - " <th>race</th>\n", - " <th>sex</th>\n", - " <th>capital-gain</th>\n", - " <th>capital-loss</th>\n", - " <th>hours-per-week</th>\n", - " <th>native-country</th>\n", - " <th>income</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>39</td>\n", - " <td>State-gov</td>\n", - " <td>77516</td>\n", - " <td>Bachelors</td>\n", - " <td>13</td>\n", - " <td>Never-married</td>\n", - " <td>Adm-clerical</td>\n", - " <td>Not-in-family</td>\n", - " <td>White</td>\n", - " <td>Male</td>\n", - " <td>2174</td>\n", - " <td>0</td>\n", - " <td>40</td>\n", - " <td>United-States</td>\n", - " <td><=50K</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>50</td>\n", - " <td>Self-emp-not-inc</td>\n", - " <td>83311</td>\n", - " <td>Bachelors</td>\n", - " <td>13</td>\n", - " <td>Married-civ-spouse</td>\n", - " <td>Exec-managerial</td>\n", - " <td>Husband</td>\n", - " <td>White</td>\n", - " <td>Male</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>13</td>\n", - " <td>United-States</td>\n", - " <td><=50K</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>38</td>\n", - " <td>Private</td>\n", - " <td>215646</td>\n", - " <td>HS-grad</td>\n", - " <td>9</td>\n", - " <td>Divorced</td>\n", - " <td>Handlers-cleaners</td>\n", - " <td>Not-in-family</td>\n", - " <td>White</td>\n", - " <td>Male</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>40</td>\n", - " <td>United-States</td>\n", - " <td><=50K</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>53</td>\n", - " <td>Private</td>\n", - " <td>234721</td>\n", - " <td>11th</td>\n", - " <td>7</td>\n", - " <td>Married-civ-spouse</td>\n", - " <td>Handlers-cleaners</td>\n", - " <td>Husband</td>\n", - " <td>Black</td>\n", - " <td>Male</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>40</td>\n", - " <td>United-States</td>\n", - " <td><=50K</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>28</td>\n", - " <td>Private</td>\n", - " <td>338409</td>\n", - " <td>Bachelors</td>\n", - " <td>13</td>\n", - " <td>Married-civ-spouse</td>\n", - " <td>Prof-specialty</td>\n", - " <td>Wife</td>\n", - " <td>Black</td>\n", - " <td>Female</td>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>40</td>\n", - " <td>Cuba</td>\n", - " <td><=50K</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " age workclass fnlwgt education education-num \\\n", - "0 39 State-gov 77516 Bachelors 13 \n", - "1 50 Self-emp-not-inc 83311 Bachelors 13 \n", - "2 38 Private 215646 HS-grad 9 \n", - "3 53 Private 234721 11th 7 \n", - "4 28 Private 338409 Bachelors 13 \n", - "\n", - " marital-status occupation relationship race sex \\\n", - "0 Never-married Adm-clerical Not-in-family White Male \n", - "1 Married-civ-spouse Exec-managerial Husband White Male \n", - "2 Divorced Handlers-cleaners Not-in-family White Male \n", - "3 Married-civ-spouse Handlers-cleaners Husband Black Male \n", - "4 Married-civ-spouse Prof-specialty Wife Black Female \n", - "\n", - " capital-gain capital-loss hours-per-week native-country income \n", - "0 2174 0 40 United-States <=50K \n", - "1 0 0 13 United-States <=50K \n", - "2 0 0 40 United-States <=50K \n", - "3 0 0 40 United-States <=50K \n", - "4 0 0 40 Cuba <=50K " - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "0c24e28c", - "metadata": {}, - "source": [ - "### Displaying each column's data type " - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "62119ad4", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "age int64\n", - "workclass object\n", - "fnlwgt int64\n", - "education object\n", - "education-num int64\n", - "marital-status object\n", - "occupation object\n", - "relationship object\n", - "race object\n", - "sex object\n", - "capital-gain int64\n", - "capital-loss int64\n", - "hours-per-week int64\n", - "native-country object\n", - "income object\n", - "dtype: object\n" - ] - } - ], - "source": [ - "print(df.dtypes)\n" - ] - }, - { - "cell_type": "markdown", - "id": "d6816ab3", - "metadata": {}, - "source": [ - "### Checking for null values\n" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "d486feaf", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "age 0\n", - "workclass 963\n", - "fnlwgt 0\n", - "education 0\n", - "education-num 0\n", - "marital-status 0\n", - "occupation 966\n", - "relationship 0\n", - "race 0\n", - "sex 0\n", - "capital-gain 0\n", - "capital-loss 0\n", - "hours-per-week 0\n", - "native-country 274\n", - "income 0\n", - "dtype: int64" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.isnull().sum()" - ] - }, - { - "cell_type": "markdown", - "id": "369da037", - "metadata": {}, - "source": [ - "### Handling the null values and replacing them with \"Unknown\"" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "1a5df3f2", - "metadata": {}, - "outputs": [], - "source": [ - "df.fillna(\"Unknown\", inplace=True)\n" - ] - }, - { - "cell_type": "markdown", - "id": "278efc8d", - "metadata": {}, - "source": [ - "### Handling \"?\" by replacing it with \"Unknown\"" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "id": "ca5f24f1", - "metadata": {}, - "outputs": [], - "source": [ - "df.replace(\"?\", \"Unknown\", inplace=True)\n" - ] - }, - { - "cell_type": "markdown", - "id": "b6d6e110", - "metadata": {}, - "source": [ - "### Discretizing 'age' into age groups " - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "2bf01c3d", - "metadata": {}, - "outputs": [], - "source": [ - "def age_group(age):\n", - " if age < 30:\n", - " return \"young\"\n", - " elif 30 <= age <= 55:\n", - " return \"middle\"\n", - " else:\n", - " return \"senior\"\n", - "df[\"age\"] = df[\"age\"].apply(age_group)" - ] - }, - { - "cell_type": "markdown", - "id": "f381014c", - "metadata": {}, - "source": [ - "### Discritizing 'hours-per-week' into hour groups" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "18153522", - "metadata": {}, - "outputs": [], - "source": [ - "# Discretize 'hours-per-week' into hours groups\n", - "def hours_group(hours):\n", - " if hours < 25:\n", - " return \"low\"\n", - " elif 25 <= hours <= 40:\n", - " return \"average\"\n", - " else:\n", - " return \"high\"\n", - "df[\"hours-per-week\"] = df[\"hours-per-week\"].apply(hours_group)" - ] - }, - { - "cell_type": "markdown", - "id": "b54335a6", - "metadata": {}, - "source": [ - "### Displaying all the unique values and their frequency in the target variable " - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "04506643", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "<=50K 24720\n", - "<=50K. 12435\n", - ">50K 7841\n", - ">50K. 3846\n", - "Name: income, dtype: int64" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"income\"].value_counts()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "12a4e5ec", - "metadata": {}, - "source": [ - "### Removing \".\" from the values in target variable to resolve string formatting error and clean the dirty data" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "e93e201e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "<=50K 37155\n", - ">50K 11687\n", - "Name: income, dtype: int64" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"income\"] = df[\"income\"].str.strip().str.rstrip(\".\")\n", - "\n", - "df[\"income\"].value_counts()\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "00c14391", - "metadata": {}, - "source": [ - "### Normalising the strings across the dataframe for a cleaner data" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "id": "19a7e465", - "metadata": {}, - "outputs": [], - "source": [ - "df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)\n" - ] - }, - { - "cell_type": "markdown", - "id": "82dd59c6", - "metadata": {}, - "source": [ - "### Encoding the Target variable " - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "id": "2953bc1b", - "metadata": {}, - "outputs": [], - "source": [ - "df[\"income\"] = df[\"income\"].map({\"<=50K\": 0, \">50K\": 1})\n" - ] - }, - { - "cell_type": "markdown", - "id": "15634166", - "metadata": {}, - "source": [ - "### Removing all the unnecessary columnns " - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "id": "39c881b2", - "metadata": {}, - "outputs": [], - "source": [ - "columns_to_keep = [\n", - " \"age\", \"workclass\", \"education\", \"marital-status\",\n", - " \"occupation\", \"relationship\", \"race\", \"sex\",\n", - " \"hours-per-week\", \"native-country\", \"income\" \n", - "]\n", - "\n", - "df = df[columns_to_keep]\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "1f2b92e4", - "metadata": {}, - "source": [ - "### Creating a new preprocessed CSV file for preprocessed data" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "id": "2cfc0da4", - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv('C:/Users/Arpit Mahapatra/Desktop/MLDM Coursework 2025/mlmavericks_coursework/data/processed/preprocessed_cencus_income_data.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85cf84f3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "785be949", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f7919db", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}