{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "is_executing": true,
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from shared.DescriptiveStats import create_stats\n",
    "\n",
    "path = '../data/insurance.csv'\n",
    "df = pd.read_csv(path)\n",
    "\n",
    "descriptiveData = {}\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>children</th>\n",
       "      <th>smoker</th>\n",
       "      <th>region</th>\n",
       "      <th>charges</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1338.000000</td>\n",
       "      <td>1338</td>\n",
       "      <td>1338.000000</td>\n",
       "      <td>1338.000000</td>\n",
       "      <td>1338</td>\n",
       "      <td>1338</td>\n",
       "      <td>1338.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>NaN</td>\n",
       "      <td>male</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>no</td>\n",
       "      <td>southeast</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>NaN</td>\n",
       "      <td>676</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1064</td>\n",
       "      <td>364</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>39.207025</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30.663397</td>\n",
       "      <td>1.094918</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13270.422265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>14.049960</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.098187</td>\n",
       "      <td>1.205493</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12110.011237</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>18.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.960000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1121.873900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>27.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.296250</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4740.287150</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>39.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30.400000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9382.033000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>51.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.693750</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16639.912515</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>64.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>53.130000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>63770.428010</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                age   sex          bmi     children smoker     region  \\\n",
       "count   1338.000000  1338  1338.000000  1338.000000   1338       1338   \n",
       "unique          NaN     2          NaN          NaN      2          4   \n",
       "top             NaN  male          NaN          NaN     no  southeast   \n",
       "freq            NaN   676          NaN          NaN   1064        364   \n",
       "mean      39.207025   NaN    30.663397     1.094918    NaN        NaN   \n",
       "std       14.049960   NaN     6.098187     1.205493    NaN        NaN   \n",
       "min       18.000000   NaN    15.960000     0.000000    NaN        NaN   \n",
       "25%       27.000000   NaN    26.296250     0.000000    NaN        NaN   \n",
       "50%       39.000000   NaN    30.400000     1.000000    NaN        NaN   \n",
       "75%       51.000000   NaN    34.693750     2.000000    NaN        NaN   \n",
       "max       64.000000   NaN    53.130000     5.000000    NaN        NaN   \n",
       "\n",
       "             charges  \n",
       "count    1338.000000  \n",
       "unique           NaN  \n",
       "top              NaN  \n",
       "freq             NaN  \n",
       "mean    13270.422265  \n",
       "std     12110.011237  \n",
       "min      1121.873900  \n",
       "25%      4740.287150  \n",
       "50%      9382.033000  \n",
       "75%     16639.912515  \n",
       "max     63770.428010  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe(include='all')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Descriptive Analysis\n",
    "This type of analysis describes and summarises the data\n",
    "\n",
    "## Measures of central tendency\n",
    "These types of calculations look at the averages of the data set\n",
    "- Mean - The average. Add all the values and divide by the number of values.\n",
    "- Median - The middle number.\n",
    "- Mode - The value which appears the most in the set.\n",
    "\n",
    "## Measures of Spread\n",
    "How similar or different the data in the data set is.\n",
    "- Range - The difference between the highest and lowest value.\n",
    "- Interquartile Range - The spread of the middle half of the distribution, when the set has been ordered from lowest to highest.\n",
    "- Standard Deviation - Measures the amount of variation of dispersion in a set of values.\n",
    "- Variance - measure of how different/ variable values are from the average and from each other.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>median</th>\n",
       "      <th>mode</th>\n",
       "      <th>range</th>\n",
       "      <th>IQR</th>\n",
       "      <th>std</th>\n",
       "      <th>var</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>39.207025</td>\n",
       "      <td>39.000</td>\n",
       "      <td>0    18\n",
       "Name: age, dtype: int64</td>\n",
       "      <td>46.00000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>14.044709</td>\n",
       "      <td>1.972539e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bmi</th>\n",
       "      <td>30.663397</td>\n",
       "      <td>30.400</td>\n",
       "      <td>0    32.3\n",
       "Name: bmi, dtype: float64</td>\n",
       "      <td>37.17000</td>\n",
       "      <td>8.397500</td>\n",
       "      <td>6.095908</td>\n",
       "      <td>3.716009e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>children</th>\n",
       "      <td>1.094918</td>\n",
       "      <td>1.000</td>\n",
       "      <td>0    0\n",
       "Name: children, dtype: int64</td>\n",
       "      <td>5.00000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>1.205042</td>\n",
       "      <td>1.452127e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>charges</th>\n",
       "      <td>13270.422265</td>\n",
       "      <td>9382.033</td>\n",
       "      <td>0    1639.5631\n",
       "Name: charges, dtype: float64</td>\n",
       "      <td>62648.55411</td>\n",
       "      <td>11899.625365</td>\n",
       "      <td>12105.484976</td>\n",
       "      <td>1.465428e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  mean    median  \\\n",
       "age          39.207025    39.000   \n",
       "bmi          30.663397    30.400   \n",
       "children      1.094918     1.000   \n",
       "charges   13270.422265  9382.033   \n",
       "\n",
       "                                                  mode        range  \\\n",
       "age                    0    18\n",
       "Name: age, dtype: int64     46.00000   \n",
       "bmi                0    32.3\n",
       "Name: bmi, dtype: float64     37.17000   \n",
       "children           0    0\n",
       "Name: children, dtype: int64      5.00000   \n",
       "charges   0    1639.5631\n",
       "Name: charges, dtype: float64  62648.55411   \n",
       "\n",
       "                   IQR           std           var  \n",
       "age          24.000000     14.044709  1.972539e+02  \n",
       "bmi           8.397500      6.095908  3.716009e+01  \n",
       "children      2.000000      1.205042  1.452127e+00  \n",
       "charges   11899.625365  12105.484976  1.465428e+08  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "number_columns = df.select_dtypes(include='number')\n",
    "for column in number_columns:\n",
    "    descriptiveData.update(create_stats(column, df[column]))\n",
    "pd.DataFrame.from_dict(descriptiveData, orient = 'index')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}