{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Fitted Values and Residuals\n",
        "\n",
        "Use the wage regression to calculate predicted wages, residuals, and squared residuals. Then visualize the vertical gaps between actual and fitted values."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "df = pd.read_csv(\"wage_sample.csv\")\n",
        "x = df[\"education\"]\n",
        "y = df[\"wage\"]\n",
        "\n",
        "slope = ((x - x.mean()) * (y - y.mean())).sum() / ((x - x.mean()) ** 2).sum()\n",
        "intercept = y.mean() - slope * x.mean()\n",
        "\n",
        "df[\"fitted_wage\"] = intercept + slope * df[\"education\"]\n",
        "df[\"residual\"] = df[\"wage\"] - df[\"fitted_wage\"]\n",
        "df[\"squared_residual\"] = df[\"residual\"] ** 2\n",
        "\n",
        "print(\"Fitted equation: wage_hat =\", round(intercept, 2), \"+\", round(slope, 2), \"* education\")\n",
        "print(df[[\"education\", \"wage\", \"fitted_wage\", \"residual\", \"squared_residual\"]].round(2))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "education_grid = np.linspace(df[\"education\"].min(), df[\"education\"].max(), 50)\n",
        "line = intercept + slope * education_grid\n",
        "\n",
        "plt.scatter(df[\"education\"], df[\"wage\"], label=\"Actual wage\")\n",
        "plt.plot(education_grid, line, label=\"Fitted line\")\n",
        "\n",
        "for _, row in df.iterrows():\n",
        "    plt.plot([row[\"education\"], row[\"education\"]], [row[\"wage\"], row[\"fitted_wage\"]], linestyle=\"--\")\n",
        "\n",
        "plt.xlabel(\"Years of education\")\n",
        "plt.ylabel(\"Hourly wage\")\n",
        "plt.title(\"Residuals are vertical gaps\")\n",
        "plt.legend()\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "plt.axhline(0, linestyle=\"--\")\n",
        "plt.scatter(df[\"education\"], df[\"residual\"])\n",
        "plt.xlabel(\"Years of education\")\n",
        "plt.ylabel(\"Residual\")\n",
        "plt.title(\"Residual plot\")\n",
        "plt.show()\n",
        "\n",
        "print(\"Largest absolute residual:\")\n",
        "print(df.loc[df[\"residual\"].abs().idxmax(), [\"education\", \"wage\", \"fitted_wage\", \"residual\"]].round(2))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Interpretation practice\n",
        "\n",
        "A positive residual means the worker earned more than the fitted line predicted. A negative residual means the worker earned less than the fitted line predicted."
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
