{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# OLS and R-squared by Formula\n",
        "\n",
        "Compute the simple regression slope, intercept, residual sum of squares, total sum of squares, and R-squared step by step."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "df = pd.read_csv(\"wage_sample.csv\")\n",
        "x = df[\"education\"]\n",
        "y = df[\"wage\"]\n",
        "\n",
        "x_bar = x.mean()\n",
        "y_bar = y.mean()\n",
        "numerator = ((x - x_bar) * (y - y_bar)).sum()\n",
        "denominator = ((x - x_bar) ** 2).sum()\n",
        "slope = numerator / denominator\n",
        "intercept = y_bar - slope * x_bar\n",
        "\n",
        "print(\"x mean:\", round(x_bar, 2))\n",
        "print(\"y mean:\", round(y_bar, 2))\n",
        "print(\"slope numerator:\", round(numerator, 2))\n",
        "print(\"slope denominator:\", round(denominator, 2))\n",
        "print(\"intercept:\", round(intercept, 2))\n",
        "print(\"slope:\", round(slope, 2))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "fitted = intercept + slope * x\n",
        "residuals = y - fitted\n",
        "ssr = (residuals ** 2).sum()\n",
        "sst = ((y - y_bar) ** 2).sum()\n",
        "sse = sst - ssr\n",
        "r_squared = 1 - ssr / sst\n",
        "\n",
        "print(\"SSR, residual variation:\", round(ssr, 2))\n",
        "print(\"SST, total variation:\", round(sst, 2))\n",
        "print(\"SSE, explained variation:\", round(sse, 2))\n",
        "print(\"R-squared:\", round(r_squared, 3))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "trial_slopes = np.linspace(1.0, 3.6, 14)\n",
        "records = []\n",
        "\n",
        "for trial_slope in trial_slopes:\n",
        "    trial_intercept = y_bar - trial_slope * x_bar\n",
        "    trial_fitted = trial_intercept + trial_slope * x\n",
        "    trial_ssr = ((y - trial_fitted) ** 2).sum()\n",
        "    records.append({\"trial_slope\": trial_slope, \"SSR\": trial_ssr})\n",
        "\n",
        "comparison = pd.DataFrame(records)\n",
        "print(comparison.round(2))\n",
        "\n",
        "plt.plot(comparison[\"trial_slope\"], comparison[\"SSR\"], marker=\"o\")\n",
        "plt.axvline(slope, linestyle=\"--\", label=\"OLS slope\")\n",
        "plt.xlabel(\"Trial slope\")\n",
        "plt.ylabel(\"Sum of squared residuals\")\n",
        "plt.title(\"OLS chooses the slope with the lowest SSR\")\n",
        "plt.legend()\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Interpretation practice\n",
        "\n",
        "OLS picks the line with the smallest sum of squared residuals. R-squared compares residual variation with total variation in the outcome."
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
