{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Module 2 Regression Project Template\n",
        "\n",
        "Use this notebook as a student project starter. Change `y_name` and `x_name` if you add a different dataset later. For now, the template uses `wage_sample.csv`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "df = pd.read_csv(\"wage_sample.csv\")\n",
        "\n",
        "# Student choice: edit these two names for a different project.\n",
        "y_name = \"wage\"\n",
        "x_name = \"education\"\n",
        "\n",
        "print(\"Project question:\")\n",
        "print(f\"How is {y_name} associated with {x_name} in this sample?\")\n",
        "print(\"\\nAvailable columns:\", list(df.columns))\n",
        "print(df[[y_name, x_name]].describe())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "plt.scatter(df[x_name], df[y_name])\n",
        "plt.xlabel(x_name)\n",
        "plt.ylabel(y_name)\n",
        "plt.title(f\"{y_name} and {x_name}\")\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "x = df[x_name]\n",
        "y = df[y_name]\n",
        "\n",
        "slope = ((x - x.mean()) * (y - y.mean())).sum() / ((x - x.mean()) ** 2).sum()\n",
        "intercept = y.mean() - slope * x.mean()\n",
        "fitted = intercept + slope * x\n",
        "residuals = y - fitted\n",
        "r_squared = 1 - (residuals ** 2).sum() / ((y - y.mean()) ** 2).sum()\n",
        "\n",
        "print(\"Estimated regression\")\n",
        "print(f\"{y_name}_hat = {intercept:.2f} + {slope:.2f} * {x_name}\")\n",
        "print(\"R-squared:\", round(r_squared, 3))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "x_grid = np.linspace(x.min(), x.max(), 50)\n",
        "line = intercept + slope * x_grid\n",
        "\n",
        "plt.scatter(x, y, label=\"Actual data\")\n",
        "plt.plot(x_grid, line, label=\"Fitted line\")\n",
        "plt.xlabel(x_name)\n",
        "plt.ylabel(y_name)\n",
        "plt.title(\"Final project graph\")\n",
        "plt.legend()\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "interpretation = f\"In this sample, one more unit of {x_name} is associated with {slope:.2f} more units of predicted {y_name}.\"\n",
        "fit_sentence = f\"The R-squared is {r_squared:.3f}, so the model explains about {100*r_squared:.1f}% of the sample variation in {y_name}.\"\n",
        "caution = \"This is a simple association. A causal claim would require stronger assumptions or a stronger research design.\"\n",
        "\n",
        "print(interpretation)\n",
        "print(fit_sentence)\n",
        "print(caution)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Student submission checklist\n",
        "\n",
        "- State the research question.\n",
        "- Identify y and x.\n",
        "- Include one scatter plot and one fitted line.\n",
        "- Report the fitted equation and R-squared.\n",
        "- Interpret the slope in units.\n",
        "- Add one caution about causation or omitted variables."
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}
