diff --git a/lectures/_static/quant-econ.bib b/lectures/_static/quant-econ.bib
index 48cf833a8..b23438f1c 100644
--- a/lectures/_static/quant-econ.bib
+++ b/lectures/_static/quant-econ.bib
@@ -3,6 +3,28 @@
 Note: Extended Information (like abstracts, doi, url's etc.) can be found in quant-econ-extendedinfo.bib file in _static/
 ###
 
+@article{evans2005interview,
+  title={An interview with thomas j. sargent},
+  author={Evans, George W and Honkapohja, Seppo},
+  journal={Macroeconomic Dynamics},
+  volume={9},
+  number={4},
+  pages={561--583},
+  year={2005},
+  publisher={Cambridge University Press}
+}
+
+@article{hansen2014nobel,
+  title={Nobel lecture: Uncertainty outside and inside economic models},
+  author={Hansen, Lars Peter},
+  journal={Journal of Political Economy},
+  volume={122},
+  number={5},
+  pages={945--987},
+  year={2014},
+  publisher={University of Chicago Press Chicago, IL}
+}
+
 @book{Sargent_Stachurski_2025, 
   place={Cambridge}, 
   title={Dynamic Programming: Finite States}, 
@@ -2460,6 +2482,16 @@ @article{hicks1937mr
   year={1937}
 }
 
+@article{GrossmanShiller1981,
+  title={The determinants of the variability of stock market prices},
+  author={Grossman, Sanford J and Shiller, Robert J},
+  journal={American Economic Review},
+  volume={71},
+  number={2},
+  pages={222--227},
+  year={1981}
+}
+
 @article{hansen1983stochastic,
   title={Stochastic consumption, risk aversion, and the temporal behavior of asset returns},
   author={Hansen, Lars Peter and Singleton, Kenneth J},
@@ -2480,6 +2512,69 @@ @article{hansen1982generalized
   publisher={JSTOR}
 }
 
+@article{abel1990asset,
+  title={Asset prices under habit formation and catching up with the Joneses},
+  author={Abel, Andrew B},
+  journal={American Economic Review},
+  volume={80},
+  number={2},
+  pages={38--42},
+  year={1990}
+}
+
+@article{campbell1999force,
+  title={By force of habit: A consumption-based explanation of aggregate stock market behavior},
+  author={Campbell, John Y and Cochrane, John H},
+  journal={Journal of Political Economy},
+  volume={107},
+  number={2},
+  pages={205--251},
+  year={1999},
+  publisher={The University of Chicago Press}
+}
+
+@article{barro2006rare,
+  title={Rare disasters and asset markets in the twentieth century},
+  author={Barro, Robert J},
+  journal={The Quarterly Journal of Economics},
+  volume={121},
+  number={3},
+  pages={823--866},
+  year={2006},
+  publisher={MIT Press}
+}
+
+@article{Brock1982,
+  title={Asset prices in a production economy},
+  author={Brock, William A},
+  journal={The Economics of Information and Uncertainty},
+  pages={1--43},
+  year={1982},
+  publisher={University of Chicago Press}
+}
+
+@article{PrescottMehra1980,
+  title={Recursive competitive equilibrium: The case of homogeneous households},
+  author={Prescott, Edward C and Mehra, Rajnish},
+  journal={Econometrica},
+  volume={48},
+  number={6},
+  pages={1365--1379},
+  year={1980},
+  publisher={JSTOR}
+}
+
+@article{Hansen1982,
+  title={Large sample properties of generalized method of moments estimators},
+  author={Hansen, Lars Peter},
+  journal={Econometrica},
+  volume={50},
+  number={4},
+  pages={1029--1054},
+  year={1982},
+  publisher={JSTOR}
+}
+
 @incollection{Uhlig2001,
   author    = {Uhlig, H},
   booktitle = {Computational Methods for the Study of Dynamic Economies},
diff --git a/lectures/_toc.yml b/lectures/_toc.yml
index 6744d84f6..d3f58830d 100644
--- a/lectures/_toc.yml
+++ b/lectures/_toc.yml
@@ -124,6 +124,8 @@ parts:
   numbered: true
   chapters:
   - file: markov_asset
+  - file: hansen_singleton_1983
+  - file: hansen_singleton_1982
   - file: doubts_or_variability
   - file: ge_arrow
   - file: harrison_kreps
diff --git a/lectures/doubts_or_variability.md b/lectures/doubts_or_variability.md
index 5aec9f635..33472d9b5 100644
--- a/lectures/doubts_or_variability.md
+++ b/lectures/doubts_or_variability.md
@@ -35,14 +35,14 @@ kernelspec:
 
 ## Overview
 
-This lecture describes machinery that empirical  macro-finance economists have used to evaluate the fits of structural statistical models that link asset prices to aggregate consumption.
+This lecture describes machinery that empirical macro-finance economists have used to evaluate the fits of structural statistical models that link asset prices to aggregate consumption.
 
 The Lucas asset pricing model {cite}`Lucas1978` functions as a benchmark that motivates much of this work.
 
 ```{note}
-New Keynesians call the consumption Euler equation for a one-period risk-free bond in the Lucas {cite}`Lucas1978` model the **IS curve**.  
+New Keynesians call the consumption Euler equation for a one-period risk-free bond in the Lucas {cite}`Lucas1978` model the **IS curve**.
 
-The distinguished **old Keynesian** disapproved of that name because the object it described was so remote from the investment function that was an important component of the IS curve of John R. Hicks {cite}`hicks1937mr` that Tobin used. 
+The distinguished **old Keynesian** disapproved of that name because the object it described was so remote from the investment function that was an important component of the IS curve of John R. Hicks {cite}`hicks1937mr` that Tobin used.
 
 See {cite}`tobin1992old`.
 ```
@@ -59,13 +59,13 @@ The Hansen-Singleton papers systematically organized evidence about directions i
 ```{note}
 {cite:t}`MehraPrescott1985` is widely credited for naming the **equity premium** puzzle.
 
-{cite:t}`Weil_1989` is  widely credited for naming the **risk-free rate** puzzle.
+{cite:t}`Weil_1989` is widely credited for naming the **risk-free rate** puzzle.
 
 ```
 
 These *puzzles* are just ways of summarizing particular dimensions along which a particular asset pricing model -- such as Lucas's -- fails empirically.
 
-They are thus special cases of specification failures detected by statistical diagnostics constructed earlier by {cite}`hansen1983stochastic` and {cite}`hansen1982generalized`.
+They are thus special cases of specification failures detected by statistical diagnostics constructed earlier by {cite:t}`hansen1983stochastic` and {cite:t}`hansen1982generalized`.
 
 Macro-finance models that purport to resolve such puzzles all do so by changing features of the economic environment assumed by Lucas {cite}`Lucas1978`.
 
@@ -75,39 +75,39 @@ Hansen-Jagannathan bounds are a key tool for evaluating how well such re-specifi
 correcting those misfits of Lucas's 1978 model.   
 
 
-This lecture begins with a description of the {cite}`Hansen_Jagannathan_1991` machinery.
+This lecture begins with a description of the {cite:t}`Hansen_Jagannathan_1991` machinery.
 
 After doing that, we proceed to describe a line of research that altered Lucas's preference specification in ways that we can think of as being designed with the Hansen-Jagannathan bounds in mind.
 
 
 We'll organize much of this lecture around parts of the paper by Thomas Tallarini {cite}`Tall2000`.
 
-His paper is particularly enlightening for macro-finance researchers because it  showed that a recursive preference specification could fit both the equity premium and the risk-free rate, thus *resolving* both of the puzzles mentioned above.
+His paper is particularly enlightening for macro-finance researchers because it showed that a recursive preference specification could fit both the equity premium and the risk-free rate, thus *resolving* both of the puzzles mentioned above.
 
 But like any good paper in applied economics, in answering some questions (i.e., resolving some puzzles), Tallarini's paper naturally posed new ones.  
 
-Thus, Tallarini's puzzles-resolving required setting the risk-aversion coefficient $\gamma$ to around 50 for a random-walk consumption model and around 75 for a trend-stationary model, exactly the range that provoked the  skepticism in the above quote from {cite:t}`Lucas_2003`.
+Thus, Tallarini's puzzles-resolving required setting the risk-aversion coefficient $\gamma$ to around 50 for a random-walk consumption model and around 75 for a trend-stationary model, exactly the range that provoked the skepticism in the above quote from {cite:t}`Lucas_2003`.
 
 This brings us to the next parts of this lecture.
 
 Lucas's skeptical response to Tallarini's explanation of the two puzzles led 
 {cite:t}`BHS_2009` to ask whether those large $\gamma$ values really measure aversion to atemporal risk, or whether they instead measure the agent's doubts about the underlying probability model.
 
-Their answer, and the theme of the remaining parts of this  lecture, is that much of what looks like "risk aversion" can be reinterpreted as **model uncertainty**.
+Their answer, and the theme of the remaining parts of this lecture, is that much of what looks like "risk aversion" can be reinterpreted as **model uncertainty**.
 
-The same recursion that defines Tallarini's risk-sensitive agent is observationally equivalent to another recursion that expresses an agent's concern  that the probability model governing consumption growth may be wrong.
+The same recursion that defines Tallarini's risk-sensitive agent is observationally equivalent to another recursion that expresses an agent's concern that the probability model governing consumption growth may be wrong.
 
-Under this reading, a  parameter value  that indicates  extreme risk aversion in one interpretation of the recursion  indicates concerns about *misspecification* in another interpretation of the same recursion.
+Under this reading, a parameter value that indicates extreme risk aversion in one interpretation of the recursion indicates concerns about *misspecification* in another interpretation of the same recursion.
 
 {cite:t}`BHS_2009` show that modest amounts of model uncertainty can substitute for large amounts of risk aversion in terms of choices and effects on asset prices.
 
 
 This reinterpretation changes the welfare question that asset prices answer.
 
-Do large risk premia measure the benefits from reducing well-understood aggregate fluctuations, or do they measure  benefits from reducing doubts about the  model describing consumption growth?
+Do large risk premia measure the benefits from reducing well-understood aggregate fluctuations, or do they measure benefits from reducing doubts about the model describing consumption growth?
 
 
-To proceed, we begin by describing   {cite:t}`Hansen_Jagannathan_1991` bounds, then specify the statistical environment, lay out four related preference specifications and the connections among them, and finally revisit Tallarini's calibration through the lens of detection-error probabilities.
+To proceed, we begin by describing {cite:t}`Hansen_Jagannathan_1991` bounds, then specify the statistical environment, lay out four related preference specifications and the connections among them, and finally revisit Tallarini's calibration through the lens of detection-error probabilities.
 
 Along the way, we draw on ideas and techniques from
 
@@ -245,7 +245,7 @@ In words, no asset's Sharpe ratio can exceed the market price of risk.
 
 The bound {eq}`bhs_hj_bound` is stated in conditional terms.
 
-There is  an unconditional counterpart that involves a vector of $n$ gross returns $R_{t+1}$ (e.g., equity and risk-free) with unconditional mean $E(R)$ and covariance matrix $\Sigma_R$:
+There is an unconditional counterpart that involves a vector of $n$ gross returns $R_{t+1}$ (e.g., equity and risk-free) with unconditional mean $E(R)$ and covariance matrix $\Sigma_R$:
 
 ```{math}
 :label: bhs_hj_unconditional
@@ -258,6 +258,8 @@ b = \mathbf{1} - E(m) E(R).
 
 {ref}`Exercise 1 <dov_ex1>` walks through a derivation of this unconditional bound.
 
+Here $\mathbf{1}$ denotes an $n \times 1$ vector of ones.
+
 The function below computes the right-hand side of {eq}`bhs_hj_unconditional` for any given value of $E(m)$.
 
 ```{code-cell} ipython3
@@ -273,7 +275,7 @@ Reconciling formula {eq}`bhs_crra_sdf` with the market price of risk extracted f
 
 This is the **equity premium puzzle**.
 
-But high values of $\gamma$  bring another difficulty.  
+But high values of $\gamma$ bring another difficulty.  
 
 High values of $\gamma$ that deliver enough volatility $\sigma(m)$ also push $E(m)$, the reciprocal of the gross risk-free rate, too far down, away from the Hansen--Jagannathan bound.
 
@@ -283,7 +285,7 @@ This is the **risk-free rate puzzle** of {cite:t}`Weil_1989`.
 
 The figure below reproduces Tallarini's key diagnostic.
 
-Because it motivates much of what follows, we show Tallarini's figure  before developing the underlying theory.
+Because it motivates much of what follows, we show Tallarini's figure before developing the underlying theory.
 
 
 Closed-form expressions for the Epstein--Zin SDF moments used in the plot are derived in {ref}`Exercise 2 <dov_ex2>`.
@@ -391,7 +393,7 @@ Instead, they reflect the agent's doubts about the probability model itself.
 
 ## The choice setting
 
-To understand  their reinterpretation, we first need to describe their statistical models of consumption growth. 
+To understand their reinterpretation, we first need to describe their statistical models of consumption growth. 
 
 ### Shocks and consumption plans
 
@@ -492,9 +494,10 @@ print(f"std[r_e-r_f]={r_excess_std:.4f}")
 
 We compare four preference specifications over consumption plans $C^\infty \in \mathcal{C}$.
 ```{note}
-For  origins of the names **multipler** and **constraint**  preferences, see {cite:t}`HansenSargent2001`.
-The risk-sensitive preference specification used here comes from {cite:t}`hansen1995discounted`, which adjusts specifications used earlier by 
-{cite:t}`jacobson1973optimal`, {cite:t}`Whittle_1981`, and  {cite:t}`Whittle_1990` to accommodate discounting in a way that preserves time-invariant optimal decision rules. 
+For origins of the names **multiplier** and **constraint** preferences, see {cite:t}`HansenSargent2001`.
+
+The risk-sensitive preference specification used here comes from {cite:t}`hansen1995discounted`, which adjusts specifications used earlier by
+{cite:t}`jacobson1973optimal`, {cite:t}`Whittle_1981`, and {cite:t}`Whittle_1990` to accommodate discounting in a way that preserves time-invariant optimal decision rules.
 ```
 
 *Type I agent (Kreps--Porteus--Epstein--Zin--Tallarini)* with
@@ -543,7 +546,7 @@ $$
 where $\hat g_{t+1}$ is a likelihood-ratio distortion that we will define in each case.
 
 
-Along the way, we introduce the likelihood-ratio distortion that enters the stochastic discount factor and describe  detection-error probabilities that will serve as our new calibration tool.
+Along the way, we introduce the likelihood-ratio distortion that enters the stochastic discount factor and describe detection-error probabilities that will serve as our new calibration tool.
 
 ### Type I: Kreps--Porteus--Epstein--Zin--Tallarini preferences
 
@@ -969,7 +972,7 @@ Under the worst-case measure $\varepsilon \sim \mathcal{N}(w(\theta),1)$, so $E_
 E_t[\hat g_{t+1}\log \hat g_{t+1}] = w(\theta) \cdot w(\theta) - \frac{1}{2}w(\theta)^2 = \frac{1}{2}w(\theta)^2.
 ```
 
-Because the distortion is i.i.d., the conditional entropy $E_t[\hat g_{t+1}\log \hat g_{t+1}] = \frac{1}{2}w(\theta)^2$ from {eq}`bhs_conditional_entropy` is constant and $N(x)$ does not depend on $x$.
+Because the distortion is iid, the conditional entropy $E_t[\hat g_{t+1}\log \hat g_{t+1}] = \frac{1}{2}w(\theta)^2$ from {eq}`bhs_conditional_entropy` is constant and $N(x)$ does not depend on $x$.
 
 The recursion {eq}`bhs_N_recursion` then reduces to $N(x) = \beta(\frac{1}{2}w(\theta)^2 + N(x))$, where we have used $\int \hat g(\varepsilon)\pi(\varepsilon)d\varepsilon = 1$ (since $\hat g$ is a likelihood ratio).  
 
@@ -1262,9 +1265,9 @@ p_hj_rw = brentq(sharpe_gap, 1e-4, 0.49, args=("rw",))
 p_hj_ts = brentq(sharpe_gap, 1e-4, 0.49, args=("ts",))
 
 fig, ax = plt.subplots(figsize=(7, 5))
-ax.plot(Em_rw_p, σ_m_rw_p, "o",
+ax.plot(Em_rw_p, σ_m_rw_p, "o", lw=2,
             label="random walk")
-ax.plot(Em_ts_p, σ_m_ts_p, "+", markersize=12,
+ax.plot(Em_ts_p, σ_m_ts_p, "+", lw=2, markersize=12,
             label="trend stationary")
 ax.plot(Em_grid, HJ_std, lw=2,
             color="black", label="Hansen-Jagannathan bound")
@@ -1282,9 +1285,9 @@ for p_hj, model, color, name, marker in [
     ax.axhline(σ_m_hj, ls="--", lw=1, color=color,
                label=f"{name} reaches bound at $p = {p_hj:.3f}$")
     if model == "ts":
-        ax.plot(Em_hj, σ_m_hj, marker, markersize=12, color=color)
+        ax.plot(Em_hj, σ_m_hj, marker, lw=2, markersize=12, color=color)
     else:
-        ax.plot(Em_hj, σ_m_hj, marker, color=color)
+        ax.plot(Em_hj, σ_m_hj, marker, lw=2, color=color)
 
 ax.set_xlabel(r"$E(m)$")
 ax.set_ylabel(r"$\sigma(m)$")
@@ -1322,8 +1325,7 @@ The right panel reveals the cumulative consequence: a per-period shift that is v
 ---
 mystnb:
   figure:
-    caption: Small one-step density shift (left) produces large cumulative consumption
-      gap (right) at detection-error probability $p = 0.03$ with $T = 240$ quarters
+    caption: Density shift and cumulative consumption
     name: fig-bhs-fear
 ---
 p_star = 0.03
@@ -1339,7 +1341,7 @@ f0 = norm.pdf(ε, 0, 1)
 fw = norm.pdf(ε, w_star, 1)
 
 ax1.fill_between(ε, f0, alpha=0.15, color='k')
-ax1.plot(ε, f0, 'k', lw=2.5,
+ax1.plot(ε, f0, 'k', lw=2,
          label=r'Approximating $\mathcal{N}(0, 1)$')
 ax1.fill_between(ε, fw, alpha=0.15, color='C3')
 ax1.plot(ε, fw, 'C3', lw=2, ls='--',
@@ -1352,7 +1354,7 @@ ax1.text(w_star / 2, 0.59 * peak, f'$w = {w_star:.2f}$',
          ha='center', fontsize=11, color='C3')
 
 ax1.set_xlabel(r'$\varepsilon_{t+1}$')
-ax1.set_ylabel('Density')
+ax1.set_ylabel('density')
 ax1.legend(frameon=False)
 
 quarters = np.arange(0, 241)
@@ -1361,8 +1363,8 @@ years = quarters / 4
 gap_rw = 100 * σ_ε * w_star * quarters
 gap_ts = 100 * σ_ε * w_star * (1 - ρ**quarters) / (1 - ρ)
 
-ax2.plot(years, gap_rw, 'C0', lw=2.5, label='Random walk')
-ax2.plot(years, gap_ts, 'C1', lw=2.5, label='Trend stationary')
+ax2.plot(years, gap_rw, 'C0', lw=2, label='random walk')
+ax2.plot(years, gap_ts, 'C1', lw=2, label='trend stationary')
 ax2.fill_between(years, gap_rw, alpha=0.1, color='C0')
 ax2.fill_between(years, gap_ts, alpha=0.1, color='C1')
 ax2.axhline(0, color='k', lw=0.5, alpha=0.3)
@@ -1373,8 +1375,8 @@ ax2.text(61, gap_rw[-1], f'{gap_rw[-1]:.1f}%',
 ax2.text(61, gap_ts[-1], f'{gap_ts[-1]:.1f}%',
          fontsize=10, color='C1', va='center')
 
-ax2.set_xlabel('Years')
-ax2.set_ylabel('Gap in expected log consumption (%)')
+ax2.set_xlabel('years')
+ax2.set_ylabel('gap in expected log consumption (%)')
 ax2.legend(frameon=False, loc='lower left')
 ax2.set_xlim(0, 68)
 
@@ -1424,8 +1426,7 @@ What looks like extreme risk aversion ($\gamma \approx 34$) is really just log u
 ---
 mystnb:
   figure:
-    caption: Doubts or variability? Decomposition of the robust SDF into log-utility
-      IMRS and worst-case distortion at $p = 0.10$
+    caption: Robust SDF log-utility decomposition
     name: fig-bhs-sdf-decomp
 ---
 θ_cal = θ_from_detection_probability(0.10, "rw")
@@ -1449,8 +1450,8 @@ ax.plot(100 * Δc, log_ghat, 'C3', lw=2, ls='--',
 ax.plot(100 * Δc, log_sdf, 'k', lw=2,
         label=r'SDF: $\log m = \log\mathrm{IMRS} + \log\hat{g}$')
 ax.axhline(0, color='k', lw=0.5, alpha=0.3)
-ax.set_xlabel(r'Consumption growth $\Delta c_{t+1}$ (%)')
-ax.set_ylabel('Log SDF component')
+ax.set_xlabel(r'consumption growth $\Delta c_{t+1}$ (%)')
+ax.set_ylabel('log SDF component')
 ax.legend(frameon=False, fontsize=10, loc='upper right')
 
 plt.show()
@@ -1703,7 +1704,7 @@ The following table collects all compensating variations for the random walk mod
 | III | $c_0^{III}(r) - c_0^{III}$ | $\frac{\beta\sigma_\varepsilon^2}{(1-\beta)^2\theta}$ | uncertainty only (vs. deterministic) |
 | III | $c_0 - c_0^{III}(u)$ | $\frac{\beta\sigma_\varepsilon^2}{(1-\beta)^3\theta}$ | uncertainty only (vs. risky path) |
 
-The "vs. deterministic" rows use the certainty-equivalent path {eq}`bhs_ce_path` as a benchmark.
+The "versus deterministic" rows use the certainty-equivalent path {eq}`bhs_ce_path` as a benchmark.
 
 The "vs. risky path" rows use the risky-but-uncertainty-free comparison of {eq}`bhs_comp_type2u`--{eq}`bhs_comp_type3u`.
 
@@ -1917,8 +1918,7 @@ A comparison of the two panels reveals that the random-walk model generates much
 ---
 mystnb:
   figure:
-    caption: Type II compensation across detection-error probability and consumption
-      volatility
+    caption: Type II compensation contours
     name: fig-bhs-contour
 ---
 p_grid = np.linspace(0.02, 0.49, 300)
@@ -1941,19 +1941,17 @@ levels = [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50]
 
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5.5), sharey=True)
 
-for ax, comp, title in [(ax1, comp_rw, 'Random walk'),
-                         (ax2, comp_ts, 'Trend stationary')]:
+for ax, comp in [(ax1, comp_rw), (ax2, comp_ts)]:
     cf = ax.contourf(100 * P, 100 * Σ, comp, levels=levels,
                      cmap='Blues', extend='both')
     cs = ax.contour(100 * P, 100 * Σ, comp, levels=levels,
                     colors='k', linewidths=0.5)
     ax.clabel(cs, fmt='%g%%', fontsize=8)
-    ax.plot(10, 0.5, 'x', markersize=14, color='w',
+    ax.plot(10, 0.5, 'x', lw=2, markersize=14, color='w',
             mec='k', mew=1, zorder=5)
-    ax.set_xlabel(r'Detection-error probability $p$ (%)')
-    ax.set_title(title)
+    ax.set_xlabel(r'detection-error probability $p$ (%)')
 
-ax1.set_ylabel(r'Consumption volatility $\sigma_\varepsilon$ (%)')
+ax1.set_ylabel(r'consumption volatility $\sigma_\varepsilon$ (%)')
 
 plt.tight_layout()
 plt.show()
@@ -1961,11 +1959,11 @@ plt.show()
 
 ## Learning doesn't eliminate misspecification fears
 
-A reasonable question  arises: if the consumer has 235 quarters of data,  can't she learn enough to dismiss the worst-case model?
+A reasonable question arises: if the consumer has 235 quarters of data, can't she learn enough to dismiss the worst-case model?
 
 The answer is no.
 
-This is because   the drift is a low-frequency feature that is very  hard to pin down.
+This is because the drift is a low-frequency feature that is very hard to pin down.
 
 Estimating the mean of a random walk to the precision needed to reject small but economically meaningful shifts requires far more data than estimating volatility precisely does.
 
@@ -2132,7 +2130,7 @@ plt.tight_layout()
 plt.show()
 ```
 
-In the left panel, postwar U.S. log consumption is shown alongside two deterministic trend lines: the approximating-model drift $\mu$ and the worst-case drift $\mu + \sigma_\varepsilon w(\theta)$ for $p(\theta^{-1}) = 0.20$.
+In the left panel, postwar US log consumption is shown alongside two deterministic trend lines: the approximating-model drift $\mu$ and the worst-case drift $\mu + \sigma_\varepsilon w(\theta)$ for $p(\theta^{-1}) = 0.20$.
 
 The two trends are close enough that, even with six decades of data, it is hard to distinguish them by eye.
 
@@ -2847,7 +2845,7 @@ In the Gaussian mean-shift setting of {ref}`Exercise 5 <dov_ex5>`, let $L_T$ be
 :class: dropdown
 ```
 
-Let the approximating model be $\varepsilon_i \sim \mathcal{N}(0,1)$ and the worst-case model be $\varepsilon_i \sim \mathcal{N}(w,1)$, i.i.d. for $i=1,\ldots,T$.
+Let the approximating model be $\varepsilon_i \sim \mathcal{N}(0,1)$ and the worst-case model be $\varepsilon_i \sim \mathcal{N}(w,1)$, iid for $i=1,\ldots,T$.
 
 Take the log likelihood ratio in the direction that matches the definitions in the text:
 
diff --git a/lectures/hansen_singleton_1982.md b/lectures/hansen_singleton_1982.md
new file mode 100644
index 000000000..862b7ba03
--- /dev/null
+++ b/lectures/hansen_singleton_1982.md
@@ -0,0 +1,1192 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  name: python3
+  display_name: Python 3 (ipykernel)
+  language: python
+---
+
+(hansen_singleton_1982)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Estimating Euler Equations by Generalized Method of Moments
+
+```{index} single: Asset Pricing; GMM Estimation
+```
+
+```{contents} Contents
+:depth: 2
+```
+
+## Overview
+
+This lecture implements the generalized instrumental variables estimator of {cite:t}`hansen1982generalized` for nonlinear rational expectations models.
+
+The preceding lecture {doc}`hansen_singleton_1983` derives the consumption Euler equation from the representative consumer's problem with CRRA preferences and estimates it by maximum likelihood under normality assumptions.
+
+That approach requires specifying the joint distribution of consumption and returns, and its validity depends on lognormality being correct.
+
+However, as we saw in {doc}`hansen_singleton_1983`, the lognormal model is rejected by the data.
+
+Moreover, outside of linear-quadratic environments, closed-form solutions for equilibrium typically require strong assumptions about the stochastic properties of forcing variables, the nature of preferences, or the production technology.
+
+{cite:t}`hansen1982generalized` propose an estimation strategy that circumvents this requirement.
+
+The key idea is that the Euler equations from economic agents' optimization problems imply a set of population orthogonality conditions that depend on observable variables and unknown preference parameters.
+
+By making sample counterparts of these orthogonality conditions close to zero, the parameters can be estimated without explicitly solving for the stochastic equilibrium and without specifying the distribution of the observable variables.
+
+Though maximum likelihood estimators (such as the MLE in {doc}`hansen_singleton_1983`) will be asymptotically more efficient when the distributional assumptions are correctly specified.
+
+Relative to the full paper, we only estimate one return at a time (value-weighted stock returns), using only monthly nondurable consumption (`ND`), and omitting their maximum-likelihood comparison (Table II) and multi-return systems (Table III).
+
+In addition to what comes with Anaconda, this lecture requires `pandas-datareader`
+
+```{code-cell} ipython3
+:tags: [hide-output]
+
+!pip install pandas-datareader 
+```
+
+```{code-cell} ipython3
+import warnings
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from IPython.display import Latex
+from numba import njit
+from pandas_datareader import data as web
+from scipy import stats
+from scipy.optimize import minimize
+from statsmodels.sandbox.regression import gmm
+from statsmodels.tsa.stattools import acf
+
+warnings.filterwarnings(
+    "ignore", message=".*date_parser.*", category=FutureWarning
+)
+```
+
+We also define a helper to display DataFrames as LaTeX arrays in the hidden cell below
+
+```{code-cell} ipython3
+:tags: [hide-cell]
+
+def display_table(df, title=None, fmt=None):
+    """
+    Display a DataFrame as a LaTeX array.
+    """
+    if fmt is None:
+        fmt = {}
+    formatted = df.copy()
+    for col in formatted.columns:
+        if col in fmt:
+            formatted[col] = formatted[col].apply(
+                lambda x: fmt[col].format(x) if np.isfinite(x) else str(x))
+    n_cols = len(formatted.columns)
+    idx_header = r"\text{" + df.index.name + "}" if df.index.name else ""
+    columns = " & ".join(
+        [idx_header] + [r"\text{" + c + "}" if "\\" not in c
+         and "^" not in c and "_" not in c
+         else c for c in formatted.columns])
+    rows = r" \\".join(
+        [" & ".join([str(idx)] + [str(v) for v in row])
+         for idx, row in zip(formatted.index, formatted.values)])
+    col_format = "r" + "c" * n_cols
+    lines = [r"\begin{array}{" + col_format + "}"]
+    lines.append(columns + r" \\")
+    lines.append(r"\hline")
+    lines.append(rows)
+    lines.append(r"\end{array}")
+    latex = "\n".join(lines)
+    if title:
+        latex = rf"\textbf{{{title}}}" + r"\\" + "\n" + latex
+    display(Latex("$" + latex + "$"))
+```
+
+
+## The economic model
+
+We consider a single-good economy with a representative consumer whose preferences are of the CRRA type, following {cite:t}`hansen1982generalized` and {cite:t}`hansen1983stochastic`.
+
+```{note}
+The following discussion is very close to the setup in {doc}`hansen_singleton_1983`, but it is more general in that it allows for multiple assets with different maturities and does not assume lognormality.
+```
+
+The representative consumer chooses stochastic consumption and investment plans to maximize
+
+```{math}
+:label: hs82-problem
+
+\max E_0 \sum_{t=0}^{\infty} \beta^t u(C_t)
+```
+
+where $C_t$ is consumption in period $t$, $\beta \in (0,1)$ is a subjective discount factor, and $u(\cdot)$ is a strictly concave period utility function.
+
+The consumer trades $N$ assets with potentially different maturities.
+
+Let $Q_{jt}$ denote the quantity of asset $j$ held at the end of date $t$, $P_{jt}$ its price at date $t$, and $R_{jt}$ the date-$t$ payoff from holding one unit of an $M_j$-period asset purchased at date $t - M_j$.
+
+Feasible consumption and investment plans must satisfy the sequence of budget constraints
+
+```{math}
+:label: hs82-budget
+
+C_t + \sum_{j=1}^{N} P_{jt} Q_{jt} \leq \sum_{j=1}^{N} R_{jt} Q_{jt-M_j} + W_t,
+```
+
+where $W_t$ is real labor income at date $t$.
+
+We specialize to CRRA preferences
+
+```{math}
+:label: hs82-crra
+
+u(C_t) = \frac{C_t^{1-\gamma}}{1-\gamma}, \quad \gamma > 0,
+```
+
+where $\gamma$ is the coefficient of relative risk aversion.
+
+The maximization of {eq}`hs82-problem` subject to {eq}`hs82-budget` gives the first-order necessary conditions (see {cite:t}`Lucas1978`, {cite:t}`Brock1982`, {cite:t}`PrescottMehra1980`, and {doc}`hansen_singleton_1983`):
+
+```{math}
+:label: hs82-general-euler
+
+P_{jt} u'(C_t) = \beta^{M_j} E_t\!\left[R_{jt+M_j} u'(C_{t+M_j})\right], \quad j = 1, \ldots, N.
+```
+
+When asset $j$ is a one-period stock ($M_j = 1$) with payoff $R_{jt+1} = P_{jt+1} + D_{jt+1}$ where $D_{jt}$ is the dividend, the **gross real return** is $R_{t+1}^i = (P_{i,t+1}+D_{i,t+1})/P_{i,t}$. 
+
+Substituting the CRRA marginal utility into {eq}`hs82-general-euler` and dividing both sides by $P_{jt} u'(C_t)$ yields the Euler equation
+
+```{math}
+:label: hs82-euler
+
+E_t\!\left[\beta \left(\frac{C_{t+1}}{C_t}\right)^{-\gamma} R_{t+1}^i\right] = 1,
+```
+
+where $R_{t+1}^i$ is the gross real return on asset $i$.
+
+We define the **stochastic discount factor** $M_{t+1}(\theta) = \beta (C_{t+1}/C_t)^{-\gamma}$ with parameter vector $\theta = (\gamma, \beta)$.
+
+In this notation the Euler equation becomes $E_t[M_{t+1}(\theta) R_{t+1}^i - 1] = 0$.
+
+As we have seen and will see, equation {eq}`hs82-euler` is the central object of both {doc}`hansen_singleton_1983` and this lecture.
+
+It holds for every traded asset for which the agent's optimality conditions apply (interior solution, no binding portfolio constraints or transaction costs). 
+
+It depends on observable quantities (consumption growth and returns) and unknown preference parameters ($\gamma$ and $\beta$).
+
+The challenge that {cite:t}`hansen1982generalized` address is how to estimate $\theta$ from {eq}`hs82-euler` without specifying the rest of the economic environment.
+
+## From conditional to unconditional moments
+
+A natural approach to estimating $\theta$ from {eq}`hs82-euler` would be to specify the entire economic environment, solve for equilibrium, and apply maximum likelihood.
+
+Just like what we did together in {doc}`hansen_singleton_1983`,
+
+{cite:t}`hansen1982generalized` argue that this is impractical for most nonlinear models because it requires strong assumptions about the stochastic properties of "forcing variables" and the production technology.
+
+Their alternative is to work directly with the Euler equation's implications for observable moments.
+
+The Euler equation {eq}`hs82-euler` states that $E_t[M_{t+1}(\theta_0) R_{t+1}^i - 1] = 0$.
+
+Let $z_t$ denote any $q$-dimensional vector of variables that are in the agent's time-$t$ information set and observed by the econometrician.
+
+Because $z_t$ is known at date $t$,
+
+$$
+E\!\left[\left(M_{t+1}(\theta_0)R_{t+1}^i - 1\right) \otimes z_t\right]
+= E\!\left[z_t \otimes \underbrace{E_t\!\left[M_{t+1}(\theta_0)R_{t+1}^i - 1\right]}_{=\,0\text{ by the Euler equation}}\right] = 0.
+$$
+
+This yields the moment restriction
+
+```{math}
+:label: hs82-uncond
+
+E\!\left[\left(M_{t+1}(\theta_0)R_{t+1}^i - 1\right) \otimes z_t\right] = 0,
+```
+
+where $\otimes$ denotes the Kronecker product.
+
+The vector $z_t$ plays the role of **instruments**.
+
+The conditional Euler equation $E_t[M_{t+1}R_{t+1}^i - 1] = 0$ says that the pricing error is unpredictable given *everything* in the agent's time-$t$ information set.
+
+That is a very strong restriction — it says the pricing error is orthogonal to every time-$t$ measurable random variable.
+
+We cannot use the entire information set in practice, but we can pick any finite collection of time-$t$ observable variables $z_t$ and the orthogonality must still hold.
+
+Each variable we include in $z_t$ gives us one sample moment condition $\frac{1}{T}\sum_t (M_{t+1}R_{t+1}^i - 1)\, z_{kt} \approx 0$ that we can compute from data.
+
+More instruments means more orthogonality conditions to match, which can improve efficiency and provides more overidentifying restrictions to test the model against.
+
+With $q$ instruments and $m$ Euler equations we obtain $mq$ moment restrictions for estimating the parameter vector $\theta$.
+
+For one return and $p$ lags of instruments, {cite:t}`hansen1982generalized` use
+
+```{math}
+:label: hs82-instruments
+
+z_t = \left[1, R_t, g_t, R_{t-1}, g_{t-1}, \ldots, R_{t-p+1}, g_{t-p+1}\right]^\top,
+```
+
+where $g_t = C_t / C_{t-1}$ is gross consumption growth.
+
+```{note}
+{cite:t}`hansen1982generalized` describe $z_t$ as "lagged values of $x_{t+1}$" where $x_{t+1} = (R_{t+1}, g_{t+1})$. 
+
+The constant 1 is not stated explicitly but is implied by the degrees of freedom reported in their Table I.
+```
+
+More generally, {cite:t}`hansen1982generalized` write the first-order condition as $E_t[h(x_{t+n}, b_0)] = 0$, where $x_{t+n}$ is a vector of observables dated $t+n$ and $b_0$ is the true parameter vector.
+
+The disturbance $u_{t+1} = h(x_{t+1}, b_0)$ is serially uncorrelated in the one-period stock case. 
+
+Stacking moment conditions via the Kronecker product gives $f(x_{t+n}, z_t, b) = h(x_{t+n}, b) \otimes z_t$, a vector of dimension $mq$.
+
+The resulting unconditional restriction $E[f(x_{t+n}, z_t, b_0)] = 0$ nests both the single-return one-period Euler equation and the multi-maturity asset pricing restrictions.
+
+The orthogonality condition and lagged instrument vector follow equations {eq}`hs82-uncond` and {eq}`hs82-instruments`
+
+```{code-cell} ipython3
+def euler_error_horizon(params, exog, horizon=1):
+    """
+    Compute Euler-equation pricing errors for (γ, β) at a given horizon.
+    """
+    if horizon < 1:
+        raise ValueError("horizon must be at least one.")
+    γ, β = params
+    gross_return = exog[:, 0]
+    gross_cons_growth = exog[:, 1]
+    return (β ** horizon) * gross_cons_growth ** (-γ) * gross_return - 1.0
+
+
+def euler_error_grad_horizon(params, exog, horizon=1):
+    """
+    Gradient of the Euler error wrt (γ, β) at a given horizon.
+    """
+    if horizon < 1:
+        raise ValueError("horizon must be at least one.")
+    γ, β = params
+    gross_return = exog[:, 0]
+    gross_cons_growth = exog[:, 1]
+
+    g_pow = gross_cons_growth ** (-γ)
+    common = (β ** horizon) * g_pow * gross_return
+
+    dγ = -common * np.log(gross_cons_growth)
+    dβ = horizon * (β ** (horizon - 1)) * g_pow * gross_return
+    return np.column_stack([dγ, dβ])
+
+
+def euler_error(params, exog):
+    """
+    One-period Euler-equation pricing errors for (γ, β).
+    """
+    return euler_error_horizon(params, exog, horizon=1)
+```
+
+A helper aligns outcomes and lagged instruments for nonlinear IV-GMM
+
+```{code-cell} ipython3
+def build_gmm_arrays(data, n_lags):
+    """
+    Build endog, exog, and instruments for nonlinear IV-GMM.
+    """
+    if n_lags < 1:
+        raise ValueError("n_lags must be at least one.")
+    if data.shape[0] <= n_lags:
+        raise ValueError("Sample size must exceed n_lags.")
+
+    t_obs = data.shape[0]
+    exog = data[n_lags:, :]
+    endog = np.zeros(exog.shape[0])
+    n_obs = t_obs - n_lags
+    n_instr = 2 * n_lags + 1
+
+    instruments = np.empty((n_obs, n_instr))
+    instruments[:, 0] = 1.0
+
+    for j in range(n_lags):
+        left = 2 * j + 1
+        right = left + 2
+        instruments[:, left:right] = data[n_lags - 1 - j : t_obs - 1 - j, :]
+
+    return endog, exog, instruments
+```
+
+When the asset has maturity $n > 1$, the Euler equation involves $n$-period compounded returns and consumption growth.
+
+For CRRA preferences, the $n$-period Euler restriction is
+
+```{math}
+:label: hs82-euler-n
+
+E_t\!\left[\beta^n \left(\frac{C_{t+n}}{C_t}\right)^{-\gamma} R_{t,t+n}^i\right] = 1.
+```
+
+For estimation, the $n$-period exog can either use directly observed $n$-period returns/payoffs, or be constructed by compounding one-period returns and consumption growth over $n$ consecutive periods.
+
+Again, a key requirement from {cite:t}`hansen1982generalized` is that instruments $z_t$ must lie in the agent's time-$t$ information set $\mathcal{I}_t$.
+
+For the multi-period case, instruments must be measurable with respect to $\mathcal{I}_t$. 
+
+In particular, one should avoid any lagged multi-period aggregates that still include periods after $t$, since these would embed realizations not in $\mathcal{I}_t$.
+
+The $n$-period exog is constructed by compounding one-period data, and instruments are timed to lie in $\mathcal{I}_t$.
+
+```{code-cell} ipython3
+def build_gmm_arrays_horizon(one_period_data, n_lags, horizon):
+    """
+    Build endog, exog, and instruments for multi-period GMM.
+
+    Exog contains n-period compounded returns and consumption growth.
+    """
+    if horizon < 1:
+        raise ValueError("horizon must be at least one.")
+    if n_lags < 1:
+        raise ValueError("n_lags must be at least one.")
+    T = one_period_data.shape[0]
+    if T <= n_lags + horizon:
+        raise ValueError("Sample size too small for given n_lags and horizon.")
+
+    # Each observation starts at index t (the first period in the window)
+    # The window spans one_period_data[t : t + horizon]
+    # Instruments use one_period_data[t - 1], ..., one_period_data[t - n_lags]
+    starts = np.arange(n_lags, T - horizon + 1)
+    n_obs = len(starts)
+
+    exog = np.empty((n_obs, 2))
+    n_instr = 2 * n_lags + 1
+    instruments = np.empty((n_obs, n_instr))
+    instruments[:, 0] = 1.0
+
+    for i, t in enumerate(starts):
+        window = one_period_data[t : t + horizon, :]
+        exog[i, 0] = np.prod(window[:, 0])   # n-period return
+        exog[i, 1] = np.prod(window[:, 1])   # n-period consumption growth
+        for j in range(n_lags):
+            instruments[
+                i, 2 * j + 1 : 2 * j + 3] = one_period_data[t - 1 - j, :]
+
+    endog = np.zeros(n_obs)
+    return endog, exog, instruments
+```
+
+When $n > 1$, the Euler equation involves variables dated $t + n$, and the disturbance $u_{t+n} = h(x_{t+n}, b_0)$ will generally be serially correlated.
+
+As {cite:t}`hansen1982generalized` note, if the $m$ assets are all one-period stocks, then $u$ is serially uncorrelated because observations on $x_{t-s}$, $s \geq 0$, are in the agent's time-$t$ information set and $E_t[h(x_{t+1}, b_0)] = 0$. 
+
+But if $n_j > 1$ for some asset $j$, the condition $E_t[h(x_{t+n}, b_0)] = 0$ does not preclude serial correlation in $u$, since $x_{t+n-1}$ is not necessarily in $I_t$ when $n > 1$.
+
+The number of population autocovariances in the long-run covariance $S_0$ is determined by $n$, the order of the moving average disturbance term $u_t$.
+
+We implement this directly as a finite-order covariance estimator
+
+```{code-cell} ipython3
+def finite_ma_covariance(moment_series, ma_order):
+    """
+    Estimate 
+    S = Gamma_0 + sum_{j=1}^{ma_order}(Gamma_j + Gamma_j.T) for moment vectors.
+    """
+    if ma_order < 0:
+        raise ValueError("ma_order must be nonnegative.")
+    if moment_series.ndim != 2:
+        raise ValueError("moment_series must be 2D.")
+
+    t_obs, n_mom = moment_series.shape
+    if t_obs <= ma_order:
+        raise ValueError("Need more observations than ma_order.")
+
+    # Use the *uncentered* cross products
+    # T^{-1} sum_t f_t f_{t-j}' and then add the symmetric lag terms.
+    s_hat = moment_series.T @ moment_series / t_obs
+
+    for j in range(1, ma_order + 1):
+        gamma_j = moment_series[j:, :].T @ moment_series[:-j, :] / t_obs
+        s_hat += gamma_j + gamma_j.T
+
+    ridge = 1e-8 * np.eye(n_mom)
+    return s_hat + ridge
+```
+
+The estimation procedure in {cite:t}`hansen1982generalized` is a two-step generalized instrumental variables procedure.
+
+In the first step, we minimize the GMM criterion with a suboptimal weighting matrix (the identity) to obtain a consistent preliminary estimate $b_T$.
+
+In the second step, we use $b_T$ to estimate the covariance matrix of the sample moment conditions and invert it to form the optimal weighting matrix, then re-minimize the criterion.
+
+Let's implement this algorithm
+
+```{code-cell} ipython3
+def two_step_gmm(data, n_lags, ma_order=0, horizon=1, start_params=None):
+    """
+    Two-step GMM with finite-order covariance.
+
+    The Euler error uses β**horizon.
+    """
+    if start_params is None:
+        start_params = np.array([1.0, 0.99])
+    else:
+        start_params = np.asarray(start_params, dtype=float)
+
+    if horizon == 1:
+        _, exog, instruments = build_gmm_arrays(data, n_lags)
+    else:
+        _, exog, instruments = build_gmm_arrays_horizon(data, n_lags, horizon)
+    n_obs = exog.shape[0]
+
+    def sample_moments(params):
+        err = euler_error_horizon(params, exog, horizon=horizon)
+        return err[:, None] * instruments
+
+    def objective(params, weight_matrix):
+        g_bar = sample_moments(params).mean(axis=0)
+        return float(g_bar @ weight_matrix @ g_bar)
+
+    def objective_grad(params, weight_matrix):
+        g_bar = sample_moments(params).mean(axis=0)
+        grad_err = euler_error_grad_horizon(params, exog, horizon=horizon)
+        d_bar = (instruments.T @ grad_err) / n_obs
+        return 2.0 * d_bar.T @ weight_matrix @ g_bar
+
+    q = instruments.shape[1]
+    w_identity = np.eye(q)
+
+    bounds = [(-2.0, 10.0), (0.85, 1.5)]
+
+    def coarse_starts(weight_matrix, n_best=5):
+        γ_grid = np.linspace(bounds[0][0], bounds[0][1], 33)
+        β_grid = np.linspace(bounds[1][0], bounds[1][1], 33)
+        scored = []
+        for γ0 in γ_grid:
+            for β0 in β_grid:
+                params0 = np.array([γ0, β0])
+                val = objective(params0, weight_matrix)
+                if np.isfinite(val):
+                    scored.append((val, params0))
+        scored.sort(key=lambda item: item[0])
+        return [params for _, params in scored[:n_best]] or [start_params]
+
+    def best_local_minimize(weight_matrix, starts):
+        best = None
+        for x0 in starts:
+            res = minimize(
+                objective,
+                x0=x0,
+                args=(weight_matrix,),
+                jac=objective_grad,
+                method="L-BFGS-B",
+                bounds=bounds,
+                options={"maxiter": 25_000},
+            )
+            if not np.isfinite(res.fun):
+                continue
+            if best is None or (res.fun < best.fun):
+                best = res
+        return best if best is not None else minimize(
+            objective,
+            x0=start_params,
+            args=(weight_matrix,),
+            jac=objective_grad,
+            method="L-BFGS-B",
+            bounds=bounds,
+            options={"maxiter": 25_000},
+        )
+
+    step1_starts = [start_params] + coarse_starts(w_identity, n_best=5)
+    step1 = best_local_minimize(w_identity, step1_starts)
+    params1 = step1.x
+
+    m1 = sample_moments(params1)
+    s_hat = finite_ma_covariance(m1, ma_order=ma_order)
+    w_opt = np.linalg.pinv(s_hat)
+
+    step2_starts = [params1] + coarse_starts(w_opt, n_best=5)
+    step2 = best_local_minimize(w_opt, step2_starts)
+    params2 = step2.x
+
+    m2 = sample_moments(params2)
+    s_hat2 = finite_ma_covariance(m2, ma_order=ma_order)
+    w_opt2 = np.linalg.pinv(s_hat2)
+    g2 = m2.mean(axis=0)
+    j_stat = float(n_obs * (g2 @ w_opt2 @ g2))
+    df = instruments.shape[1] - len(params2)
+    j_prob = float(stats.chi2.cdf(j_stat, df=df)) if df > 0 else np.nan
+    p_value = float(1.0 - j_prob) if df > 0 else np.nan
+
+    # Asymptotic covariance under optimal weighting: (D' S^{-1} D)^{-1} / T.
+    grad_err = euler_error_grad_horizon(params2, exog, horizon=horizon)
+    d_hat = (instruments.T @ grad_err) / n_obs
+    cov_hat = np.linalg.pinv(d_hat.T @ w_opt2 @ d_hat) / n_obs
+    se_hat = np.sqrt(np.diag(cov_hat))
+
+    return {
+        "params_step1": params1,
+        "params_step2": params2,
+        "se_step2": se_hat,
+        "weight_opt": w_opt2,
+        "j_stat": j_stat,
+        "j_df": int(df),
+        "j_prob": j_prob,
+        "j_pval": p_value,
+        "n_obs": int(n_obs),
+        "success": bool(step2.success),
+    }
+```
+
+## GMM criterion and asymptotic theory
+
+We now formalize the estimation procedure.
+
+Let $m_t(\theta) = (M_{t+1}(\theta) R_{t+1}^i - 1) \otimes z_t$ denote the vector of moment conditions at date $t$, and define the sample mean
+
+```{math}
+:label: hs82-sample-moments
+
+g_T(\theta) = \frac{1}{T} \sum_{t=1}^T m_t(\theta).
+```
+
+If the model is correctly specified, $g_T(\theta_0)$ should be close to zero for large $T$.
+
+We estimate $\theta$ by choosing the parameter vector that makes $g_T$ as close to zero as possible, measured by a quadratic form:
+
+```{math}
+:label: hs82-criterion
+
+\hat\theta = \arg\min_\theta g_T(\theta)^\top W_T g_T(\theta)
+```
+
+where $W_T$ is a symmetric positive-definite weighting matrix.
+
+Under regularity conditions given in {cite:t}`Hansen1982`, the GMM estimator is consistent, asymptotically normal, and has the sandwich covariance matrix
+
+```{math}
+:label: hs82-asymptotic
+
+\sqrt{T}(\hat\theta-\theta_0) \Rightarrow N\!\left(0, (D^\top W D)^{-1} D^\top W S W D (D^\top W D)^{-1}\right),
+```
+
+where $D = E[\partial m_t(\theta_0)/\partial\theta^\top]$ is the Jacobian of the moment conditions, $S$ is the long-run covariance matrix of $m_t(\theta_0)$, and $W$ is the probability limit of $W_T$.
+
+{cite:t}`Hansen1982` shows that the optimal weighting matrix is $W^* = S^{-1}$, which yields the smallest asymptotic covariance matrix among all choices of $W$.
+
+Under $W = S^{-1}$ the sandwich simplifies to $(D^\top S^{-1} D)^{-1}$.
+
+When the number of moment conditions $r$ (e.g., $r = mq$ for $m$ Euler equations and $q$ instruments) exceeds the number of parameters $k$, the model is overidentified and we can test whether the data are consistent with the maintained restrictions.
+
+{cite:t}`hansen1982generalized` test the overidentifying restrictions using a result from {cite:t}`Hansen1982`:
+
+```{math}
+:label: hs82-jtest
+
+J_T = T\, g_T(\hat\theta)^\top \hat S^{-1} g_T(\hat\theta) \Rightarrow \chi^2_{r-k},
+```
+
+where $\hat S$ is a consistent estimator of $S$.
+
+A large $J_T$ relative to $\chi^2_{r-k}$ critical values leads to rejection of the model's overidentifying restrictions.
+
+For the multi-period case ($n > 1$), the disturbance is at most MA($n-1$) as discussed above, so {cite:t}`hansen1982generalized` obtain the optimal weighting matrix by setting $W_0 = S_0^{-1}$ where
+
+```{math}
+:label: hs82-finite-so
+
+S_0 = \sum_{j=-n+1}^{n-1} E\!\left[f(x_{t+n}, z_t, b_0)\, f(x_{t+n-j}, z_{t-j}, b_0)^\top\right].
+```
+
+## Covariance estimation and the choice of instruments
+
+For the one-period Euler equation ($n = 1$), the disturbance $u_{t+1} = M_{t+1}(\theta_0) R_{t+1} - 1$ is a martingale difference sequence.
+
+In this case the moment vector $m_t(\theta_0) = u_{t+1} \otimes z_t$ is serially uncorrelated, and the long-run covariance $S$ equals the contemporaneous variance $E[m_t m_t^\top]$.
+
+The covariance estimator therefore requires no kernel or bandwidth choice.
+
+We simply use the sample analogue $\hat S = T^{-1} \sum_t m_t m_t^\top$.
+
+In our implementation below, we use a HAC (heteroskedasticity and autocorrelation consistent) estimator  against possible mild serial dependence from time aggregation or measurement timing.
+
+This is a modern precaution, not part of the original {cite:t}`hansen1982generalized` procedure, which exploits the known MA order directly as in {eq}`hs82-finite-so`.
+
+The number of instrument lags $p$ determines how many orthogonality conditions we use and hence the power of the $J$ test.
+
+{cite:t}`hansen1982generalized` report results for NLAG $= 1, 2, 4, 6$ and note (footnote 12) that using more orthogonality conditions may lead to estimators with less desirable small-sample properties.
+
+Below we implement the estimation procedure and allow the user to choose the number of lags and whether to use a HAC estimator for the covariance matrix
+
+```{code-cell} ipython3
+def estimate_gmm(
+    data,
+    n_lags,
+    start_params=None,
+    use_hac=True,
+    hac_maxlag=None,
+    maxiter=2,
+):
+    """
+    Estimate Euler-equation parameters with nonlinear IV-GMM.
+    """
+    if start_params is None:
+        start_params = np.array([1.0, 0.99])
+
+    endog, exog, instruments = build_gmm_arrays(data, n_lags)
+    model = gmm.NonlinearIVGMM(endog, exog, instruments, euler_error)
+
+    if use_hac:
+        if hac_maxlag is None:
+            hac_maxlag = max(
+                1, int(
+                    np.floor(4.0 * (endog.shape[0] / 100.0) ** (2.0 / 9.0))))
+        result = model.fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            optim_method="bfgs",
+            optim_args={"disp": False},
+            weights_method="hac",
+            wargs={"maxlag": hac_maxlag},
+        )
+    else:
+        result = model.fit(
+            start_params=start_params,
+            maxiter=maxiter,
+            optim_method="bfgs",
+            optim_args={"disp": False},
+        )
+
+    return result
+```
+
+Next we include a helper to run GMM estimation across lag lengths and summarize results in a table
+
+```{code-cell} ipython3
+def run_gmm_by_lag(
+    data,
+    lags=(1, 2, 4, 6),
+    use_hac=True,
+    hac_maxlag=None,
+):
+    """
+    Estimate GMM models by lag length and return a summary table.
+    """
+    rows = []
+    results = {}
+
+    for lag in lags:
+        res = estimate_gmm(data, n_lags=lag, use_hac=use_hac, hac_maxlag=hac_maxlag)
+        results[lag] = res
+        j_stat, j_pval, j_df = res.jtest()
+        j_prob = float(stats.chi2.cdf(j_stat, df=j_df)) if j_df > 0 else np.nan
+        rows.append(
+            {
+                "NLAG": lag,
+                "γ_hat": res.params[0],
+                "se_γ": res.bse[0],
+                "β_hat": res.params[1],
+                "se_β": res.bse[1],
+                "j_stat": j_stat,
+                "j_prob": j_prob,
+                "j_pval": j_pval,
+                "j_df": int(j_df),
+                "n_obs": int(res.nobs),
+            }
+        )
+
+    table = pd.DataFrame(rows).set_index("NLAG")
+    return table, results
+
+
+def run_two_step_by_lag(
+    data,
+    lags=(1, 2, 4, 6),
+    horizon=1,
+):
+    """
+    Two-step GMM with exact S0 (MA order 0) across lag lengths.
+    """
+    rows = []
+    start_params = None
+    for lag in lags:
+        res = two_step_gmm(
+            data,
+            n_lags=lag,
+            ma_order=0,
+            horizon=horizon,
+            start_params=start_params,
+        )
+        start_params = res["params_step2"]
+        rows.append(
+            {
+                "NLAG": lag,
+                "γ_hat": res["params_step2"][0],
+                "se_γ": res["se_step2"][0],
+                "β_hat": res["params_step2"][1],
+                "se_β": res["se_step2"][1],
+                "j_stat": res["j_stat"],
+                "j_prob": res["j_prob"],
+                "j_pval": res["j_pval"],
+                "j_df": res["j_df"],
+                "n_obs": res["n_obs"],
+            }
+        )
+    return pd.DataFrame(rows).set_index("NLAG")
+```
+
+
+### Simulation
+
+Before applying the estimator to real data, we verify that GMM recovers known parameters from simulated data.
+
+We build a simulator that generates synthetic return-growth pairs satisfying the Euler equation by construction.
+
+We generate log consumption growth from a stationary AR(1), compute the stochastic discount factor at known true parameters, and construct gross returns as
+$R_{t+1} = \xi_{t+1} / M_{t+1}(\theta_0)$ where $\xi_{t+1}$ is an iid lognormal shock with mean one.
+
+```{code-cell} ipython3
+@njit
+def _ar1_simulate(mu_c, phi_c, sigma_c, shocks_c, total_n):
+    """
+    Simulate AR(1) log consumption growth.
+    """
+    delta_c = np.empty(total_n)
+    delta_c[0] = mu_c
+    for t in range(1, total_n):
+        delta_c[t] = mu_c * (1.0 - phi_c) + phi_c * delta_c[t - 1] + sigma_c * shocks_c[t]
+    return delta_c
+
+
+def simulate_euler_sample(
+    n_obs,
+    γ_true=0.8,
+    β_true=0.993,
+    seed=0,
+):
+    """
+    Simulate [gross real return, gross consumption growth].
+    """
+    rng = np.random.default_rng(seed)
+    mu_c = 0.0015
+    sigma_c = 0.006
+    phi_c = 0.4
+    sigma_eta = 0.02
+    burn_in = 200
+
+    total_n = n_obs + burn_in
+    shocks_c = rng.standard_normal(total_n)
+    delta_c = _ar1_simulate(mu_c, phi_c, sigma_c, shocks_c, total_n)
+
+    cons_growth = np.exp(delta_c[burn_in:])
+    sdf = β_true * cons_growth ** (-γ_true)
+
+    # Positive mean-one return shock: E[ξ]=1 so E[M R]=1 by construction.
+    eps = rng.standard_normal(n_obs)
+    xi = np.exp(sigma_eta * eps - 0.5 * sigma_eta**2)
+    gross_return = xi / sdf
+
+    return np.column_stack([gross_return, cons_growth])
+```
+
+We set $\gamma = 2$ and $\beta = 0.995$ as the true parameters and generate 700 monthly observations from the Euler-consistent DGP.
+
+```{code-cell} ipython3
+γ_true = 2.0
+β_true = 0.995
+sim_data = simulate_euler_sample(
+    n_obs=5000,
+    γ_true=γ_true,
+    β_true=β_true,
+    seed=0,
+)
+
+print(f"Simulation sample size: {sim_data.shape[0]}")
+print(f"True γ: {γ_true:.3f}")
+print(f"True β: {β_true:.3f}")
+```
+
+We now estimate GMM across lag lengths, following the format of Table I in {cite:t}`hansen1982generalized`.
+
+```{code-cell} ipython3
+sim_table = run_two_step_by_lag(sim_data, lags=(1, 2, 4, 6), horizon=1)
+```
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+sim_pretty = sim_table[
+    ["γ_hat", "se_γ", "β_hat", "se_β", "j_stat", "j_df", "j_prob"]].rename(
+    columns={
+        "γ_hat": r"\hat{\gamma}",
+        "se_γ": r"\mathrm{se}(\hat{\gamma})",
+        "β_hat": r"\hat{\beta}",
+        "se_β": r"\mathrm{se}(\hat{\beta})",
+        "j_stat": "J",
+        "j_df": "df",
+        "j_prob": "Prob(J)",
+    }
+)
+display_table(
+    sim_pretty,
+    fmt={
+        r"\hat{\gamma}": "{:.4f}",
+        r"\mathrm{se}(\hat{\gamma})": "{:.4f}",
+        r"\hat{\beta}": "{:.4f}",
+        r"\mathrm{se}(\hat{\beta})": "{:.4f}",
+        "J": "{:.3f}",
+        "Prob(J)": "{:.3f}",
+        "df": "{:.0f}",
+    },
+)
+```
+
+GMM recovers the true $\gamma$ and $\beta$ across lag specifications pretty closely.
+
+For hypothesis testing, the right-tail $p$ value is $1-\mathrm{Prob}(J)$.
+
+The large standard errors visible in both papers' tables suggest that the preference parameters $\gamma$ and $\beta$ can be weakly identified.
+
+To visualize this, we plot the GMM criterion over a $(\gamma, \beta)$ grid using the simulated data
+
+
+```{code-cell} ipython3
+def gmm_objective_surface(
+    data,
+    n_lags=2,
+    γ_grid=None,
+    β_grid=None,
+):
+    """
+    Compute identity-weighted GMM objective on a parameter grid.
+    """
+    _, exog, instruments = build_gmm_arrays(data, n_lags)
+
+    if γ_grid is None:
+        γ_grid = np.linspace(-1.0, 8.0, 70)
+    if β_grid is None:
+        β_grid = np.linspace(0.96, 1.02, 70)
+
+    objective = np.empty((len(β_grid), len(γ_grid)))
+
+    for i, β_val in enumerate(β_grid):
+        for j, γ_val in enumerate(γ_grid):
+            err = euler_error(np.array([γ_val, β_val]), exog)
+            moments = (err[:, None] * instruments).mean(axis=0)
+            objective[i, j] = moments @ moments
+
+    return γ_grid, β_grid, objective
+```
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: GMM objective contour surface (simulated data)
+    name: fig-hs82-objective-contour
+---
+γ_grid, β_grid, objective = gmm_objective_surface(sim_data, n_lags=2)
+log_obj = np.log10(objective + 1e-12)
+
+fig, ax = plt.subplots()
+contours = ax.contourf(γ_grid, β_grid, log_obj, levels=30, cmap="viridis")
+ax.set_xlabel(r"$\gamma$")
+ax.set_ylabel(r"$\beta$")
+ax.plot(γ_true, β_true, "k*", ms=12, lw=2, label="true values")
+ax.legend()
+plt.colorbar(contours, ax=ax)
+plt.tight_layout()
+plt.show()
+```
+
+The criterion surface may have elongated valleys where many parameter combinations fit the moments nearly equally well.
+
+To illustrate the multi-period case from Section 2 of {cite:t}`hansen1982generalized`, we estimate the three-period Euler restriction using overlapping-horizon returns and consumption growth, with instruments formed from one-period data dated $t$ or earlier and the finite-order covariance appropriate for MA(2) disturbances.
+
+```{code-cell} ipython3
+horizon_n = 3
+two_step = two_step_gmm(
+    sim_data,
+    n_lags=2,
+    ma_order=horizon_n - 1,
+    horizon=horizon_n,
+)
+
+print(f"Horizon n: {horizon_n}")
+print(f"Step-2 converged: {two_step['success']}")
+print(f"Step-2 gamma: {two_step['params_step2'][0]:.4f}")
+print(f"Step-2 beta (one-period): {two_step['params_step2'][1]:.4f}")
+print(
+    f"J({two_step['j_df']}): {two_step['j_stat']:.3f}, "
+    f"Prob={two_step['j_prob']:.3f}, p={two_step['j_pval']:.3f}"
+)
+
+_, exog_n, _ = build_gmm_arrays_horizon(sim_data, n_lags=2, horizon=horizon_n)
+acf_n = acf(
+    euler_error_horizon(two_step["params_step2"], exog_n, horizon=horizon_n),
+    nlags=6,
+    fft=True,
+)
+print("Euler-error ACF lags 1-3:", ", ".join([f"{v:.3f}" for v in acf_n[1:4]]))
+```
+
+The low-lag ACF is consistent with the MA(2) dependence implied by the 3-period horizon.
+
+We now run a Monte Carlo exercise with 500 replications to visualize the finite-sample distribution of $\hat\gamma$, $\hat\beta$, and the $J$ statistic and verify that the asymptotic theory from Section 3 of {cite:t}`hansen1982generalized` provides a reasonable approximation.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Monte Carlo GMM sampling distributions
+    name: fig-hs82-monte-carlo
+---
+n_rep = 500
+estimates = []
+j_stats = []
+
+for rep in range(n_rep):
+    rep_data = simulate_euler_sample(
+        n_obs=900,
+        γ_true=γ_true,
+        β_true=β_true,
+        seed=rep,
+    )
+    rep_res = estimate_gmm(rep_data, n_lags=2, use_hac=True, maxiter=2)
+    estimates.append(rep_res.params)
+    j_stats.append(rep_res.jval)
+
+estimates = np.asarray(estimates)
+j_stats = np.asarray(j_stats)
+
+fig, axes = plt.subplots(1, 3, figsize=(15, 4))
+axes[0].hist(estimates[:, 0], bins=20, edgecolor="white")
+axes[0].axvline(γ_true, color="red", ls="--", lw=2)
+axes[0].set_xlabel(r"$\hat{\gamma}$")
+axes[1].hist(estimates[:, 1], bins=20, edgecolor="white")
+axes[1].axvline(β_true, color="red", ls="--", lw=2)
+axes[1].set_xlabel(r"$\hat{\beta}$")
+
+df_j = 2 * 2 + 1 - 2
+axes[2].hist(j_stats, bins=20, density=True, edgecolor="white")
+grid = np.linspace(0.0, max(j_stats.max(), 1.0), 200)
+axes[2].plot(grid, stats.chi2.pdf(grid, df_j), "r-", lw=2)
+axes[2].set_xlabel("j-statistic")
+plt.tight_layout()
+plt.show()
+```
+
+Both $\hat\gamma$ and $\hat\beta$ are centered near their true values, and the $J$ histogram tracks the $\chi^2$ density, supporting the asymptotic approximation at this sample size.
+
+## Empirical GMM estimation
+
+We now apply GMM to observed data, following the empirical strategy of Section 5 in {cite:t}`hansen1982generalized`.
+
+{cite:t}`hansen1982generalized` use monthly per capita consumption of nondurables (ND) and nondurables plus services (NDS) paired with the equally-weighted (EWR) and value-weighted (VWR) aggregate stock returns from CRSP, for 1959:2 through 1978:12.
+
+We focus on their ND+VWR specification using FRED nondurables consumption and the Ken French value-weighted market return as a proxy for CRSP, on the same 1959:2--1978:12 sample period.
+
+Because the Ken French return is not identical to the original CRSP NYSE value-weighted return, we only want to match the paper qualitatively.
+
+Both this lecture and the companion lecture {doc}`hansen_singleton_1983` use the same data construction.
+
+The hidden cell below pulls the relevant FRED series, constructs per capita real consumption, and joins with Ken French market returns via `pandas-datareader`.
+
+```{code-cell} ipython3
+:tags: [hide-cell]
+
+fred_codes = {
+    "population_16plus": "CNP16OV",
+    "cons_nd_real_index": "DNDGRA3M086SBEA",
+    "cons_nd_price_index": "DNDGRG3M086SBEA",
+}
+
+def to_month_end(index):
+    """
+    Convert a date index to month-end timestamps.
+    """
+    return pd.PeriodIndex(pd.DatetimeIndex(index), freq="M").to_timestamp("M")
+
+
+def load_hs_monthly_data(
+    start="1959-02-01",
+    end="1978-12-01",
+):
+    """
+    Build monthly gross real return and gross consumption-growth series.
+    """
+    start_period = pd.Timestamp(start).to_period("M")
+    end_period = pd.Timestamp(end).to_period("M")
+
+    # Pull one extra month to build the first in-sample growth rate.
+    fetch_start = (start_period - 1).to_timestamp(how="start")
+    fetch_end = end_period.to_timestamp("M")
+    sample_start = start_period.to_timestamp("M")
+    sample_end = end_period.to_timestamp("M")
+
+    fred = web.DataReader(
+        list(fred_codes.values()), "fred", fetch_start, fetch_end)
+    fred = fred.rename(columns={v: k for k, v in fred_codes.items()})
+    fred.index = to_month_end(fred.index)
+    fred["cons_real_level"] = fred["cons_nd_real_index"]
+    fred["cons_price_index"] = fred["cons_nd_price_index"]
+    fred["consumption_per_capita"] = fred["cons_real_level"] \
+        / fred["population_16plus"]
+    fred["gross_cons_growth"] = (
+        fred["consumption_per_capita"] 
+        / fred["consumption_per_capita"].shift(1)
+    )
+    fred["gross_inflation_cons"] = (
+        fred["cons_price_index"] / fred["cons_price_index"].shift(1)
+    )
+
+    ff = web.DataReader(
+        "F-F_Research_Data_Factors", "famafrench",
+        fetch_start, fetch_end)[0].copy()
+    ff.columns = [str(col).strip() for col in ff.columns]
+    if ("Mkt-RF" not in ff.columns) or ("RF" not in ff.columns):
+        raise KeyError(
+            "Fama-French data missing required columns: 'Mkt-RF' and 'RF'.")
+
+    # Mkt-RF and RF are reported in percent per month.
+    ff["gross_nom_return"] = 1.0 + (ff["Mkt-RF"] + ff["RF"]) / 100.0
+    ff.index = ff.index.to_timestamp(how="end")
+    ff.index = to_month_end(ff.index)
+    market = ff[["gross_nom_return"]]
+
+    out = fred.join(market, how="inner")
+    out["gross_real_return"] = out["gross_nom_return"] \
+        / out["gross_inflation_cons"]
+    out = out.loc[sample_start:sample_end].dropna()
+
+    required_cols = [
+        "gross_real_return",
+        "gross_cons_growth",
+    ]
+    return out[required_cols].copy()
+
+
+def get_estimation_data(
+    start="1959-02-01",
+    end="1978-12-01",
+):
+    """
+    Return (dataframe, array) using observed data.
+    """
+    frame = load_hs_monthly_data(start=start, end=end)
+    data = frame[["gross_real_return", "gross_cons_growth"]].to_numpy()
+    return frame, data
+```
+
+We first examine the raw data moments.
+
+```{code-cell} ipython3
+LAGS = (1, 2, 4, 6)
+
+emp_frame, emp_data = get_estimation_data()
+
+print(f"Mean net real return: {(emp_data[:, 0].mean() - 1.0) * 100:.3f}%")
+print(f"Std net real return: {emp_data[:, 0].std() * 100:.3f}%")
+print(f"Mean net consumption growth: {(emp_data[:, 1].mean() - 1.0) * 100:.3f}%")
+print(f"Std net consumption growth: {emp_data[:, 1].std() * 100:.3f}%")
+print(f"Std log consumption growth: {np.log(emp_data[:, 1]).std() * 100:.3f}%")
+```
+
+One feature of these data is the large gap between the volatility of returns and the volatility of consumption growth.
+
+This is again an empirical fact underlying the equity premium puzzle of {cite:t}`MehraPrescott1985`: matching the observed equity premium with CRRA preferences requires implausibly high risk aversion.
+
+We now estimate the Euler equation using the two-step generalized instrumental variables (GIV) / GMM procedure in {cite:t}`hansen1982generalized`.
+
+For the one-period stock-return Euler equation ($n=1$), the disturbance is a martingale difference sequence, so the optimal weighting matrix uses the contemporaneous covariance $S_0 = E[m_t m_t^\top]$.
+
+To match Table I, we report the paper's exponent parameter $\alpha$ in
+$E_t[\beta (C_{t+1}/C_t)^\alpha R_{t+1} - 1] = 0$.
+
+The two-step GMM estimates of $\hat{\alpha}$ and $\hat{\beta}$ by lag length are
+
+```{code-cell} ipython3
+gmm_raw = run_two_step_by_lag(emp_data, lags=LAGS, horizon=1)
+gmm_raw.index.name = "NLAG"
+
+table_i = pd.DataFrame(index=gmm_raw.index)
+table_i.index.name = "NLAG"
+table_i[r"\hat{\alpha}"] = -gmm_raw["γ_hat"]
+table_i[r"SE(\hat{\alpha})"] = gmm_raw["se_γ"]
+table_i[r"\beta"] = gmm_raw["β_hat"]
+table_i[r"\mathrm{SE}(\beta)"] = gmm_raw["se_β"]
+table_i[r"\chi^2"] = gmm_raw["j_stat"]
+table_i["DF"] = gmm_raw["j_df"]
+table_i["Prob"] = gmm_raw["j_prob"]
+
+display_table(
+    table_i,
+    fmt={
+        r"\hat{\alpha}": "{:.4f}",
+        r"SE(\hat{\alpha})": "{:.4f}",
+        r"\beta": "{:.4f}",
+        r"\mathrm{SE}(\beta)": "{:.4f}",
+        r"\chi^2": "{:.4f}",
+        "DF": "{:.0f}",
+        "Prob": "{:.4f}",
+    },
+)
+```
+
+For comparison, Table I of {cite:t}`hansen1982generalized` (as corrected in the [1984 *Econometrica* errata](https://www.jstor.org/stable/1911486?seq=2)) reports the following ND+VWR values for 1959:2--1978:12:
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+table_i_paper = pd.DataFrame(
+    {
+        r"\alpha": [-1.2028, -0.5761, -0.6565, -0.9638],
+        r"SE(\alpha)": [0.7789, 0.7067, 0.6896, 0.6425],
+        r"\beta": [0.9976, 0.9975, 0.9978, 0.9985],
+        r"\mathrm{SE}(\beta)": [0.0027, 0.0027, 0.0027, 0.0027],
+        r"\chi^2": [1.457, 5.819, 7.923, 10.522],
+        "DF": [1, 3, 7, 11],
+        "Prob": [0.7726, 0.8792, 0.6606, 0.5159],
+    },
+    index=pd.Index([1, 2, 4, 6], name="NLAG"),
+)
+
+display_table(
+    table_i_paper,
+    fmt={
+        r"\alpha": "{:.4f}",
+        r"SE(\alpha)": "{:.4f}",
+        r"\beta": "{:.4f}",
+        r"\mathrm{SE}(\beta)": "{:.4f}",
+        r"\chi^2": "{:.4f}",
+        "DF": "{:.0f}",
+        "Prob": "{:.4f}",
+    },
+)
+```
+
+They are very close to our estimates.
+
+## Summary
+
+The GMM estimator requires only the orthogonality conditions implied by the Euler equation and a set of conditioning variables.
+
+It does not require assumptions about the joint distribution of consumption and returns, the production technology, or any other part of the economic environment beyond the representative agent's first-order conditions.
+
+So GMM is provides a way to estimate some objects of interest while not estimating all of the parameters that {cite:t}`hansen1983stochastic` estimated in their loglinear model of consumption growth and returns.  
+
+  * If the complete model of {cite:t}`hansen1983stochastic` is correctly specified, then their maximum likelihood estimators promise to be more efficient that are the GMM estimators described in this lecture. 
+  * The theme of estimating *something* while not estimating *everything* runs through much of Lars Peter Hansen's work. See {cite:t}`hansen2014nobel`
+
+
+
diff --git a/lectures/hansen_singleton_1983.md b/lectures/hansen_singleton_1983.md
new file mode 100644
index 000000000..c2df80578
--- /dev/null
+++ b/lectures/hansen_singleton_1983.md
@@ -0,0 +1,1896 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.17.1
+kernelspec:
+  name: python3
+  display_name: Python 3 (ipykernel)
+  language: python
+---
+
+(hansen_singleton_1983)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Stochastic Consumption, Risk Aversion, and the Temporal Behavior of Asset Returns
+
+```{index} single: Asset Pricing; MLE Estimation
+```
+
+```{contents} Contents
+:depth: 2
+```
+
+> Evans and Honkapohja: What were the profession’s most important responses
+> to the Lucas Critique?
+> 
+> Sargent: There were two. The first and most optimistic response was complete
+> rational expectations econometrics. A rational expectations equilibrium is a
+> likelihood function. Maximize it.
+>
+> — An Interview with Thomas J. Sargent {cite}`evans2005interview`
+
+## Overview
+
+This lecture describes how   {cite:t}`hansen1983stochastic` formulated a complete statistical model of asset returns and consumption growth, then estimated its parameters by the method of  maximum likelihood.  
+
+They detect a defects in their model, one of which {cite:t}`MehraPrescott1985` later  called   the **equity premium puzzle**.
+
+{cite:t}`hansen1983stochastic` study a consumption-based asset pricing model in which a representative consumer with CRRA preferences chooses how to allocate wealth across traded assets.
+
+First-order conditions for asset holdings are  stochastic Euler equations that  connect consumption growth, asset returns, and preference parameters.
+
+{cite:t}`hansen1983stochastic` assume that consumption growth and asset returns are *jointly lognormal*.
+
+Then   Euler equations imply a set of restrictions on a  the joint distribution of  asset prices and returns.
+
+These restrict a linear time-series model for  logarithms of consumption growth and returns in which  predictable movements in log returns are proportional to predictable movements in log consumption growth. 
+
+Hansen and Singleton estimated their model by {doc}`mle`.
+
+  
+
+The empirical findings of {cite:t}`hansen1983stochastic` constitute  what {cite:t}`MehraPrescott1985` would later call the **equity premium puzzle**.
+
+To keep lecture this lecture narrowly focused, we estimate one return at a time (either a market proxy or a T-bill) rather than the  full multi-asset systems studied by  {cite:t}`hansen1983stochastic`.
+
+ * we  use only monthly nondurable consumption (`ND`).
+
+In addition to what comes with Anaconda, this lecture requires `pandas-datareader`
+
+```{code-cell} ipython3
+:tags: [hide-output]
+
+!pip install pandas-datareader
+```
+
+```{code-cell} ipython3
+import warnings
+from itertools import combinations
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from IPython.display import Latex
+from pandas_datareader import data as web
+from scipy import stats
+from scipy.linalg import LinAlgError, cholesky, solve_triangular
+from scipy.optimize import minimize
+from statsmodels.stats.stattools import durbin_watson
+
+warnings.filterwarnings(
+    "ignore", message=".*date_parser.*", category=FutureWarning
+) 
+```
+
+We also define a helper to display DataFrames as LaTeX arrays in the hidden cell below
+
+```{code-cell} ipython3
+:tags: [hide-cell]
+
+def display_table(df, title=None, fmt=None):
+    """
+    Display a DataFrame as a LaTeX array.
+    """
+    if fmt is None:
+        fmt = {}
+    formatted = df.copy()
+    for col in formatted.columns:
+        if col in fmt:
+            formatted[col] = formatted[col].apply(
+                lambda x: fmt[col].format(x) if np.isfinite(x) else str(x))
+    n_cols = len(formatted.columns)
+    idx_header = r"\text{" + df.index.name + "}" if df.index.name else ""
+    columns = " & ".join(
+        [idx_header] + [r"\text{" + c + "}" if "\\" not in c
+         and "^" not in c and "_" not in c
+         else c for c in formatted.columns])
+    rows = r" \\".join(
+        [" & ".join([str(idx)] + [str(v) for v in row])
+         for idx, row in zip(formatted.index, formatted.values)])
+    col_format = "r" + "c" * n_cols
+    lines = [r"\begin{array}{" + col_format + "}"]
+    lines.append(columns + r" \\")
+    lines.append(r"\hline")
+    lines.append(rows)
+    lines.append(r"\end{array}")
+    latex = "\n".join(lines)
+    if title:
+        latex = rf"\textbf{{{title}}}" + r"\\" + "\n" + latex
+    display(Latex("$" + latex + "$"))
+```
+
+## Euler equation
+
+Consider a single-good economy of identical consumers whose utility functions are in the CRRA form
+
+```{math}
+:label: hs83-crra
+
+U(c_t) = c_t^{\gamma}/\gamma, \quad \gamma < 1,
+```
+
+where $c_t$ is aggregate real per capita consumption and $U(\cdot)$ is the period utility function.
+
+The representative consumer chooses a stochastic consumption plan to maximize the expected value of a time-additive utility function,
+
+```{math}
+:label: hs83-objective
+
+E_0 \sum_{t=0}^{\infty} \beta^t U(c_t), \quad 0 < \beta < 1.
+```
+
+Consumers substitute present for future consumption by trading the ownership rights of $N$ financial and capital assets.
+
+Let $\mathbf{w}_t$ denote the holdings of the $N$ assets at date $t$, $\mathbf{q}_t$ the vector of asset prices, $\mathbf{d}_t$ the vector of dividends, and $y_t$ real labor income.
+
+A feasible consumption and investment plan $\{c_t, \mathbf{w}_t\}$ must satisfy the sequence of budget constraints
+
+```{math}
+:label: hs83-budget
+
+c_t + \mathbf{q}_t \cdot \mathbf{w}_{t+1} \leq (\mathbf{q}_t + \mathbf{d}_t) \cdot \mathbf{w}_t + y_t,
+```
+
+where $(\mathbf{q}_t + \mathbf{d}_t) \cdot \mathbf{w}_t$ is the cum-dividend value of the portfolio carried into period $t$.
+
+To derive the first-order conditions, attach a Lagrange multiplier $\lambda_t$ to the budget constraint {eq}`hs83-budget` at each date $t$ and form the Lagrangian
+
+```{math}
+:label: hs83-lagrangian
+
+\mathcal{L} = E_0 \sum_{t=0}^{\infty} \beta^t \left\{ U(c_t) + \lambda_t \left[ (\mathbf{q}_t + \mathbf{d}_t) \cdot \mathbf{w}_t + y_t - c_t - \mathbf{q}_t \cdot \mathbf{w}_{t+1} \right] \right\}.
+```
+
+Differentiating $\mathcal{L}$ with respect to $c_t$ gives
+
+```{math}
+\frac{\partial \mathcal{L}}{\partial c_t} = 0 \implies \lambda_t = U'(c_t).
+```
+
+Differentiating with respect to $w_{i,t+1}$, the holdings of asset $i$ carried from date $t$ into date $t+1$, collects two terms: $w_{i,t+1}$ appears in the date-$t$ budget constraint as well as in the date-$(t+1)$ constraint:
+
+```{math}
+\frac{\partial \mathcal{L}}{\partial w_{i,t+1}} = 0 \implies E_0\!\left[\beta^t \big(- \lambda_t\, q_{it} + \beta\, \lambda_{t+1} (q_{i,t+1} + d_{i,t+1})\big)\right] = 0.
+```
+
+By the law of iterated expectations, this becomes
+
+```{math}
+E_0\!\left[\beta^t\, E_t\!\left(- \lambda_t\, q_{it} + \beta\, \lambda_{t+1} (q_{i,t+1} + d_{i,t+1})\right)\right] = 0.
+```
+
+Hence,
+
+```{math}
+\lambda_t\, q_{it} = \beta\, E_t\!\left[\lambda_{t+1}(q_{i,t+1} + d_{i,t+1})\right].
+```
+
+Substituting $\lambda_t = U'(c_t)$ and $\lambda_{t+1} = U'(c_{t+1})$ yields
+
+```{math}
+q_{it}\, U'(c_t) = \beta\, E_t\!\left[U'(c_{t+1})(q_{i,t+1} + d_{i,t+1})\right].
+```
+
+Dividing both sides by $q_{it}$ and defining the gross return $r_{it+1} := (q_{i,t+1} + d_{i,t+1})/q_{it}$ yields the stochastic Euler equation:
+
+```{math}
+:label: hs83-foc
+
+U'(c_t) = \beta E_t\!\left[U'(c_{t+1})\, r_{it+1}\right], \quad i = 1, \ldots, N,
+```
+
+where $r_{it+1}$ is the gross real return on asset $i$.
+
+Substituting the CRRA marginal utility $U'(c_t) = c_t^{\gamma-1} = c_t^{\alpha}$ with $\alpha := \gamma - 1$ into {eq}`hs83-foc` and rearranging gives
+
+```{math}
+:label: hs83-euler
+
+E_t\!\left[\beta \left(\frac{c_{t+1}}{c_t}\right)^{\alpha} r_{it+1}\right] = 1, \quad i = 1, \ldots, N.
+```
+
+The coefficient of relative risk aversion is $-\alpha$.
+
+## The Euler equation under lognormality
+
+Using the Euler equation {eq}`hs83-euler` derived above, we now impose the distributional assumptions of {cite:t}`hansen1983stochastic`.
+
+Let $x_t = c_t / c_{t-1}$ denote the consumption ratio, and define $u_{it} = x_t^\alpha r_{it}$ where $r_{it}$ is the gross real return on asset $i$.
+
+The Euler equation {eq}`hs83-euler` states $E_{t-1}[u_{it}] = 1/\beta$.
+
+Define log variables $X_t = \log x_t$, $R_{it} = \log r_{it}$, and $U_{it} = \log u_{it}$, so that
+
+```{math}
+:label: hs83-u-def
+
+U_{i,t}= \alpha X_{t}+R_{i,t}.
+```
+
+Assume the vector process $\{Y_t\} = \{(X_t, R_{1t}, \ldots, R_{nt})^\top\}$ is jointly stationary and Gaussian.
+
+Under this assumption, $U_{it}$ conditional on the information set $\psi_{t-1}$ is normal with constant variance $\sigma_i^2$ and a conditional mean $\mu_{i,t-1}$ that is a linear function of past $Y$'s.
+
+Because $u_{it} = \exp(U_{it})$ is conditionally lognormal, 
+
+```{math}
+:label: hs83-lognormal-identity
+
+E_{t-1}[u_{it}] = \exp\left(\mu_{i,t-1} + \tfrac{1}{2}\sigma_i^2\right).
+```
+
+Setting $E_{t-1}[u_{it}] = 1/\beta$ and taking logs gives $\mu_{i,t-1} + \sigma_i^2/2 = -\log\beta$.
+
+Now define the innovation
+
+```{math}
+:label: hs83-v-it
+
+V_{i,t} := U_{i,t} - \mu_{i,t-1} = \alpha X_{t}+R_{i,t}+\log\beta+\frac{\sigma_i^2}{2},
+\quad i = 1, \ldots, N.
+```
+
+Then $E_{t-1}[V_{i,t}]=0$, where $\sigma_i^2=\operatorname{Var}_{t-1}(\alpha X_t + R_{it})$ is constant under the stationarity and Gaussian assumptions.
+
+Setting $E_{t-1}[V_{i,t}] = 0$ gives the conditional mean restriction
+
+```{math}
+:label: hs83-cond-mean
+
+E_{t-1}[R_{i,t}] = -\alpha\, E_{t-1}[X_{t}] - \log\beta - \frac{\sigma_i^2}{2}.
+```
+
+Equation {eq}`hs83-cond-mean` is the central result of {cite:t}`hansen1983stochastic`.
+
+The predictable component of each asset's log return is proportional to the predictable component of log consumption growth, with proportionality factor $-\alpha$.
+
+The intercept absorbs the discount factor $\beta$ and the lognormal variance term $\sigma_i^2 / 2$.
+
+Let's consider three special cases to better understand the implications of {eq}`hs83-cond-mean`:
+
+- Risk neutrality ($\alpha = 0$): Each asset's log return equals a constant plus a serially uncorrelated error, so returns are serially uncorrelated.
+- Log utility ($\alpha = -1$): The difference $R_{it} - X_t$ has no time-varying predictable component, so returns and consumption growth share the same predictable component.
+- Risk aversion ($\alpha < 0$): The time-varying part of $E_{t-1}[R_{it}]$ equals $-\alpha$ times $E_{t-1}[X_t]$, so a larger $|\alpha|$ amplifies the sensitivity of expected returns to expected consumption growth.
+
+For a given consumption–return covariance structure, higher relative risk aversion ($-\alpha$) widens the gap between risky-asset returns and the risk-free return.
+
+The equity premium puzzle arises because the observed spread is large but estimated $|\alpha|$ is small as we will see soon in our estimation.
+
+## The restricted system and its likelihood
+
+To build a likelihood, we need to parameterize the conditional expectation $E_{t-1}[X_t]$.
+
+In the single-return case, write $\mathbf{Y}_t = (X_t, R_t)^\top$ and assume that the predictable component of $X_t$ is a finite-order linear function of past observations:
+
+```{math}
+:label: hs83-x-forecast
+
+E(X_t\mid\psi_{t-1})=\mathbf{a}(L)^\top \mathbf{Y}_{t-1}+\mu_x,
+```
+
+where $\mathbf{a}(L)$ is a vector of lag polynomial coefficients in past $(X, R)$ and $\mu_x$ is a constant.
+
+The consumption-growth equation is unrestricted, so $X_t$ depends freely on its own lags and on lagged returns.
+
+The return equation, however, is restricted by the Euler equation.
+
+Combining {eq}`hs83-cond-mean` with {eq}`hs83-x-forecast` forces the predictable part of $R_t$ to be $-\alpha$ times the predictable part of $X_t$, plus a constant.
+
+This gives the restricted system
+
+```{math}
+:label: hs83-restricted
+
+\mathbf{A}_0\mathbf{Y}_t=\mathbf{A}_1(L)\mathbf{Y}_{t-1}+\boldsymbol{\mu}+\mathbf{V}_t,
+```
+
+with
+
+```{math}
+:label: hs83-a0a1
+
+\mathbf{A}_0=\begin{bmatrix}1&0\\\alpha&1\end{bmatrix},
+\quad
+\mathbf{A}_1(L)=\begin{bmatrix}\mathbf{a}(L)^\top\\0\end{bmatrix},
+\quad
+\boldsymbol{\mu}=\begin{bmatrix}\mu_x\\-\log\beta-\sigma_U^2/2\end{bmatrix},
+```
+
+where $\sigma_U^2 := \operatorname{Var}_{t-1}(\alpha X_t + R_t) = \alpha^2 \sigma_{XX} + \sigma_{RR} + 2\alpha \sigma_{XR}$ under conditional homoskedasticity.
+
+The sign in the second element of $\boldsymbol{\mu}$ follows directly from {eq}`hs83-cond-mean`.
+
+The innovations $\mathbf{V}_t$ are assumed to be Gaussian with density $f_V(\mathbf{v})$.
+
+Since $\mathbf{V}_t = \mathbf{A}_0 \mathbf{Y}_t - \mathbf{A}_1(L)\mathbf{Y}_{t-1} - \boldsymbol{\mu}$, the observables $\mathbf{Y}_t$ are a linear transformation of $\mathbf{V}_t$ with $\mathbf{Y}_t = \mathbf{A}_0^{-1}(\mathbf{V}_t + \mathbf{A}_1(L)\mathbf{Y}_{t-1} + \boldsymbol{\mu})$.
+
+The change-of-variables formula for densities gives
+
+$$
+f_Y(\mathbf{y}_t \mid \mathbf{Y}_{t-1}) = f_V(\mathbf{A}_0 \mathbf{y}_t - \mathbf{A}_1(L)\mathbf{Y}_{t-1} - \boldsymbol{\mu})\;\left|\det\!\left(\frac{\partial \mathbf{V}_t}{\partial \mathbf{Y}_t}\right)\right| = f_V(\mathbf{v}_t)\;|\det(\mathbf{A}_0)|.
+$$
+
+Because $\mathbf{A}_0$ is unit lower triangular, $\det(\mathbf{A}_0) = 1$, so the Jacobian term disappears and the log-likelihood of $\mathbf{Y}_t$ equals the log-likelihood of $\mathbf{V}_t$ evaluated at the residuals.
+
+Since the Jacobian is unity, the conditional density of $\mathbf{Y}_t$ is the Gaussian density of $\mathbf{V}_t$ evaluated at $\mathbf{v}_t = \mathbf{A}_0 \mathbf{Y}_t - \mathbf{A}_1(L)\mathbf{Y}_{t-1} - \boldsymbol{\mu}$.
+
+For a single observation, $\mathbf{V}_t \sim N(\mathbf{0}, \boldsymbol{\Sigma})$ has density
+
+$$
+f_V(\mathbf{v}_t) = (2\pi)^{-1} |\boldsymbol{\Sigma}|^{-1/2} \exp\!\left(-\tfrac{1}{2}\,\mathbf{v}_t^\top \boldsymbol{\Sigma}^{-1} \mathbf{v}_t\right).
+$$
+
+Taking logarithms gives 
+
+$$
+\ell_t(\theta) = -\log(2\pi) - \tfrac{1}{2}\log|\boldsymbol{\Sigma}| - \tfrac{1}{2}\,\mathbf{v}_t^\top \boldsymbol{\Sigma}^{-1}\,\mathbf{v}_t.
+$$
+
+Summing over $T$ observations and dropping the constant $-T\log(2\pi)$ yields the log-likelihood function
+
+```{math}
+:label: hs83-loglik
+
+L(\theta) = -\frac{T}{2}\log|\boldsymbol{\Sigma}| - \frac{1}{2}\sum_{t=1}^{T}(\mathbf{A}_0 \mathbf{Y}_t - \mathbf{A}_1(L)\mathbf{Y}_{t-1} - \boldsymbol{\mu})^\top\boldsymbol{\Sigma}^{-1}(\mathbf{A}_0 \mathbf{Y}_t - \mathbf{A}_1(L)\mathbf{Y}_{t-1} - \boldsymbol{\mu}),
+```
+
+where $\boldsymbol{\Sigma}$ is the covariance matrix of the innovation $\mathbf{V}_t$ and $\theta$ collects all free parameters including $\alpha$, $\beta$, the covariance parameters, the first-row intercept $\mu_x$, and the first-row lag coefficients.
+
+The restrictions imposed by {eq}`hs83-euler` enter {eq}`hs83-loglik` through the structure of $\mathbf{A}_0$, $\mathbf{A}_1(L)$, and $\boldsymbol{\mu}$ in {eq}`hs83-a0a1`.
+
+The second row of {eq}`hs83-restricted` has no free lag coefficients and its intercept is determined by $\alpha$, $\beta$, and $\boldsymbol{\Sigma}$.
+
+An alternative unrestricted bivariate VAR($p$) would estimate both rows of {eq}`hs83-restricted` freely.
+
+Each row has 1 intercept plus $2p$ lag coefficients ($p$ lags $\times$ 2 variables), giving $2(1 + 2p)$ mean parameters.
+
+Adding 3 free covariance parameters ($\sigma_{XX}, \sigma_{RR}, \sigma_{XR}$) gives a total of $5 + 4p$.
+
+The restricted system {eq}`hs83-restricted` has only $6 + 2p$ free parameters: the first row contributes $1 + 2p$ (its intercept $\mu_x$ and $2p$ lag coefficients), plus $\alpha$, $\beta$, and the 3 covariance parameters. 
+
+The second row adds nothing because its lag structure and intercept are pinned down by $\alpha$, $\beta$, and $\boldsymbol{\Sigma}$ via {eq}`hs83-cond-mean`.
+
+The difference $(\smash{5 + 4p}) - (\smash{6 + 2p}) = 2p - 1$ gives the degrees of freedom for the likelihood ratio tests we will perform soon.
+
+## Likelihood implementation
+
+Now let's implement the likelihood {eq}`hs83-loglik`.
+
+The building blocks are:
+
+- a function to construct lagged data matrices $(\mathbf{Y}_t, \mathbf{Y}_{t-1}, \ldots, \mathbf{Y}_{t-p})$,
+
+- a function to map the parameter vector into the matrices $\mathbf{A}_0$, $\mathbf{A}_1$, $\boldsymbol{\mu}$, $\boldsymbol{\Sigma}$,
+
+- a function to compute the restricted-system residuals $\mathbf{V}_t = \mathbf{A}_0 \mathbf{Y}_t - \mathbf{A}_1(L) \mathbf{Y}_{t-1} - \boldsymbol{\mu}$, and
+
+- the Gaussian log-likelihood itself.
+
+Since we are working with log-transformed data, we define a helper for the transformation below
+
+```{code-cell} ipython3
+def to_mle_array(data):
+    valid = (data[:, 0] > 0.0) & (data[:, 1] > 0.0)
+    return np.column_stack(
+        [np.log(data[valid, 1]), np.log(data[valid, 0])])
+```
+
+
+First we build the lagged data matrices, which are the inputs to the likelihood function
+
+```{code-cell} ipython3
+def build_lagged_data(data, n_lags):
+    """
+    Build Y_t and lag stacks [Y_{t-1}, ..., Y_{t-p}] for bivariate data.
+    """
+    if data.ndim != 2 or data.shape[1] != 2:
+        raise ValueError("data must be T x 2.")
+    if data.shape[0] <= n_lags:
+        raise ValueError("Sample size must exceed n_lags.")
+
+    t_obs = data.shape[0]
+    n_obs = t_obs - n_lags
+    y_t = data[n_lags:, :]
+    y_lag = np.empty((n_obs, 2 * n_lags))
+
+    for lag in range(1, n_lags + 1):
+        y_lag[:, 2 * (lag - 1) : 2 * lag] = data[n_lags - lag : t_obs - lag, :]
+
+    return y_t, y_lag
+```
+
+Next, we validate and unpack the parameter vector while enforcing feasibility conditions
+
+```{code-cell} ipython3
+def unpack_parameters(params, n_lags):
+    """
+    Validate and unpack parameter vector.
+    """
+    if len(params) != 6 + 2 * n_lags:
+        return None
+
+    α, β, σ_x, σ_r, cov_xr, μ_x = params[:6]
+    a_lags = params[6:]
+
+    tol = 1e-8
+    if not np.isfinite(α):
+        return None
+    if not (tol < β):
+        return None
+    if not (σ_x > tol and σ_r > tol):
+        return None
+
+    Σ = np.array(
+        [
+            [σ_x ** 2, cov_xr],
+            [cov_xr, σ_r ** 2],
+        ]
+    )
+
+    try:
+        cholesky(Σ, lower=True)
+    except (LinAlgError, ValueError):
+        return None
+
+    return {
+        "α": np.array(α),
+        "β": np.array(β),
+        "σ_x": np.array(σ_x),
+        "σ_r": np.array(σ_r),
+        "cov_xr": np.array(cov_xr),
+        "μ_x": np.array(μ_x),
+        "a_lags": a_lags,
+    }
+```
+
+The next step maps parameters and lagged data into restricted-system residuals
+
+```{code-cell} ipython3
+def restricted_residuals(
+    params,
+    y_t,
+    y_lag,
+    n_lags,
+):
+    """
+    Compute V_t implied by A0 Y_t - A1 Y_{t-1} - mu.
+    """
+    parsed = unpack_parameters(params, n_lags)
+    if parsed is None:
+        return None
+
+    α = float(parsed["α"])
+    β = float(parsed["β"])
+    σ_x = float(parsed["σ_x"])
+    σ_r = float(parsed["σ_r"])
+    cov_xr = float(parsed["cov_xr"])
+    μ_x = float(parsed["μ_x"])
+    a_lags = np.asarray(parsed["a_lags"])
+
+    A0 = np.array([[1.0, 0.0], [α, 1.0]])
+    A1 = np.zeros((2, 2 * n_lags))
+    A1[0, :] = a_lags
+    σ_u2 = α ** 2 * σ_x ** 2 + σ_r ** 2 + 2.0 * α * cov_xr
+    μ = np.array([μ_x, -np.log(β) - 0.5 * σ_u2])
+
+    resid = y_t @ A0.T - y_lag @ A1.T - μ[None, :]
+    if np.any(np.abs(resid) > 1e10):
+        return None
+    return resid
+```
+
+The recursive structure also lets us simulate data from the model.
+
+We can generate data from the model by drawing innovations $\mathbf{V}_t \sim N(\mathbf{0}, \boldsymbol{\Sigma})$ and compute $\mathbf{Y}_t = \mathbf{A}_0^{-1}(\mathbf{A}_1(L) \mathbf{Y}_{t-1} + \boldsymbol{\mu} + \mathbf{V}_t)$ forward in time.
+
+This is useful for checking the likelihood implementation through Monte Carlo
+
+```{code-cell} ipython3
+def simulate_restricted_var(
+    params,
+    n_obs,
+    n_lags,
+    burn_in=200,
+    seed=0,
+):
+    """
+    Simulate [log consumption growth, log return] from the restricted model.
+    """
+    if seed is not None:
+        np.random.seed(seed)
+
+    if len(params) != 6 + 2 * n_lags:
+        raise ValueError("Parameter vector length must be 6 + 2 * n_lags.")
+
+    α, β, σ_x, σ_r, cov_xr, μ_x = params[:6]
+    a_lags = params[6:]
+
+    Σ_e = np.array(
+        [
+            [σ_x ** 2, cov_xr],
+            [cov_xr, σ_r ** 2],
+        ]
+    )
+
+    A0 = np.array([[1.0, 0.0], [α, 1.0]])
+    Σ_v = A0 @ Σ_e @ A0.T
+
+    eigvals = np.linalg.eigvals(Σ_v)
+    if np.min(eigvals) <= 0.0:
+        Σ_v += np.eye(2) * 1e-6
+
+    A1 = np.zeros((2, 2 * n_lags))
+    A1[0, :] = a_lags
+    σ_u2 = α ** 2 * σ_x ** 2 + σ_r ** 2 + 2.0 * α * cov_xr
+    μ = np.array([μ_x, -np.log(β) - 0.5 * σ_u2])
+
+    total_n = n_obs + burn_in
+    y = np.zeros((total_n, 2))
+
+    for t in range(n_lags, total_n):
+        lag_stack = []
+        for lag in range(1, n_lags + 1):
+            lag_stack.append(y[t - lag, :])
+        lag_vec = np.concatenate(lag_stack)
+        shock = np.random.multivariate_normal(np.zeros(2), Σ_v)
+        y[t, :] = np.linalg.solve(A0, A1 @ lag_vec + μ + shock)
+
+    return y[burn_in:, :]
+```
+
+Next, we encode the Gaussian log-likelihood implied by the residual covariance matrix
+
+```{code-cell} ipython3
+def log_likelihood_mle(
+    params,
+    y_t,
+    y_lag,
+    n_lags,
+    include_const=True,
+):
+    """
+    Evaluate Gaussian log-likelihood for the restricted system.
+    """
+    parsed = unpack_parameters(params, n_lags)
+    if parsed is None:
+        return -np.inf
+
+    resid = restricted_residuals(params, y_t, y_lag, n_lags)
+    if resid is None:
+        return -np.inf
+
+    α = float(parsed["α"])
+    σ_x = float(parsed["σ_x"])
+    σ_r = float(parsed["σ_r"])
+    cov_xr = float(parsed["cov_xr"])
+
+    Σ_e = np.array(
+        [
+            [σ_x ** 2, cov_xr],
+            [cov_xr, σ_r ** 2],
+        ]
+    )
+
+    A0 = np.array([[1.0, 0.0], [α, 1.0]])
+    Σ_v = A0 @ Σ_e @ A0.T
+
+    try:
+        chol = cholesky(Σ_v, lower=True)
+        log_det = 2.0 * np.sum(np.log(np.diag(chol) + 1e-16))
+        std_resid = solve_triangular(chol, resid.T, lower=True).T
+        quad_form = np.sum(std_resid ** 2)
+    except (LinAlgError, ValueError):
+        return -np.inf
+
+    sample_size = y_t.shape[0]
+    ll = -0.5 * sample_size * log_det - 0.5 * quad_form
+    if include_const:
+        ll -= sample_size * np.log(2.0 * np.pi)
+
+    if np.isnan(ll) or np.isinf(ll):
+        return -np.inf
+    return float(ll)
+```
+
+To pass this into a numerical optimizer, we wrap the log-likelihood as a minimization objective
+
+```{code-cell} ipython3
+def negative_log_likelihood(
+    params,
+    y_t,
+    y_lag,
+    n_lags,
+):
+    """
+    Return negative log-likelihood for minimization.
+    """
+    ll = log_likelihood_mle(params, y_t, y_lag, n_lags, include_const=False)
+    if np.isfinite(ll):
+        return -ll
+    return 1e20
+```
+
+We set parameter bounds and generate data-driven starting values for multi-start optimization.
+
+We keep $\beta$ positive but do not force $\beta < 1$ in estimation
+
+```{code-cell} ipython3
+def parameter_bounds(n_lags):
+    """
+    Bounds for optimization.
+    """
+    bounds = [
+        (-200.0, 200.0),   # α (= -risk aversion)
+        (1e-8, 2.0),     # β (discount factor)
+        (1e-8, None),    # σ_x (std dev of consumption innovation)
+        (1e-8, None),    # σ_r (std dev of return innovation)
+        (None, None),    # cov_xr (covariance)
+        (None, None),    # μ_x (consumption growth intercept)
+    ]
+    bounds += [(-0.99, 0.99)] * (2 * n_lags)  # VAR lag coefficients
+    return bounds
+```
+
+We use multiple starting vectors to help local solvers escape poor initializations
+
+```{code-cell} ipython3
+def starting_values(y_t, y_lag, n_lags, n_starts=10):
+    """
+    Generate multiple starting values.
+    """
+    rng = np.random.default_rng(123)
+    starts = []
+    n_params = 6 + 2 * n_lags
+
+    base = np.zeros(n_params)
+    base[0] = -0.5
+    base[1] = 0.999
+    base[2] = max(float(np.std(y_t[:, 0])), 1e-3)
+    base[3] = max(float(np.std(y_t[:, 1])), 1e-3)
+    base[4] = float(np.cov(y_t.T)[0, 1])
+    base[5] = float(np.mean(y_t[:, 0]))
+    base[6:] = 0.1
+    starts.append(base.copy())
+
+    # OLS seed from unrestricted VAR
+    n_obs = y_t.shape[0]
+    x = np.column_stack([np.ones(n_obs), y_lag])
+    coef = np.linalg.lstsq(x, y_t, rcond=None)[0]
+    resid = y_t - x @ coef
+    Σ_e = resid.T @ resid / max(1, n_obs)
+
+    a_lags_ols = coef[1:, 0]
+    r_lags_ols = coef[1:, 1]
+    denom = float(a_lags_ols @ a_lags_ols)
+    if denom > 1e-10:
+        α_ols = -float((a_lags_ols @ r_lags_ols) / denom)
+    else:
+        α_ols = -0.5
+
+    μ_x_ols = float(coef[0, 0])
+    μ_r_ols = float(coef[0, 1])
+    σ_x_ols = float(np.sqrt(max(Σ_e[0, 0], 1e-8)))
+    σ_r_ols = float(np.sqrt(max(Σ_e[1, 1], 1e-8)))
+    cov_xr_ols = float(Σ_e[0, 1])
+    σ_u2_ols = (
+        α_ols ** 2 * σ_x_ols ** 2
+        + σ_r_ols ** 2
+        + 2.0 * α_ols * cov_xr_ols
+    )
+    β_ols = float(np.exp(-(μ_r_ols + α_ols * μ_x_ols + 0.5 * σ_u2_ols)))
+    β_ols = float(np.clip(β_ols, 1e-6, 2.0))
+
+    ols_seed = np.zeros(n_params)
+    ols_seed[0] = α_ols
+    ols_seed[1] = β_ols
+    ols_seed[2] = σ_x_ols
+    ols_seed[3] = σ_r_ols
+    ols_seed[4] = cov_xr_ols
+    ols_seed[5] = μ_x_ols
+    ols_seed[6:] = a_lags_ols
+    starts.append(ols_seed.copy())
+
+    seeds = [base, ols_seed]
+    while len(starts) < n_starts:
+        seed = seeds[len(starts) % len(seeds)]
+        trial = seed.copy()
+        trial[:2] += rng.normal(0.0, 0.2, 2)
+        trial[2:6] *= 1.0 + rng.normal(0.0, 0.15, 4)
+        trial[6:] += rng.normal(0.0, 0.08, 2 * n_lags)
+        trial[1] = max(trial[1], 1e-6)
+        trial[2] = max(trial[2], 1e-6)
+        trial[3] = max(trial[3], 1e-6)
+        starts.append(trial)
+
+    return starts
+```
+
+Standard errors come from an outer-product-of-gradients (OPG) approximation to the
+information matrix, computed by finite differences of per-observation
+log-likelihood contributions.
+
+This tends to be more numerically stable than finite-difference Hessians in this
+application.
+
+```{code-cell} ipython3
+def log_likelihood_contributions(
+    params,
+    y_t,
+    y_lag,
+    n_lags,
+    include_const=False,
+):
+    """
+    Vector of per-observation Gaussian log-likelihood contributions.
+
+    Returns an array of length T, or None if the parameter vector is infeasible.
+    """
+    parsed = unpack_parameters(params, n_lags)
+    if parsed is None:
+        return None
+
+    resid = restricted_residuals(params, y_t, y_lag, n_lags)
+    if resid is None:
+        return None
+
+    α = float(parsed["α"])
+    σ_x = float(parsed["σ_x"])
+    σ_r = float(parsed["σ_r"])
+    cov_xr = float(parsed["cov_xr"])
+
+    Σ_e = np.array(
+        [
+            [σ_x ** 2, cov_xr],
+            [cov_xr, σ_r ** 2],
+        ]
+    )
+    A0 = np.array([[1.0, 0.0], [α, 1.0]])
+    Σ_v = A0 @ Σ_e @ A0.T
+
+    try:
+        chol = cholesky(Σ_v, lower=True)
+        log_det = 2.0 * np.sum(np.log(np.diag(chol) + 1e-16))
+        std_resid = solve_triangular(chol, resid.T, lower=True).T
+    except (LinAlgError, ValueError):
+        return None
+
+    quad = np.sum(std_resid ** 2, axis=1)
+    ll_t = -0.5 * log_det - 0.5 * quad
+    if include_const:
+        ll_t -= np.log(2.0 * np.pi)
+    if not np.all(np.isfinite(ll_t)):
+        return None
+    return ll_t
+
+
+def opg_standard_errors(
+    params,
+    y_t,
+    y_lag,
+    n_lags,
+    step=1e-6,
+    max_step_shrink=12,
+    eig_floor=1e-12,
+):
+    """
+    Standard errors via OPG approximation to the information matrix.
+    """
+    n = len(params)
+    ll0 = log_likelihood_contributions(params, y_t, y_lag, n_lags, include_const=False)
+    if ll0 is None:
+        return np.full(n, np.nan)
+
+    n_obs = int(ll0.shape[0])
+    scores = np.empty((n_obs, n))
+
+    for i in range(n):
+        base_step = step * (abs(params[i]) + 1.0)
+        hi = base_step
+        ll_plus = None
+        ll_minus = None
+
+        for _ in range(max_step_shrink + 1):
+            p_plus = params.copy()
+            p_minus = params.copy()
+            p_plus[i] += hi
+            p_minus[i] -= hi
+            ll_plus = log_likelihood_contributions(
+                p_plus, y_t, y_lag, n_lags, include_const=False
+            )
+            ll_minus = log_likelihood_contributions(
+                p_minus, y_t, y_lag, n_lags, include_const=False
+            )
+            if ll_plus is not None and ll_minus is not None:
+                break
+            hi *= 0.5
+
+        if ll_plus is None or ll_minus is None:
+            return np.full(n, np.nan)
+
+        scores[:, i] = (ll_plus - ll_minus) / (2.0 * hi)
+
+    if not np.all(np.isfinite(scores)):
+        return np.full(n, np.nan)
+
+    # Center scores to mitigate numerical drift away
+    scores = scores - scores.mean(axis=0, keepdims=True)
+
+    opg = scores.T @ scores
+    if not np.all(np.isfinite(opg)):
+        return np.full(n, np.nan)
+    opg = 0.5 * (opg + opg.T)
+
+    try:
+        eigvals, eigvecs = np.linalg.eigh(opg)
+    except (LinAlgError, ValueError):
+        return np.full(n, np.nan)
+
+    floor = float(eig_floor) * max(1.0, float(np.max(eigvals)))
+    eigvals = np.clip(eigvals, floor, None)
+    cov = eigvecs @ np.diag(1.0 / eigvals) @ eigvecs.T
+    se = np.sqrt(np.maximum(np.diag(cov), 0.0))
+    se[~np.isfinite(se)] = np.nan
+    return se
+```
+
+The multi-start MLE estimator below combines these pieces and returns parameters, fit criteria, and residuals
+
+```{code-cell} ipython3
+def estimate_mle(data, n_lags, verbose=False):
+    """
+    Estimate the restricted model by multi-start local optimization.
+    """
+    y_t, y_lag = build_lagged_data(data, n_lags)
+    bounds = parameter_bounds(n_lags)
+    starts = starting_values(y_t, y_lag, n_lags)
+
+    best_result = None
+    best_ll = -np.inf
+
+    for i, x0 in enumerate(starts):
+        try:
+            result = minimize(
+                negative_log_likelihood,
+                x0=x0,
+                args=(y_t, y_lag, n_lags),
+                method="L-BFGS-B",
+                bounds=bounds,
+            )
+        except Exception:
+            continue
+
+        if np.isfinite(result.fun):
+            ll_val = log_likelihood_mle(result.x, y_t, y_lag, n_lags)
+            if np.isfinite(ll_val) and ll_val > best_ll:
+                best_ll = ll_val
+                best_result = result
+                if verbose:
+                    print(
+                        f"start={i}, success={result.success}, "
+                        f"status={result.status}, loglike={ll_val:.2f}"
+                    )
+
+    n_params = 6 + 2 * n_lags
+
+    if best_result is None:
+        return {
+            "params": np.full(n_params, np.nan),
+            "se": np.full(n_params, np.nan),
+            "loglike": -np.inf,
+            "converged": False,
+            "optimizer_success": False,
+            "residuals": None,
+            "n_obs": int(y_t.shape[0]),
+        }
+
+    params = best_result.x
+    se = opg_standard_errors(params, y_t, y_lag, n_lags)
+    resid = restricted_residuals(params, y_t, y_lag, n_lags)
+    ll_val = log_likelihood_mle(params, y_t, y_lag, n_lags)
+
+    return {
+        "params": params,
+        "se": se,
+        "loglike": ll_val,
+        "converged": bool(np.isfinite(ll_val)),
+        "optimizer_success": bool(best_result.success),
+        "residuals": resid,
+        "n_obs": int(y_t.shape[0]),
+    }
+```
+
+Residual diagnostics below summarize normality and serial-correlation checks
+
+```{code-cell} ipython3
+def residual_diagnostics(resid):
+    """
+    Compute basic residual diagnostics.
+    """
+    out = {}
+
+    for i, label in enumerate(["consumption", "return"]):
+        jb_stat, jb_pval = stats.jarque_bera(resid[:, i])
+        out[f"{label}_jb_stat"] = float(jb_stat)
+        out[f"{label}_jb_pval"] = float(jb_pval)
+        out[f"{label}_dw"] = float(durbin_watson(resid[:, i]))
+
+    return out
+```
+
+Finally, a lag-loop wrapper runs MLE across different lag lengths and collects results in a DataFrame 
+
+```{code-cell} ipython3
+def run_mle_by_lag(
+    data,
+    lags=(2, 4, 6),
+    verbose=False,
+):
+    """
+    Estimate restricted MLE models by lag length.
+    """
+    rows = []
+    fits = {}
+
+    for lag in lags:
+        fit = estimate_mle(data, n_lags=lag, verbose=verbose)
+        fits[lag] = fit
+
+        rows.append(
+            {
+                "NLAG": lag,
+                "α_hat": fit["params"][0],
+                "se_α": fit["se"][0],
+                "β_hat": fit["params"][1],
+                "se_β": fit["se"][1],
+                "loglike": fit["loglike"],
+                "n_obs": fit["n_obs"],
+            }
+        )
+
+    table = pd.DataFrame(rows).set_index("NLAG")
+    return table, fits
+```
+
+### Simulation
+
+Before applying the likelihood to real data, we verify that it recovers known parameters from simulated data generated by the restricted system.
+
+For `n_lags = 1`, the parameter vector is
+
+```{math}
+\theta = (\alpha,\ \beta,\ \sigma_x,\ \sigma_r,\ \sigma_{xr},\ \mu_x,\ a_{x,1},\ a_{r,1}),
+```
+
+where the last two entries are the coefficients on $X_{t-1}$ and $R_{t-1}$ in the first-row regression for $X_t$.
+
+More generally, for `n_lags = p` we pack `a_lags` in the order
+$[a_{x,1}, a_{r,1}, \ldots, a_{x,p}, a_{r,p}]$, so the full parameter vector has length $6 + 2p$.
+
+The covariance parameters $(\sigma_x, \sigma_r, \sigma_{xr})$ describe the reduced-form shocks to $(X_t, R_t)$: if $\boldsymbol{\varepsilon}_t = (\varepsilon_{x,t}, \varepsilon_{r,t})^\top \sim N(0, \boldsymbol{\Sigma}_\varepsilon)$, then
+$\boldsymbol{\Sigma}_\varepsilon = \begin{bmatrix}\sigma_x^2 & \sigma_{xr}\\ \sigma_{xr} & \sigma_r^2\end{bmatrix}$.
+
+The restricted-system innovation is $\mathbf{V}_t = \mathbf{A}_0 \boldsymbol{\varepsilon}_t$, so its covariance is $\boldsymbol{\Sigma}_V = \mathbf{A}_0 \boldsymbol{\Sigma}_\varepsilon \mathbf{A}_0^\top$, which is what enters the likelihood.
+
+In the simulation recursion we draw $\mathbf{V}_t \sim N(0, \boldsymbol{\Sigma}_V)$ and compute $\mathbf{Y}_t = \mathbf{A}_0^{-1}(\mathbf{A}_1(L)\mathbf{Y}_{t-1} + \boldsymbol{\mu} + \mathbf{V}_t)$ forward.
+
+The following table compares the true parameters to the MLE estimates from a large simulated sample
+
+```{code-cell} ipython3
+α_true = -1.00
+β_true = 0.993
+σ_x_true = 0.015
+σ_r_true = 0.020
+cov_xr_true = 0.0001
+μ_x_true = 0.002
+a_x1_true = 0.40
+a_r1_true = 0.10
+
+true_params = np.array(
+    [
+        α_true,
+        β_true,
+        σ_x_true,
+        σ_r_true,
+        cov_xr_true,
+        μ_x_true,
+        a_x1_true,
+        a_r1_true,
+    ]
+)
+
+sim_mle_data = simulate_restricted_var(
+    params=true_params,
+    n_obs=50000,
+    n_lags=1,
+    burn_in=5000,
+    seed=0,
+)
+
+fit_sim = estimate_mle(sim_mle_data, n_lags=1, verbose=False)
+```
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+sim_results = pd.DataFrame({
+    "true": true_params[:2],
+    "estimate": fit_sim["params"][:2],
+    "se": fit_sim["se"][:2],
+}, index=[r"α", r"β"])
+sim_results[r"t\ (H_0{:}\ \text{true})"] = (
+    (sim_results["estimate"] - sim_results["true"]) / sim_results["se"]
+)
+display_table(sim_results, fmt={
+    "true": "{:.4f}", "estimate": "{:.4f}", "se": "{:.6f}", r"t\ (H_0{:}\ \text{true})": "{:.2f}",
+})
+```
+
+The point estimates are close to the true parameters, and the t-statistics for the null hypothesis that the true value is correct are small in magnitude, consistent with sampling variation.
+
+
+## Preference parameters and likelihood ratio tests
+
+The restricted system embeds preference parameters through cross-equation restrictions.
+
+The parameter $\alpha$ links predictable variation in returns to predictable variation in consumption growth.
+
+Under the model, the return equation's dependence on lagged variables is entirely determined by $-\alpha$ times the consumption equation's lag coefficients.
+
+The parameter $\beta$ shifts the return-equation intercept through $-\log\beta - \sigma_U^2/2$.
+
+{cite:t}`hansen1983stochastic` test these restrictions by comparing the restricted system to an unrestricted bivariate VAR estimated on the same sample.
+
+If the restrictions imposed by the Euler-equation are correct, the restricted model should fit nearly as well as the unrestricted model.
+
+The standard test is the likelihood ratio statistic
+
+```{math}
+:label: hs83-lr-test
+
+LR = 2(\ell_u - \ell_r) \Rightarrow \chi^2_d,
+```
+
+where $\ell_u$ and $\ell_r$ are the maximized log-likelihoods of the unrestricted and restricted models, and $d$ is the difference in the number of free parameters.
+
+Both likelihoods must be evaluated on the same effective sample for the LR distribution to be valid.
+
+{cite:t}`hansen1983stochastic` report that for the value-weighted aggregate stock return, the $\chi^2$ test statistics provide little evidence against the model.
+
+In the tables below we report both `chi2.cdf(LR, df)` (corresponding to what HS report in parentheses) and the usual right-tail `p(LR) = 1 - chi2.cdf`
+
+```{code-cell} ipython3
+def likelihood_ratio_test(
+    fit_restricted,
+    fit_unrestricted,
+    df_diff,
+):
+    """
+    Compare nested specifications with an LR test.
+    """
+    if not (fit_restricted["converged"] and fit_unrestricted["converged"]):
+        return {"lr_stat": np.nan, "p_value": np.nan, "chi2_cdf": np.nan}
+    if fit_restricted.get("n_obs") != fit_unrestricted.get("n_obs"):
+        return {"lr_stat": np.nan, "p_value": np.nan, "chi2_cdf": np.nan}
+
+    lr_stat = 2.0 * (fit_unrestricted["loglike"] - fit_restricted["loglike"])
+    chi2_cdf = stats.chi2.cdf(lr_stat, df=df_diff)
+    p_value = 1.0 - chi2_cdf
+    return {
+        "lr_stat": float(lr_stat),
+        "p_value": float(p_value),
+        "chi2_cdf": float(chi2_cdf),
+    }
+```
+
+The unrestricted benchmark is a Gaussian VAR for $\mathbf{Y}_t = (X_t, R_t)^\top$ with free coefficients on all lags in both equations.
+
+```{code-cell} ipython3
+def estimate_unrestricted_var(data, n_lags):
+    """
+    Estimate an unrestricted Gaussian VAR for Y_t = [X_t, R_t].
+    """
+    y_t, y_lag = build_lagged_data(data, n_lags)
+    n_obs = y_t.shape[0]
+    x = np.column_stack([np.ones(n_obs), y_lag])
+    coef = np.linalg.lstsq(x, y_t, rcond=None)[0]
+    resid = y_t - x @ coef
+    Σ = resid.T @ resid / n_obs
+
+    try:
+        chol = cholesky(Σ, lower=True)
+        log_det = 2.0 * np.sum(np.log(np.diag(chol) + 1e-16))
+        std_resid = solve_triangular(chol, resid.T, lower=True).T
+        quad_form = np.sum(std_resid ** 2)
+    except (LinAlgError, ValueError):
+        return {
+            "coef": coef,
+            "Σ": np.full((2, 2), np.nan),
+            "residuals": resid,
+            "loglike": -np.inf,
+            "converged": False,
+            "n_obs": int(n_obs),
+        }
+
+    d = y_t.shape[1]
+    loglike = float(-0.5 * n_obs * d * np.log(2.0 * np.pi) 
+            - 0.5 * n_obs * log_det - 0.5 * quad_form)
+
+    return {
+        "coef": coef,
+        "Σ": Σ,
+        "residuals": resid,
+        "loglike": loglike,
+        "converged": True,
+        "n_obs": int(n_obs),
+    }
+```
+
+```{code-cell} ipython3
+def run_unrestricted_var_by_lag(data, lags=(2, 4, 6)):
+    """
+    Estimate unrestricted VAR models by lag length.
+    """
+    rows = []
+    fits = {}
+
+    for lag in lags:
+        fit = estimate_unrestricted_var(data, n_lags=lag)
+        fits[lag] = fit
+        rows.append(
+            {
+                "NLAG": lag,
+                "loglike": fit["loglike"],
+                "n_obs": fit["n_obs"],
+            }
+        )
+
+    table = pd.DataFrame(rows).set_index("NLAG")
+    return table, fits
+```
+
+The following function computes the LR statistic at each lag length, replicating the testing strategy of Table 1 in {cite:t}`hansen1983stochastic`.
+
+```{code-cell} ipython3
+def restricted_vs_unrestricted_lr(mle_fits, unrestricted_fits, lags=(2, 4, 6)):
+    """
+    Compute LR tests of restricted model versus unrestricted VAR.
+    """
+    rows = []
+
+    for lag in lags:
+        fit_r = mle_fits[lag]
+        fit_u = unrestricted_fits[lag]
+        df_diff = (2 * (1 + 2 * lag) + 3) - (6 + 2 * lag)
+        lr = likelihood_ratio_test(fit_restricted=fit_r, fit_unrestricted=fit_u, df_diff=df_diff)
+        rows.append(
+            {
+                "NLAG": lag,
+                "lr_stat": lr["lr_stat"],
+                "p_value": lr["p_value"],
+                "chi2_cdf": lr["chi2_cdf"],
+                "df": df_diff,
+                "T": fit_r.get("n_obs", np.nan),
+            }
+        )
+
+    return pd.DataFrame(rows).set_index("NLAG")
+```
+
+## Predictability and the R-squared restriction
+
+Section II of {cite:t}`hansen1983stochastic` emphasizes an implication of the restricted system for return predictability.
+
+From {eq}`hs83-cond-mean` and {eq}`hs83-x-forecast`, the predictable component of the log return is
+
+```{math}
+:label: hs83-predictable-return
+
+E(R_t \mid \psi_{t-1}) = -\alpha\, E(X_t \mid \psi_{t-1}) - \log\beta - \frac{\sigma_U^2}{2}.
+```
+
+Since the predictable return is a linear function of the predictable consumption growth, the variance of the predictable return component is exactly $\alpha^2$ times the variance of the predictable consumption-growth component:
+
+```{math}
+:label: hs83-var-pred
+
+\operatorname{Var}[E(R_t \mid \psi_{t-1})] = \alpha^2 \operatorname{Var}[E(X_t \mid \psi_{t-1})].
+```
+
+{cite:t}`hansen1983stochastic` derive the implied $R^2$ of the return projection onto $\psi_{t-1}$:
+
+```{math}
+:label: hs83-r2
+
+R_R^2 = \frac{\alpha^2 \operatorname{Var}[E(X_t \mid \psi_{t-1})]}{\operatorname{Var}(R_t \mid \psi_{t-1}) + \alpha^2 \operatorname{Var}[E(X_t \mid \psi_{t-1})]}.
+```
+
+If $\alpha = 0$ (risk neutrality), then $R_R^2 = 0$ and asset returns are unpredictable.
+
+If $\alpha = -1$ (log utility), then $R_t - X_t$ is unpredictable, so returns and consumption growth share the same predictable component.
+
+More generally, the $R_R^2$ for returns will be small whenever the variance of the unpredictable return component $\operatorname{Var}(R_t \mid \psi_{t-1})$ is large relative to the predictable variance, which is the case for stock returns.
+
+The function below reports:
+- restriction-side predictable-variance terms implied by the Euler equation, and
+- $R_X^2$ and $R_R^2$ from the unrestricted VAR.
+
+```{code-cell} ipython3
+def predictability_metrics(data, restricted_fit, unrestricted_fit, n_lags):
+    """
+    Compute predictable-component metrics and unrestricted VAR R^2 values.
+    """
+    y_t, y_lag = build_lagged_data(data, n_lags)
+    parsed = unpack_parameters(restricted_fit["params"], n_lags)
+    α = float(parsed["α"])
+    β = float(parsed["β"])
+    σ_x = float(parsed["σ_x"])
+    σ_r = float(parsed["σ_r"])
+    cov_xr = float(parsed["cov_xr"])
+    μ_x = float(parsed["μ_x"])
+    a_lags = np.asarray(parsed["a_lags"])
+
+    pred_x = y_lag @ a_lags + μ_x
+    σ_u2 = α ** 2 * σ_x ** 2 + σ_r ** 2 + 2.0 * α * cov_xr
+    pred_r = -α * pred_x - np.log(β) - 0.5 * σ_u2
+
+    x = y_t[:, 0]
+    r = y_t[:, 1]
+    resid_x = x - pred_x
+    resid_r = r - pred_r
+
+    r2_x = 1.0 - np.var(resid_x) / np.var(x)
+    r2_r = 1.0 - np.var(resid_r) / np.var(r)
+
+    if unrestricted_fit["converged"] and unrestricted_fit.get("coef") is not None:
+        n_obs = y_t.shape[0]
+        x_u = np.column_stack([np.ones(n_obs), y_lag])
+        pred_u = x_u @ unrestricted_fit["coef"]
+        resid_u = y_t - pred_u
+        var_x = np.var(y_t[:, 0])
+        var_r = np.var(y_t[:, 1])
+        r2_x_unres = np.nan if var_x <= 0.0 else 1.0 - np.var(resid_u[:, 0]) / var_x
+        r2_r_unres = np.nan if var_r <= 0.0 else 1.0 - np.var(resid_u[:, 1]) / var_r
+    else:
+        r2_x_unres = np.nan
+        r2_r_unres = np.nan
+
+    return {
+        "alpha_hat": α,
+        "var_pred_x": float(np.var(pred_x)),
+        "var_pred_r": float(np.var(pred_r)),
+        "alpha2_var_pred_x": float(α ** 2 * np.var(pred_x)),
+        "r2_x_restricted": float(r2_x),
+        "r2_r_restricted": float(r2_r),
+        "r2_x_unrestricted": float(r2_x_unres),
+        "r2_r_unrestricted": float(r2_r_unres),
+        "T": int(y_t.shape[0]),
+    }
+```
+
+## Return-difference tests
+
+{cite:t}`hansen1983stochastic` also propose tests based on differences in log returns across assets.
+
+From {eq}`hs83-cond-mean`, the conditional mean of asset $i$'s log return is $E_{t-1}[R_{it}] = -\alpha\, E_{t-1}[X_t] - \log\beta - \sigma_i^2/2$.
+
+The term $-\alpha\, E_{t-1}[X_t] - \log\beta$ is common to all assets, so it cancels in the difference:
+
+$$
+E_{t-1}[R_{it} - R_{jt}] = \frac{\sigma_j^2 - \sigma_i^2}{2},
+$$
+
+which is a constant that does not depend on time-$(t-1)$ information.
+
+Return differences should therefore be unpredictable if the model is correct, regardless of the values of $\alpha$ and $\beta$.
+
+These tests avoid the need to measure consumption, at the cost of losing the ability to identify $\alpha$ and $\beta$.
+
+{cite:t}`hansen1983stochastic` report that the return-difference restrictions are strongly rejected for models with multiple stock returns, providing substantial evidence against the CRRA-lognormal specification even when consumption measurement problems are eliminated.
+
+The code below is an illustration of this logic on simulated data. 
+
+Reproducing the paper's empirical return-difference tables requires estimating multi-asset systems that are outside this lecture's scope
+
+```{code-cell} ipython3
+def simulate_multi_asset_nominal_returns(
+    n_obs,
+    n_assets=3,
+    α_true=-1.0,
+    β_true=0.993,
+    seed=0,
+):
+    """
+    Simulate log nominal returns satisfying E_t[beta * exp(alpha X) * r_i] = 1.
+    """
+    if n_assets < 2:
+        raise ValueError("n_assets must be at least 2.")
+
+    rng = np.random.default_rng(seed)
+    x = np.empty(n_obs)
+    x[0] = 0.001
+    for t in range(1, n_obs):
+        x[t] = 0.001 + 0.4 * (x[t - 1] - 0.001) + 0.006 * rng.standard_normal()
+
+    sigmas = np.linspace(0.03, 0.06, n_assets)
+    eps = rng.standard_normal((n_obs, n_assets)) * sigmas
+    log_returns = -np.log(β_true) - α_true * x[:, None] + eps \
+            - 0.5 * sigmas[None, :] ** 2
+    return x, log_returns
+
+
+def return_difference_test(log_returns, n_lags=2):
+    """
+    Test predictability of pairwise log-return differences.
+    """
+    if log_returns.ndim != 2 or log_returns.shape[1] < 2:
+        raise ValueError("log_returns must be T x m with m >= 2.")
+    if log_returns.shape[0] <= n_lags + 1:
+        raise ValueError("Sample size must exceed n_lags + 1.")
+
+    t_obs, n_assets = log_returns.shape
+    pairs = list(combinations(range(n_assets), 2))
+    n_obs = t_obs - n_lags - 1
+    z = np.empty((n_obs, 1 + n_assets * n_lags))
+    z[:, 0] = 1.0
+
+    for j in range(n_lags):
+        z[:, 1 + j * n_assets : 1 + (j + 1) * n_assets] = log_returns[
+            n_lags - j : t_obs - 1 - j, :
+        ]
+
+    rows = []
+    for i, j in pairs:
+        y = log_returns[n_lags + 1 :, i] - log_returns[n_lags + 1 :, j]
+        coef = np.linalg.lstsq(z, y, rcond=None)[0]
+        resid = y - z @ coef
+        sigma2 = float((resid @ resid) / max(1, n_obs - z.shape[1]))
+        cov = sigma2 * np.linalg.pinv(z.T @ z)
+        slopes = coef[1:]
+        cov_slopes = cov[1:, 1:]
+        stat = float(slopes @ np.linalg.pinv(cov_slopes) @ slopes)
+        p_value = float(1.0 - stats.chi2.cdf(stat, df=slopes.shape[0]))
+        rows.append(
+            {
+                "pair": f"{i+1}-{j+1}",
+                "wald_chi2": stat,
+                "p_value": p_value,
+                "mean_spread": float(np.mean(y)),
+            }
+        )
+
+    return pd.DataFrame(rows).set_index("pair")
+```
+
+We run `return_difference_test` on simulated data with $m = 3$ assets, giving $\binom{3}{2} = 3$ pairs.
+
+For each pair, the function regresses the return spread on a constant and `n_lags` lags of all asset returns, then tests whether the slope coefficients are jointly zero using a Wald $\chi^2$ statistic
+
+```{code-cell} ipython3
+_, sim_log_returns = simulate_multi_asset_nominal_returns(
+    n_obs=1500,
+    n_assets=3,
+    α_true=-1.0,
+    β_true=0.993,
+    seed=0,
+)
+
+spread_test = return_difference_test(sim_log_returns, n_lags=2)
+spread_pretty = spread_test.rename(columns={
+    "wald_chi2": r"\chi^2", "p_value": "p", "mean_spread": r"\overline{\Delta R}",
+})
+display_table(spread_pretty, fmt={
+    r"\chi^2": "{:.3f}",
+    "p": "{:.3f}",
+    r"\overline{\Delta R}": "{:.5f}",
+})
+```
+
+Large $p$-values confirm that return differences are unpredictable in this simulation, exactly as the model predicts when $\alpha = -1$.
+
+## Empirical MLE estimation
+
+We now apply the maximum likelihood estimator of {cite:t}`hansen1983stochastic` to real data.
+
+
+### Data
+
+Both this lecture and the companion lecture {doc}`hansen_singleton_1982` use the same data construction.
+
+{cite:t}`hansen1982generalized` and {cite:t}`hansen1983stochastic` use monthly data on real per capita consumption (nondurables) and stock returns from CRSP for the period 1959:2 through 1978:12.
+
+To align with the paper, we set the default sample to 1959:2--1978:12.
+
+You can pass different `start` and `end` dates to study later periods.
+
+This lecture pulls stock-market and one-month bill returns from the Ken French data library (`F-F_Research_Data_Factors`) and constructs gross nominal returns as `1 + (Mkt-RF + RF)/100` for the market and `1 + RF/100` for T-bills.
+
+`Mkt-RF` is the value-weighted return on all CRSP firms minus the risk-free rate, and `RF` is the one-month Treasury bill return (see [here](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html) for details).
+
+While Hansen-Singleton use CRSP value-weighted NYSE returns, we use the Ken French market factor as the closest open-access alternative.
+
+The consumption series is constructed from consumption of nondurables (`ND`) with the nondurables deflator.
+
+The hidden cell below pulls the relevant FRED series, constructs per capita real consumption, and joins with the Ken French returns
+
+```{code-cell} ipython3
+:tags: [hide-cell]
+
+fred_codes = {
+    "population_16plus": "CNP16OV",
+    "cons_nd_real_index": "DNDGRA3M086SBEA",
+    "cons_nd_price_index": "DNDGRG3M086SBEA",
+}
+
+def to_month_end(index):
+    """
+    Convert a date index to month-end timestamps.
+    """
+    return pd.PeriodIndex(pd.DatetimeIndex(index), freq="M").to_timestamp("M")
+
+
+def load_hs_monthly_data(
+    start="1959-02-01",
+    end="1978-12-01",
+):
+    """
+    Build monthly gross real return and gross consumption-growth series.
+    """
+    start_period = pd.Timestamp(start).to_period("M")
+    end_period = pd.Timestamp(end).to_period("M")
+
+    # Pull one extra month to build the first in-sample growth rate
+    fetch_start = (start_period - 1).to_timestamp(how="start")
+    fetch_end = end_period.to_timestamp("M")
+    sample_start = start_period.to_timestamp("M")
+    sample_end = end_period.to_timestamp("M")
+
+    fred = web.DataReader(
+        list(fred_codes.values()), "fred", fetch_start, fetch_end)
+    fred = fred.rename(columns={v: k for k, v in fred_codes.items()})
+    fred.index = to_month_end(fred.index)
+    fred["cons_real_level"] = fred["cons_nd_real_index"]
+    fred["cons_price_index"] = fred["cons_nd_price_index"]
+    fred["consumption_per_capita"] = fred["cons_real_level"] \
+        / fred["population_16plus"]
+    fred["gross_cons_growth"] = (
+        fred["consumption_per_capita"] / fred["consumption_per_capita"].shift(1)
+    )
+    fred["gross_inflation_cons"] = (
+        fred["cons_price_index"] / fred["cons_price_index"].shift(1)
+    )
+
+    ff = web.DataReader(
+        "F-F_Research_Data_Factors", "famafrench", 
+        fetch_start, fetch_end)[0].copy()
+    ff.columns = [str(col).strip() for col in ff.columns]
+    if ("Mkt-RF" not in ff.columns) or ("RF" not in ff.columns):
+        raise KeyError(
+            "Fama-French data missing required columns: 'Mkt-RF' and 'RF'.")
+
+    # Mkt-RF and RF are reported in percent per month
+    ff["gross_nom_return"] = 1.0 + (ff["Mkt-RF"] + ff["RF"]) / 100.0
+    ff["gross_nom_tbill"] = 1.0 + ff["RF"] / 100.0
+    ff.index = ff.index.to_timestamp(how="end")
+    ff.index = to_month_end(ff.index)
+    market = ff[["gross_nom_return", "gross_nom_tbill"]]
+
+    out = fred.join(market, how="inner")
+    out["gross_real_return"] = out["gross_nom_return"] \
+        / out["gross_inflation_cons"]
+    out["gross_real_tbill"] = out["gross_nom_tbill"] \
+        / out["gross_inflation_cons"]
+    out = out.loc[sample_start:sample_end].dropna()
+
+    required_cols = [
+        "gross_real_return",
+        "gross_cons_growth",
+        "gross_inflation_cons",
+        "consumption_per_capita",
+        "gross_real_tbill",
+    ]
+    return out[required_cols].copy()
+
+
+def get_estimation_data(
+    start="1959-02-01",
+    end="1978-12-01",
+):
+    """
+    Return (dataframe, array) using observed data.
+    """
+    frame = load_hs_monthly_data(start=start, end=end)
+    data = frame[["gross_real_return", "gross_cons_growth"]].to_numpy()
+    return frame, data
+
+
+def get_tbill_estimation_data(
+    start="1959-02-01",
+    end="1978-12-01",
+):
+    """
+    Return (dataframe, array) using Treasury bill data.
+    """
+    frame = load_hs_monthly_data(start=start, end=end)
+    data = frame[["gross_real_tbill", "gross_cons_growth"]].to_numpy()
+    return frame, data
+```
+
+### MLE estimation and predictability summaries
+
+With the data in hand, we can run the MLE estimation and compute the predictability summaries.
+
+Lognormality makes the MLE tractable when the assumption is correct.
+
+As {cite:t}`hansen1983stochastic` stress, however, $\alpha$ is estimated with relatively large standard errors, and precise inference on risk aversion cannot be expected from aggregate monthly data alone.
+
+```{code-cell} ipython3
+lags = (2, 4, 6)
+
+emp_frame, emp_data = get_estimation_data()
+```
+
+The following table reports the restricted MLE estimates of $\hat\alpha$ and $\hat\beta$ by lag length
+
+```{code-cell} ipython3
+emp_log_data = to_mle_array(emp_data)
+mle_table, mle_fits = run_mle_by_lag(
+    emp_log_data, lags=lags, verbose=False
+)
+mle_pretty = mle_table.rename(columns={
+    "α_hat": r"\hat{\alpha}", "se_α": r"\mathrm{se}(\hat{\alpha})",
+    "β_hat": r"\hat{\beta}", "se_β": r"\mathrm{se}(\hat{\beta})",
+    "loglike": "logL", "n_obs": "T",
+})
+display_table(mle_pretty, fmt={
+    r"\hat{\alpha}": "{:.4f}", r"\mathrm{se}(\hat{\alpha})": "{:.4f}",
+    r"\hat{\beta}": "{:.4f}", r"\mathrm{se}(\hat{\beta})": "{:.4f}",
+    "logL": "{:.1f}", "T": "{:.0f}",
+})
+```
+
+The table reports $\hat\alpha$ and $\hat\beta$ by lag length for the sample used in the code cell above.
+
+For comparison, {cite:t}`hansen1983stochastic` report $\hat\alpha$ values of $-0.32$ to $-1.25$ (standard errors $0.65$ to $0.83$) for the value-weighted return with nondurables consumption.
+
+Our numbers fall into those ranges.
+
+In risk-aversion units, this corresponds to $-\hat\alpha$ between $0.32$ and $1.25$.
+
+We now compute the predictability summaries
+
+```{code-cell} ipython3
+unres_table, unres_fits = run_unrestricted_var_by_lag(
+    emp_log_data,
+    lags=lags,
+)
+```
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+pred_rows = []
+for lag in lags:
+    metrics = predictability_metrics(
+        emp_log_data,
+        restricted_fit=mle_fits[lag],
+        unrestricted_fit=unres_fits[lag],
+        n_lags=lag,
+    )
+    pred_rows.append({"NLAG": lag, **metrics})
+
+pred_df = pd.DataFrame(pred_rows).set_index("NLAG")
+pred_pretty = pred_df[
+    [
+        "alpha_hat",
+        "var_pred_x",
+        "var_pred_r",
+        "alpha2_var_pred_x",
+        "r2_x_unrestricted",
+        "r2_r_unrestricted",
+        "T",
+    ]
+].rename(columns={
+    "alpha_hat": r"\hat{\alpha}",
+    "var_pred_x": r"\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])",
+    "var_pred_r": r"\mathrm{Var}(\hat E[R_t\mid\psi_{t-1}])",
+    "alpha2_var_pred_x": r"\hat{\alpha}^2\,\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])",
+    "r2_x_unrestricted": r"R_X^2",
+    "r2_r_unrestricted": r"R_R^2",
+    "T": "T",
+})
+display_table(pred_pretty, fmt={
+    r"\hat{\alpha}": "{:.4f}",
+    r"\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])": "{:.6f}",
+    r"\mathrm{Var}(\hat E[R_t\mid\psi_{t-1}])": "{:.6f}",
+    r"\hat{\alpha}^2\,\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])": "{:.6f}",
+    r"R_X^2": "{:.4f}",
+    r"R_R^2": "{:.4f}",
+    "T": "{:.0f}",
+})
+```
+
+The column $\hat\alpha^2 \operatorname{Var}(\hat E[X_t \mid \psi_{t-1}])$ should equal $\operatorname{Var}(\hat E[R_t \mid \psi_{t-1}])$ if the restriction {eq}`hs83-var-pred` holds.
+
+The columns  $R_X^2$ and $R_R^2$ match the paper convention of reporting the values from the unrestricted VAR projections.
+
+In {cite:t}`hansen1983stochastic`, $R_R^2$ is small ($0.02$ to $0.06$) even when $R_X^2$ is nontrivial: most stock-return variation is unpredictable.
+
+Our estimates show the same pattern.
+
+We now test the Euler-equation restrictions by comparing the restricted system to an unrestricted VAR.
+
+```{code-cell} ipython3
+lr_hs83 = restricted_vs_unrestricted_lr(mle_fits, unres_fits, lags=lags)
+lr_pretty = lr_hs83.rename(columns={
+    "lr_stat": "LR",
+    "chi2_cdf": "chi2.cdf(LR,df)",
+    "p_value": "p(LR)",
+    "df": "df",
+    "T": "T",
+})
+display_table(lr_pretty, fmt={
+    "LR": "{:.3f}",
+    "chi2.cdf(LR,df)": "{:.3f}",
+    "p(LR)": "{:.3f}",
+    "df": "{:.0f}",
+    "T": "{:.0f}",
+})
+
+sig_level = 0.05
+rejected_lags = [int(lag) for lag in lr_hs83.index if lr_hs83.loc[lag, "p_value"] < sig_level]
+not_rejected_lags = [int(lag) for lag in lr_hs83.index if lr_hs83.loc[lag, "p_value"] >= sig_level]
+print(f"Not rejected at 5%: {not_rejected_lags if not_rejected_lags else 'none'}")
+```
+
+The LR test does not reject the model for the value-weighted return, consistent with {cite:t}`hansen1983stochastic`.
+
+### Treasury bill estimation
+
+We now repeat the estimation using the 1-month Treasury bill return in place of the stock return.
+
+{cite:t}`hansen1983stochastic` find that the model is strongly rejected for Treasury bills (Table 4 of their paper).
+
+Because the nominally risk-free T-bill return is much more predictable than stock returns, the proportionality restriction has more bite, and the LR test has greater power to detect violations
+
+```{code-cell} ipython3
+tbill_frame, tbill_data = get_tbill_estimation_data()
+```
+
+The following table reports the restricted MLE estimates for Treasury bill returns by lag length.
+
+```{code-cell} ipython3
+tbill_log_data = to_mle_array(tbill_data)
+tbill_mle_table, tbill_mle_fits = run_mle_by_lag(
+    tbill_log_data, lags=lags, verbose=False
+)
+```
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+tbill_mle_pretty = tbill_mle_table.rename(columns={
+    "α_hat": r"\hat{\alpha}", "se_α": r"\mathrm{se}(\hat{\alpha})",
+    "β_hat": r"\hat{\beta}", "se_β": r"\mathrm{se}(\hat{\beta})",
+    "loglike": "logL", "n_obs": "T",
+})
+display_table(tbill_mle_pretty, fmt={
+    r"\hat{\alpha}": "{:.4f}", r"\mathrm{se}(\hat{\alpha})": "{:.4f}",
+    r"\hat{\beta}": "{:.4f}", r"\mathrm{se}(\hat{\beta})": "{:.4f}",
+    "logL": "{:.1f}", "T": "{:.0f}",
+})
+```
+
+The LR test below remains the main specification check in the paper
+
+```{code-cell} ipython3
+tbill_unres_table, tbill_unres_fits = run_unrestricted_var_by_lag(
+    tbill_log_data,
+    lags=lags,
+)
+
+tbill_unres_pretty = tbill_unres_table.rename(columns={"loglike": "logL", "n_obs": "T"})
+display_table(tbill_unres_pretty, fmt={
+    "logL": "{:.1f}", "T": "{:.0f}",
+})
+```
+
+```{code-cell} ipython3
+:tags: [hide-input]
+
+tbill_pred_rows = []
+for lag in lags:
+    metrics = predictability_metrics(
+        tbill_log_data,
+        restricted_fit=tbill_mle_fits[lag],
+        unrestricted_fit=tbill_unres_fits[lag],
+        n_lags=lag,
+    )
+    tbill_pred_rows.append({"NLAG": lag, **metrics})
+
+tbill_pred_df = pd.DataFrame(tbill_pred_rows).set_index("NLAG")
+tbill_pred_pretty = tbill_pred_df[
+    [
+        "alpha_hat",
+        "var_pred_x",
+        "var_pred_r",
+        "alpha2_var_pred_x",
+        "r2_x_unrestricted",
+        "r2_r_unrestricted",
+        "T",
+    ]
+].rename(columns={
+    "alpha_hat": r"\hat{\alpha}",
+    "var_pred_x": r"\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])",
+    "var_pred_r": r"\mathrm{Var}(\hat E[R_t\mid\psi_{t-1}])",
+    "alpha2_var_pred_x": r"\hat{\alpha}^2\,\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])",
+    "r2_x_unrestricted": r"R_X^2",
+    "r2_r_unrestricted": r"R_R^2",
+    "T": "T",
+})
+display_table(tbill_pred_pretty, fmt={
+    r"\hat{\alpha}": "{:.4f}",
+    r"\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])": "{:.6f}",
+    r"\mathrm{Var}(\hat E[R_t\mid\psi_{t-1}])": "{:.6f}",
+    r"\hat{\alpha}^2\,\mathrm{Var}(\hat E[X_t\mid\psi_{t-1}])": "{:.6f}",
+    r"R_X^2": "{:.4f}",
+    r"R_R^2": "{:.4f}",
+    "T": "{:.0f}",
+})
+```
+
+The following table reports the likelihood ratio tests for the Treasury bill model.
+
+```{code-cell} ipython3
+tbill_lr = restricted_vs_unrestricted_lr(tbill_mle_fits, tbill_unres_fits, lags=lags)
+```
+
+```{code-cell} ipython3
+:tags: [hide-input]
+tbill_lr_pretty = tbill_lr.rename(columns={
+    "lr_stat": "LR",
+    "chi2_cdf": "chi2.cdf(LR,df)",
+    "p_value": "p(LR)",
+    "df": "df",
+    "T": "T",
+})
+display_table(tbill_lr_pretty, fmt={
+    "LR": "{:.3f}",
+    "chi2.cdf(LR,df)": "{:.3f}",
+    "p(LR)": "{:.3f}",
+    "df": "{:.0f}",
+    "T": "{:.0f}",
+})
+
+tbill_rejected_lags = [int(lag) for lag in tbill_lr.index if tbill_lr.loc[lag, "p_value"] < 0.05]
+print(f"T-bill model rejected at 5% for lags: {tbill_rejected_lags if tbill_rejected_lags else 'none'}")
+```
+
+{cite:t}`hansen1983stochastic` find the same qualitative pattern in their 1959--1978 sample: the Treasury bill model is rejected much more strongly than the value-weighted stock-return model.
+
+Why does the LR test not reject the model for stocks?
+
+As we hinted earlier, one reason may be limited test power when return predictability is small (as reflected in the low $R_R^2$ for stocks).
+
+When aggregate stock returns are nearly unpredictable, there is almost no predictable variation to constrain.
+
+## Residual diagnostics
+
+As a final check, let's inspect the residual paths, histograms, and diagnostic statistics from the restricted model to assess whether the normality and serial independence assumptions hold.
+
+```{code-cell} ipython3
+---
+mystnb:
+  figure:
+    caption: Restricted-model residual diagnostics
+    name: fig-hs83-residual-diagnostics
+---
+diag_lag = 2
+diag_fit = mle_fits[diag_lag]
+resid = diag_fit["residuals"]
+
+if diag_fit["converged"] and resid is not None:
+    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
+    axes[0, 0].plot(resid[:, 0], lw=2)
+    axes[0, 0].axhline(0.0, color="black", lw=2)
+    axes[0, 0].set_ylabel("consumption residual")
+    axes[0, 0].set_xlabel("observation")
+
+    axes[0, 1].plot(resid[:, 1], lw=2)
+    axes[0, 1].axhline(0.0, color="black", lw=2)
+    axes[0, 1].set_ylabel("return residual")
+    axes[0, 1].set_xlabel("observation")
+
+    axes[1, 0].hist(resid[:, 0], bins=30, edgecolor="white")
+    axes[1, 0].set_xlabel("consumption residual")
+    axes[1, 0].set_ylabel("count")
+
+    axes[1, 1].hist(resid[:, 1], bins=30, edgecolor="white")
+    axes[1, 1].set_xlabel("return residual")
+    axes[1, 1].set_ylabel("count")
+    plt.tight_layout()
+    plt.show()
+```
+
+The following table reports Jarque-Bera normality tests and Durbin-Watson serial correlation statistics for the restricted-model residuals.
+
+```{code-cell} ipython3
+if diag_fit["converged"] and resid is not None:
+    diag = residual_diagnostics(resid)
+    diag_df = pd.DataFrame({
+        "JB stat": [diag["consumption_jb_stat"], diag["return_jb_stat"]],
+        "JB p-val": [diag["consumption_jb_pval"], diag["return_jb_pval"]],
+        "DW": [diag["consumption_dw"], diag["return_dw"]],
+    }, index=pd.Index(["consumption", "return"], name="series"))
+    display_table(diag_df, fmt={
+        "JB stat": "{:.2f}", "JB p-val": "{:.4f}", "DW": "{:.3f}",
+    })
+```
+
+The residual time-series plots reveal periods of volatility clustering, while the histograms show departures from bell curve with fatter tails.
+
+The Jarque-Bera statistics are large for both series, rejecting the normality assumption that underlies the likelihood.
+
+The Durbin-Watson statistics are close to 2 for both series, so serial correlation is not a concern.
+
+This motivates the companion lecture {doc}`hansen_singleton_1982` where we discuss how GMM avoids the normality assumption.
+
+## Connection to the equity premium puzzle
+
+Our estimates reproduce the pattern that {cite:t}`MehraPrescott1985` later called  the **equity premium puzzle**.
+
+- *Low estimated risk aversion:* The estimated $\hat\alpha$ values (and thus risk aversion $-\hat\alpha$) from the table above are similar to those in {cite:t}`hansen1983stochastic`, who report $\hat\alpha$ between $-0.32$ and $-1.25$.
+
+- *Tiny return predictability:* The unrestricted-VAR $R_R^2$ values are comparable to the 0.02 to 0.06 range in {cite:t}`hansen1983stochastic` — the predictable component of stock returns is small relative to the unpredictable component.
+
+- *Strong rejection for Treasury bills:* The Euler-equation restrictions are decisively rejected for the nominally risk-free Treasury bill return, just as in Table 4 of {cite:t}`hansen1983stochastic`.
+
+The restrictions have more bite when the return series is more predictable (the unrestricted-VAR $R_R^2$ for bills is much larger than for stocks), a precursor to what {cite:t}`Weil_1989` would later call the **risk-free rate puzzle**.
+
+{cite:t}`MehraPrescott1985` presented closely related findings. 
+
+In a calibrated version of the consumption-based model with CRRA utility, they showed that for relative risk aversion $\gamma_{\text{MP}}$ in the range of 0 to 10, the model could not simultaneously match:
+
+1. the average annual equity premium of about 6\%,
+2. the average annual risk-free rate of about 1\%.
+
+Matching the equity premium requires $\gamma_{\text{MP}}$ well above 10, while matching the risk-free rate requires low $\gamma_{\text{MP}}$.
+
+The same difficulty appears in our estimates: the implied risk aversion $-\hat\alpha$ is too low to generate a large equity premium.
+
+## Another approach
+
+A companion lecture {doc}`hansen_singleton_1982` describes a closely related paper in which
+Hansen and Singleton  specified less about the joint  distribution of returns and consumption growth.
+
+They formulated an incomplete probability model that stops  short of specifying a likelihood function. 
+
+To proceed, they used  a generalized methods of moments (GMM) estimator to estimate  key parameters that appear in the Euler equation.
\ No newline at end of file
diff --git a/lectures/measurement_models.md b/lectures/measurement_models.md
index b2d6e42e0..ea4133a81 100644
--- a/lectures/measurement_models.md
+++ b/lectures/measurement_models.md
@@ -11,7 +11,7 @@ kernelspec:
   name: python3
 ---
 
-(sargent_measurement_models)=
+(measurement_models)=
 ```{raw} jupyter
 <div id="qe-notebook-header" align="right" style="text-align:right;">
         <a href="https://quantecon.org/" title="quantecon.org">