Fama–MacBeth Regressions

Jun He May 31, 2026

The Fama–MacBeth procedure estimates how characteristics are priced in the cross-section while handling the correlation structure of panel returns. Run a cross-sectional regression each period, then average the coefficients over time — the average is the risk premium and its time-series variation gives the standard error. Use the R | Python toggle to switch.

library(tidyverse)
library(tidyfinance)

import pandas as pd
import numpy as np
import sqlite3
import statsmodels.formula.api as smf

Data preparation

We assemble a panel of returns plus characteristics: beta (from the beta chapter), book-to-market, and log market cap. The characteristics are lagged six months and forward-filled so that the regression predicting next month's return uses only past information.

crsp_monthly <- tbl(tidy_finance, "crsp_monthly") |> collect()
compustat <- tbl(tidy_finance, "compustat") |> collect()
beta <- tbl(tidy_finance, "beta") |> collect()

characteristics <- compustat |>
  mutate(date = floor_date(ymd(datadate), "month")) |>
  left_join(crsp_monthly, join_by(gvkey, date)) |>
  left_join(beta, join_by(permno, date)) |>
  mutate(
    bm = be / mktcap,
    sorting_date = date %m+% months(6),
    log_mktcap = log(mktcap)
  ) |>
  select(permno, sorting_date, beta = beta_monthly, bm, log_mktcap) |>
  drop_na()

data_fama_macbeth <- crsp_monthly |>
  left_join(characteristics, join_by(permno, date == sorting_date)) |>
  group_by(permno) |>
  arrange(date) |>
  fill(c(beta, bm, log_mktcap), .direction = "down") |>
  ungroup() |>
  left_join(
    crsp_monthly |>
      select(permno, date, ret_excess_lead = ret_excess) |>
      mutate(date = date %m-% months(1)),
    join_by(permno, date)
  ) |>
  select(permno, date, ret_excess_lead, beta, log_mktcap, bm) |>
  drop_na()

tidy_finance = sqlite3.connect(database="data/tidy_finance_python.sqlite")
crsp_monthly = pd.read_sql_query("SELECT * FROM crsp_monthly", tidy_finance, parse_dates={"date"})
compustat = pd.read_sql_query("SELECT * FROM compustat", tidy_finance, parse_dates={"datadate"})
beta = pd.read_sql_query("SELECT * FROM beta", tidy_finance, parse_dates={"date"})

characteristics = (compustat
  .assign(date=lambda x: x["datadate"].dt.to_period("M").dt.to_timestamp())
  .merge(crsp_monthly, how="left", on=["gvkey", "date"])
  .merge(beta, how="left", on=["permno", "date"])
  .assign(
    bm=lambda x: x["be"] / x["mktcap"],
    sorting_date=lambda x: x["date"] + pd.DateOffset(months=6),
    log_mktcap=lambda x: np.log(x["mktcap"]))
  .get(["permno", "sorting_date", "beta_monthly", "bm", "log_mktcap"])
  .rename(columns={"beta_monthly": "beta", "sorting_date": "date"})
  .dropna())

data_fama_macbeth = (crsp_monthly
  .merge(characteristics, how="left", on=["permno", "date"])
  .sort_values(["permno", "date"])
  .groupby("permno")
  .apply(lambda x: x.assign(
    beta=x["beta"].ffill(), bm=x["bm"].ffill(), log_mktcap=x["log_mktcap"].ffill()))
  .reset_index(drop=True))

data_fama_macbeth = (data_fama_macbeth
  .merge(crsp_monthly.get(["permno", "date", "ret_excess"])
    .rename(columns={"ret_excess": "ret_excess_lead"})
    .assign(date=lambda x: x["date"] - pd.DateOffset(months=1)),
    how="left", on=["permno", "date"])
  .get(["permno", "date", "ret_excess_lead", "beta", "log_mktcap", "bm"])
  .dropna())

Step one: cross-sectional regressions

For each month, regress next month's excess return on the characteristics across all stocks. This yields a time series of slope coefficients — one estimate of each characteristic's premium per period.

risk_premiums <- data_fama_macbeth |>
  nest(data = -date) |>
  mutate(estimates = map(
    data,
    ~ tidy(lm(ret_excess_lead ~ beta + log_mktcap + bm, data = .x))
  )) |>
  unnest(estimates)

risk_premiums = (data_fama_macbeth
  .groupby("date")
  .apply(lambda x: smf.ols(
      formula="ret_excess_lead ~ beta + log_mktcap + bm", data=x
    ).fit().params)
  .reset_index()
)

Step two: time-series aggregation

The premium for each characteristic is the time-series average of its period-by-period coefficients; the t-statistic is that mean divided by its standard error, which comes from how the estimates vary across periods. Because the coefficients can be autocorrelated, a Newey–West correction is the standard refinement.

price_of_risk <- risk_premiums |>
  group_by(factor = term) |>
  summarize(
    risk_premium = mean(estimate) * 100,
    t_statistic = mean(estimate) / sd(estimate) * sqrt(n())
  )

regressions_for_newey_west <- risk_premiums |>
  select(date, factor = term, estimate) |>
  nest(data = c(date, estimate)) |>
  mutate(
    model = map(data, ~ lm(estimate ~ 1, .)),
    mean = map(model, tidy)
  )

price_of_risk_newey_west <- regressions_for_newey_west |>
  mutate(newey_west_se = map_dbl(model, ~ sqrt(NeweyWest(.)))) |>
  unnest(mean) |>
  mutate(t_statistic_newey_west = estimate / newey_west_se) |>
  select(factor, risk_premium = estimate, t_statistic_newey_west)

price_of_risk = (risk_premiums
  .melt(id_vars="date", var_name="factor", value_name="estimate")
  .groupby("factor")
  .apply(lambda x: pd.Series({
    "risk_premium": x["estimate"].mean() * 100,
    "t_statistic": x["estimate"].mean() / x["estimate"].std() * np.sqrt(len(x))
  }))
  .reset_index())

def newey_west_tstat(x):
    fit = smf.ols("estimate ~ 1", data=x).fit(
        cov_type="HAC", cov_kwds={"maxlags": 6})
    return fit.params["Intercept"] / fit.bse["Intercept"]

price_of_risk_newey_west = (risk_premiums
  .melt(id_vars="date", var_name="factor", value_name="estimate")
  .groupby("factor")
  .apply(newey_west_tstat)
  .reset_index(name="t_statistic_newey_west"))

Study notes following the Tidy Finance curriculum by Scheuch, Voigt, Weiss, and Frey. Prose is my own; the R/Python code is reproduced from the book's open-source source, licensed CC BY-NC-SA 4.0.