Fama–MacBeth Regressions
The Fama–MacBeth procedure estimates how characteristics are priced in the cross-section while handling the correlation structure of panel returns. Run a cross-sectional regression each period, then average the coefficients over time — the average is the risk premium and its time-series variation gives the standard error. Use the R | Python toggle to switch.
library(tidyverse)
library(tidyfinance)
import pandas as pd
import numpy as np
import sqlite3
import statsmodels.formula.api as smf
Data preparation
We assemble a panel of returns plus characteristics: beta (from the beta chapter), book-to-market, and log market cap. The characteristics are lagged six months and forward-filled so that the regression predicting next month's return uses only past information.
crsp_monthly <- tbl(tidy_finance, "crsp_monthly") |> collect()
compustat <- tbl(tidy_finance, "compustat") |> collect()
beta <- tbl(tidy_finance, "beta") |> collect()
characteristics <- compustat |>
mutate(date = floor_date(ymd(datadate), "month")) |>
left_join(crsp_monthly, join_by(gvkey, date)) |>
left_join(beta, join_by(permno, date)) |>
mutate(
bm = be / mktcap,
sorting_date = date %m+% months(6),
log_mktcap = log(mktcap)
) |>
select(permno, sorting_date, beta = beta_monthly, bm, log_mktcap) |>
drop_na()
data_fama_macbeth <- crsp_monthly |>
left_join(characteristics, join_by(permno, date == sorting_date)) |>
group_by(permno) |>
arrange(date) |>
fill(c(beta, bm, log_mktcap), .direction = "down") |>
ungroup() |>
left_join(
crsp_monthly |>
select(permno, date, ret_excess_lead = ret_excess) |>
mutate(date = date %m-% months(1)),
join_by(permno, date)
) |>
select(permno, date, ret_excess_lead, beta, log_mktcap, bm) |>
drop_na()
tidy_finance = sqlite3.connect(database="data/tidy_finance_python.sqlite")
crsp_monthly = pd.read_sql_query("SELECT * FROM crsp_monthly", tidy_finance, parse_dates={"date"})
compustat = pd.read_sql_query("SELECT * FROM compustat", tidy_finance, parse_dates={"datadate"})
beta = pd.read_sql_query("SELECT * FROM beta", tidy_finance, parse_dates={"date"})
characteristics = (compustat
.assign(date=lambda x: x["datadate"].dt.to_period("M").dt.to_timestamp())
.merge(crsp_monthly, how="left", on=["gvkey", "date"])
.merge(beta, how="left", on=["permno", "date"])
.assign(
bm=lambda x: x["be"] / x["mktcap"],
sorting_date=lambda x: x["date"] + pd.DateOffset(months=6),
log_mktcap=lambda x: np.log(x["mktcap"]))
.get(["permno", "sorting_date", "beta_monthly", "bm", "log_mktcap"])
.rename(columns={"beta_monthly": "beta", "sorting_date": "date"})
.dropna())
data_fama_macbeth = (crsp_monthly
.merge(characteristics, how="left", on=["permno", "date"])
.sort_values(["permno", "date"])
.groupby("permno")
.apply(lambda x: x.assign(
beta=x["beta"].ffill(), bm=x["bm"].ffill(), log_mktcap=x["log_mktcap"].ffill()))
.reset_index(drop=True))
data_fama_macbeth = (data_fama_macbeth
.merge(crsp_monthly.get(["permno", "date", "ret_excess"])
.rename(columns={"ret_excess": "ret_excess_lead"})
.assign(date=lambda x: x["date"] - pd.DateOffset(months=1)),
how="left", on=["permno", "date"])
.get(["permno", "date", "ret_excess_lead", "beta", "log_mktcap", "bm"])
.dropna())
Step one: cross-sectional regressions
For each month, regress next month's excess return on the characteristics across all stocks. This yields a time series of slope coefficients — one estimate of each characteristic's premium per period.
risk_premiums <- data_fama_macbeth |>
nest(data = -date) |>
mutate(estimates = map(
data,
~ tidy(lm(ret_excess_lead ~ beta + log_mktcap + bm, data = .x))
)) |>
unnest(estimates)
risk_premiums = (data_fama_macbeth
.groupby("date")
.apply(lambda x: smf.ols(
formula="ret_excess_lead ~ beta + log_mktcap + bm", data=x
).fit().params)
.reset_index()
)
Step two: time-series aggregation
The premium for each characteristic is the time-series average of its period-by-period coefficients; the t-statistic is that mean divided by its standard error, which comes from how the estimates vary across periods. Because the coefficients can be autocorrelated, a Newey–West correction is the standard refinement.
price_of_risk <- risk_premiums |>
group_by(factor = term) |>
summarize(
risk_premium = mean(estimate) * 100,
t_statistic = mean(estimate) / sd(estimate) * sqrt(n())
)
regressions_for_newey_west <- risk_premiums |>
select(date, factor = term, estimate) |>
nest(data = c(date, estimate)) |>
mutate(
model = map(data, ~ lm(estimate ~ 1, .)),
mean = map(model, tidy)
)
price_of_risk_newey_west <- regressions_for_newey_west |>
mutate(newey_west_se = map_dbl(model, ~ sqrt(NeweyWest(.)))) |>
unnest(mean) |>
mutate(t_statistic_newey_west = estimate / newey_west_se) |>
select(factor, risk_premium = estimate, t_statistic_newey_west)
price_of_risk = (risk_premiums
.melt(id_vars="date", var_name="factor", value_name="estimate")
.groupby("factor")
.apply(lambda x: pd.Series({
"risk_premium": x["estimate"].mean() * 100,
"t_statistic": x["estimate"].mean() / x["estimate"].std() * np.sqrt(len(x))
}))
.reset_index())
def newey_west_tstat(x):
fit = smf.ols("estimate ~ 1", data=x).fit(
cov_type="HAC", cov_kwds={"maxlags": 6})
return fit.params["Intercept"] / fit.bse["Intercept"]
price_of_risk_newey_west = (risk_premiums
.melt(id_vars="date", var_name="factor", value_name="estimate")
.groupby("factor")
.apply(newey_west_tstat)
.reset_index(name="t_statistic_newey_west"))
Study notes following the Tidy Finance curriculum by Scheuch, Voigt, Weiss, and Frey. Prose is my own; the R/Python code is reproduced from the book's open-source source, licensed CC BY-NC-SA 4.0.