Stock Pseudo Data
The core of the data is the cross-section of stocks. We generate 100 fictional stocks with unique permno and gvkey identifiers and associated exchcd, exchange, industry, and siccd values. The industry and exchange shares roughly match the real crsp_monthly distribution, but the identifiers are made up.
Stock identifiers
number_of_stocks <- 100
industries <- tibble(
industry = c(
"Agriculture", "Construction", "Finance", "Manufacturing", "Mining",
"Public", "Retail", "Services", "Transportation", "Utilities", "Wholesale"
),
prob = c(
0.00319, 0.0113, 0.185, 0.339, 0.0508,
0.0779, 0.0620, 0.169, 0.0493, 0.0180, 0.0357
)
)
exchanges <- tibble(
exchange = c("AMEX", "NASDAQ", "NYSE"),
prob = c(0.113, 0.671, 0.216)
)
stock_identifiers <- 1:number_of_stocks |>
map_df(function(x) {
tibble(
permno = x,
gvkey = as.character(x + 10000),
exchange = sample(exchanges$exchange, 1, prob = exchanges$prob),
industry = sample(industries$industry, 1, prob = industries$prob)
) |>
mutate(
exchcd = case_when(
exchange == "NYSE" ~ sample(c(1, 31), n()),
exchange == "AMEX" ~ sample(c(2, 32), n()),
exchange == "NASDAQ" ~ sample(c(3, 33), n())
),
siccd = case_when(
industry == "Agriculture" ~ sample(1:999, n()),
industry == "Mining" ~ sample(1000:1499, n()),
industry == "Construction" ~ sample(1500:1799, n()),
industry == "Manufacturing" ~ sample(1800:3999, n()),
industry == "Transportation" ~ sample(4000:4899, n()),
industry == "Utilities" ~ sample(4900:4999, n()),
industry == "Wholesale" ~ sample(5000:5199, n()),
industry == "Retail" ~ sample(5200:5999, n()),
industry == "Finance" ~ sample(6000:6799, n()),
industry == "Services" ~ sample(7000:8999, n()),
industry == "Public" ~ sample(9000:9999, n())
)
)
})
number_of_stocks = 100
industries = pd.DataFrame({
"industry": [
"Agriculture", "Construction", "Finance", "Manufacturing", "Mining",
"Public", "Retail", "Services", "Transportation", "Utilities", "Wholesale",
],
"prob": [
0.00319, 0.0113, 0.185, 0.339, 0.0508,
0.0779, 0.0620, 0.169, 0.0493, 0.0180, 0.0357,
],
})
exchanges = pd.DataFrame({
"exchange": ["AMEX", "NASDAQ", "NYSE"],
"prob": [0.113, 0.671, 0.216],
})
exchcd_map = {"NYSE": [1, 31], "AMEX": [2, 32], "NASDAQ": [3, 33]}
sic_ranges = {
"Agriculture": (1, 999), "Mining": (1000, 1499), "Construction": (1500, 1799),
"Manufacturing": (1800, 3999), "Transportation": (4000, 4899),
"Utilities": (4900, 4999), "Wholesale": (5000, 5199), "Retail": (5200, 5999),
"Finance": (6000, 6799), "Services": (7000, 8999), "Public": (9000, 9999),
}
rows = []
for x in range(1, number_of_stocks + 1):
exchange = rng.choice(exchanges["exchange"],
p=exchanges["prob"] / exchanges["prob"].sum())
industry = rng.choice(industries["industry"],
p=industries["prob"] / industries["prob"].sum())
lo, hi = sic_ranges[industry]
rows.append({
"permno": x,
"gvkey": str(x + 10000),
"exchange": exchange,
"industry": industry,
"exchcd": rng.choice(exchcd_map[exchange]),
"siccd": rng.integers(lo, hi + 1),
})
stock_identifiers = pd.DataFrame(rows)
The three panels
We expand the cross-section into yearly, monthly, and daily panels.
stock_panel_yearly <- expand_grid(
stock_identifiers,
tibble(year = time_series_years)
) |>
select(gvkey, year)
stock_panel_monthly <- expand_grid(
stock_identifiers,
tibble(date = time_series_months)
) |>
select(permno, gvkey, date, siccd, industry, exchcd, exchange)
stock_panel_daily <- expand_grid(
stock_identifiers,
tibble(date = time_series_days)
) |>
select(permno, date)
stock_panel_yearly = (
stock_identifiers.merge(pd.DataFrame({"year": time_series_years}), how="cross")
[["gvkey", "year"]]
)
stock_panel_monthly = (
stock_identifiers.merge(pd.DataFrame({"date": time_series_months}), how="cross")
[["permno", "gvkey", "date", "siccd", "industry", "exchcd", "exchange"]]
)
stock_panel_daily = (
stock_identifiers.merge(pd.DataFrame({"date": time_series_days}), how="cross")
[["permno", "date"]]
)
beta table
Monthly betas are drawn from a normal with mean and standard deviation 1; daily betas add a small noise term.
beta_pseudo <- stock_panel_monthly |>
mutate(
beta_monthly = rnorm(n(), mean = 1, sd = 1),
beta_daily = beta_monthly + rnorm(n()) / 100
)
write_parquet(beta_pseudo, "data-r/beta.parquet")
n = len(stock_panel_monthly)
beta_pseudo = stock_panel_monthly.copy()
beta_pseudo["beta_monthly"] = rng.normal(1, 1, n)
beta_pseudo["beta_daily"] = beta_pseudo["beta_monthly"] + rng.normal(0, 1, n) / 100
beta_pseudo.to_parquet("data-python/beta.parquet")
compustat_annual table
Every Compustat column is filled with uniform random numbers in [0, 1]. For simplicity, datadate is set to the last day of each year.
relevant_columns <- c(
"seq", "ceq", "at", "lt", "txditc", "txdb", "itcb", "pstkrv", "pstkl",
"pstk", "capx", "oancf", "sale", "cogs", "xint", "xsga", "be", "op",
"at_lag", "inv"
)
commands <- unlist(
map(relevant_columns, ~ rlang::exprs(!!..1 := runif(n())))
)
compustat_pseudo <- stock_panel_yearly |>
mutate(
datadate = ymd(str_c(year, "12", "31")),
!!!commands
)
write_parquet(compustat_pseudo, "data-r/compustat_annual.parquet")
relevant_columns = [
"seq", "ceq", "at", "lt", "txditc", "txdb", "itcb", "pstkrv", "pstkl",
"pstk", "capx", "oancf", "sale", "cogs", "xint", "xsga", "be", "op",
"at_lag", "inv",
]
compustat_pseudo = stock_panel_yearly.copy()
compustat_pseudo["datadate"] = pd.to_datetime(
compustat_pseudo["year"].astype(str) + "-12-31"
)
n = len(compustat_pseudo)
for col in relevant_columns:
compustat_pseudo[col] = rng.uniform(0, 1, n)
compustat_pseudo.to_parquet("data-python/compustat_annual.parquet")
crsp_monthly table
Beyond the monthly panel, CRSP adds returns, excess returns, shares outstanding, the last price, and market cap (with a lagged copy).
crsp_monthly_pseudo <- stock_panel_monthly |>
mutate(
ret = pmax(rnorm(n()), -1),
ret_excess = ret - runif(n(), 0, 0.0025),
shrout = runif(n(), 1, 50) * 1000,
altprc = runif(n(), 0, 1000),
mktcap = shrout * altprc
) |>
group_by(permno) |>
arrange(date) |>
mutate(mktcap_lag = lag(mktcap)) |>
ungroup()
write_parquet(crsp_monthly_pseudo, "data-r/crsp_monthly.parquet")
n = len(stock_panel_monthly)
crsp_monthly_pseudo = stock_panel_monthly.copy()
crsp_monthly_pseudo["ret"] = np.maximum(rng.normal(0, 1, n), -1)
crsp_monthly_pseudo["ret_excess"] = (
crsp_monthly_pseudo["ret"] - rng.uniform(0, 0.0025, n)
)
crsp_monthly_pseudo["shrout"] = rng.uniform(1, 50, n) * 1000
crsp_monthly_pseudo["altprc"] = rng.uniform(0, 1000, n)
crsp_monthly_pseudo["mktcap"] = (
crsp_monthly_pseudo["shrout"] * crsp_monthly_pseudo["altprc"]
)
crsp_monthly_pseudo = crsp_monthly_pseudo.sort_values(["permno", "date"])
crsp_monthly_pseudo["mktcap_lag"] = (
crsp_monthly_pseudo.groupby("permno")["mktcap"].shift(1)
)
crsp_monthly_pseudo.to_parquet("data-python/crsp_monthly.parquet")
crsp_daily table
The daily table just adds daily excess returns to the daily panel.
crsp_daily_pseudo <- stock_panel_daily |>
mutate(
ret_excess = pmax(rnorm(n()), -1)
)
write_parquet(crsp_daily_pseudo, "data-r/crsp_daily.parquet")
n = len(stock_panel_daily)
crsp_daily_pseudo = stock_panel_daily.copy()
crsp_daily_pseudo["ret_excess"] = np.maximum(rng.normal(0, 1, n), -1)
crsp_daily_pseudo.to_parquet("data-python/crsp_daily.parquet")