Stock Pseudo Data

The core of the data is the cross-section of stocks. We generate 100 fictional stocks with unique permno and gvkey identifiers and associated exchcd, exchange, industry, and siccd values. The industry and exchange shares roughly match the real crsp_monthly distribution, but the identifiers are made up.

Stock identifiers

number_of_stocks <- 100

industries <- tibble(
  industry = c(
    "Agriculture", "Construction", "Finance", "Manufacturing", "Mining",
    "Public", "Retail", "Services", "Transportation", "Utilities", "Wholesale"
  ),
  prob = c(
    0.00319, 0.0113, 0.185, 0.339, 0.0508,
    0.0779, 0.0620, 0.169, 0.0493, 0.0180, 0.0357
  )
)

exchanges <- tibble(
  exchange = c("AMEX", "NASDAQ", "NYSE"),
  prob = c(0.113, 0.671, 0.216)
)

stock_identifiers <- 1:number_of_stocks |>
  map_df(function(x) {
    tibble(
      permno = x,
      gvkey = as.character(x + 10000),
      exchange = sample(exchanges$exchange, 1, prob = exchanges$prob),
      industry = sample(industries$industry, 1, prob = industries$prob)
    ) |>
      mutate(
        exchcd = case_when(
          exchange == "NYSE" ~ sample(c(1, 31), n()),
          exchange == "AMEX" ~ sample(c(2, 32), n()),
          exchange == "NASDAQ" ~ sample(c(3, 33), n())
        ),
        siccd = case_when(
          industry == "Agriculture" ~ sample(1:999, n()),
          industry == "Mining" ~ sample(1000:1499, n()),
          industry == "Construction" ~ sample(1500:1799, n()),
          industry == "Manufacturing" ~ sample(1800:3999, n()),
          industry == "Transportation" ~ sample(4000:4899, n()),
          industry == "Utilities" ~ sample(4900:4999, n()),
          industry == "Wholesale" ~ sample(5000:5199, n()),
          industry == "Retail" ~ sample(5200:5999, n()),
          industry == "Finance" ~ sample(6000:6799, n()),
          industry == "Services" ~ sample(7000:8999, n()),
          industry == "Public" ~ sample(9000:9999, n())
        )
      )
  })
number_of_stocks = 100

industries = pd.DataFrame({
    "industry": [
        "Agriculture", "Construction", "Finance", "Manufacturing", "Mining",
        "Public", "Retail", "Services", "Transportation", "Utilities", "Wholesale",
    ],
    "prob": [
        0.00319, 0.0113, 0.185, 0.339, 0.0508,
        0.0779, 0.0620, 0.169, 0.0493, 0.0180, 0.0357,
    ],
})

exchanges = pd.DataFrame({
    "exchange": ["AMEX", "NASDAQ", "NYSE"],
    "prob": [0.113, 0.671, 0.216],
})

exchcd_map = {"NYSE": [1, 31], "AMEX": [2, 32], "NASDAQ": [3, 33]}
sic_ranges = {
    "Agriculture": (1, 999), "Mining": (1000, 1499), "Construction": (1500, 1799),
    "Manufacturing": (1800, 3999), "Transportation": (4000, 4899),
    "Utilities": (4900, 4999), "Wholesale": (5000, 5199), "Retail": (5200, 5999),
    "Finance": (6000, 6799), "Services": (7000, 8999), "Public": (9000, 9999),
}

rows = []
for x in range(1, number_of_stocks + 1):
    exchange = rng.choice(exchanges["exchange"],
                          p=exchanges["prob"] / exchanges["prob"].sum())
    industry = rng.choice(industries["industry"],
                          p=industries["prob"] / industries["prob"].sum())
    lo, hi = sic_ranges[industry]
    rows.append({
        "permno": x,
        "gvkey": str(x + 10000),
        "exchange": exchange,
        "industry": industry,
        "exchcd": rng.choice(exchcd_map[exchange]),
        "siccd": rng.integers(lo, hi + 1),
    })

stock_identifiers = pd.DataFrame(rows)

The three panels

We expand the cross-section into yearly, monthly, and daily panels.

stock_panel_yearly <- expand_grid(
  stock_identifiers,
  tibble(year = time_series_years)
) |>
  select(gvkey, year)

stock_panel_monthly <- expand_grid(
  stock_identifiers,
  tibble(date = time_series_months)
) |>
  select(permno, gvkey, date, siccd, industry, exchcd, exchange)

stock_panel_daily <- expand_grid(
  stock_identifiers,
  tibble(date = time_series_days)
) |>
  select(permno, date)
stock_panel_yearly = (
    stock_identifiers.merge(pd.DataFrame({"year": time_series_years}), how="cross")
    [["gvkey", "year"]]
)

stock_panel_monthly = (
    stock_identifiers.merge(pd.DataFrame({"date": time_series_months}), how="cross")
    [["permno", "gvkey", "date", "siccd", "industry", "exchcd", "exchange"]]
)

stock_panel_daily = (
    stock_identifiers.merge(pd.DataFrame({"date": time_series_days}), how="cross")
    [["permno", "date"]]
)

beta table

Monthly betas are drawn from a normal with mean and standard deviation 1; daily betas add a small noise term.

beta_pseudo <- stock_panel_monthly |>
  mutate(
    beta_monthly = rnorm(n(), mean = 1, sd = 1),
    beta_daily = beta_monthly + rnorm(n()) / 100
  )

write_parquet(beta_pseudo, "data-r/beta.parquet")
n = len(stock_panel_monthly)
beta_pseudo = stock_panel_monthly.copy()
beta_pseudo["beta_monthly"] = rng.normal(1, 1, n)
beta_pseudo["beta_daily"] = beta_pseudo["beta_monthly"] + rng.normal(0, 1, n) / 100

beta_pseudo.to_parquet("data-python/beta.parquet")

compustat_annual table

Every Compustat column is filled with uniform random numbers in [0, 1]. For simplicity, datadate is set to the last day of each year.

relevant_columns <- c(
  "seq", "ceq", "at", "lt", "txditc", "txdb", "itcb", "pstkrv", "pstkl",
  "pstk", "capx", "oancf", "sale", "cogs", "xint", "xsga", "be", "op",
  "at_lag", "inv"
)

commands <- unlist(
  map(relevant_columns, ~ rlang::exprs(!!..1 := runif(n())))
)

compustat_pseudo <- stock_panel_yearly |>
  mutate(
    datadate = ymd(str_c(year, "12", "31")),
    !!!commands
  )

write_parquet(compustat_pseudo, "data-r/compustat_annual.parquet")
relevant_columns = [
    "seq", "ceq", "at", "lt", "txditc", "txdb", "itcb", "pstkrv", "pstkl",
    "pstk", "capx", "oancf", "sale", "cogs", "xint", "xsga", "be", "op",
    "at_lag", "inv",
]

compustat_pseudo = stock_panel_yearly.copy()
compustat_pseudo["datadate"] = pd.to_datetime(
    compustat_pseudo["year"].astype(str) + "-12-31"
)
n = len(compustat_pseudo)
for col in relevant_columns:
    compustat_pseudo[col] = rng.uniform(0, 1, n)

compustat_pseudo.to_parquet("data-python/compustat_annual.parquet")

crsp_monthly table

Beyond the monthly panel, CRSP adds returns, excess returns, shares outstanding, the last price, and market cap (with a lagged copy).

crsp_monthly_pseudo <- stock_panel_monthly |>
  mutate(
    ret = pmax(rnorm(n()), -1),
    ret_excess = ret - runif(n(), 0, 0.0025),
    shrout = runif(n(), 1, 50) * 1000,
    altprc = runif(n(), 0, 1000),
    mktcap = shrout * altprc
  ) |>
  group_by(permno) |>
  arrange(date) |>
  mutate(mktcap_lag = lag(mktcap)) |>
  ungroup()

write_parquet(crsp_monthly_pseudo, "data-r/crsp_monthly.parquet")
n = len(stock_panel_monthly)
crsp_monthly_pseudo = stock_panel_monthly.copy()
crsp_monthly_pseudo["ret"] = np.maximum(rng.normal(0, 1, n), -1)
crsp_monthly_pseudo["ret_excess"] = (
    crsp_monthly_pseudo["ret"] - rng.uniform(0, 0.0025, n)
)
crsp_monthly_pseudo["shrout"] = rng.uniform(1, 50, n) * 1000
crsp_monthly_pseudo["altprc"] = rng.uniform(0, 1000, n)
crsp_monthly_pseudo["mktcap"] = (
    crsp_monthly_pseudo["shrout"] * crsp_monthly_pseudo["altprc"]
)
crsp_monthly_pseudo = crsp_monthly_pseudo.sort_values(["permno", "date"])
crsp_monthly_pseudo["mktcap_lag"] = (
    crsp_monthly_pseudo.groupby("permno")["mktcap"].shift(1)
)

crsp_monthly_pseudo.to_parquet("data-python/crsp_monthly.parquet")

crsp_daily table

The daily table just adds daily excess returns to the daily panel.

crsp_daily_pseudo <- stock_panel_daily |>
  mutate(
    ret_excess = pmax(rnorm(n()), -1)
  )

write_parquet(crsp_daily_pseudo, "data-r/crsp_daily.parquet")
n = len(stock_panel_daily)
crsp_daily_pseudo = stock_panel_daily.copy()
crsp_daily_pseudo["ret_excess"] = np.maximum(rng.normal(0, 1, n), -1)

crsp_daily_pseudo.to_parquet("data-python/crsp_daily.parquet")