Stock Pseudo Data

Jun He May 31, 2026

The core of the data is the cross-section of stocks. We generate 100 fictional stocks with unique permno and gvkey identifiers and associated exchcd, exchange, industry, and siccd values. The industry and exchange shares roughly match the real crsp_monthly distribution, but the identifiers are made up.

Stock identifiers

number_of_stocks <- 100

industries <- tibble(
  industry = c(
    "Agriculture", "Construction", "Finance", "Manufacturing", "Mining",
    "Public", "Retail", "Services", "Transportation", "Utilities", "Wholesale"
  ),
  prob = c(
    0.00319, 0.0113, 0.185, 0.339, 0.0508,
    0.0779, 0.0620, 0.169, 0.0493, 0.0180, 0.0357
  )
)

exchanges <- tibble(
  exchange = c("AMEX", "NASDAQ", "NYSE"),
  prob = c(0.113, 0.671, 0.216)
)

stock_identifiers <- 1:number_of_stocks |>
  map_df(function(x) {
    tibble(
      permno = x,
      gvkey = as.character(x + 10000),
      exchange = sample(exchanges$exchange, 1, prob = exchanges$prob),
      industry = sample(industries$industry, 1, prob = industries$prob)
    ) |>
      mutate(
        exchcd = case_when(
          exchange == "NYSE" ~ sample(c(1, 31), n()),
          exchange == "AMEX" ~ sample(c(2, 32), n()),
          exchange == "NASDAQ" ~ sample(c(3, 33), n())
        ),
        siccd = case_when(
          industry == "Agriculture" ~ sample(1:999, n()),
          industry == "Mining" ~ sample(1000:1499, n()),
          industry == "Construction" ~ sample(1500:1799, n()),
          industry == "Manufacturing" ~ sample(1800:3999, n()),
          industry == "Transportation" ~ sample(4000:4899, n()),
          industry == "Utilities" ~ sample(4900:4999, n()),
          industry == "Wholesale" ~ sample(5000:5199, n()),
          industry == "Retail" ~ sample(5200:5999, n()),
          industry == "Finance" ~ sample(6000:6799, n()),
          industry == "Services" ~ sample(7000:8999, n()),
          industry == "Public" ~ sample(9000:9999, n())
        )
      )
  })

number_of_stocks = 100

industries = pd.DataFrame({
    "industry": [
        "Agriculture", "Construction", "Finance", "Manufacturing", "Mining",
        "Public", "Retail", "Services", "Transportation", "Utilities", "Wholesale",
    ],
    "prob": [
        0.00319, 0.0113, 0.185, 0.339, 0.0508,
        0.0779, 0.0620, 0.169, 0.0493, 0.0180, 0.0357,
    ],
})

exchanges = pd.DataFrame({
    "exchange": ["AMEX", "NASDAQ", "NYSE"],
    "prob": [0.113, 0.671, 0.216],
})

exchcd_map = {"NYSE": [1, 31], "AMEX": [2, 32], "NASDAQ": [3, 33]}
sic_ranges = {
    "Agriculture": (1, 999), "Mining": (1000, 1499), "Construction": (1500, 1799),
    "Manufacturing": (1800, 3999), "Transportation": (4000, 4899),
    "Utilities": (4900, 4999), "Wholesale": (5000, 5199), "Retail": (5200, 5999),
    "Finance": (6000, 6799), "Services": (7000, 8999), "Public": (9000, 9999),
}

rows = []
for x in range(1, number_of_stocks + 1):
    exchange = rng.choice(exchanges["exchange"],
                          p=exchanges["prob"] / exchanges["prob"].sum())
    industry = rng.choice(industries["industry"],
                          p=industries["prob"] / industries["prob"].sum())
    lo, hi = sic_ranges[industry]
    rows.append({
        "permno": x,
        "gvkey": str(x + 10000),
        "exchange": exchange,
        "industry": industry,
        "exchcd": rng.choice(exchcd_map[exchange]),
        "siccd": rng.integers(lo, hi + 1),
    })

stock_identifiers = pd.DataFrame(rows)

The three panels

We expand the cross-section into yearly, monthly, and daily panels.

stock_panel_yearly <- expand_grid(
  stock_identifiers,
  tibble(year = time_series_years)
) |>
  select(gvkey, year)

stock_panel_monthly <- expand_grid(
  stock_identifiers,
  tibble(date = time_series_months)
) |>
  select(permno, gvkey, date, siccd, industry, exchcd, exchange)

stock_panel_daily <- expand_grid(
  stock_identifiers,
  tibble(date = time_series_days)
) |>
  select(permno, date)

stock_panel_yearly = (
    stock_identifiers.merge(pd.DataFrame({"year": time_series_years}), how="cross")
    [["gvkey", "year"]]
)

stock_panel_monthly = (
    stock_identifiers.merge(pd.DataFrame({"date": time_series_months}), how="cross")
    [["permno", "gvkey", "date", "siccd", "industry", "exchcd", "exchange"]]
)

stock_panel_daily = (
    stock_identifiers.merge(pd.DataFrame({"date": time_series_days}), how="cross")
    [["permno", "date"]]
)

`beta` table

Monthly betas are drawn from a normal with mean and standard deviation 1; daily betas add a small noise term.

beta_pseudo <- stock_panel_monthly |>
  mutate(
    beta_monthly = rnorm(n(), mean = 1, sd = 1),
    beta_daily = beta_monthly + rnorm(n()) / 100
  )

write_parquet(beta_pseudo, "data-r/beta.parquet")

n = len(stock_panel_monthly)
beta_pseudo = stock_panel_monthly.copy()
beta_pseudo["beta_monthly"] = rng.normal(1, 1, n)
beta_pseudo["beta_daily"] = beta_pseudo["beta_monthly"] + rng.normal(0, 1, n) / 100

beta_pseudo.to_parquet("data-python/beta.parquet")

`compustat_annual` table

Every Compustat column is filled with uniform random numbers in [0, 1]. For simplicity, datadate is set to the last day of each year.

relevant_columns <- c(
  "seq", "ceq", "at", "lt", "txditc", "txdb", "itcb", "pstkrv", "pstkl",
  "pstk", "capx", "oancf", "sale", "cogs", "xint", "xsga", "be", "op",
  "at_lag", "inv"
)

commands <- unlist(
  map(relevant_columns, ~ rlang::exprs(!!..1 := runif(n())))
)

compustat_pseudo <- stock_panel_yearly |>
  mutate(
    datadate = ymd(str_c(year, "12", "31")),
    !!!commands
  )

write_parquet(compustat_pseudo, "data-r/compustat_annual.parquet")

relevant_columns = [
    "seq", "ceq", "at", "lt", "txditc", "txdb", "itcb", "pstkrv", "pstkl",
    "pstk", "capx", "oancf", "sale", "cogs", "xint", "xsga", "be", "op",
    "at_lag", "inv",
]

compustat_pseudo = stock_panel_yearly.copy()
compustat_pseudo["datadate"] = pd.to_datetime(
    compustat_pseudo["year"].astype(str) + "-12-31"
)
n = len(compustat_pseudo)
for col in relevant_columns:
    compustat_pseudo[col] = rng.uniform(0, 1, n)

compustat_pseudo.to_parquet("data-python/compustat_annual.parquet")

`crsp_monthly` table

Beyond the monthly panel, CRSP adds returns, excess returns, shares outstanding, the last price, and market cap (with a lagged copy).

crsp_monthly_pseudo <- stock_panel_monthly |>
  mutate(
    ret = pmax(rnorm(n()), -1),
    ret_excess = ret - runif(n(), 0, 0.0025),
    shrout = runif(n(), 1, 50) * 1000,
    altprc = runif(n(), 0, 1000),
    mktcap = shrout * altprc
  ) |>
  group_by(permno) |>
  arrange(date) |>
  mutate(mktcap_lag = lag(mktcap)) |>
  ungroup()

write_parquet(crsp_monthly_pseudo, "data-r/crsp_monthly.parquet")

n = len(stock_panel_monthly)
crsp_monthly_pseudo = stock_panel_monthly.copy()
crsp_monthly_pseudo["ret"] = np.maximum(rng.normal(0, 1, n), -1)
crsp_monthly_pseudo["ret_excess"] = (
    crsp_monthly_pseudo["ret"] - rng.uniform(0, 0.0025, n)
)
crsp_monthly_pseudo["shrout"] = rng.uniform(1, 50, n) * 1000
crsp_monthly_pseudo["altprc"] = rng.uniform(0, 1000, n)
crsp_monthly_pseudo["mktcap"] = (
    crsp_monthly_pseudo["shrout"] * crsp_monthly_pseudo["altprc"]
)
crsp_monthly_pseudo = crsp_monthly_pseudo.sort_values(["permno", "date"])
crsp_monthly_pseudo["mktcap_lag"] = (
    crsp_monthly_pseudo.groupby("permno")["mktcap"].shift(1)
)

crsp_monthly_pseudo.to_parquet("data-python/crsp_monthly.parquet")

`crsp_daily` table

The daily table just adds daily excess returns to the daily panel.

crsp_daily_pseudo <- stock_panel_daily |>
  mutate(
    ret_excess = pmax(rnorm(n()), -1)
  )

write_parquet(crsp_daily_pseudo, "data-r/crsp_daily.parquet")

n = len(stock_panel_daily)
crsp_daily_pseudo = stock_panel_daily.copy()
crsp_daily_pseudo["ret_excess"] = np.maximum(rng.normal(0, 1, n), -1)

crsp_daily_pseudo.to_parquet("data-python/crsp_daily.parquet")

Stock Pseudo Data

Stock identifiers

The three panels

beta table

compustat_annual table

crsp_monthly table

crsp_daily table

`beta` table

`compustat_annual` table

`crsp_monthly` table

`crsp_daily` table