Bond Pseudo Data

Finally, the bond data: a Mergent FISD-style cross-section and a TRACE-style transaction table built on top of it.

fisd table

We sample 100 bonds with made-up CUSIPs and issue/issuer IDs, drawing the remaining fields from plausible ranges.

number_of_bonds <- 100

fisd_pseudo <- 1:number_of_bonds |>
  map_df(function(x) {
    tibble(complete_cusip = str_to_upper(
      str_c(sample(c(letters, 0:9), 12, replace = TRUE), collapse = "")
    ))
  }) |>
  mutate(
    maturity = sample(time_series_days, n(), replace = TRUE),
    offering_amt = sample(seq(1:100) * 100000, n(), replace = TRUE),
    offering_date = maturity - sample(seq(1:25) * 365, n(), replace = TRUE),
    dated_date = offering_date - sample(-10:10, n(), replace = TRUE),
    interest_frequency = sample(c(0, 1, 2, 4, 12), n(), replace = TRUE),
    coupon = sample(seq(0, 2, by = 0.1), n(), replace = TRUE),
    last_interest_date = pmax(maturity, offering_date, dated_date),
    issue_id = row_number(),
    issuer_id = sample(1:250, n(), replace = TRUE),
    sic_code = as.character(sample(seq(1:9) * 1000, n(), replace = TRUE))
  )

write_parquet(fisd_pseudo, "data-r/fisd.parquet")
number_of_bonds = 100
cusip_chars = list(string.ascii_lowercase) + [str(d) for d in range(10)]

fisd_pseudo = pd.DataFrame({
    "complete_cusip": [
        "".join(rng.choice(cusip_chars, 12)).upper()
        for _ in range(number_of_bonds)
    ]
})
n = number_of_bonds
fisd_pseudo["maturity"] = rng.choice(time_series_days, n)
fisd_pseudo["offering_amt"] = rng.choice(np.arange(1, 101) * 100000, n)
fisd_pseudo["offering_date"] = (
    fisd_pseudo["maturity"]
    - pd.to_timedelta(rng.choice(np.arange(1, 26) * 365, n), unit="D")
)
fisd_pseudo["dated_date"] = (
    fisd_pseudo["offering_date"]
    - pd.to_timedelta(rng.integers(-10, 11, n), unit="D")
)
fisd_pseudo["interest_frequency"] = rng.choice([0, 1, 2, 4, 12], n)
fisd_pseudo["coupon"] = rng.choice(np.round(np.arange(0, 2.01, 0.1), 1), n)
fisd_pseudo["last_interest_date"] = fisd_pseudo[
    ["maturity", "offering_date", "dated_date"]
].max(axis=1)
fisd_pseudo["issue_id"] = np.arange(1, n + 1)
fisd_pseudo["issuer_id"] = rng.integers(1, 251, n)
fisd_pseudo["sic_code"] = (rng.choice(np.arange(1, 10) * 1000, n)).astype(str)

fisd_pseudo.to_parquet("data-python/fisd.parquet")

trace_enhanced table

For each fictional CUSIP we build a daily panel over the analysis window and stack it five times, so there are at least five transactions per day. We then add transaction time, price, volume, yield, and side/counterparty codes.

start_date <- as.Date("2014-01-01")
end_date <- as.Date("2016-11-30")

bonds_panel <- expand_grid(
  fisd_pseudo |> select(cusip_id = complete_cusip),
  tibble(trd_exctn_dt = seq(start_date, end_date, "1 day"))
)

trace_enhanced_pseudo <- bind_rows(
  bonds_panel, bonds_panel, bonds_panel, bonds_panel, bonds_panel
) |>
  mutate(
    trd_exctn_tm = str_c(
      sample(0:24, n(), replace = TRUE), ":",
      sample(0:60, n(), replace = TRUE), ":",
      sample(0:60, n(), replace = TRUE)
    ),
    rptd_pr = runif(n(), 10, 200),
    entrd_vol_qt = sample(1:20, n(), replace = TRUE) * 1000,
    yld_pt = runif(n(), -10, 10),
    rpt_side_cd = sample(c("B", "S"), n(), replace = TRUE),
    cntra_mp_id = sample(c("C", "D"), n(), replace = TRUE)
  )

write_parquet(trace_enhanced_pseudo, "data-r/trace_enhanced.parquet")
start_date = pd.Timestamp("2014-01-01")
end_date = pd.Timestamp("2016-11-30")
trace_dates = pd.date_range(start_date, end_date, freq="D")

bonds_panel = (
    fisd_pseudo[["complete_cusip"]]
    .rename(columns={"complete_cusip": "cusip_id"})
    .merge(pd.DataFrame({"trd_exctn_dt": trace_dates}), how="cross")
)

trace_enhanced_pseudo = pd.concat([bonds_panel] * 5, ignore_index=True)
n = len(trace_enhanced_pseudo)
trace_enhanced_pseudo["trd_exctn_tm"] = [
    f"{h}:{m}:{s}"
    for h, m, s in zip(
        rng.integers(0, 25, n), rng.integers(0, 61, n), rng.integers(0, 61, n)
    )
]
trace_enhanced_pseudo["rptd_pr"] = rng.uniform(10, 200, n)
trace_enhanced_pseudo["entrd_vol_qt"] = rng.integers(1, 21, n) * 1000
trace_enhanced_pseudo["yld_pt"] = rng.uniform(-10, 10, n)
trace_enhanced_pseudo["rpt_side_cd"] = rng.choice(["B", "S"], n)
trace_enhanced_pseudo["cntra_mp_id"] = rng.choice(["C", "D"], n)

trace_enhanced_pseudo.to_parquet("data-python/trace_enhanced.parquet")

As stated in the overview, this data contains no samples of the original WRDS data — every column is filled with random numbers.