Simulation tutorial
This tutorial walks through examples/simu_test_100n.py. It tunes global
hyperparameters on a synthetic dataset before orchestrating multi-window runs
with genboostgpu.orchestration.run_windows_with_dask().
Setup
Place the simulated PLINK/phenotype assets in
inputs/simulated-data/_mas expected by the script.Export
NUM_SAMPLESto select the cohort size, e.g.NUM_SAMPLES=100.Activate the GPU environment from installation and ensure seeds are set globally (the script also seeds
random,numpy, andcupyto42).
Key stages
Load genotypes with
genboostgpu.data_io.load_genotypes()and phenotypes from cuDF tables.Select representative windows via
genboostgpu.tuning.select_tuning_windows().Run
genboostgpu.tuning.global_tune_params()to derive ElasticNet targets that map tofixed_alpha/fixed_l1_ratio.Launch
genboostgpu.orchestration.run_windows_with_dask()with the tuned parameters, saving Parquet summaries inresults/.
Full script
1import os
2import random
3
4import cupy as cp
5import numpy as np
6import pandas as pd
7import session_info
8from pathlib import Path
9from pyhere import here
10
11random.seed(42)
12np.random.seed(42)
13cp.random.seed(42)
14
15from genboostgpu.vmr_runner import run_single_window
16from genboostgpu.hyperparams import enet_from_targets
17from genboostgpu.orchestration import run_windows_with_dask
18from genboostgpu.snp_processing import count_snps_in_window
19from genboostgpu.data_io import load_genotypes, load_phenotypes
20from genboostgpu.tuning import select_tuning_windows, global_tune_params
21
22def construct_data_path(num_samples, dtype="phen"):
23 base_dir = Path(here("inputs/simulated-data/_m")) / f"sim_{num_samples}_indiv"
24 if dtype.lower() == "plink":
25 return base_dir / "plink_sim" / "simulated.bed"
26 elif dtype.lower() == "phen":
27 return base_dir / "simulated.phen"
28 else:
29 raise ValueError(f"Unknown dtype: {dtype}")
30
31
32def get_pheno_loc(num_samples):
33 base_dir = Path(here("inputs/simulated-data/_m")) / f"sim_{num_samples}_indiv"
34 mapping_file = os.path.join(base_dir, "snp_phenotype_mapping.tsv")
35
36 if not os.path.exists(mapping_file):
37 raise FileNotFoundError(f"Mapping file not found: {mapping_file}")
38
39 mapped_df = pd.read_csv(mapping_file, sep="\t", header=0)
40
41 return mapped_df
42
43
44def build_windows(num_samples):
45 # Load phenotypes
46 pheno_path = construct_data_path(num_samples, "phen")
47
48 # Build windows config list
49 pheno_loc_df = get_pheno_loc(num_samples)
50 windows = []
51 for _, row in pheno_loc_df[0:10].iterrows(): ## Test the first 10
52 windows.append({
53 "chrom": row["chrom"],
54 "start": row["start"],
55 "end": row["end"],
56 "pheno_id": row["phenotype_id"],
57 "pheno_path": pheno_path,
58 })
59 return windows
60
61
62def tune_windows(windows, geno_arr, bim, fam):
63 tw = select_tuning_windows(
64 windows, bim, frac=0.05, n_min=60, n_max=300, window_size=500_000,
65 use_window=False, n_bins=3, per_chrom_min=1, seed=13
66 )
67 # Tune alpha scale only
68 best = global_tune_params(
69 tuning_windows=tw,
70 geno_arr=geno_arr, bim=bim, fam=fam,
71 window_size=500_000, by_hand=False, use_window=False,
72 grid={
73 "c_lambda": [0.5, 0.7, 1.0, 1.4, 2.0],
74 "c_ridge": [1.0], # keep EN balance fixed here
75 "subsample_frac": [0.7], # keep fixed for now
76 "batch_size": [4096], # keep fixed for now
77 },
78 early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
79 batch_size=4096
80 )
81 print("Best alpha scale:", best)
82
83 return best
84
85
86def fixed_params_for_window(w, bim, N, c_lambda=None, c_ridge=None):
87 ## Needs bim move M
88 M = count_snps_in_window(
89 bim, w["chrom"], w["start"], w.get("end", w["start"]),
90 window_size=500_000, use_window=True
91 )
92 # Convert global (c_lambda, c_ridge) -> window-specific (alpha, l1_ratio)
93 alpha, l1r = enet_from_targets(M, N, c_lambda=c_lambda, c_ridge=c_ridge)
94 return alpha, l1r
95
96
97def make_fixed_fn(*, bim, N, c_lambda, c_ridge, window_size=500_000, use_window=True):
98 def _fn(w):
99 M = int(w.get("M_raw") or count_snps_in_window(
100 bim, w["chrom"], w["start"], w.get("end", w["start"]),
101 window_size=window_size, use_window=use_window
102 ))
103 if M <= 1:
104 return {} # skip fixing if no SNPs
105 alpha, l1r = enet_from_targets(M, N, c_lambda=c_lambda, c_ridge=c_ridge)
106 return {"fixed_alpha": alpha, "fixed_l1_ratio": l1r}
107 return _fn
108
109
110def main(SINGLE=False):
111 num_samples = os.environ.get("NUM_SAMPLES")
112
113 if not num_samples:
114 raise ValueError("NUM_SAMPLES environment variable must be set")
115
116 # Load genotypes
117 geno_path = construct_data_path(num_samples, "plink")
118 geno_arr, bim, fam = load_genotypes(str(geno_path))
119 N = len(fam) if fam is not None else int(getattr(geno_arr, "shape", [0])[0])
120
121 # Build and tune windows
122 windows = build_windows(num_samples)
123 best = tune_windows(windows, geno_arr, bim, fam)
124
125 if SINGLE:
126 # Test run_single_window
127 results = []
128 for w in windows:
129 alpha, l1r = fixed_params_for_window(w, bim, N,
130 c_lambda=best["c_lambda"],
131 c_ridge=best["c_ridge"])
132 r = run_single_window(
133 chrom=w["chrom"], start=w["start"], end=w.get("end", w["start"]),
134 geno_arr=geno_arr, bim=bim, fam=fam,
135 pheno_path=w["pheno_path"], pheno_id=w["pheno_id"],
136 window_size=500_000, use_window=True,
137 # fixed hyperparams -> skip per-window tuning
138 fixed_alpha=alpha, fixed_l1_ratio=l1r,
139 fixed_subsample=best["subsample_frac"],
140 batch_size=best["batch_size"], n_trials=1, n_iter=100,
141 early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
142 save_full=True
143 )
144 if r is not None:
145 results.append(r)
146 df = pd.DataFrame([r for r in results if r is not None])
147 else:
148 # Run with dask orchestration
149 fixed_fn = make_fixed_fn(
150 bim=bim, N=N, c_lambda=best["c_lambda"],
151 c_ridge=best["c_ridge"], window_size=500_000, use_window=False
152 )
153
154 df = run_windows_with_dask(
155 windows, geno_arr=geno_arr, bim=bim, fam=fam,
156 outdir="results", batch_size=best["batch_size"],
157 window_size=500_000, use_window=False,
158 fixed_params=fixed_fn, fixed_subsample=best["subsample_frac"],
159 early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
160 working_set={"K": 1024, "refresh": 10},
161 prefix="simu_100"
162 )
163
164 print(f"Completed {len(df)} phenotypes")
165 print(df.head())
166
167 # Session information
168 session_info.show()
169
170if __name__ == "__main__":
171 main()
Next steps
Inspect the output Parquet files to confirm the tuned hyperparameters are being reused.
Modify
working_setorearly_stopin the script to explore runtime trade-offs.Compare the per-window metrics with tutorials/vmr_caudate to see how real VMR data differs from simulations.