Simulation tutorial

This tutorial walks through examples/simu_test_100n.py. It tunes global hyperparameters on a synthetic dataset before orchestrating multi-window runs with genboostgpu.orchestration.run_windows_with_dask().

Setup

  1. Place the simulated PLINK/phenotype assets in inputs/simulated-data/_m as expected by the script.

  2. Export NUM_SAMPLES to select the cohort size, e.g. NUM_SAMPLES=100.

  3. Activate the GPU environment from installation and ensure seeds are set globally (the script also seeds random, numpy, and cupy to 42).

Key stages

  • Load genotypes with genboostgpu.data_io.load_genotypes() and phenotypes from cuDF tables.

  • Select representative windows via genboostgpu.tuning.select_tuning_windows().

  • Run genboostgpu.tuning.global_tune_params() to derive ElasticNet targets that map to fixed_alpha/fixed_l1_ratio.

  • Launch genboostgpu.orchestration.run_windows_with_dask() with the tuned parameters, saving Parquet summaries in results/.

Full script

  1import os
  2import random
  3
  4import cupy as cp
  5import numpy as np
  6import pandas as pd
  7import session_info
  8from pathlib import Path
  9from pyhere import here
 10
 11random.seed(42)
 12np.random.seed(42)
 13cp.random.seed(42)
 14
 15from genboostgpu.vmr_runner import run_single_window
 16from genboostgpu.hyperparams import enet_from_targets
 17from genboostgpu.orchestration import run_windows_with_dask
 18from genboostgpu.snp_processing import count_snps_in_window
 19from genboostgpu.data_io import load_genotypes, load_phenotypes
 20from genboostgpu.tuning import select_tuning_windows, global_tune_params
 21
 22def construct_data_path(num_samples, dtype="phen"):
 23    base_dir = Path(here("inputs/simulated-data/_m")) / f"sim_{num_samples}_indiv"
 24    if dtype.lower() == "plink":
 25        return base_dir / "plink_sim" / "simulated.bed"
 26    elif dtype.lower() == "phen":
 27        return base_dir / "simulated.phen"
 28    else:
 29        raise ValueError(f"Unknown dtype: {dtype}")
 30
 31
 32def get_pheno_loc(num_samples):
 33    base_dir = Path(here("inputs/simulated-data/_m")) / f"sim_{num_samples}_indiv"
 34    mapping_file = os.path.join(base_dir, "snp_phenotype_mapping.tsv")
 35
 36    if not os.path.exists(mapping_file):
 37        raise FileNotFoundError(f"Mapping file not found: {mapping_file}")
 38
 39    mapped_df = pd.read_csv(mapping_file, sep="\t", header=0)
 40
 41    return mapped_df
 42
 43
 44def build_windows(num_samples):
 45    # Load phenotypes
 46    pheno_path = construct_data_path(num_samples, "phen")
 47
 48    # Build windows config list
 49    pheno_loc_df = get_pheno_loc(num_samples)
 50    windows = []
 51    for _, row in pheno_loc_df[0:10].iterrows(): ## Test the first 10
 52        windows.append({
 53            "chrom": row["chrom"],
 54            "start": row["start"],
 55            "end": row["end"],
 56            "pheno_id": row["phenotype_id"],
 57            "pheno_path": pheno_path,
 58        })
 59    return windows
 60
 61
 62def tune_windows(windows, geno_arr, bim, fam):
 63    tw = select_tuning_windows(
 64        windows, bim, frac=0.05, n_min=60, n_max=300, window_size=500_000,
 65        use_window=False, n_bins=3, per_chrom_min=1, seed=13
 66    )
 67    # Tune alpha scale only
 68    best = global_tune_params(
 69        tuning_windows=tw,
 70        geno_arr=geno_arr, bim=bim, fam=fam,
 71        window_size=500_000, by_hand=False, use_window=False,
 72        grid={
 73            "c_lambda":       [0.5, 0.7, 1.0, 1.4, 2.0],
 74            "c_ridge":        [1.0],      # keep EN balance fixed here
 75            "subsample_frac": [0.7],      # keep fixed for now
 76            "batch_size":     [4096],     # keep fixed for now
 77        },
 78        early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
 79        batch_size=4096
 80    )
 81    print("Best alpha scale:", best)
 82
 83    return best
 84
 85
 86def fixed_params_for_window(w, bim, N, c_lambda=None, c_ridge=None):
 87    ## Needs bim move M
 88    M = count_snps_in_window(
 89        bim, w["chrom"], w["start"], w.get("end", w["start"]),
 90        window_size=500_000, use_window=True
 91    )
 92    # Convert global (c_lambda, c_ridge) -> window-specific (alpha, l1_ratio)
 93    alpha, l1r = enet_from_targets(M, N, c_lambda=c_lambda, c_ridge=c_ridge)
 94    return alpha, l1r
 95
 96
 97def make_fixed_fn(*, bim, N, c_lambda, c_ridge, window_size=500_000, use_window=True):
 98    def _fn(w):
 99        M = int(w.get("M_raw") or count_snps_in_window(
100            bim, w["chrom"], w["start"], w.get("end", w["start"]),
101            window_size=window_size, use_window=use_window
102        ))
103        if M <= 1:
104            return {}  # skip fixing if no SNPs
105        alpha, l1r = enet_from_targets(M, N, c_lambda=c_lambda, c_ridge=c_ridge)
106        return {"fixed_alpha": alpha, "fixed_l1_ratio": l1r}
107    return _fn
108
109
110def main(SINGLE=False):
111    num_samples = os.environ.get("NUM_SAMPLES")
112
113    if not num_samples:
114        raise ValueError("NUM_SAMPLES environment variable must be set")
115
116    # Load genotypes
117    geno_path = construct_data_path(num_samples, "plink")
118    geno_arr, bim, fam = load_genotypes(str(geno_path))
119    N = len(fam) if fam is not None else int(getattr(geno_arr, "shape", [0])[0])
120    
121    # Build and tune windows
122    windows = build_windows(num_samples)
123    best = tune_windows(windows, geno_arr, bim, fam)
124
125    if SINGLE:
126        # Test run_single_window
127        results = []
128        for w in windows:
129            alpha, l1r = fixed_params_for_window(w, bim, N,
130                                                 c_lambda=best["c_lambda"],
131                                                 c_ridge=best["c_ridge"])
132            r = run_single_window(
133                chrom=w["chrom"], start=w["start"], end=w.get("end", w["start"]),
134                geno_arr=geno_arr, bim=bim, fam=fam,
135                pheno_path=w["pheno_path"], pheno_id=w["pheno_id"],
136                window_size=500_000, use_window=True,
137                # fixed hyperparams -> skip per-window tuning
138                fixed_alpha=alpha, fixed_l1_ratio=l1r,
139                fixed_subsample=best["subsample_frac"],
140                batch_size=best["batch_size"], n_trials=1, n_iter=100,
141                early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
142                save_full=True
143            )
144            if r is not None:
145                results.append(r)
146        df = pd.DataFrame([r for r in results if r is not None])
147    else:
148        # Run with dask orchestration
149        fixed_fn = make_fixed_fn(
150        bim=bim, N=N, c_lambda=best["c_lambda"],
151        c_ridge=best["c_ridge"], window_size=500_000, use_window=False
152        )
153
154        df = run_windows_with_dask(
155            windows, geno_arr=geno_arr, bim=bim, fam=fam,
156            outdir="results", batch_size=best["batch_size"],
157            window_size=500_000, use_window=False,
158            fixed_params=fixed_fn, fixed_subsample=best["subsample_frac"],
159            early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
160            working_set={"K": 1024, "refresh": 10},
161            prefix="simu_100"
162        )
163
164    print(f"Completed {len(df)} phenotypes")
165    print(df.head())
166
167    # Session information
168    session_info.show()
169
170if __name__ == "__main__":
171    main()

Next steps

  • Inspect the output Parquet files to confirm the tuned hyperparameters are being reused.

  • Modify working_set or early_stop in the script to explore runtime trade-offs.

  • Compare the per-window metrics with tutorials/vmr_caudate to see how real VMR data differs from simulations.