VMR caudate tutorial
examples/vmr_test_caudate.py demonstrates running GENBoostGPU on variable
methylated regions (VMRs) derived from brain tissue. It mirrors the simulation
tutorial by selecting representative windows, tuning global hyperparameters, and
then dispatching the full workload with genboostgpu.orchestration.run_windows_with_dask().
Setup
Mirror the file layout under
heritability/gcta/<region>/_mexpected by the helper functions.Export
REGION=caudate(or another region present in your dataset).Confirm GPU availability and seed all RNGs (the script seeds
random,numpy, andcupyto42).Ensure the RAPIDS/CuPy versions installed in your environment satisfy the pins published in
pyproject.toml.
Tuning and orchestration
get_error_list()loads blacklisted windows so they can be skipped.build_windows()now reads each window’s PLINK trio, assembles an aggregatebimtable, and records per-windowM_rawcounts so the tuning heuristics can stratify by SNP density.genboostgpu.tuning.select_tuning_windows()draws a 5% sample with at least one window per chromosome. Those candidates feed intogenboostgpu.tuning.global_tune_params(), which evaluates elastic net targets across GPUs using the same search grid described in the simulation tutorial.make_fixed_fnconverts the chosenc_lambda/c_ridgepair into per- windowfixed_alphaandfixed_l1_ratiovalues viagenboostgpu.hyperparams.enet_from_targets(). The tuned subsample fraction and batch size are passed directly togenboostgpu.orchestration.run_windows_with_dask(), which fans the remaining windows across the cluster and writes summaries toresults/with thevmrprefix.
Full script
1import random, os
2from pathlib import Path
3
4import cupy as cp
5import numpy as np
6import pandas as pd
7from pyhere import here
8
9from genboostgpu.hyperparams import enet_from_targets
10from genboostgpu.orchestration import run_windows_with_dask
11from genboostgpu.snp_processing import count_snps_in_window
12from genboostgpu.tuning import global_tune_params, select_tuning_windows
13
14random.seed(42)
15np.random.seed(42)
16cp.random.seed(42)
17
18WINDOW_SIZE = 500_000
19EARLY_STOP = {"patience": 5, "min_delta": 1e-4, "warmup": 5}
20
21def get_error_list(error_file="../_h/snp-error-window.tsv"):
22 error_path = Path(error_file)
23 if error_path.exists():
24 df = pd.read_csv(error_path, sep="\t")
25 df["Chrom"] = df["Chr"].str.replace("chr", "", regex=False).astype(int)
26 return df[["Chrom", "Start", "End"]]
27 print(f"Warning: Error regions file not found: {error_file}")
28 return pd.DataFrame(columns=["Chrom", "Start", "End"])
29
30
31def get_vmr_list(region: str) -> pd.DataFrame:
32 base_dir = Path(here("heritability/gcta")) / region.lower() / "_m"
33 vmr_file = base_dir / "vmr_list.txt"
34 if not vmr_file.exists():
35 raise FileNotFoundError(f"VMR list file not found: {vmr_file}")
36 return pd.read_csv(vmr_file, sep="\t", header=None)
37
38
39def load_bim(geno_path):
40 bim_path = Path(geno_path).with_suffix(".bim")
41 if not bim_path.exists():
42 raise FileNotFoundError(f"Missing BIM file for window: {bim_path}")
43 cols = ["chrom", "snp", "cm", "pos", "a1", "a2"]
44 bim = pd.read_csv(bim_path, sep=r"\s+", header=None, names=cols)
45 bim["chrom"] = bim["chrom"].astype(str)
46 bim["pos"] = bim["pos"].astype(int)
47 return bim[["chrom", "snp", "pos"]]
48
49
50def load_fam(geno_path):
51 fam_path = Path(geno_path).with_suffix(".fam")
52 if not fam_path.exists():
53 raise FileNotFoundError(f"Missing FAM file for window: {fam_path}")
54 cols = ["fid", "iid", "father", "mother", "sex", "phenotype"]
55 fam = pd.read_csv(fam_path, sep=r"\s+", header=None, names=cols)
56 return fam
57
58
59def construct_data_path(chrom, start, end, region, dtype="plink"):
60 chrom_dir = f"chr_{chrom}"
61 base_dir = Path(here("heritability/gcta")) / region.lower() / "_m"
62 if dtype.lower() == "plink":
63 fn = f"subset_TOPMed_LIBD.AA.{start}_{end}.bed"
64 return base_dir / "plink_format" / chrom_dir / fn
65 if dtype.lower() == "vmr":
66 fn = f"{start}_{end}_meth.phen"
67 return base_dir / "vmr" / chrom_dir / fn
68 raise ValueError(f"Unknown dtype: {dtype}")
69
70
71def read_bim(geno_path: Path) -> pd.DataFrame:
72 bim_path = geno_path.with_suffix(".bim")
73 if not bim_path.exists():
74 raise FileNotFoundError(f"Missing BIM file for {geno_path}")
75 cols = ["chrom", "snp", "cm", "pos", "a1", "a2"]
76 bim = pd.read_csv(bim_path, sep=r"\s+", header=None, names=cols)
77 bim["chrom"] = bim["chrom"].astype(int)
78 bim["pos"] = bim["pos"].astype(int)
79 return bim
80
81
82def read_fam(geno_path: Path) -> pd.DataFrame:
83 fam_path = geno_path.with_suffix(".fam")
84 if not fam_path.exists():
85 raise FileNotFoundError(f"Missing FAM file for {geno_path}")
86 cols = ["family_id", "individual_id", "paternal_id", "maternal_id", "sex", "phenotype"]
87 fam = pd.read_csv(fam_path, sep=r"\s+", header=None, names=cols)
88 return fam
89
90
91def build_windows(region: str, vmr_list: pd.DataFrame):
92 windows = []
93 bim_tables = []
94 fam = None
95
96 for i, row in vmr_list.iterrows():
97 chrom, start, end = int(row[0]), int(row[1]), int(row[2])
98 geno_path = Path(construct_data_path(chrom, start, end, region, "plink"))
99 pheno_path = Path(construct_data_path(chrom, start, end, region, "vmr"))
100
101 bim = read_bim(geno_path)
102 bim_tables.append(bim[["chrom", "snp", "pos"]])
103
104 if fam is None:
105 fam = read_fam(geno_path)
106
107 windows.append({
108 "chrom": chrom,
109 "start": start,
110 "end": end,
111 "pheno_id": f"vmr_{i + 1}",
112 "geno_path": str(geno_path),
113 "pheno_path": str(pheno_path),
114 "has_header": False,
115 "y_pos": 2,
116 "M_raw": int(len(bim)),
117 })
118
119 if not windows:
120 raise ValueError("No VMR windows were discovered—check input files.")
121
122 bim_df = pd.concat(bim_tables, axis=0, ignore_index=True).drop_duplicates()
123 return windows, bim_df, fam
124
125
126def make_fixed_fn(*, bim, N, c_lambda, c_ridge, window_size=500_000, use_window=True):
127 if N <= 0:
128 raise ValueError("Unable to infer sample size from the FAM file.")
129
130 def _fn(window):
131 M = int(window.get("M_raw") or count_snps_in_window(
132 bim,
133 window["chrom"],
134 window["start"],
135 window.get("end", window["start"]),
136 window_size=window_size,
137 use_window=use_window,
138 ))
139 if M <= 1:
140 return {}
141 alpha, l1_ratio = enet_from_targets(M, N, c_lambda=c_lambda, c_ridge=c_ridge)
142 return {"fixed_alpha": alpha, "fixed_l1_ratio": l1_ratio}
143
144 return _fn
145
146
147def main():
148 region = os.environ.get("REGION")
149 if not region:
150 raise ValueError("REGION environment variable must be set")
151
152 error_regions = get_error_list()
153 vmr_list = get_vmr_list(region)
154 windows, bim, fam = build_windows(region, vmr_list)
155
156 tuning_windows = select_tuning_windows(
157 windows, bim, frac=0.05, n_min=40, n_max=200,
158 window_size=500_000, use_window=True,
159 n_bins=3, per_chrom_min=1, seed=13,
160 )
161
162 best = global_tune_params(
163 tuning_windows=tuning_windows, bim=bim, fam=fam,
164 window_size=500_000, use_window=True,
165 grid={
166 "c_lambda": [0.5, 0.7, 1.0, 1.4, 2.0],
167 "c_ridge": [0.5, 1.0, 2.0],
168 "subsample_frac": [0.6, 0.7, 0.8],
169 "batch_size": [4096, 8192],
170 },
171 early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
172 use_window=True,
173 )
174
175 print("Best global parameters:", best)
176
177 fixed_fn = make_fixed_fn(
178 bim=bim,
179 N=len(fam) if fam is not None else 0,
180 c_lambda=best["c_lambda"],
181 c_ridge=best["c_ridge"],
182 window_size=500_000,
183 use_window=True,
184 )
185
186 df = run_windows_with_dask(
187 windows, error_regions=error_regions, outdir="results",
188 window_size=500_000, use_window=True,
189 batch_size=best["batch_size"], fixed_params=fixed_fn,
190 fixed_subsample=best["subsample_frac"],
191 early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
192 working_set={"K": 1024, "refresh": 10}, save=True,
193 prefix="vmr",
194 )
195
196 print(f"Completed {len(df)} VMRs")
197 print(df.head())
198
199
200if __name__ == "__main__":
201 main()
Troubleshooting
Missing imports or file paths will raise
FileNotFoundErrorbefore any GPU work begins—double-check the directory layout.global_tune_paramswill read PLINK/phenotype files for the sampled windows during tuning. If I/O is a bottleneck, lowerfracorn_maxinselect_tuning_windows.Reduce
n_trialspassed togenboostgpu.orchestration.run_windows_with_dask()if VMR windows are extremely numerous; the tunedfixed_*parameters let you drop to one trial safely.Compare the saved results with the simulation outputs to validate that real phenotypes lead to the expected spread of
final_r2values.