VMR caudate tutorial

examples/vmr_test_caudate.py demonstrates running GENBoostGPU on variable methylated regions (VMRs) derived from brain tissue. It mirrors the simulation tutorial by selecting representative windows, tuning global hyperparameters, and then dispatching the full workload with genboostgpu.orchestration.run_windows_with_dask().

Setup

  1. Mirror the file layout under heritability/gcta/<region>/_m expected by the helper functions.

  2. Export REGION=caudate (or another region present in your dataset).

  3. Confirm GPU availability and seed all RNGs (the script seeds random, numpy, and cupy to 42).

  4. Ensure the RAPIDS/CuPy versions installed in your environment satisfy the pins published in pyproject.toml.

Tuning and orchestration

  • get_error_list() loads blacklisted windows so they can be skipped.

  • build_windows() now reads each window’s PLINK trio, assembles an aggregate bim table, and records per-window M_raw counts so the tuning heuristics can stratify by SNP density.

  • genboostgpu.tuning.select_tuning_windows() draws a 5% sample with at least one window per chromosome. Those candidates feed into genboostgpu.tuning.global_tune_params(), which evaluates elastic net targets across GPUs using the same search grid described in the simulation tutorial.

  • make_fixed_fn converts the chosen c_lambda/c_ridge pair into per- window fixed_alpha and fixed_l1_ratio values via genboostgpu.hyperparams.enet_from_targets(). The tuned subsample fraction and batch size are passed directly to genboostgpu.orchestration.run_windows_with_dask(), which fans the remaining windows across the cluster and writes summaries to results/ with the vmr prefix.

Full script

  1import random, os
  2from pathlib import Path
  3
  4import cupy as cp
  5import numpy as np
  6import pandas as pd
  7from pyhere import here
  8
  9from genboostgpu.hyperparams import enet_from_targets
 10from genboostgpu.orchestration import run_windows_with_dask
 11from genboostgpu.snp_processing import count_snps_in_window
 12from genboostgpu.tuning import global_tune_params, select_tuning_windows
 13
 14random.seed(42)
 15np.random.seed(42)
 16cp.random.seed(42)
 17
 18WINDOW_SIZE = 500_000
 19EARLY_STOP = {"patience": 5, "min_delta": 1e-4, "warmup": 5}
 20
 21def get_error_list(error_file="../_h/snp-error-window.tsv"):
 22    error_path = Path(error_file)
 23    if error_path.exists():
 24        df = pd.read_csv(error_path, sep="\t")
 25        df["Chrom"] = df["Chr"].str.replace("chr", "", regex=False).astype(int)
 26        return df[["Chrom", "Start", "End"]]
 27    print(f"Warning: Error regions file not found: {error_file}")
 28    return pd.DataFrame(columns=["Chrom", "Start", "End"])
 29
 30
 31def get_vmr_list(region: str) -> pd.DataFrame:
 32    base_dir = Path(here("heritability/gcta")) / region.lower() / "_m"
 33    vmr_file = base_dir / "vmr_list.txt"
 34    if not vmr_file.exists():
 35        raise FileNotFoundError(f"VMR list file not found: {vmr_file}")
 36    return pd.read_csv(vmr_file, sep="\t", header=None)
 37
 38
 39def load_bim(geno_path):
 40    bim_path = Path(geno_path).with_suffix(".bim")
 41    if not bim_path.exists():
 42        raise FileNotFoundError(f"Missing BIM file for window: {bim_path}")
 43    cols = ["chrom", "snp", "cm", "pos", "a1", "a2"]
 44    bim = pd.read_csv(bim_path, sep=r"\s+", header=None, names=cols)
 45    bim["chrom"] = bim["chrom"].astype(str)
 46    bim["pos"] = bim["pos"].astype(int)
 47    return bim[["chrom", "snp", "pos"]]
 48
 49
 50def load_fam(geno_path):
 51    fam_path = Path(geno_path).with_suffix(".fam")
 52    if not fam_path.exists():
 53        raise FileNotFoundError(f"Missing FAM file for window: {fam_path}")
 54    cols = ["fid", "iid", "father", "mother", "sex", "phenotype"]
 55    fam = pd.read_csv(fam_path, sep=r"\s+", header=None, names=cols)
 56    return fam
 57
 58
 59def construct_data_path(chrom, start, end, region, dtype="plink"):
 60    chrom_dir = f"chr_{chrom}"
 61    base_dir = Path(here("heritability/gcta")) / region.lower() / "_m"
 62    if dtype.lower() == "plink":
 63        fn = f"subset_TOPMed_LIBD.AA.{start}_{end}.bed"
 64        return base_dir / "plink_format" / chrom_dir / fn
 65    if dtype.lower() == "vmr":
 66        fn = f"{start}_{end}_meth.phen"
 67        return base_dir / "vmr" / chrom_dir / fn
 68    raise ValueError(f"Unknown dtype: {dtype}")
 69
 70
 71def read_bim(geno_path: Path) -> pd.DataFrame:
 72    bim_path = geno_path.with_suffix(".bim")
 73    if not bim_path.exists():
 74        raise FileNotFoundError(f"Missing BIM file for {geno_path}")
 75    cols = ["chrom", "snp", "cm", "pos", "a1", "a2"]
 76    bim = pd.read_csv(bim_path, sep=r"\s+", header=None, names=cols)
 77    bim["chrom"] = bim["chrom"].astype(int)
 78    bim["pos"] = bim["pos"].astype(int)
 79    return bim
 80
 81
 82def read_fam(geno_path: Path) -> pd.DataFrame:
 83    fam_path = geno_path.with_suffix(".fam")
 84    if not fam_path.exists():
 85        raise FileNotFoundError(f"Missing FAM file for {geno_path}")
 86    cols = ["family_id", "individual_id", "paternal_id", "maternal_id", "sex", "phenotype"]
 87    fam = pd.read_csv(fam_path, sep=r"\s+", header=None, names=cols)
 88    return fam
 89
 90
 91def build_windows(region: str, vmr_list: pd.DataFrame):
 92    windows = []
 93    bim_tables = []
 94    fam = None
 95
 96    for i, row in vmr_list.iterrows():
 97        chrom, start, end = int(row[0]), int(row[1]), int(row[2])
 98        geno_path = Path(construct_data_path(chrom, start, end, region, "plink"))
 99        pheno_path = Path(construct_data_path(chrom, start, end, region, "vmr"))
100
101        bim = read_bim(geno_path)
102        bim_tables.append(bim[["chrom", "snp", "pos"]])
103
104        if fam is None:
105            fam = read_fam(geno_path)
106
107        windows.append({
108            "chrom": chrom,
109            "start": start,
110            "end": end,
111            "pheno_id": f"vmr_{i + 1}",
112            "geno_path": str(geno_path),
113            "pheno_path": str(pheno_path),
114            "has_header": False,
115            "y_pos": 2,
116            "M_raw": int(len(bim)),
117        })
118
119    if not windows:
120        raise ValueError("No VMR windows were discovered—check input files.")
121
122    bim_df = pd.concat(bim_tables, axis=0, ignore_index=True).drop_duplicates()
123    return windows, bim_df, fam
124
125
126def make_fixed_fn(*, bim, N, c_lambda, c_ridge, window_size=500_000, use_window=True):
127    if N <= 0:
128        raise ValueError("Unable to infer sample size from the FAM file.")
129
130    def _fn(window):
131        M = int(window.get("M_raw") or count_snps_in_window(
132            bim,
133            window["chrom"],
134            window["start"],
135            window.get("end", window["start"]),
136            window_size=window_size,
137            use_window=use_window,
138        ))
139        if M <= 1:
140            return {}
141        alpha, l1_ratio = enet_from_targets(M, N, c_lambda=c_lambda, c_ridge=c_ridge)
142        return {"fixed_alpha": alpha, "fixed_l1_ratio": l1_ratio}
143
144    return _fn
145
146
147def main():
148    region = os.environ.get("REGION")
149    if not region:
150        raise ValueError("REGION environment variable must be set")
151
152    error_regions = get_error_list()
153    vmr_list = get_vmr_list(region)
154    windows, bim, fam = build_windows(region, vmr_list)
155
156    tuning_windows = select_tuning_windows(
157        windows, bim, frac=0.05, n_min=40, n_max=200,
158        window_size=500_000, use_window=True,
159        n_bins=3, per_chrom_min=1, seed=13,
160    )
161
162    best = global_tune_params(
163        tuning_windows=tuning_windows, bim=bim, fam=fam,
164        window_size=500_000, use_window=True,
165        grid={
166            "c_lambda": [0.5, 0.7, 1.0, 1.4, 2.0],
167            "c_ridge": [0.5, 1.0, 2.0],
168            "subsample_frac": [0.6, 0.7, 0.8],
169            "batch_size": [4096, 8192],
170        },
171        early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
172        use_window=True,
173    )
174
175    print("Best global parameters:", best)
176
177    fixed_fn = make_fixed_fn(
178        bim=bim,
179        N=len(fam) if fam is not None else 0,
180        c_lambda=best["c_lambda"],
181        c_ridge=best["c_ridge"],
182        window_size=500_000,
183        use_window=True,
184    )
185
186    df = run_windows_with_dask(
187        windows, error_regions=error_regions, outdir="results",
188        window_size=500_000, use_window=True,
189        batch_size=best["batch_size"], fixed_params=fixed_fn,
190        fixed_subsample=best["subsample_frac"],
191        early_stop={"patience": 5, "min_delta": 1e-4, "warmup": 5},
192        working_set={"K": 1024, "refresh": 10}, save=True,
193        prefix="vmr",
194    )
195
196    print(f"Completed {len(df)} VMRs")
197    print(df.head())
198
199
200if __name__ == "__main__":
201    main()

Troubleshooting

  • Missing imports or file paths will raise FileNotFoundError before any GPU work begins—double-check the directory layout.

  • global_tune_params will read PLINK/phenotype files for the sampled windows during tuning. If I/O is a bottleneck, lower frac or n_max in select_tuning_windows.

  • Reduce n_trials passed to genboostgpu.orchestration.run_windows_with_dask() if VMR windows are extremely numerous; the tuned fixed_* parameters let you drop to one trial safely.

  • Compare the saved results with the simulation outputs to validate that real phenotypes lead to the expected spread of final_r2 values.