Source code for genvarloader.data_registry

"""Data registry for GVL tutorial datasets."""

from pathlib import Path
from typing import Literal

import pooch

N_RETRIES = 3


[docs] def fetch(name: Literal["geuvadis_ebi", "1kgp"]) -> dict[str, Path]: """Download and cache data for constructing/opening a GVL dataset. Files are cached in the user's home directory under :code:`~/.cache/genvarloader`. Parameters ---------- name The name of the dataset to fetch. Can be one of: - "geuvadis_ebi": Geuvadis data for the original analyses by Lappalainen et al. 2013. Phased, normalized, and split into biallelic variants. - "1kgp": 1000 Genomes Project, all 3,202 individuals. Phased, normalized, and split into biallelic variants. Returns ------- A dictionary of paths to the fetched data. """ if name == "geuvadis_ebi": return _geuvadis_ebi() elif name == "1kgp": return _1kgp() raise ValueError(f"Unknown dataset: {name}")
def _geuvadis_ebi(): huang = pooch.Pooch( pooch.os_cache("genvarloader/geuvadis_ebi"), "https://github.com/ni-lab/personalized-expression-benchmark/raw/refs/heads/main/", registry={ "data/gene_list.csv": "md5:ed1b207b14313dbddef520413cbf9e40", "consensus/samples.txt": "md5:0ba048e6dbb39dc6a6a9835e4528621f", }, retry_if_failed=N_RETRIES, ) basenji = pooch.Pooch( pooch.os_cache("genvarloader/geuvadis_ebi"), "https://github.com/calico/basenji/raw/refs/heads/master/manuscripts/cross2020/", registry={"targets_human.txt": "md5:61c7fe3aa9da2f309124830ddb282ce3"}, retry_if_failed=N_RETRIES, ) zenodo = pooch.Pooch( pooch.os_cache("genvarloader/geuvadis_ebi"), "https://zenodo.org/records/15596289/files/", registry={ "GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz": "md5:500bffeed8e0f770c157e0189e9e50ae", "geuvadis.pgen": "md5:2815f8e20f8a0acbdead2b0c0d6a00dc", "geuvadis.psam": "md5:3c33d631dceccb43222341485b8658f8", "geuvadis.pvar.zst": "md5:51aad81267254695191d6284352b0523", }, retry_if_failed=N_RETRIES, ) genes = Path(huang.fetch("data/gene_list.csv", progressbar=True)) samples = Path(huang.fetch("consensus/samples.txt", progressbar=True)) basenji2_targets = Path(basenji.fetch("targets_human.txt", progressbar=True)) expr = Path( zenodo.fetch( "GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz", progressbar=True ) ) pgen = Path(zenodo.fetch("geuvadis.pgen", progressbar=True)) _ = zenodo.fetch("geuvadis.psam", progressbar=True) _ = zenodo.fetch("geuvadis.pvar.zst", progressbar=True) return { "genes": genes, "expr": expr, "pgen": pgen, "basenji2_targets": basenji2_targets, "samples": samples, } def _1kgp(): zenodo = pooch.Pooch( pooch.os_cache("genvarloader/1kgp"), "https://zenodo.org/records/15596480/files/", registry={ "1kGP.snp_indel.split_multiallelics.pgen.xz": "md5:3e07b63b48e5a205a2b2ae5022e01871", "1kGP.snp_indel.split_multiallelics.psam": "md5:cb8ea444a41cceb27483df26bb76ad1b", "1kGP.snp_indel.split_multiallelics.pvar.zst": "md5:cb00b271504713dbbdbbf17ee0b1825e", }, retry_if_failed=N_RETRIES, ) pgen = Path( zenodo.fetch( "1kGP.snp_indel.split_multiallelics.pgen.xz", pooch.Decompress(), progressbar=True, ) ) _ = zenodo.fetch("1kGP.snp_indel.split_multiallelics.psam", progressbar=True) _ = zenodo.fetch("1kGP.snp_indel.split_multiallelics.pvar.zst", progressbar=True) return {"pgen": pgen}