diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py index 7d6a65f16..e680f9dc9 100644 --- a/pyhealth/datasets/__init__.py +++ b/pyhealth/datasets/__init__.py @@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs): from .dreamt import DREAMTDataset from .ehrshot import EHRShotDataset from .eicu import eICUDataset +from .gdsc import GDSCDataset from .isruc import ISRUCDataset from .medical_transcriptions import MedicalTranscriptionsDataset from .mimic3 import MIMIC3Dataset diff --git a/pyhealth/datasets/configs/gdsc.yaml b/pyhealth/datasets/configs/gdsc.yaml new file mode 100644 index 000000000..80589a4c8 --- /dev/null +++ b/pyhealth/datasets/configs/gdsc.yaml @@ -0,0 +1,15 @@ +version: "1.0" +tables: + drug_info: + file_path: "drug_info_gdsc.csv" + patient_id: null + timestamp: null + attributes: + - "drug_id" + - "Name" + - "Synonyms" + - "Targets" + - "Target pathway" + - "PubCHEM" + - "Sample Size" + - "Count" \ No newline at end of file diff --git a/pyhealth/datasets/gdsc.py b/pyhealth/datasets/gdsc.py new file mode 100644 index 000000000..a41f73505 --- /dev/null +++ b/pyhealth/datasets/gdsc.py @@ -0,0 +1,47 @@ +import logging +from pathlib import Path +from typing import List, Optional +from .base_dataset import BaseDataset + +import polars as pl + +logger = logging.getLogger(__name__) + + +class GDSCDataset(BaseDataset): + + def __init__( + self, + root: str, + tables: List[str], + dataset_name: Optional[str] = None, + config_path: Optional[str] = None, + **kwargs + ) -> None: + """ + Initializes the GDSC (Genomics of Drug Sensitivity in Cancer) Dataset with the specified parameters. + The GDSC drug_info table is a drug-centric metadata table that describes compounds screened across + the Genomics of Drug Sensitivity in Cancer (GDSC) cell-line drug-sensitivity project. + Typical columns include unique drug identifiers, canonical names, alternate names/synonyms, + molecular or protein targets, higher-level pathways targeted, external chemical identifiers + (e.g., PubChem CID), and bookkeeping counts such as sample sizes or number of experiments. + The broader GDSC resource pairs these drug metadata with measured drug response (e.g., IC50) + across hundreds to thousands of cancer cell lines, enabling pharmacogenomic analyses. + + Args: + root (str): The root directory where the dataset is stored. + tables (List[str]): A list of additional tables to include. + dataset_name (Optional[str]): The name of the dataset. Defaults to "gdsc". + config_path (Optional[str]): The path to the configuration file. If not provided, a default config is used. + """ + if config_path is None: + logger.info("No config path provided, using default config") + config_path = Path(__file__).parent / "configs" / "gdsc.yaml" + super().__init__( + root=root, + tables=tables, + dataset_name=dataset_name or "gdsc", + config_path=config_path, + **kwargs + ) + return diff --git a/tests/todo/test_datasets/test_gdsc.py b/tests/todo/test_datasets/test_gdsc.py new file mode 100644 index 000000000..2739063d0 --- /dev/null +++ b/tests/todo/test_datasets/test_gdsc.py @@ -0,0 +1,68 @@ +import unittest + +from pyhealth.datasets import GDSCDataset +import polars as pl +import os +import sys + +current = os.path.dirname(os.path.realpath(__file__)) +repo_root = os.path.dirname(os.path.dirname(os.path.dirname(current))) +sys.path.append(repo_root) + + +class TestsGDSCDataset(unittest.TestCase): + DATASET_NAME = "gdsc-demo" + ROOT = "https://github.com/svshah4/extending-cadre/blob/main/data/input/" + TABLES = ["drug_info"] + REFRESH_CACHE = True + + dataset = GDSCDataset( + dataset_name=DATASET_NAME, + root=ROOT, + tables=TABLES, + ) + + def setUp(self): + pass + + def test_drug_info(self): + """Tests that a drug entry from drug_info_gdsc.csv is parsed correctly.""" + + # Pick a deterministic row that should always exist + selected_drug_id = "1242" + + expected_name = "(5Z)-7-Oxozeaenol" + expected_synonyms = "5Z-7-Oxozeaenol, LL-Z1640-2" + expected_targets = "TAK1" + expected_pathway = "Other, kinases" + expected_pubchem = "9863776" + expected_sample_size = "945" + expected_count = "266" + + # dataset.tables["drug_info"] should be a Polars DataFrame + drug_df = self.dataset.tables["drug_info"] + + # Basic checks + self.assertTrue(len(drug_df) > 0) + self.assertIn("drug_id", drug_df.columns) + self.assertIn("Name", drug_df.columns) + + # Row lookup + row = drug_df.filter(pl.col("drug_id") == selected_drug_id) + + self.assertEqual(1, len(row), "Expected exactly one matched drug entry.") + + row = row.to_dicts()[0] + + # Field-level checks + self.assertEqual(expected_name, row["Name"]) + self.assertEqual(expected_synonyms, row["Synonyms"]) + self.assertEqual(expected_targets, row["Targets"]) + self.assertEqual(expected_pathway, row["Target pathway"]) + self.assertEqual(expected_pubchem, row["PubCHEM"]) + self.assertEqual(expected_sample_size, row["Sample Size"]) + self.assertEqual(expected_count, row["Count"]) + + +if __name__ == "__main__": + unittest.main(verbosity=2) \ No newline at end of file