sunlabuiuc · anumala2 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py
@@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs):
 from .dreamt import DREAMTDataset
 from .ehrshot import EHRShotDataset
 from .eicu import eICUDataset
+from .gdsc import GDSCDataset
 from .isruc import ISRUCDataset
 from .medical_transcriptions import MedicalTranscriptionsDataset
 from .mimic3 import MIMIC3Dataset

diff --git a/pyhealth/datasets/configs/gdsc.yaml b/pyhealth/datasets/configs/gdsc.yaml
@@ -0,0 +1,15 @@
+version: "1.0"
+tables:
+  drug_info:
+    file_path: "drug_info_gdsc.csv"
+    patient_id: null
+    timestamp: null
+    attributes:
+    - "drug_id"
+    - "Name"
+    - "Synonyms"
+    - "Targets"
+    - "Target pathway"
+    - "PubCHEM"
+    - "Sample Size"
+    - "Count"
diff --git a/pyhealth/datasets/gdsc.py b/pyhealth/datasets/gdsc.py
@@ -0,0 +1,47 @@
+import logging
+from pathlib import Path
+from typing import List, Optional
+from .base_dataset import BaseDataset
+
+import polars as pl
+
+logger = logging.getLogger(__name__)
+
+
+class GDSCDataset(BaseDataset):
+
+    def __init__(
+        self,
+        root: str,
+        tables: List[str],
+        dataset_name: Optional[str] = None,
+        config_path: Optional[str] = None,
+        **kwargs
+    ) -> None:
+        """
+        Initializes the GDSC (Genomics of Drug Sensitivity in Cancer) Dataset with the specified parameters.
+        The GDSC drug_info table is a drug-centric metadata table that describes compounds screened across
+          the Genomics of Drug Sensitivity in Cancer (GDSC) cell-line drug-sensitivity project.
+            Typical columns include unique drug identifiers, canonical names, alternate names/synonyms,
+              molecular or protein targets, higher-level pathways targeted, external chemical identifiers
+                (e.g., PubChem CID), and bookkeeping counts such as sample sizes or number of experiments.
+                  The broader GDSC resource pairs these drug metadata with measured drug response (e.g., IC50)
+                    across hundreds to thousands of cancer cell lines, enabling pharmacogenomic analyses.
+
+        Args:
+            root (str): The root directory where the dataset is stored.
+            tables (List[str]): A list of additional tables to include.
+            dataset_name (Optional[str]): The name of the dataset. Defaults to "gdsc".
+            config_path (Optional[str]): The path to the configuration file. If not provided, a default config is used.
+        """
+        if config_path is None:
+            logger.info("No config path provided, using default config")
+            config_path = Path(__file__).parent / "configs" / "gdsc.yaml"
+        super().__init__(
+            root=root,
+            tables=tables,
+            dataset_name=dataset_name or "gdsc",
+            config_path=config_path,
+            **kwargs
+        )
+        return
diff --git a/tests/todo/test_datasets/test_gdsc.py b/tests/todo/test_datasets/test_gdsc.py
@@ -0,0 +1,68 @@
+import unittest
+
+from pyhealth.datasets import GDSCDataset
+import polars as pl
+import os
+import sys
+
+current = os.path.dirname(os.path.realpath(__file__))
+repo_root = os.path.dirname(os.path.dirname(os.path.dirname(current)))
+sys.path.append(repo_root)
+
+
+class TestsGDSCDataset(unittest.TestCase):
+    DATASET_NAME = "gdsc-demo"
+    ROOT = "https://github.com/svshah4/extending-cadre/blob/main/data/input/"
+    TABLES = ["drug_info"]
+    REFRESH_CACHE = True
+
+    dataset = GDSCDataset(
+        dataset_name=DATASET_NAME,
+        root=ROOT,
+        tables=TABLES,
+    )
+
+    def setUp(self):
+        pass
+
+    def test_drug_info(self):
+        """Tests that a drug entry from drug_info_gdsc.csv is parsed correctly."""
+
+        # Pick a deterministic row that should always exist
+        selected_drug_id = "1242"
+
+        expected_name = "(5Z)-7-Oxozeaenol"
+        expected_synonyms = "5Z-7-Oxozeaenol, LL-Z1640-2"
+        expected_targets = "TAK1"
+        expected_pathway = "Other, kinases"
+        expected_pubchem = "9863776"
+        expected_sample_size = "945"
+        expected_count = "266"
+
+        # dataset.tables["drug_info"] should be a Polars DataFrame
+        drug_df = self.dataset.tables["drug_info"]
+
+        # Basic checks
+        self.assertTrue(len(drug_df) > 0)
+        self.assertIn("drug_id", drug_df.columns)
+        self.assertIn("Name", drug_df.columns)
+
+        # Row lookup
+        row = drug_df.filter(pl.col("drug_id") == selected_drug_id)
+
+        self.assertEqual(1, len(row), "Expected exactly one matched drug entry.")
+
+        row = row.to_dicts()[0]
+
+        # Field-level checks
+        self.assertEqual(expected_name, row["Name"])
+        self.assertEqual(expected_synonyms, row["Synonyms"])
+        self.assertEqual(expected_targets, row["Targets"])
+        self.assertEqual(expected_pathway, row["Target pathway"])
+        self.assertEqual(expected_pubchem, row["PubCHEM"])
+        self.assertEqual(expected_sample_size, row["Sample Size"])
+        self.assertEqual(expected_count, row["Count"])
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)