Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyhealth/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def __init__(self, *args, **kwargs):
from .dreamt import DREAMTDataset
from .ehrshot import EHRShotDataset
from .eicu import eICUDataset
from .gdsc import GDSCDataset
from .isruc import ISRUCDataset
from .medical_transcriptions import MedicalTranscriptionsDataset
from .mimic3 import MIMIC3Dataset
Expand Down
15 changes: 15 additions & 0 deletions pyhealth/datasets/configs/gdsc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
version: "1.0"
tables:
drug_info:
file_path: "drug_info_gdsc.csv"
patient_id: null
timestamp: null
attributes:
- "drug_id"
- "Name"
- "Synonyms"
- "Targets"
- "Target pathway"
- "PubCHEM"
- "Sample Size"
- "Count"
47 changes: 47 additions & 0 deletions pyhealth/datasets/gdsc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import logging
from pathlib import Path
from typing import List, Optional
from .base_dataset import BaseDataset

import polars as pl

logger = logging.getLogger(__name__)


class GDSCDataset(BaseDataset):

def __init__(
self,
root: str,
tables: List[str],
dataset_name: Optional[str] = None,
config_path: Optional[str] = None,
**kwargs
) -> None:
"""
Initializes the GDSC (Genomics of Drug Sensitivity in Cancer) Dataset with the specified parameters.
The GDSC drug_info table is a drug-centric metadata table that describes compounds screened across
the Genomics of Drug Sensitivity in Cancer (GDSC) cell-line drug-sensitivity project.
Typical columns include unique drug identifiers, canonical names, alternate names/synonyms,
molecular or protein targets, higher-level pathways targeted, external chemical identifiers
(e.g., PubChem CID), and bookkeeping counts such as sample sizes or number of experiments.
The broader GDSC resource pairs these drug metadata with measured drug response (e.g., IC50)
across hundreds to thousands of cancer cell lines, enabling pharmacogenomic analyses.

Args:
root (str): The root directory where the dataset is stored.
tables (List[str]): A list of additional tables to include.
dataset_name (Optional[str]): The name of the dataset. Defaults to "gdsc".
config_path (Optional[str]): The path to the configuration file. If not provided, a default config is used.
"""
if config_path is None:
logger.info("No config path provided, using default config")
config_path = Path(__file__).parent / "configs" / "gdsc.yaml"
super().__init__(
root=root,
tables=tables,
dataset_name=dataset_name or "gdsc",
config_path=config_path,
**kwargs
)
return
68 changes: 68 additions & 0 deletions tests/todo/test_datasets/test_gdsc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import unittest

from pyhealth.datasets import GDSCDataset
import polars as pl
import os
import sys

current = os.path.dirname(os.path.realpath(__file__))
repo_root = os.path.dirname(os.path.dirname(os.path.dirname(current)))
sys.path.append(repo_root)


class TestsGDSCDataset(unittest.TestCase):
DATASET_NAME = "gdsc-demo"
ROOT = "https://github.com/svshah4/extending-cadre/blob/main/data/input/"
TABLES = ["drug_info"]
REFRESH_CACHE = True

dataset = GDSCDataset(
dataset_name=DATASET_NAME,
root=ROOT,
tables=TABLES,
)

def setUp(self):
pass

def test_drug_info(self):
"""Tests that a drug entry from drug_info_gdsc.csv is parsed correctly."""

# Pick a deterministic row that should always exist
selected_drug_id = "1242"

expected_name = "(5Z)-7-Oxozeaenol"
expected_synonyms = "5Z-7-Oxozeaenol, LL-Z1640-2"
expected_targets = "TAK1"
expected_pathway = "Other, kinases"
expected_pubchem = "9863776"
expected_sample_size = "945"
expected_count = "266"

# dataset.tables["drug_info"] should be a Polars DataFrame
drug_df = self.dataset.tables["drug_info"]

# Basic checks
self.assertTrue(len(drug_df) > 0)
self.assertIn("drug_id", drug_df.columns)
self.assertIn("Name", drug_df.columns)

# Row lookup
row = drug_df.filter(pl.col("drug_id") == selected_drug_id)

self.assertEqual(1, len(row), "Expected exactly one matched drug entry.")

row = row.to_dicts()[0]

# Field-level checks
self.assertEqual(expected_name, row["Name"])
self.assertEqual(expected_synonyms, row["Synonyms"])
self.assertEqual(expected_targets, row["Targets"])
self.assertEqual(expected_pathway, row["Target pathway"])
self.assertEqual(expected_pubchem, row["PubCHEM"])
self.assertEqual(expected_sample_size, row["Sample Size"])
self.assertEqual(expected_count, row["Count"])


if __name__ == "__main__":
unittest.main(verbosity=2)