PavlidisLab · arteymix · Nov 12, 2025 · Apr 28, 2025 · Apr 28, 2025 · Sep 16, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -7,20 +7,16 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       max-parallel: 5
+    defaults:
+      run:
+        shell: bash -el {0}
 
     steps:
     - uses: actions/checkout@v4
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v5
+    - uses: conda-incubator/setup-miniconda@v3
       with:
-        python-version: '3.9'
-    - name: Add conda to system path
-      run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
-    - name: Setup Conda environment
-      run: |
-        conda env update --file environment.yml --name base
+        activate-environment: rnaseq-pipeline
+        environment-file: environment.yml
     - name: Install package
       run: |
         pip install .[gsheet,webviewer]

diff --git a/README.md b/README.md
@@ -102,13 +102,15 @@ The output is organized as follow:
 
 ```
 pipeline-output/
-    genomes/<reference_id>/                 # Genomic references
-    references/<reference_id>/              # RSEM/STAR indexes
-    data/<source>                           # FASTQs (note that GEO source uses SRA)
-    data-qc/<experiment_id>/<sample_id>/    # FastQC reports
-    aligned/<reference_id>/<experiment_id>/ # alignments and quantification results
-    quantified/<reference_id>               # quantification matrices for isoforms and genes
-    report/<reference_id>/<experiment_id>/  # MultiQC reports for reads and alignments
+    genomes/<reference_id>/                       # Genomic references
+    references/<reference_id>/                    # RSEM/STAR indexes
+    data/<source>/                                # FASTQs (organization is source-specific; note that GEO source uses SRA)
+    data-qc/<experiment_id>/<sample_id>/          # FastQC reports
+    data-single-cell/<experiment_id>/<sample_id>/ # Single-cell data (hard links to files from data/)
+    aligned/<reference_id>/<experiment_id>/       # alignments and quantification results
+    quantified/<reference_id>                     # quantification matrices for isoforms and genes
+    quantified-single-cell/<reference_id>         # quantified single-cell data (Cell Ranger outputs)
+    report/<reference_id>/<experiment_id>/        # MultiQC reports for reads and alignments
 ```
 
 You can adjust the pipeline output directory by setting `OUTPUT_DIR` under

diff --git a/environment.yml b/environment.yml
@@ -14,3 +14,5 @@ dependencies:
 - star==2.7.3a
 - entrez-direct
 - perl # rsem expects this
+- samtools
+- curl
diff --git a/example.luigi.cfg b/example.luigi.cfg
@@ -27,9 +27,12 @@ submit_data_jobs=1
 submit_batch_info_jobs=2
 
 [bioluigi]
-scheduler=slurm
+scheduler=local
 scheduler_partition=
 scheduler_extra_args=[]
+# Default tools, override as needed
+#cutadapt_bin=cutadapt
+#cell_ranger_bin=cellranger
 
 #
 # This section contains the necessary variables for the pipeline execution
@@ -40,6 +43,7 @@ scheduler_extra_args=[]
 OUTPUT_DIR=pipeline-output
 GENOMES=genomes
 REFERENCES=references
+SINGLE_CELL_REFERENCES=references-single-cell
 METADATA=metadata
 DATA=data
 DATAQCDIR=data-qc
@@ -53,6 +57,13 @@ RSEM_DIR=contrib/RSEM
 
 SLACK_WEBHOOK_URL=
 
+[rnaseq_pipeline.sources.sra]
+# location where tools like prefetch and fastq-dump will store downloaded SRA files
+# you can get this value with vdb-config -p
+ncbi_public_dir=/cosmos/scratch/ncbi/public
+samtools_bin=samtools
+bamtofastq_bin=bamtofastq
+
 [rnaseq_pipeline.gemma]
 cli_bin=gemma-cli
 # values for $JAVA_HOME and $JAVA_OPTS environment variables
@@ -63,3 +74,6 @@ appdata_dir=/space/gemmaData
 human_reference_id=hg38_ncbi
 mouse_reference_id=mm10_ncbi
 rat_reference_id=rn7_ncbi
+human_single_cell_reference_id=refdata-gex-GRCh38-2024-A
+mouse_single_cell_reference_id=refdata-gex-GRCm39-2024-A
+rat_single_cell_reference_id=refdata-gex-mRatBN7-2-2024-A
diff --git a/find-samples-with-multiple-lanes.py b/find-samples-with-multiple-lanes.py
@@ -0,0 +1,81 @@
+import tarfile
+import tempfile
+from io import StringIO
+from urllib.parse import urlparse, parse_qs
+
+import pandas as pd
+import requests
+from tqdm import tqdm
+
+from rnaseq_pipeline.miniml_utils import collect_geo_samples_info
+
+ns = {'miniml': 'http://www.ncbi.nlm.nih.gov/geo/info/MINiML'}
+
+def retrieve_geo_series_miniml_from_ftp(gse):
+    res = requests.get(f'https://ftp.ncbi.nlm.nih.gov/geo/series/{gse[:-3]}nnn/{gse}/miniml/{gse}_family.xml.tgz',
+                       stream=True)
+    res.raise_for_status()
+    # we need to use a temporary file because Response.raw does not allow seeking
+    with tempfile.TemporaryFile() as tmp:
+        for chunk in res.iter_content(chunk_size=1024):
+            tmp.write(chunk)
+        tmp.seek(0)
+        with tarfile.open(fileobj=tmp, mode='r:gz') as fin:
+            reader = fin.extractfile(f'{gse}_family.xml')
+            return reader.read().decode('utf-8')
+
+def retrieve_geo_series_miniml_from_geo_query(gse):
+    res = requests.get('https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi', params=dict(acc=gse, form='xml', targ='gsm'))
+    res.raise_for_status()
+    return res.text
+
+def fetch_sra_metadata(gse):
+    try:
+        miniml = retrieve_geo_series_miniml_from_ftp(gse)
+    except Exception as e:
+        print(
+            f'Failed to retrieve MINiML metadata for {gse} from NCBI FTP server will attempt to use GEO directly.',
+            e)
+        try:
+            miniml = retrieve_geo_series_miniml_from_geo_query(gse)
+        except Exception as e:
+            print(f'Failed to retrieve MINiML metadata for {gse} from GEO query.', e)
+            return []
+    try:
+        meta = collect_geo_samples_info(StringIO(miniml))
+    except Exception as e:
+        print('Failed to parse MINiML from input: ' + miniml[:100], e)
+        return []
+    results = []
+    for gsm in meta:
+        platform, srx_url = meta[gsm]
+        srx = parse_qs(urlparse(srx_url).query)['term'][0]
+        results.append((gse, gsm, srx))
+    return results
+
+with open('geo-sample-to-sra-experiment.tsv', 'wt') as f:
+    print('geo_series', 'geo_sample', 'sra_experiment', file=f, sep='\t', flush=True)
+    df = pd.read_table('gemma-rnaseq-datasets.tsv')
+    batch = []
+    for gse in tqdm(df.geo_accession):
+        samples = fetch_sra_metadata(gse)
+        for sample in samples:
+            print(*sample, file=f, sep='\t', flush=True)
+
+# def fetch_runinfo(srx_ids):
+#     # fetch the SRX metadata
+#     return pd.read_csv(StringIO(retrieve_runinfo(srx_ids)))
+#
+# def print_results(samples):
+#     srx_ids = [s[2] for s in samples]
+#     try:
+#         results = fetch_runinfo(srx_ids)
+#     except Exception as e:
+#         print('Failed to retrieve runinfo for the following SRX IDs:', srx_ids, e)
+#         return
+#     for sample in samples:
+#         runs = results[results['Experiment'] == sample[2]]['Run']
+#         r = sample + ('|'.join(runs), len(runs))
+#         print(*r, sep='\t', flush=True)
+#
+# BATCH_SIZE = 100
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[tool.mypy]
+plugins = ["luigi.mypy"]
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+log_cli=1
+log_cli_level=warning
diff --git a/rnaseq_pipeline/config.py b/rnaseq_pipeline/config.py
@@ -1,21 +1,24 @@
+from typing import Optional
+
 import luigi
 
 # see luigi.cfg for details
 class rnaseq_pipeline(luigi.Config):
     task_namespace = ''
 
-    GENOMES = luigi.Parameter()
+    GENOMES: str = luigi.Parameter()
 
-    OUTPUT_DIR = luigi.Parameter()
-    REFERENCES = luigi.Parameter()
-    METADATA = luigi.Parameter()
-    DATA = luigi.Parameter()
-    DATAQCDIR = luigi.Parameter()
-    ALIGNDIR = luigi.Parameter()
-    ALIGNQCDIR = luigi.Parameter()
-    QUANTDIR = luigi.Parameter()
-    BATCHINFODIR = luigi.Parameter()
+    OUTPUT_DIR: str = luigi.Parameter()
+    REFERENCES: str = luigi.Parameter()
+    SINGLE_CELL_REFERENCES: str = luigi.Parameter()
+    METADATA: str = luigi.Parameter()
+    DATA: str = luigi.Parameter()
+    DATAQCDIR: str = luigi.Parameter()
+    ALIGNDIR: str = luigi.Parameter()
+    ALIGNQCDIR: str = luigi.Parameter()
+    QUANTDIR: str = luigi.Parameter()
+    BATCHINFODIR: str = luigi.Parameter()
 
-    RSEM_DIR = luigi.Parameter()
+    RSEM_DIR: str = luigi.Parameter()
 
-    SLACK_WEBHOOK_URL = luigi.OptionalParameter(default=None)
+    SLACK_WEBHOOK_URL: Optional[str] = luigi.OptionalParameter(default=None)
diff --git a/rnaseq_pipeline/gemma.py b/rnaseq_pipeline/gemma.py
@@ -1,3 +1,5 @@
+import enum
+import logging
 import os
 import subprocess
 from getpass import getpass
@@ -8,16 +10,21 @@
 from luigi.contrib.external_program import ExternalProgramTask
 from requests.auth import HTTPBasicAuth
 
+logger = logging.getLogger(__name__)
+
 class gemma(luigi.Config):
     task_namespace = 'rnaseq_pipeline'
-    baseurl = luigi.Parameter()
-    appdata_dir = luigi.Parameter()
-    cli_bin = luigi.Parameter()
-    cli_JAVA_HOME = luigi.Parameter()
-    cli_JAVA_OPTS = luigi.Parameter()
-    human_reference_id = luigi.Parameter()
-    mouse_reference_id = luigi.Parameter()
-    rat_reference_id = luigi.Parameter()
+    baseurl: str = luigi.Parameter()
+    appdata_dir: str = luigi.Parameter()
+    cli_bin: str = luigi.Parameter()
+    cli_JAVA_HOME: str = luigi.Parameter()
+    cli_JAVA_OPTS: str = luigi.Parameter()
+    human_reference_id: str = luigi.Parameter()
+    mouse_reference_id: str = luigi.Parameter()
+    rat_reference_id: str = luigi.Parameter()
+    human_single_cell_reference_id: str = luigi.Parameter()
+    mouse_single_cell_reference_id: str = luigi.Parameter()
+    rat_single_cell_reference_id: str = luigi.Parameter()
 
 cfg = gemma()
 
@@ -45,6 +52,9 @@ def _query_api(self, endpoint):
     def datasets(self, experiment_id):
         return self._query_api(join('datasets', experiment_id))
 
+    def dataset_annotations(self, experiment_id):
+        return self._query_api(join('datasets', experiment_id, 'annotations'))
+
     def dataset_has_batch(self, experiment_id):
         return self._query_api(join('datasets', experiment_id, 'hasbatch'))
 
@@ -54,7 +64,7 @@ def samples(self, experiment_id):
     def platforms(self, experiment_id):
         return self._query_api(join('datasets', experiment_id, 'platforms'))
 
-class GemmaTaskMixin:
+class GemmaTaskMixin(luigi.Task):
     experiment_id = luigi.Parameter()
 
     def __init__(self, *kwargs, **kwds):
@@ -98,10 +108,70 @@ def reference_id(self):
         except KeyError:
             raise ValueError('Unsupported Gemma taxon {}.'.format(self.taxon))
 
+    @property
+    def single_cell_reference_id(self):
+        try:
+            return {'human': cfg.human_single_cell_reference_id, 'mouse': cfg.mouse_single_cell_reference_id,
+                    'rat': cfg.rat_single_cell_reference_id}[
+                self.taxon]
+        except KeyError:
+            raise ValueError('Unsupported Gemma taxon {}.'.format(self.taxon))
+
     @property
     def platform_short_name(self):
         return f'Generic_{self.taxon}_ncbiIds'
 
+    @property
+    def assay_type(self):
+        # Possible values:
+        # bulk RNA-seq assay	http://purl.obolibrary.org/obo/OBI_0003090	11345
+        # transcription profiling by array assay	http://purl.obolibrary.org/obo/OBI_0001463	10788
+        # transcription profiling by high throughput sequencing	http://www.ebi.ac.uk/efo/EFO_0002770	262
+        # single-cell RNA sequencing assay	http://purl.obolibrary.org/obo/OBI_0002631	248
+        # single-nucleus RNA sequencing assay	http://purl.obolibrary.org/obo/OBI_0003109	150
+        # transcription profiling by array	http://www.ebi.ac.uk/efo/EFO_0002768	79
+        # single-cell RNA sequencing	http://www.ebi.ac.uk/efo/EFO_0008913	5
+        # single nucleus RNA sequencing	http://www.ebi.ac.uk/efo/EFO_0009809	4
+        # fluorescence-activated cell sorting	http://www.ebi.ac.uk/efo/EFO_0009108	2
+        # RIP-seq	http://www.ebi.ac.uk/efo/EFO_0005310	1
+        # These were pulled from Gemma on October 1st, 2025
+
+        assay_type_class_uri = 'http://purl.obolibrary.org/obo/OBI_0000070'
+        microarray_uris = ['http://purl.obolibrary.org/obo/OBI_0001463', 'http://www.ebi.ac.uk/efo/EFO_0002768']
+        bulk_rnaseq_uris = ['http://purl.obolibrary.org/obo/OBI_0003090']
+        sc_rnaseq_uris = ['http://purl.obolibrary.org/obo/OBI_0002631', 'http://www.ebi.ac.uk/efo/EFO_0008913',
+                          'http://www.ebi.ac.uk/efo/EFO_0009809', 'http://purl.obolibrary.org/obo/OBI_0003109']
+        fac_sorted_uri = 'http://www.ebi.ac.uk/efo/EFO_0009108'
+
+        annotations = self._gemma_api.dataset_annotations(self.experiment_id)
+        fac_sorted = any(annotation['classUri'] == assay_type_class_uri and annotation['termUri'] == fac_sorted_uri
+                         for annotation in annotations)
+        for annotation in annotations:
+            if annotation['classUri'] == assay_type_class_uri:
+                value_uri = annotation['termUri']
+                if value_uri in microarray_uris:
+                    return GemmaAssayType.MICROARRAY
+                elif value_uri in bulk_rnaseq_uris:
+                    return GemmaAssayType.BULK_RNA_SEQ
+                elif value_uri in sc_rnaseq_uris:
+                    if fac_sorted:
+                        # fac-sorted scRNA-Seq is treated as bulk
+                        logger.info('%s: Dataset is a FAC-sorted single-cell RNA-Seq, will treat as bulk RNA-Seq.',
+                                    self.dataset_short_name)
+                        return GemmaAssayType.BULK_RNA_SEQ
+                    else:
+                        return GemmaAssayType.SINGLE_CELL_RNA_SEQ
+
+        # assume bulk
+        logger.warning('%s: No suitable experiment tag to determine the assay type, will assume bulk RNA-Seq.',
+                       self.dataset_short_name)
+        return GemmaAssayType.BULK_RNA_SEQ
+
+class GemmaAssayType(enum.Enum):
+    MICROARRAY = 0
+    BULK_RNA_SEQ = 1
+    SINGLE_CELL_RNA_SEQ = 2
+
 class GemmaCliTask(GemmaTaskMixin, ExternalProgramTask):
     """
     Base class for tasks that wraps Gemma CLI.

diff --git a/rnaseq_pipeline/gsheet.py b/rnaseq_pipeline/gsheet.py
@@ -1,5 +1,4 @@
 import logging
-import logging
 import os
 import os.path
 import pickle
@@ -15,7 +14,7 @@
 SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']
 CREDENTIALS_FILE = resource_filename('rnaseq_pipeline', 'credentials.json')
 
-logger = logging.getLogger('luigi-interface')
+logger = logging.getLogger(__name__)
 
 def _authenticate():
     # authentication
@@ -34,14 +33,14 @@ def _authenticate():
         else:
             flow = InstalledAppFlow.from_client_secrets_file(
                 CREDENTIALS_FILE, SCOPES)
-            creds = flow.run_local_server(port=0)
+            creds = flow.run_local_server(open_browser=False, port=0)
         # Save the credentials for the next run
         with open(token_path, 'wb') as token:
             pickle.dump(creds, token)
             logger.info(f'Created Google Sheets API token under {token_path}.')
     return creds
 
-def retrieve_spreadsheet(spreadsheet_id, sheet_name):
+def retrieve_spreadsheet(spreadsheet_id: str, sheet_name: str):
     service = build('sheets', 'v4', credentials=_authenticate(), cache_discovery=None)
 
     # Retrieve the documents contents from the Docs service.

diff --git a/rnaseq_pipeline/miniml_utils.py b/rnaseq_pipeline/miniml_utils.py
@@ -36,9 +36,9 @@ def collect_geo_samples(f):
             continue
         if sample_type.text == 'SRA':
             library_source = x.find('miniml:Library-Source', ns)
-            if library_source is not None and library_source.text == 'transcriptomic':
+            if library_source is not None and library_source.text in ['transcriptomic', 'transcriptomic single cell']:
                 library_strategy = x.find('miniml:Library-Strategy', ns)
-                if library_strategy is not None and library_strategy.text in ['RNA-Seq', 'ssRNA-seq', 'OTHER']:
+                if library_strategy is not None and library_strategy.text in ['RNA-Seq', 'ssRNA-seq', 'scRNA-Seq', 'snRNA-Seq', 'OTHER']:
                     gsm_identifiers.add(gsm_id.text)
 
     return gsm_identifiers