From bf7884b6146fa9d442d5064353eb6279d77339d5 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Tue, 13 Feb 2024 09:29:54 -0500 Subject: [PATCH 1/9] Snakefiles --- demos/salcher-2022/Snakefile | 36 ++++++++++++++++++++ demos/salcher-2022/config.yml | 2 ++ demos/salcher-2022/src/convert_to_zarr.py | 41 +++++++++++++++++++++++ demos/sikkema-2023/Snakefile | 36 ++++++++++++++++++++ demos/sikkema-2023/config.yml | 2 ++ demos/sikkema-2023/src/convert_to_zarr.py | 41 +++++++++++++++++++++++ 6 files changed, 158 insertions(+) create mode 100644 demos/salcher-2022/Snakefile create mode 100644 demos/salcher-2022/config.yml create mode 100644 demos/salcher-2022/src/convert_to_zarr.py create mode 100644 demos/sikkema-2023/Snakefile create mode 100644 demos/sikkema-2023/config.yml create mode 100644 demos/sikkema-2023/src/convert_to_zarr.py diff --git a/demos/salcher-2022/Snakefile b/demos/salcher-2022/Snakefile new file mode 100644 index 00000000..8b4b783f --- /dev/null +++ b/demos/salcher-2022/Snakefile @@ -0,0 +1,36 @@ +include: "../common.smk" +configfile: "config.yml" + +# May need to get new URLs from https://cellxgene.cziscience.com/collections/edb893ee-4066-4128-9aec-5eb2b03f8287 + +# The single-cell lung cancer atlas (LuCA) -- extended atlas +H5AD_URL = "https://datasets.cellxgene.cziscience.com/6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad" + +rule all: + input: + [ (PROCESSED_DIR / f) for f in config['output'] ] + +rule convert_to_zarr: + input: + (RAW_DIR / "6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad") + output: + directory(PROCESSED_DIR / "salcher_2022_extended.h5ad.zarr") + params: + script=(SRC_DIR / "convert_to_zarr.py") + shell: + ''' + python {params.script} \ + -i {input} \ + -o {output} + ''' + +# Download raw h5ad file. +rule download_adata: + output: + (RAW_DIR / "6e5e887d-96f7-40af-908c-9b4fc5057ef9.h5ad") + params: + file_url=H5AD_URL + shell: + ''' + curl -L --retry 999 --retry-delay 3 -C - -o {output} "{params.file_url}" + ''' diff --git a/demos/salcher-2022/config.yml b/demos/salcher-2022/config.yml new file mode 100644 index 00000000..674fa553 --- /dev/null +++ b/demos/salcher-2022/config.yml @@ -0,0 +1,2 @@ +output: +- salcher_2022_extended.h5ad.zarr \ No newline at end of file diff --git a/demos/salcher-2022/src/convert_to_zarr.py b/demos/salcher-2022/src/convert_to_zarr.py new file mode 100644 index 00000000..10c82425 --- /dev/null +++ b/demos/salcher-2022/src/convert_to_zarr.py @@ -0,0 +1,41 @@ +import argparse +from anndata import read_h5ad +from scipy import sparse +from vitessce.data_utils import ( + to_uint8, +) + + +def convert_h5ad_to_zarr(input_path, output_path): + adata = read_h5ad(input_path) + + adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="global") + + # Vitessce plays nicely with csc matrices + # TODO: automate conversion to csc in optimize_adata function + if isinstance(adata.X, sparse.spmatrix): + adata.X = adata.X.tocsc() + adata.write_zarr(output_path, chunks=[adata.shape[0], 10]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '-i', + '--input', + type=str, + required=True, + help='Input H5AD file' + ) + parser.add_argument( + '-o', + '--output', + type=str, + required=True, + help='Output Zarr store' + ) + args = parser.parse_args() + convert_h5ad_to_zarr( + args.input, + args.output + ) diff --git a/demos/sikkema-2023/Snakefile b/demos/sikkema-2023/Snakefile new file mode 100644 index 00000000..c3551bbf --- /dev/null +++ b/demos/sikkema-2023/Snakefile @@ -0,0 +1,36 @@ +include: "../common.smk" +configfile: "config.yml" + +# May need to get new URLs from https://cellxgene.cziscience.com/collections/edb893ee-4066-4128-9aec-5eb2b03f8287 + +# The single-cell lung cancer atlas (LuCA) -- extended atlas +H5AD_URL = "https://datasets.cellxgene.cziscience.com/3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad" + +rule all: + input: + [ (PROCESSED_DIR / f) for f in config['output'] ] + +rule convert_to_zarr: + input: + (RAW_DIR / "3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad") + output: + directory(PROCESSED_DIR / "sikkema_2023_full.h5ad.zarr") + params: + script=(SRC_DIR / "convert_to_zarr.py") + shell: + ''' + python {params.script} \ + -i {input} \ + -o {output} + ''' + +# Download raw h5ad file. +rule download_adata: + output: + (RAW_DIR / "3ab47484-a3eb-4f6a-beea-670e1a8fc1e8.h5ad") + params: + file_url=H5AD_URL + shell: + ''' + curl -L --retry 999 --retry-delay 3 -C - -o {output} "{params.file_url}" + ''' diff --git a/demos/sikkema-2023/config.yml b/demos/sikkema-2023/config.yml new file mode 100644 index 00000000..baeb72df --- /dev/null +++ b/demos/sikkema-2023/config.yml @@ -0,0 +1,2 @@ +output: +- sikkema_2023_full.h5ad.zarr \ No newline at end of file diff --git a/demos/sikkema-2023/src/convert_to_zarr.py b/demos/sikkema-2023/src/convert_to_zarr.py new file mode 100644 index 00000000..10c82425 --- /dev/null +++ b/demos/sikkema-2023/src/convert_to_zarr.py @@ -0,0 +1,41 @@ +import argparse +from anndata import read_h5ad +from scipy import sparse +from vitessce.data_utils import ( + to_uint8, +) + + +def convert_h5ad_to_zarr(input_path, output_path): + adata = read_h5ad(input_path) + + adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="global") + + # Vitessce plays nicely with csc matrices + # TODO: automate conversion to csc in optimize_adata function + if isinstance(adata.X, sparse.spmatrix): + adata.X = adata.X.tocsc() + adata.write_zarr(output_path, chunks=[adata.shape[0], 10]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '-i', + '--input', + type=str, + required=True, + help='Input H5AD file' + ) + parser.add_argument( + '-o', + '--output', + type=str, + required=True, + help='Output Zarr store' + ) + args = parser.parse_args() + convert_h5ad_to_zarr( + args.input, + args.output + ) From c62616371b4dff56258cfc2d4598e784ca5e59b5 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Tue, 13 Feb 2024 12:40:00 -0500 Subject: [PATCH 2/9] O2 scratch space --- demos/common.smk | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/demos/common.smk b/demos/common.smk index 5929ad96..1ff89474 100644 --- a/demos/common.smk +++ b/demos/common.smk @@ -1,8 +1,17 @@ +import platform from pathlib import Path +import os + +# Check if this is running on O2 +IS_O2 = (platform.system() == "Linux") + +if IS_O2: + O2_USER = os.environ["USER"] + O2_SCRATCH_DIR = f"/n/scratch/users/{O2_USER[0]}/{O2_USER}/vitessce-python/demos" # Directory / file constants SRC_DIR = Path("src") -DATA_DIR = Path("data") +DATA_DIR = Path("data" if not IS_O2 else O2_SCRATCH_DIR) RAW_DIR = DATA_DIR / "raw" PROCESSED_DIR = DATA_DIR / "processed" From c76da1cebf82ba58a83fd522af43848d51fe8208 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Thu, 15 Feb 2024 15:59:02 -0500 Subject: [PATCH 3/9] Update --- demos/salcher-2022/src/convert_to_zarr.py | 2 +- demos/sikkema-2023/src/convert_to_zarr.py | 2 +- demos/toy/Snakefile-2 | 89 +++++++++++++++++++++ demos/toy/Untitled.ipynb | 96 +++++++++++++++++++++++ demos/toy/normalize_data.py | 19 +++++ 5 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 demos/toy/Snakefile-2 create mode 100644 demos/toy/Untitled.ipynb create mode 100644 demos/toy/normalize_data.py diff --git a/demos/salcher-2022/src/convert_to_zarr.py b/demos/salcher-2022/src/convert_to_zarr.py index 10c82425..aba2a980 100644 --- a/demos/salcher-2022/src/convert_to_zarr.py +++ b/demos/salcher-2022/src/convert_to_zarr.py @@ -9,7 +9,7 @@ def convert_h5ad_to_zarr(input_path, output_path): adata = read_h5ad(input_path) - adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="global") + adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="var") # Vitessce plays nicely with csc matrices # TODO: automate conversion to csc in optimize_adata function diff --git a/demos/sikkema-2023/src/convert_to_zarr.py b/demos/sikkema-2023/src/convert_to_zarr.py index 10c82425..aba2a980 100644 --- a/demos/sikkema-2023/src/convert_to_zarr.py +++ b/demos/sikkema-2023/src/convert_to_zarr.py @@ -9,7 +9,7 @@ def convert_h5ad_to_zarr(input_path, output_path): adata = read_h5ad(input_path) - adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="global") + adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="var") # Vitessce plays nicely with csc matrices # TODO: automate conversion to csc in optimize_adata function diff --git a/demos/toy/Snakefile-2 b/demos/toy/Snakefile-2 new file mode 100644 index 00000000..8b06276f --- /dev/null +++ b/demos/toy/Snakefile-2 @@ -0,0 +1,89 @@ +include: "../common.smk" +configfile: "config.yml" + +# TODO: start from the real raw files +BASE_URL = "https://s3.embl.de/spatialdata/spatialdata-sandbox/{dataset}.zip" + + +rule all: + input: + [ (PROCESSED_DIR / f) for f in config['output'] ] + + +wildcard_constraints: + dataset="\w+" # a-z 0-9 + +# Normalize the matrix +rule normalize_matrix: + input: + PROCESSED_DIR / "{dataset}.zarr" + output: + directory(PROCESSED_DIR / "normalized" / "{dataset}.zarr"), + PROCESSED_DIR / "heatmap_before_{dataset}.png", + PROCESSED_DIR / "heatmap_after_{dataset}.png" + run: + import os + import scanpy as sc + import spatialdata as sd + import matplotlib.pyplot as plt + + #toy_folder_path = 'C:/Users/darkr/Projects/vitessce-python/demos/toy/data/processed' + #zarr_files = [os.path.join(toy_folder_path, f) for f in os.listdir(toy_folder_path) if f.endswith('.zarr')] + #print(f"Found {len(zarr_files)} zarr files in the 'toy' directory.") + + def process_and_plot_zarr(zarr_path, before_path, after_path): + # Read zarr file + print(f"Processed {zarr_path}.") + sdata = sd.read_zarr(zarr_path) + adata = sdata.table + #print(sdata.__dict__) + #print(adata) + sc.pp.normalize_total(adata, target_sum=1e6, inplace=True)\ + + + #zarr_basename = os.path.splitext(os.path.basename(zarr_path))[0] + + with plt.rc_context(): + # Initial heatmap before normalization + sc.pl.heatmap(adata, var_names=adata.var_names, groupby='fov', use_raw=False, show=False) + plt.savefig(before_path) + + + # Normalize + sc.pp.normalize_total(adata, target_sum=1e4) + sc.pp.log1p(adata) + + + # Heatmap after normalization + sc.pl.heatmap(adata, var_names=adata.var_names, groupby='fov', use_raw=False, show=False) + plt.savefig(after_path) + + + print(f"Processed {zarr_path}.") + # sdata.table = adata + sdata.write(output[0]) + process_and_plot_zarr(input[0], output[1], output[2]) + + +# Unzip the downloaded zip files +rule unzip_file: + input: + (RAW_DIR / "{dataset}.zip") + output: + directory(PROCESSED_DIR / "{dataset}.zarr") + shell: + """ + unzip {input} -d data/processed &&\ + mv data/processed/data.zarr data/processed/{wildcards.dataset}.zarr + """ + +# Download visium .zip file containing single-cell data. +rule download_data: + output: + (RAW_DIR / "{dataset}.zip") + params: + file_url=BASE_URL + shell: + ''' + curl -L -o {output} {params.file_url} + ''' diff --git a/demos/toy/Untitled.ipynb b/demos/toy/Untitled.ipynb new file mode 100644 index 00000000..7c866506 --- /dev/null +++ b/demos/toy/Untitled.ipynb @@ -0,0 +1,96 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "24ec3c29-9a1e-4a97-8584-8c6f6f29a9a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/mkeller/research/dbmi/vitessce/vitessce-python/demos/toy\n" + ] + } + ], + "source": [ + "!pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "65a58c0a-96c7-40a8-bfba-0d40244817e2", + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'BaseStore' from 'zarr.storage' (/Users/mkeller/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/zarr/storage.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msd\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mscanpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msc\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Read in the data as a SpatialData object\u001b[39;00m\n", + "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/spatialdata/__init__.py:30\u001b[0m\n\u001b[1;32m 9\u001b[0m _check_geopandas_using_shapely()\n\u001b[1;32m 12\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransformations\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msave_transformations\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m ]\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dataloader, models, transformations\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconcatenate\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m concatenate\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moperations\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01maggregate\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m aggregate\n", + "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/spatialdata/models/__init__.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m__future__\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m annotations\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 4\u001b[0m C,\n\u001b[1;32m 5\u001b[0m SpatialElement,\n\u001b[1;32m 6\u001b[0m X,\n\u001b[1;32m 7\u001b[0m Y,\n\u001b[1;32m 8\u001b[0m Z,\n\u001b[1;32m 9\u001b[0m get_axes_names,\n\u001b[1;32m 10\u001b[0m get_spatial_axes,\n\u001b[1;32m 11\u001b[0m points_dask_dataframe_to_geopandas,\n\u001b[1;32m 12\u001b[0m points_geopandas_to_dask_dataframe,\n\u001b[1;32m 13\u001b[0m validate_axes,\n\u001b[1;32m 14\u001b[0m validate_axis_name,\n\u001b[1;32m 15\u001b[0m )\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 17\u001b[0m Image2DModel,\n\u001b[1;32m 18\u001b[0m Image3DModel,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m get_model,\n\u001b[1;32m 25\u001b[0m )\n\u001b[1;32m 27\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLabels2DModel\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 29\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLabels3DModel\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpoints_dask_dataframe_to_geopandas\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 47\u001b[0m ]\n", + "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/spatialdata/models/_utils.py:11\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataframe\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataFrame \u001b[38;5;28;01mas\u001b[39;00m DaskDataFrame\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgeopandas\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GeoDataFrame\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmultiscale_spatial_image\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MultiscaleSpatialImage\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatial_image\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SpatialImage\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_logging\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m logger\n", + "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/multiscale_spatial_image/__init__.py:15\u001b[0m\n\u001b[1;32m 6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMultiscaleSpatialImage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMethods\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__version__\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 12\u001b[0m ]\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m__about__\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmultiscale_spatial_image\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MultiscaleSpatialImage\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mto_multiscale\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Methods, to_multiscale, itk_image_to_multiscale\n", + "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/multiscale_spatial_image/multiscale_spatial_image.py:9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcollections\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mabc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MutableMapping\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mzarr\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstorage\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseStore\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mMultiscaleSpatialImage\u001b[39;00m(DataTree):\n\u001b[1;32m 13\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"A multi-scale representation of a spatial image.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \n\u001b[1;32m 15\u001b[0m \u001b[38;5;124;03m This is an xarray DataTree, with content compatible with the Open Microscopy Environment-\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;124;03m scale2\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'BaseStore' from 'zarr.storage' (/Users/mkeller/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/zarr/storage.py)" + ] + } + ], + "source": [ + "import spatialdata as sd\n", + "import scanpy as sc\n", + "\n", + "# Read in the data as a SpatialData object\n", + "sdata_orig = sd.read_zarr(\"data/processed/original.toy.zarr\")\n", + "#sdata = sd.read_zarr(\"data/processed/normalized.toy.zarr\")\n", + "\n", + "# Get the AnnData object that the SpatialData object contains\n", + "adata = sdata.table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "923166f3-4315-46f2-8c28-9992b5bcfe62", + "metadata": {}, + "outputs": [], + "source": [ + "sc.pl.heatmap(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a71752e-341d-4128-973c-3a7c4696e848", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demos/toy/normalize_data.py b/demos/toy/normalize_data.py new file mode 100644 index 00000000..c6ed9f5e --- /dev/null +++ b/demos/toy/normalize_data.py @@ -0,0 +1,19 @@ +import spatialdata as sd +import scanpy as sc + +# Read in the data as a SpatialData object +sdata = sd.read_zarr(snakemake.input[0]) + +# Get the AnnData object that the SpatialData object contains +adata = sdata.table + +# The ScanPy package is compatible with AnnData objects +# References: +# - https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html +# - https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.normalize_total.html +sc.pp.normalize_total(adata, target_sum=1e6, inplace=True) + +sdata.table = adata + +# Write the normalized data back to the SpatialData object +sdata.write_zarr(snakemake.output[0]) From 95548355da4b88149a6861bba6afb6180becf138 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Thu, 15 Feb 2024 16:05:37 -0500 Subject: [PATCH 4/9] Delete --- demos/toy/Snakefile-2 | 89 ---------------------------------- demos/toy/Untitled.ipynb | 96 ------------------------------------- demos/toy/normalize_data.py | 19 -------- 3 files changed, 204 deletions(-) delete mode 100644 demos/toy/Snakefile-2 delete mode 100644 demos/toy/Untitled.ipynb delete mode 100644 demos/toy/normalize_data.py diff --git a/demos/toy/Snakefile-2 b/demos/toy/Snakefile-2 deleted file mode 100644 index 8b06276f..00000000 --- a/demos/toy/Snakefile-2 +++ /dev/null @@ -1,89 +0,0 @@ -include: "../common.smk" -configfile: "config.yml" - -# TODO: start from the real raw files -BASE_URL = "https://s3.embl.de/spatialdata/spatialdata-sandbox/{dataset}.zip" - - -rule all: - input: - [ (PROCESSED_DIR / f) for f in config['output'] ] - - -wildcard_constraints: - dataset="\w+" # a-z 0-9 - -# Normalize the matrix -rule normalize_matrix: - input: - PROCESSED_DIR / "{dataset}.zarr" - output: - directory(PROCESSED_DIR / "normalized" / "{dataset}.zarr"), - PROCESSED_DIR / "heatmap_before_{dataset}.png", - PROCESSED_DIR / "heatmap_after_{dataset}.png" - run: - import os - import scanpy as sc - import spatialdata as sd - import matplotlib.pyplot as plt - - #toy_folder_path = 'C:/Users/darkr/Projects/vitessce-python/demos/toy/data/processed' - #zarr_files = [os.path.join(toy_folder_path, f) for f in os.listdir(toy_folder_path) if f.endswith('.zarr')] - #print(f"Found {len(zarr_files)} zarr files in the 'toy' directory.") - - def process_and_plot_zarr(zarr_path, before_path, after_path): - # Read zarr file - print(f"Processed {zarr_path}.") - sdata = sd.read_zarr(zarr_path) - adata = sdata.table - #print(sdata.__dict__) - #print(adata) - sc.pp.normalize_total(adata, target_sum=1e6, inplace=True)\ - - - #zarr_basename = os.path.splitext(os.path.basename(zarr_path))[0] - - with plt.rc_context(): - # Initial heatmap before normalization - sc.pl.heatmap(adata, var_names=adata.var_names, groupby='fov', use_raw=False, show=False) - plt.savefig(before_path) - - - # Normalize - sc.pp.normalize_total(adata, target_sum=1e4) - sc.pp.log1p(adata) - - - # Heatmap after normalization - sc.pl.heatmap(adata, var_names=adata.var_names, groupby='fov', use_raw=False, show=False) - plt.savefig(after_path) - - - print(f"Processed {zarr_path}.") - # sdata.table = adata - sdata.write(output[0]) - process_and_plot_zarr(input[0], output[1], output[2]) - - -# Unzip the downloaded zip files -rule unzip_file: - input: - (RAW_DIR / "{dataset}.zip") - output: - directory(PROCESSED_DIR / "{dataset}.zarr") - shell: - """ - unzip {input} -d data/processed &&\ - mv data/processed/data.zarr data/processed/{wildcards.dataset}.zarr - """ - -# Download visium .zip file containing single-cell data. -rule download_data: - output: - (RAW_DIR / "{dataset}.zip") - params: - file_url=BASE_URL - shell: - ''' - curl -L -o {output} {params.file_url} - ''' diff --git a/demos/toy/Untitled.ipynb b/demos/toy/Untitled.ipynb deleted file mode 100644 index 7c866506..00000000 --- a/demos/toy/Untitled.ipynb +++ /dev/null @@ -1,96 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "24ec3c29-9a1e-4a97-8584-8c6f6f29a9a8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Users/mkeller/research/dbmi/vitessce/vitessce-python/demos/toy\n" - ] - } - ], - "source": [ - "!pwd" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "65a58c0a-96c7-40a8-bfba-0d40244817e2", - "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "cannot import name 'BaseStore' from 'zarr.storage' (/Users/mkeller/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/zarr/storage.py)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msd\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mscanpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01msc\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Read in the data as a SpatialData object\u001b[39;00m\n", - "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/spatialdata/__init__.py:30\u001b[0m\n\u001b[1;32m 9\u001b[0m _check_geopandas_using_shapely()\n\u001b[1;32m 12\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 13\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodels\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 14\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtransformations\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msave_transformations\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 28\u001b[0m ]\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m dataloader, models, transformations\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconcatenate\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m concatenate\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_core\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01moperations\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01maggregate\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m aggregate\n", - "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/spatialdata/models/__init__.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m__future__\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m annotations\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 4\u001b[0m C,\n\u001b[1;32m 5\u001b[0m SpatialElement,\n\u001b[1;32m 6\u001b[0m X,\n\u001b[1;32m 7\u001b[0m Y,\n\u001b[1;32m 8\u001b[0m Z,\n\u001b[1;32m 9\u001b[0m get_axes_names,\n\u001b[1;32m 10\u001b[0m get_spatial_axes,\n\u001b[1;32m 11\u001b[0m points_dask_dataframe_to_geopandas,\n\u001b[1;32m 12\u001b[0m points_geopandas_to_dask_dataframe,\n\u001b[1;32m 13\u001b[0m validate_axes,\n\u001b[1;32m 14\u001b[0m validate_axis_name,\n\u001b[1;32m 15\u001b[0m )\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 17\u001b[0m Image2DModel,\n\u001b[1;32m 18\u001b[0m Image3DModel,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m get_model,\n\u001b[1;32m 25\u001b[0m )\n\u001b[1;32m 27\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 28\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLabels2DModel\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 29\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLabels3DModel\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpoints_dask_dataframe_to_geopandas\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 47\u001b[0m ]\n", - "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/spatialdata/models/_utils.py:11\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataframe\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DataFrame \u001b[38;5;28;01mas\u001b[39;00m DaskDataFrame\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgeopandas\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GeoDataFrame\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmultiscale_spatial_image\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MultiscaleSpatialImage\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatial_image\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SpatialImage\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspatialdata\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_logging\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m logger\n", - "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/multiscale_spatial_image/__init__.py:15\u001b[0m\n\u001b[1;32m 6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMultiscaleSpatialImage\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMethods\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__version__\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 12\u001b[0m ]\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m__about__\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmultiscale_spatial_image\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MultiscaleSpatialImage\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mto_multiscale\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Methods, to_multiscale, itk_image_to_multiscale\n", - "File \u001b[0;32m~/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/multiscale_spatial_image/multiscale_spatial_image.py:9\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcollections\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mabc\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MutableMapping\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpathlib\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Path\n\u001b[0;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mzarr\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstorage\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseStore\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mMultiscaleSpatialImage\u001b[39;00m(DataTree):\n\u001b[1;32m 13\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"A multi-scale representation of a spatial image.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \n\u001b[1;32m 15\u001b[0m \u001b[38;5;124;03m This is an xarray DataTree, with content compatible with the Open Microscopy Environment-\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;124;03m scale2\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'BaseStore' from 'zarr.storage' (/Users/mkeller/software/miniconda3/envs/vitessce-python-notebooks/lib/python3.9/site-packages/zarr/storage.py)" - ] - } - ], - "source": [ - "import spatialdata as sd\n", - "import scanpy as sc\n", - "\n", - "# Read in the data as a SpatialData object\n", - "sdata_orig = sd.read_zarr(\"data/processed/original.toy.zarr\")\n", - "#sdata = sd.read_zarr(\"data/processed/normalized.toy.zarr\")\n", - "\n", - "# Get the AnnData object that the SpatialData object contains\n", - "adata = sdata.table" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "923166f3-4315-46f2-8c28-9992b5bcfe62", - "metadata": {}, - "outputs": [], - "source": [ - "sc.pl.heatmap(adata)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4a71752e-341d-4128-973c-3a7c4696e848", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.0" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/demos/toy/normalize_data.py b/demos/toy/normalize_data.py deleted file mode 100644 index c6ed9f5e..00000000 --- a/demos/toy/normalize_data.py +++ /dev/null @@ -1,19 +0,0 @@ -import spatialdata as sd -import scanpy as sc - -# Read in the data as a SpatialData object -sdata = sd.read_zarr(snakemake.input[0]) - -# Get the AnnData object that the SpatialData object contains -adata = sdata.table - -# The ScanPy package is compatible with AnnData objects -# References: -# - https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html -# - https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.normalize_total.html -sc.pp.normalize_total(adata, target_sum=1e6, inplace=True) - -sdata.table = adata - -# Write the normalized data back to the SpatialData object -sdata.write_zarr(snakemake.output[0]) From d25ea5664c2a765fdf8ee3858f4b81c4f3fc3fa2 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 16 Feb 2024 11:12:15 -0500 Subject: [PATCH 5/9] Dask not working --- demos/salcher-2022/src/convert_to_zarr.py | 60 ++++++++++++++++++----- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/demos/salcher-2022/src/convert_to_zarr.py b/demos/salcher-2022/src/convert_to_zarr.py index aba2a980..22b621c5 100644 --- a/demos/salcher-2022/src/convert_to_zarr.py +++ b/demos/salcher-2022/src/convert_to_zarr.py @@ -1,24 +1,42 @@ import argparse from anndata import read_h5ad -from scipy import sparse -from vitessce.data_utils import ( - to_uint8, -) +import scipy +import numpy as np +import pandas as pd +import sparse +import dask +import dask.array as da +from dask.distributed import Client, LocalCluster, progress +import platform +import os - -def convert_h5ad_to_zarr(input_path, output_path): +def convert_h5ad_to_zarr(input_path, output_path, client): adata = read_h5ad(input_path) - adata.layers['X_uint8'] = to_uint8(adata.X, norm_along="var") + # Clear X so that we can write it ourselves manually + X = adata.X #.copy() + + adata.X = None + adata.write_zarr(output_path) + + assert isinstance(X, scipy.sparse.spmatrix) - # Vitessce plays nicely with csc matrices - # TODO: automate conversion to csc in optimize_adata function - if isinstance(adata.X, sparse.spmatrix): - adata.X = adata.X.tocsc() - adata.write_zarr(output_path, chunks=[adata.shape[0], 10]) + # Use dask to write the X matrix as a dense matrix + X_sparse = sparse.GCXS.from_scipy_sparse(X) + print("X_sparse") + X_dask = da.from_array(X_sparse, chunks=(3000, 1)) + print("X_dask") + X_dask = X_dask.map_blocks(lambda block: block.todense()) + print("map_blocks") + X_delayed = X_dask.to_zarr(url=output_path, component="/X", overwrite=True, compute=False) + print("to_zarr") + X_future = client.persist(X_delayed) + + progress(X_future) if __name__ == '__main__': + # Argparse parser = argparse.ArgumentParser() parser.add_argument( '-i', @@ -35,7 +53,23 @@ def convert_h5ad_to_zarr(input_path, output_path): help='Output Zarr store' ) args = parser.parse_args() + + # Dask + # Check if this is running on O2 + IS_O2 = (platform.system() == "Linux") + + if IS_O2: + O2_USER = os.environ["USER"] + DASK_TEMP_DIR = f"/n/scratch/users/{O2_USER[0]}/{O2_USER}/vitessce-python-temp" + os.makedirs(DASK_TEMP_DIR, exist_ok=True) + dask.config.set({ "temporary_directory": DASK_TEMP_DIR }) + + # Should request at least 96GB of memory for this job. + cluster = LocalCluster(n_workers=2, threads_per_worker=2, memory_limit='4GB') + client = Client(cluster) + convert_h5ad_to_zarr( args.input, - args.output + args.output, + client, ) From 4830fd8385d7843049ec0fdec4df3c8a02a8af67 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 16 Feb 2024 16:11:50 -0500 Subject: [PATCH 6/9] Update --- demos/salcher-2022/src/convert_to_zarr.py | 53 +++++++++++------------ 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/demos/salcher-2022/src/convert_to_zarr.py b/demos/salcher-2022/src/convert_to_zarr.py index 22b621c5..1ad7b0dc 100644 --- a/demos/salcher-2022/src/convert_to_zarr.py +++ b/demos/salcher-2022/src/convert_to_zarr.py @@ -9,8 +9,10 @@ from dask.distributed import Client, LocalCluster, progress import platform import os +import zarr +import math -def convert_h5ad_to_zarr(input_path, output_path, client): +def convert_h5ad_to_zarr(input_path, output_path): adata = read_h5ad(input_path) # Clear X so that we can write it ourselves manually @@ -21,19 +23,31 @@ def convert_h5ad_to_zarr(input_path, output_path, client): assert isinstance(X, scipy.sparse.spmatrix) - # Use dask to write the X matrix as a dense matrix - X_sparse = sparse.GCXS.from_scipy_sparse(X) - print("X_sparse") - X_dask = da.from_array(X_sparse, chunks=(3000, 1)) - print("X_dask") - X_dask = X_dask.map_blocks(lambda block: block.todense()) - print("map_blocks") - X_delayed = X_dask.to_zarr(url=output_path, component="/X", overwrite=True, compute=False) - print("to_zarr") - X_future = client.persist(X_delayed) + print(output_path) - progress(X_future) + store = zarr.DirectoryStore(output_path) + z = zarr.zeros(shape=X.shape, chunks=(X.shape[0], 10), dtype=X.dtype, store = store, path = "/X", overwrite=True) + chunk_shape = (10000, 10000) + x_chunks = math.ceil(X.shape[0] / chunk_shape[0]) + y_chunks = math.ceil(X.shape[1] / chunk_shape[1]) + + + for i in range(x_chunks): + for j in range(y_chunks): + x_start = i * chunk_shape[0] + x_end = min((i + 1) * chunk_shape[0], X.shape[0]) + y_start = j * chunk_shape[1] + y_end = min((j + 1) * chunk_shape[1], X.shape[1]) + + X_chunk = X[x_start:x_end, y_start:y_end].tocoo(copy=False) + z.set_coordinate_selection( + # Add x_start and y_start as offsets to the row/chunk coordinates + ([cx+x_start for cx in X_chunk.row], [cy+y_start for cy in X_chunk.col]), + X_chunk.data + ) + + print("done") if __name__ == '__main__': # Argparse @@ -54,22 +68,7 @@ def convert_h5ad_to_zarr(input_path, output_path, client): ) args = parser.parse_args() - # Dask - # Check if this is running on O2 - IS_O2 = (platform.system() == "Linux") - - if IS_O2: - O2_USER = os.environ["USER"] - DASK_TEMP_DIR = f"/n/scratch/users/{O2_USER[0]}/{O2_USER}/vitessce-python-temp" - os.makedirs(DASK_TEMP_DIR, exist_ok=True) - dask.config.set({ "temporary_directory": DASK_TEMP_DIR }) - - # Should request at least 96GB of memory for this job. - cluster = LocalCluster(n_workers=2, threads_per_worker=2, memory_limit='4GB') - client = Client(cluster) - convert_h5ad_to_zarr( args.input, args.output, - client, ) From 3176305b691a347534d2f9e01ef36c62f3f922ce Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Mon, 19 Feb 2024 09:30:38 -0500 Subject: [PATCH 7/9] Remove unused imports --- demos/salcher-2022/src/convert_to_zarr.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/demos/salcher-2022/src/convert_to_zarr.py b/demos/salcher-2022/src/convert_to_zarr.py index 1ad7b0dc..37be86e6 100644 --- a/demos/salcher-2022/src/convert_to_zarr.py +++ b/demos/salcher-2022/src/convert_to_zarr.py @@ -3,10 +3,6 @@ import scipy import numpy as np import pandas as pd -import sparse -import dask -import dask.array as da -from dask.distributed import Client, LocalCluster, progress import platform import os import zarr From 07a7797968b1e684012b46b4e84b019c2c9de028 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Mon, 1 Apr 2024 17:24:02 -0400 Subject: [PATCH 8/9] Add spatialdata demos --- demos/spatialdata-2024/Snakefile | 35 +++++++++++++++++++++++++++++++ demos/spatialdata-2024/config.yml | 10 +++++++++ 2 files changed, 45 insertions(+) create mode 100644 demos/spatialdata-2024/Snakefile create mode 100644 demos/spatialdata-2024/config.yml diff --git a/demos/spatialdata-2024/Snakefile b/demos/spatialdata-2024/Snakefile new file mode 100644 index 00000000..d21fa464 --- /dev/null +++ b/demos/spatialdata-2024/Snakefile @@ -0,0 +1,35 @@ + +include: "../common.smk" +configfile: "config.yml" + +BASE_URL = "https://s3.embl.de/spatialdata/spatialdata-sandbox/{dataset}.zip" + + +rule all: + input: + [ (PROCESSED_DIR / f) for f in config['output'] ] + + +# Unzip the downloaded zip files +rule unzip_file: + input: + (RAW_DIR / "{dataset}.zip") + output: + directory(PROCESSED_DIR / "{dataset}.zarr") + shell: + """ + mkdir -p data/{wildcards.dataset} &&\ + unzip {input} -d data/{wildcards.dataset} &&\ + mv data/{wildcards.dataset}/data.zarr data/processed/{wildcards.dataset}.zarr + """ + +# Download visium .zip file containing single-cell data. +rule download_data: + output: + (RAW_DIR / "{dataset}.zip") + params: + file_url=BASE_URL + shell: + ''' + curl -L -o {output} {params.file_url} + ''' \ No newline at end of file diff --git a/demos/spatialdata-2024/config.yml b/demos/spatialdata-2024/config.yml new file mode 100644 index 00000000..4f75ebc2 --- /dev/null +++ b/demos/spatialdata-2024/config.yml @@ -0,0 +1,10 @@ +output: +- toy.zarr +- visium_hd_3.0.0_io.zarr +- visium_associated_xenium_io.zarr +- xenium_rep1_io.zarr +- xenium_rep2_io.zarr +- mcmicro_io.zarr +- merfish.zarr +- mibitof.zarr +- steinbock_io.zarr \ No newline at end of file From f29ee99ba40fa887a7295ed8de53d75523d404b8 Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Mon, 1 Apr 2024 17:32:41 -0400 Subject: [PATCH 9/9] Use constants --- demos/spatialdata-2024/Snakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/demos/spatialdata-2024/Snakefile b/demos/spatialdata-2024/Snakefile index d21fa464..7f4d9a16 100644 --- a/demos/spatialdata-2024/Snakefile +++ b/demos/spatialdata-2024/Snakefile @@ -18,9 +18,9 @@ rule unzip_file: directory(PROCESSED_DIR / "{dataset}.zarr") shell: """ - mkdir -p data/{wildcards.dataset} &&\ - unzip {input} -d data/{wildcards.dataset} &&\ - mv data/{wildcards.dataset}/data.zarr data/processed/{wildcards.dataset}.zarr + mkdir -p {RAW_DIR}/{wildcards.dataset} &&\ + unzip {input} -d {RAW_DIR}/{wildcards.dataset} &&\ + mv {RAW_DIR}/{wildcards.dataset}/data.zarr {PROCESSED_DIR}/{wildcards.dataset}.zarr """ # Download visium .zip file containing single-cell data.