diff --git a/course_files/clustering.Rmd b/course_files/clustering.Rmd index 8f6d05339..d0c0cff2d 100644 --- a/course_files/clustering.Rmd +++ b/course_files/clustering.Rmd @@ -60,7 +60,7 @@ Now we are ready to run `SC3` (we also ask it to calculate biological properties deng <- sc3(deng, ks = 10, biology = TRUE, n_cores = 1) ``` -`SC3` result consists of several different outputs (please look in [@Kiselev2016-bq] and [SC3 vignette](http://bioconductor.org/packages/release/bioc/vignettes/SC3/inst/doc/my-vignette.html) for more details). Here we show some of them: +`SC3` result consists of several different outputs (please look in [@Kiselev2016-bq] and [SC3 vignette](https://bioconductor.org/packages/release/bioc/vignettes/SC3/inst/doc/SC3.html) for more details). Here we show some of them: Consensus matrix: ```{r, fig.height=6} @@ -123,7 +123,7 @@ Now we are going to apply _k_-means clustering algorithm to the cloud of points We will start with $k=8$: ```{r clust-tsne-kmeans2, fig.cap = "tSNE map of the patient data with 8 colored clusters, identified by the k-means clustering algorithm"} -colData(deng)$tSNE_kmeans <- as.character(kmeans(deng@reducedDims$TSNE, centers = 8)$clust) +colData(deng)$tSNE_kmeans <- as.character(kmeans(reducedDim(deng), centers = 8)$clust) plotTSNE(deng, colour_by = "tSNE_kmeans") ``` diff --git a/course_files/dropouts.Rmd b/course_files/dropouts.Rmd index 98ed7ddf3..376323636 100644 --- a/course_files/dropouts.Rmd +++ b/course_files/dropouts.Rmd @@ -61,7 +61,7 @@ expression matrix. This can be extracted from our SingleCellExperiment object us command below. ```{r} -expr_matrix <- M3Drop::M3DropConvertData(deng) +expr_matrix <- M3Drop::M3DropConvertData(counts(deng)) ``` This function is compatible with most single-cell RNA-seq analysis packages including: @@ -272,7 +272,7 @@ far too many genes will be called as significant. Thus we will take the top 1500 by effect size. ```{r, fig.width=8, fig.height=5} -deng_int <- NBumiConvertData(deng) +deng_int <- NBumiConvertData(counts(deng)) DANB_fit <- NBumiFitModel(deng_int) # DANB is fit to the raw count matrix # Perform DANB feature selection DropFS <- NBumiFeatureSelectionCombinedDrop(DANB_fit, method="fdr", qval.thresh=0.01, suppress.plot=FALSE) diff --git a/course_files/exprs-norm-reads.Rmd b/course_files/exprs-norm-reads.Rmd index 085566999..c1b021826 100644 --- a/course_files/exprs-norm-reads.Rmd +++ b/course_files/exprs-norm-reads.Rmd @@ -31,7 +31,7 @@ plotPCA( ``` ```{r norm-pca-cpm-reads, fig.cap = "PCA plot of the tung data after CPM normalisation"} -logcounts(reads.qc) <- log2(calculateCPM(reads.qc, use_size_factors = FALSE) + 1) +logcounts(reads.qc) <- log2(calculateCPM(reads.qc, use_size_factors = NULL) + 1) plotPCA( reads.qc[endog_genes, ], colour_by = "batch", @@ -55,7 +55,7 @@ plotRLE( ```{r norm-pca-lsf-umi, fig.cap = "PCA plot of the tung data after LSF normalisation"} qclust <- quickCluster(reads.qc, min.size = 30) reads.qc <- computeSumFactors(reads.qc, sizes = 15, clusters = qclust) -reads.qc <- normalize(reads.qc) +reads.qc <- logNormCounts(reads.qc) plotPCA( reads.qc[endog_genes, ], colour_by = "batch", diff --git a/course_files/exprs-norm.Rmd b/course_files/exprs-norm.Rmd index db2dde91c..02852bb04 100644 --- a/course_files/exprs-norm.Rmd +++ b/course_files/exprs-norm.Rmd @@ -131,7 +131,7 @@ plotPCA( ### CPM ```{r norm-pca-cpm, fig.cap = "PCA plot of the tung data after CPM normalisation"} -logcounts(umi.qc) <- log2(calculateCPM(umi.qc, use_size_factors = FALSE) + 1) +logcounts(umi.qc) <- log2(calculateCPM(umi.qc, use_size_factors = NULL) + 1) plotPCA( umi.qc[endog_genes, ], colour_by = "batch", @@ -156,7 +156,7 @@ plotRLE( ```{r norm-pca-lsf, fig.cap = "PCA plot of the tung data after LSF normalisation"} qclust <- quickCluster(umi.qc, min.size = 30) umi.qc <- computeSumFactors(umi.qc, sizes = 15, clusters = qclust) -umi.qc <- normalize(umi.qc) +umi.qc <- logNormCounts(umi.qc) plotPCA( umi.qc[endog_genes, ], colour_by = "batch", diff --git a/course_files/exprs-qc-reads.Rmd b/course_files/exprs-qc-reads.Rmd index 774657249..ba9e93207 100644 --- a/course_files/exprs-qc-reads.Rmd +++ b/course_files/exprs-qc-reads.Rmd @@ -38,13 +38,18 @@ reads <- reads[keep_feature, ] ``` ```{r} -isSpike(reads, "ERCC") <- grepl("^ERCC-", rownames(reads)) -isSpike(reads, "MT") <- rownames(reads) %in% - c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", +is.spike=grepl("^ERCC-", rownames(reads)) +reads <- splitAltExps(reads, ifelse(is.spike, "ERCC", "gene")) +altExpNames(reads) + +is.mt <- rownames(reads) %in% + c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763", "ENSG00000228253", "ENSG00000198938", "ENSG00000198840") +reads <- splitAltExps(reads, ifelse(is.mt, "MT", "gene")) +altExpNames(reads) ``` ```{r} @@ -148,7 +153,7 @@ table(reads$outlier) ```{r} plotReducedDim( reads, - use_dimred = "PCA_coldata", + use_dimred = "PCA", size_by = "total_features_by_counts", shape_by = "use", colour_by = "outlier" diff --git a/course_files/exprs-qc.Rmd b/course_files/exprs-qc.Rmd index 8dfcc09e4..a38b0a3c5 100644 --- a/course_files/exprs-qc.Rmd +++ b/course_files/exprs-qc.Rmd @@ -64,13 +64,18 @@ umi <- umi[keep_feature, ] Define control features (genes) - ERCC spike-ins and mitochondrial genes ([provided](http://jdblischak.github.io/singleCellSeq/analysis/qc-filter-ipsc.html) by the authors): ```{r} -isSpike(umi, "ERCC") <- grepl("^ERCC-", rownames(umi)) -isSpike(umi, "MT") <- rownames(umi) %in% - c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", +is.spike <- grepl("^ERCC-", rownames(umi)) +umi <- splitAltExps(umi, ifelse(is.spike, "ERCC", "gene")) +altExpNames(umi) + +is.mt <- rownames(umi) %in% + c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763", "ENSG00000228253", "ENSG00000198938", "ENSG00000198840") +umi <- splitAltExps(umi, ifelse(is.mt, "MT", "gene")) +altExpNames(umi) ``` Calculate the quality metrics: @@ -253,7 +258,7 @@ Then, we can use a PCA plot to see a 2D representation of the cells ordered by t ```{r} plotReducedDim( umi, - use_dimred = "PCA_coldata", + use_dimred = "PCA", size_by = "total_features_by_counts", shape_by = "use", colour_by = "outlier" diff --git a/course_files/figures/barcode_UMI_read1_adapter_fastqc.png b/course_files/figures/barcode_UMI_read1_adapter_fastqc.png new file mode 100644 index 000000000..8e60b3394 Binary files /dev/null and b/course_files/figures/barcode_UMI_read1_adapter_fastqc.png differ diff --git a/course_files/imputation.Rmd b/course_files/imputation.Rmd index 50a3ee650..3415b27a0 100644 --- a/course_files/imputation.Rmd +++ b/course_files/imputation.Rmd @@ -142,7 +142,7 @@ enhances trajectory-like structure in a dataset, in contrast to scImpute and DrImpute which assume a cluster-like structure to the underlying data. ```{r, eval=FALSE} -res <- magic(t(deng@assays[["logcounts"]]), genes="all_genes", k=10, t="auto") +res <- magic(t(deng@assays@data$logcounts), genes="all_genes", k=10, t="auto") ``` ```{r, eval=FALSE} diff --git a/course_files/intro.Rmd b/course_files/intro.Rmd index 751f1e67b..5536006bb 100644 --- a/course_files/intro.Rmd +++ b/course_files/intro.Rmd @@ -26,7 +26,7 @@ opts_chunk$set(fig.align = "center", echo=FALSE) * Allows to study new biological questions in which __cell-specific changes in transcriptome are important__, e.g. cell type identification, heterogeneity of cell responses, stochasticity of gene expression, inference of gene regulatory networks across the cells. * Datasets range __from $10^2$ to $10^6$ cells__ and increase in size every year * Currently there are several different protocols in use, e.g. SMART-seq2 [@Picelli2013-sb], CELL-seq [@Hashimshony2012-kd] and Drop-seq [@Macosko2015-ix] -* There are also commercial platforms available, including the [Fluidigm C1](https://www.fluidigm.com/products/c1-system), [Wafergen ICELL8](https://www.wafergen.com/products/icell8-single-cell-system) and the [10X Genomics Chromium](https://www.10xgenomics.com/single-cell/) +* There are also commercial platforms available, including the [Fluidigm C1](https://www.fluidigm.com/products/c1-system), [Wafergen ICELL8](https://www.wafergen.com/products/icell8-single-cell-system), the [10X Genomics Chromium](https://www.10xgenomics.com/single-cell/) and even a platform developed in Cambridgeshire: [Nadia from Dolomite Bio](https://www.dolomite-bio.com/product/nadia-instrument). * Several computational analysis methods from bulk RNA-seq __can__ be used * __In most cases__ computational analysis requires adaptation of the existing methods or development of new ones @@ -56,6 +56,7 @@ Today, there are also several different platforms available for carrying out one * [Seurat](http://satijalab.org/seurat/) is an R package designed for QC, analysis, and exploration of single cell RNA-seq data. * [ASAP](https://asap.epfl.ch/) (Automated Single-cell Analysis Pipeline) is an interactive web-based platform for single-cell analysis. * [Bioconductor](https://master.bioconductor.org/packages/release/workflows/html/simpleSingleCell.html) is a open-source, open-development software project for the analysis of high-throughput genomics data, including packages for the analysis of single-cell data. +* [dropSeqPipe](https://github.com/Hoohm/dropSeqPipe) is dropSeq pre-processing pipeline based on the popular [snakmake](https://snakemake.readthedocs.io/en/stable/) framework combining some of the above software (co-developed by Sebastian Mueller, one of the instructors). ## Challenges diff --git a/course_files/umis.Rmd b/course_files/umis.Rmd index 537068bab..82ca0f96b 100644 --- a/course_files/umis.Rmd +++ b/course_files/umis.Rmd @@ -38,6 +38,9 @@ After processing the reads from a UMI experiment, the following conventions are knitr::include_graphics("figures/UMI-Seq-reads.png") ``` +```{r intro-umi-fastqc-adapter, out.width = '90%', fig.cap="FastQC adapter content plot of too long Read 1 for a DropSeq sample. Barcode (12bp) and UMI (8bp) make up first 20 bp, the remaining sequence is ininformative poly-T."} +knitr::include_graphics("figures/barcode_UMI_read1_adapter_fastqc.png") +``` ### Counting Barcodes In theory, every unique UMI-transcript pair should represent all reads originating from a single RNA molecule. However, in practice this is frequently not the case and the most common reasons are: