bioinformatics-core-shared-training
diff --git a/‎Markdowns/10_MultiSplComp.Rmd‎
Lines changed: 108 additions & 183 deletions b/‎Markdowns/10_MultiSplComp.Rmd‎
Lines changed: 108 additions & 183 deletions
diff --git a/‎Markdowns/10_MultiSplComp.html‎
Lines changed: 2349 additions & 569 deletions b/‎Markdowns/10_MultiSplComp.html‎
Lines changed: 2349 additions & 569 deletions
diff --git a/‎Markdowns/10_MultiSplComp_Abbi.Rmd‎
Lines changed: 0 additions & 472 deletions b/‎Markdowns/10_MultiSplComp_Abbi.Rmd‎
Lines changed: 0 additions & 472 deletions
diff --git a/‎Markdowns/10_MultiSplComp_exercise1.Rmd‎
Lines changed: 4 additions & 136 deletions b/‎Markdowns/10_MultiSplComp_exercise1.Rmd‎
Lines changed: 4 additions & 136 deletions
diff --git a/‎Markdowns/10_MultiSplComp_exercise1.html‎
Lines changed: 1811 additions & 0 deletions b/‎Markdowns/10_MultiSplComp_exercise1.html‎
Lines changed: 1811 additions & 0 deletions
diff --git a/‎Markdowns/10_MultiSplComp_exercise1.nb.html‎
Lines changed: 3067 additions & 0 deletions b/‎Markdowns/10_MultiSplComp_exercise1.nb.html‎
Lines changed: 3067 additions & 0 deletions
diff --git a/‎Markdowns/10_MultiSplComp_exercise1_solutions.Rmd‎
Lines changed: 8 additions & 140 deletions b/‎Markdowns/10_MultiSplComp_exercise1_solutions.Rmd‎
Lines changed: 8 additions & 140 deletions
diff --git a/‎Markdowns/10_MultiSplComp_exercise1_solutions.html‎
Lines changed: 1567 additions & 174 deletions b/‎Markdowns/10_MultiSplComp_exercise1_solutions.html‎
Lines changed: 1567 additions & 174 deletions
diff --git a/‎Markdowns/10_MultiSplComp_exercise1_solutions.nb.html‎
Lines changed: 3331 additions & 0 deletions b/‎Markdowns/10_MultiSplComp_exercise1_solutions.nb.html‎
Lines changed: 3331 additions & 0 deletions
@@ -1,7 +1,7 @@
 ---
 title: "Introduction to single-cell RNA-seq analysis"
 subtitle: 'Multi-sample comparisons - Exercise 1'
-author: "Stephane Ballereau"
+author: "Abbi Edwards, Stephane Ballereau"
 output:
   html_document:
     df_print: paged
@@ -15,27 +15,6 @@ output:
     number_sections: true
 ---
 
-# Differential expression and abundance between conditions
-
-```{r multiSplComp_setup, include=FALSE, echo=FALSE}
-# First, set some variables:
-require(knitr)
-
-opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
-opts_chunk$set(echo = TRUE)
-opts_chunk$set(eval = TRUE) 
-options(stringsAsFactors = FALSE)
-opts_chunk$set(fig.width=7, fig.height=7)
-set.seed(123) # for reproducibility
-```
-
-```{r}
-splSetToGet <- "PBMMC,ETV6-RUNX1"
-splSetVec <- unlist(strsplit(splSetToGet, ","))
-splSetToGet2 <- gsub(",", "_", splSetToGet)
-nbPcToComp <- 50
-figSize <- 7
-```
 
 ```{r, message=FALSE, warning=FALSE}
 library(scater)
@@ -52,128 +31,17 @@ Source: [Multi-sample comparisons](https://osca.bioconductor.org/multi-sample-co
 
 ## Exercise 1 - differential expression
 
-Identify label-specific DE genes that are significant in 'c10' yet not DE in any other label.
+Identify label-specific DE genes that are significant in cluster 4 yet not DE in any other label.
 
 Plot the top-ranked gene for inspection.
 
-## Setting up the data
-
-Load the SCE object (with 1200 cells per sample):
-
-```{r}
-# Read object in:
-merged <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged.Rds")
-# also get raw counts that were written to a separate file
-# (to help file sharing)
-merged_counts <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged_counts.Rds")
-# put raw counts back:
-counts(merged) <- merged_counts
-# tidy:
-rm(merged_counts)
-```
-
-A brief inspection of the results shows clusters contain varying contributions from samples:
-
-```{r}
-colLabels(merged) <- merged$clusters.mnn
-tab <- table(colLabels(merged), merged$SampleName)
-tab
-```
-
-On the t-SNE plots below, cells are coloured by type or sample ('batch of origin'). Cluster numbers are superimposed based on the median coordinate of cells assigned to that cluster. 
-
-```{r, fig.height=0.5*figSize, fig.width=1.2*figSize}
-p1 <- plotTSNE(merged, colour_by="SampleGroup", text_by="label", point_size=0.3)
-p2 <- plotTSNE(merged, colour_by="SampleName", point_size=0.3) +
-  facet_wrap(~colData(merged)$SampleName)
-p1
-p2
-```
-
-## Creating pseudo-bulk samples
-
-Sum counts together for all cells with the same combination of label and sample,
-with `aggregateAcrossCells`.
-
-```{r}
-# Using 'label' and 'sample' as our two factors; each column of the output
-# corresponds to one unique combination of these two factors.
-columnsToUse <- c("batch", "SampleName", "SampleGroup", "clusters.mnn")
-colData(merged) <- colData(merged) %>% data.frame() %>% select(all_of(columnsToUse)) %>% DataFrame
-summed <- aggregateAcrossCells(merged, 
-    				id = DataFrame(
-    					label = merged$clusters.mnn,
-    					sample = merged$SampleName
-					)
-)
-colData(summed) %>% head(3)
-```
-
-## Performing the DE analysis
-
-Filter out all sample-label combinations with insufficient cells, 20 cells or less.
-
-```{r}
-summed.filt <- summed[,summed$ncells >= 20]
-```
-
-Construct a common design matrix that will be used in the analysis for each label.
-
-```{r}
-# Pulling out a sample-level 'targets' data.frame:
-targets <- colData(merged)[!duplicated(merged$SampleName),] %>%
-  data.frame() %>%
-  select(-clusters.mnn)
-
-# Constructing the design matrix:
-design <- model.matrix(~factor(SampleGroup), data=targets)
-rownames(design) <- targets$SampleName
-```
-
-Apply the `pseudoBulkDGE` function to obtain a list of DE genes for each label.
-
-```{r}
-summed.filt$SampleGroup <- factor(summed.filt$SampleGroup)
-
-de.results <- pseudoBulkDGE(summed.filt, 
-    label = summed.filt$label,
-    design = ~SampleGroup,
-    coef = "SampleGroupPBMMC",
-    condition = summed.filt$SampleGroup
-)
-```
-
-Examine the numbers of DEGs at a FDR of 5% for each label using the `decideTestsPerLabel` function.
-
 ```{r}
-is.de <- decideTestsPerLabel(de.results, threshold=0.05)
-summarizeTestsPerLabel(is.de)
-```
-
-For each gene, we compute the percentage of cell types in which that gene is upregulated or downregulated. (Here, we consider a gene to be non-DE if it is not retained after filtering.).
+# load RObjects until this point
 
-<!-- TODO: add gene names -->
-
-```{r}
-# Upregulated across most cell types.
-up.de <- is.de > 0 & !is.na(is.de)
-head(sort(rowMeans(up.de), decreasing=TRUE), 10)
-```
+load("../Robjects/10_exercise1.RData")
 
-```{r}
-# Downregulated across cell types.
-down.de <- is.de < 0 & !is.na(is.de)
-head(sort(rowMeans(down.de), decreasing=TRUE), 10)
 ```
 
-Identify label-specific DE genes that are significant in 'c10' yet not DE in any other label (FDR threshold of 50%).
-
-Plot the top-ranked gene for inspection.
-
-```{r}
-remotely.de <- decideTestsPerLabel(de.results, threshold=0.5)
-not.de <- remotely.de==0 | is.na(remotely.de)
-```
 
 ```{r}
 # get c10's 'unique.degs':
 
@@ -1,7 +1,7 @@
 ---
 title: "Introduction to single-cell RNA-seq analysis"
 subtitle: 'Multi-sample comparisons - Exercise 1'
-author: "Stephane Ballereau"
+author: "Abbi Edwards, Stephane Ballereau"
 output:
   html_document:
     df_print: paged
@@ -15,27 +15,6 @@ output:
     number_sections: true
 ---
 
-# Differential expression and abundance between conditions
-
-```{r multiSplComp_setup, include=FALSE, echo=FALSE}
-# First, set some variables:
-require(knitr)
-
-opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
-opts_chunk$set(echo = TRUE)
-opts_chunk$set(eval = TRUE) 
-options(stringsAsFactors = FALSE)
-opts_chunk$set(fig.width=7, fig.height=7)
-set.seed(123) # for reproducibility
-```
-
-```{r}
-splSetToGet <- "PBMMC,ETV6-RUNX1"
-splSetVec <- unlist(strsplit(splSetToGet, ","))
-splSetToGet2 <- gsub(",", "_", splSetToGet)
-nbPcToComp <- 50
-figSize <- 7
-```
 
 ```{r, message=FALSE, warning=FALSE}
 library(scater)
@@ -52,141 +31,30 @@ Source: [Multi-sample comparisons](https://osca.bioconductor.org/multi-sample-co
 
 ## Exercise 1 - differential expression
 
-Identify label-specific DE genes that are significant in 'c10' yet not DE in any other label.
+Identify label-specific DE genes that are significant in cluster 4 yet not DE in any other label.
 
 Plot the top-ranked gene for inspection.
 
-## Setting up the data
-
-Load the SCE object (with 1200 cells per sample):
-
-```{r}
-# Read object in:
-merged <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged.Rds")
-# also get raw counts that were written to a separate file
-# (to help file sharing)
-merged_counts <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged_counts.Rds")
-# put raw counts back:
-counts(merged) <- merged_counts
-# tidy:
-rm(merged_counts)
-```
-
-A brief inspection of the results shows clusters contain varying contributions from samples:
-
-```{r}
-colLabels(merged) <- merged$clusters.mnn
-tab <- table(colLabels(merged), merged$SampleName)
-tab
-```
-
-On the t-SNE plots below, cells are coloured by type or sample ('batch of origin'). Cluster numbers are superimposed based on the median coordinate of cells assigned to that cluster. 
-
-```{r}
-p1 <- plotTSNE(merged, colour_by="SampleGroup", text_by="label", point_size=0.3)
-p2 <- plotTSNE(merged, colour_by="SampleName", point_size=0.3) +
-  facet_wrap(~colData(merged)$SampleName)
-p1
-p2
-```
-
-## Creating pseudo-bulk samples
-
-Sum counts together for all cells with the same combination of label and sample,
-with `aggregateAcrossCells`.
-
-```{r}
-# Using 'label' and 'sample' as our two factors; each column of the output
-# corresponds to one unique combination of these two factors.
-columnsToUse <- c("batch", "SampleName", "SampleGroup", "clusters.mnn")
-colData(merged) <- colData(merged) %>% data.frame() %>% select(all_of(columnsToUse)) %>% DataFrame
-summed <- aggregateAcrossCells(merged, 
-    				id = DataFrame(
-    					label = merged$clusters.mnn,
-    					sample = merged$SampleName
-					)
-)
-colData(summed) %>% head(3)
-```
-
-## Performing the DE analysis
-
-Filter out all sample-label combinations with insufficient cells, 20 cells or less.
-
 ```{r}
-summed.filt <- summed[,summed$ncells >= 20]
-```
-
-Construct a common design matrix that will be used in the analysis for each label.
-
-```{r}
-# Pulling out a sample-level 'targets' data.frame:
-targets <- colData(merged)[!duplicated(merged$SampleName),] %>%
-  data.frame() %>%
-  select(-clusters.mnn)
-
-# Constructing the design matrix:
-design <- model.matrix(~factor(SampleGroup), data=targets)
-rownames(design) <- targets$SampleName
-```
-
-Apply the `pseudoBulkDGE` function to obtain a list of DE genes for each label.
-
-```{r}
-summed.filt$SampleGroup <- factor(summed.filt$SampleGroup)
-
-de.results <- pseudoBulkDGE(summed.filt, 
-    label = summed.filt$label,
-    design = ~SampleGroup,
-    coef = "SampleGroupPBMMC",
-    condition = summed.filt$SampleGroup
-)
-```
-
-Examine the numbers of DEGs at a FDR of 5% for each label using the `decideTestsPerLabel` function.
-
-```{r}
-is.de <- decideTestsPerLabel(de.results, threshold=0.05)
-summarizeTestsPerLabel(is.de)
-```
+# load RObjects until this point
 
-For each gene, we compute the percentage of cell types in which that gene is upregulated or downregulated. (Here, we consider a gene to be non-DE if it is not retained after filtering.).
+load("../Robjects/10_exercise1.RData")
 
-<!-- TODO: add gene names -->
-
-```{r}
-# Upregulated across most cell types.
-up.de <- is.de > 0 & !is.na(is.de)
-head(sort(rowMeans(up.de), decreasing=TRUE), 10)
-```
-
-```{r}
-# Downregulated across cell types.
-down.de <- is.de < 0 & !is.na(is.de)
-head(sort(rowMeans(down.de), decreasing=TRUE), 10)
 ```
 
-Identify label-specific DE genes that are significant in 'c10' yet not DE in any other label (FDR threshold of 50%).
-
-Plot the top-ranked gene for inspection.
-
-```{r}
-remotely.de <- decideTestsPerLabel(de.results, threshold=0.5)
-not.de <- remotely.de==0 | is.na(remotely.de)
-```
 
 ```{r}
-# get c10's 'unique.degs':
+# get cluster 4's 'unique.degs':
 
 # 2nd cluster in is.de
-cx <- "c10"
+cx <- "4"
 other.labels <- setdiff(colnames(not.de), cx)
 unique.degs <- is.de[,cx]!=0 & rowMeans(not.de[,other.labels])==1
 unique.degs <- names(which(unique.degs))
 head(unique.degs)
 ```
 
-```{r}
+```{r, warning=FALSE}
 # Choosing the top-ranked gene for inspection:
 de.inspec <- list()
 de.inspec[[cx]] <- de.results[[cx]] 
@@ -198,7 +66,7 @@ de.inspec[[cx]] <- de.inspec[[cx]][rownames(de.inspec[[cx]]) %in% unique.degs,]
 sizeFactors(summed.filt) <- NULL
 plotExpression(logNormCounts(summed.filt), 
     features=rownames(de.inspec[[cx]])[1],
-    x="SampleGroup", colour_by="SampleGroup", 
+    x="Sample", colour_by="Sample", 
     other_fields="label") + 
     facet_wrap(~label) +
   ggtitle(glue::glue("{cx}: {rownames(de.inspec[[cx]])[1]}"))