As a second RNA-seq dataset, we will test for differences in gene expression upon the knockout of the microRNA mir-200c (Kim et al., 2013). The raw fastq files can be found under the accession number SRP030475. As the number of samples is limited the experiment might be underpowered, as in most RNA-seq analysis. This is an experimental scenario that could benefit from power gained using modern FDR control methods.
library(dplyr)
library(ggplot2)
library(DESeq2)
library(SummarizedBenchmark)
library(BiocParallel)
library(recount)
## load helper functions
for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) {
source(f)
}
## project data/results folders
datdir <- "data"
resdir <- "results"
sbdir <- "../../results/RNAseq"
dir.create(datdir, showWarnings = FALSE)
dir.create(resdir, showWarnings = FALSE)
dir.create(sbdir, showWarnings = FALSE)
## intermediary files we create below
count_file <- file.path(datdir, "rse_gene.Rdata")
result_file <- file.path(resdir, "mir200c-results.rds")
bench_file <- file.path(sbdir, "mir200c-benchmark.rds")
bench_file_uninf <- file.path(sbdir, "mir200c-uninf-benchmark.rds")
## set up parallel backend
cores <- as.numeric(Sys.getenv("SLURM_NTASKS"))
multicoreParam <- SerialParam()We will download the pre-processed gene level counts available through recount2.
if (!file.exists(count_file)) {
download_study('SRP030475', outdir = datdir)
}
load(count_file)
dsd <- scale_counts(rse_gene)We next subset for samples containing the control samples and the samples where mir200c was knocked down. recount2 downloads data as a RangeSummarizedExperiment object, so we convert this into a DESeqDataSet object.
dsd <- dsd[, grepl("WT|200c", colData(dsd)$title)]
colData(dsd)$mir200c <- factor(ifelse(grepl("WT", colData(dsd)$title), "WT", "KO"))
dsd <- as(dsd, "DESeqDataSet")
storage.mode(assays(dsd)[["counts"]]) <- "integer"Then, we set the design parameter to test for differences in expression upon mir200c knockout and run DESeq2. Similarly to the previous dataset, we set the parameter independentFiltering=FALSE.
if (file.exists(result_file)) {
res <- readRDS(result_file)
} else {
design(dsd) <- ~ mir200c
dds <- DESeq(dsd)
res <- results(dds, independentFiltering = FALSE) %>%
as.data.frame() %>%
na.omit() %>%
dplyr::select(pvalue, baseMean, log2FoldChange, lfcSE, stat) %>%
dplyr::rename(pval = pvalue,
ind_covariate = baseMean,
effect_size = log2FoldChange,
SE = lfcSE,
test_statistic = stat)
saveRDS(res, file = result_file)
}Add random (uninformative) covariate.
set.seed(4719)
res$rand_covar <- rnorm(nrow(res))As with the GTEx example, the mean counts is used as the informative covariate.
rank_scatter(res, pvalue = "pval", covariate = "ind_covariate") +
ggtitle("Mean coverage as independent covariate") +
xlab("Mean Expression")Similar to the GTEx dataset, keeping all the tests results in a strange discreteness. This is removed once we filter very lowly expressed genes. For the first covariate bin, however, there is a strange behaviour in which the distribution seems a bit skewed towards larger p-values.
strat_hist(res, pvalue = "pval",
covariate = "ind_covariate", maxy = 7.5)##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:rlang':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
res <- filter(res, ind_covariate > 1)
strat_hist(res, pvalue = "pval",
covariate = "ind_covariate", maxy = 3)rank_scatter(res, pvalue = "pval", covariate = "rand_covar") +
ggtitle("Random independent covariate")strat_hist(res, pvalue = "pval",
covariate = "rand_covar", maxy = 7.5)We use the common BenchDesign with the set of multiple testing correction methods already included. We also add in Scott’s FDR Regression (both nulltype = "empirical" and nulltype = "theoretical") since our test statistics are t-distributed.
if (file.exists(bench_file)) {
sb <- readRDS(bench_file)
} else {
bd <- initializeBenchDesign()
bd <- addBMethod(bd, "fdrreg-t",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'theoretical',
control = list(lambda = 0.01))
bd <- addBMethod(bd, "fdrreg-e",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'empirical',
control = list(lambda = 0.01))
sb <- buildBench(bd, data = res, ftCols = "ind_covariate")
saveRDS(sb, file = bench_file)
}There are some warnings from both BH and fdrreg-empirical. However, they do return results. I tried to increase the nmids parameter for Scott’s FDR Regression but this does not appear to make a difference.
head(assays(sb)[["bench"]])## unadjusted bonf bh qvalue ihw-a01 ihw-a02 ihw-a03
## [1,] 0.6289835 1 0.9988806 0.9988806 0.9084923 0.9256538 0.9538953
## [2,] 0.7276542 1 0.9988806 0.9988806 1.0000000 1.0000000 1.0000000
## [3,] 0.9856426 1 0.9988806 0.9988806 1.0000000 1.0000000 1.0000000
## [4,] 0.5658651 1 0.9988806 0.9988806 0.9574543 1.0000000 0.9970605
## [5,] 0.7562462 1 0.9988806 0.9988806 1.0000000 1.0000000 1.0000000
## [6,] 0.1831689 1 0.9988806 0.9988806 0.7748361 0.7293789 0.7587335
## ihw-a04 ihw-a05 ihw-a06 ihw-a07 ihw-a08 ihw-a09 ihw-a10
## [1,] 0.9399781 0.9477460 0.9079441 0.8867238 0.8920413 0.8938990 0.8875297
## [2,] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [3,] 0.9115595 0.9805432 1.0000000 1.0000000 0.9829497 0.9995001 1.0000000
## [4,] 0.8728812 0.9211388 0.9273243 0.8693917 0.8743719 0.8877471 0.9233528
## [5,] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [6,] 0.7443460 0.8010331 0.8550321 0.8951808 0.9322806 0.9550085 0.9014612
## ashq bl-df02 bl-df03 bl-df04 bl-df05 lfdr adapt-glm
## [1,] 0.6802749 0.9988806 0.9988806 0.9987813 0.9988806 0.8952599 Inf
## [2,] 0.6901540 0.9988806 0.9988806 0.9987719 0.9988804 0.9068889 Inf
## [3,] 0.6906848 0.9988806 0.9988806 0.9987803 0.9988806 0.9292130 Inf
## [4,] 0.6727982 0.9988806 0.9988806 0.9987793 0.9988806 0.8856500 Inf
## [5,] 0.5948722 0.9988806 0.9988806 0.9987854 0.9988806 0.9097459 Inf
## [6,] 0.5280837 0.9988806 0.9988806 0.9987717 0.9988803 0.5631366 0.6234639
## fdrreg-t fdrreg-e
## [1,] 0.9517631 0.8509272
## [2,] 0.9509362 0.8506307
## [3,] 0.9664806 0.8458035
## [4,] 0.9449457 0.8389268
## [5,] 0.9721130 0.8123192
## [6,] 0.8827712 0.4987177
fdrreg-empirical rejects many more hypothesis than the rest of the methods, followed by lfdr and ihw.
assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)
estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>%
filter(performanceMetric == "rejections") %>%
select(blabel, performanceMetric, alpha, value) %>%
mutate(n = nrow(sb), prop = round(value / n, 3)) %>%
arrange(desc(value)) %>%
as_tibble() %>%
print(n = 40)## # A tibble: 23 x 6
## blabel performanceMetric alpha value n prop
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 unadjusted rejections 0.05 1947 28345 0.069
## 2 fdrreg-e rejections 0.05 602 28345 0.021
## 3 lfdr rejections 0.05 338 28345 0.012
## 4 ihw-a04 rejections 0.05 289 28345 0.01
## 5 ihw-a05 rejections 0.05 289 28345 0.01
## 6 ihw-a02 rejections 0.05 287 28345 0.01
## 7 ihw-a03 rejections 0.05 285 28345 0.01
## 8 ihw-a07 rejections 0.05 283 28345 0.01
## 9 ihw-a10 rejections 0.05 281 28345 0.01
## 10 ashq rejections 0.05 281 28345 0.01
## 11 ihw-a08 rejections 0.05 280 28345 0.01
## 12 ihw-a09 rejections 0.05 279 28345 0.01
## 13 ihw-a01 rejections 0.05 278 28345 0.01
## 14 ihw-a06 rejections 0.05 275 28345 0.01
## 15 fdrreg-t rejections 0.05 250 28345 0.009
## 16 bl-df02 rejections 0.05 245 28345 0.009
## 17 bh rejections 0.05 243 28345 0.009
## 18 qvalue rejections 0.05 243 28345 0.009
## 19 bl-df03 rejections 0.05 243 28345 0.009
## 20 bl-df04 rejections 0.05 243 28345 0.009
## 21 bl-df05 rejections 0.05 243 28345 0.009
## 22 adapt-glm rejections 0.05 229 28345 0.008
## 23 bonf rejections 0.05 68 28345 0.002
rejections_scatter(sb, supplementary = FALSE)rejection_scatter_bins(sb, covariate = "ind_covariate",
bins = 4, supplementary = FALSE)plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
order.by = "freq", decreasing = TRUE,
supplementary = FALSE)covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")We use the common BenchDesign with the set of multiple testing correction methods already included. We also add in Scott’s FDR Regression (both nulltype = "empirical" and nulltype = "theoretical") since our test statistics are t-distributed.
if (file.exists(bench_file_uninf)) {
sb <- readRDS(bench_file_uninf)
} else {
bd <- initializeBenchDesign()
bd <- addBMethod(bd, "fdrreg-t",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'theoretical',
control = list(lambda = 0.01))
bd <- addBMethod(bd, "fdrreg-e",
FDRreg::FDRreg,
function(x) { x$FDR },
z = test_statistic,
features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1),
nulltype = 'empirical',
control = list(lambda = 0.01))
res <- res %>% dplyr::mutate(ind_covariate = rand_covar)
sb <- buildBench(bd, data = res, ftCols = "ind_covariate")
saveRDS(sb, file = bench_file_uninf)
}head(assays(sb)[["bench"]])## unadjusted bonf bh qvalue ihw-a01 ihw-a02 ihw-a03 ihw-a04
## [1,] 0.6289835 1 0.9988806 0.9988806 1 1 1 1.0000000
## [2,] 0.7276542 1 0.9988806 0.9988806 1 1 1 1.0000000
## [3,] 0.9856426 1 0.9988806 0.9988806 1 1 1 1.0000000
## [4,] 0.5658651 1 0.9988806 0.9988806 1 1 1 1.0000000
## [5,] 0.7562462 1 0.9988806 0.9988806 1 1 1 1.0000000
## [6,] 0.1831689 1 0.9988806 0.9988806 1 1 1 0.9993566
## ihw-a05 ihw-a06 ihw-a07 ihw-a08 ihw-a09 ihw-a10 ashq
## [1,] 1.000000 1.0000000 1.000000 0.9988806 1.0000000 1.0000000 0.6802749
## [2,] 1.000000 1.0000000 1.000000 0.9988806 1.0000000 1.0000000 0.6901540
## [3,] 1.000000 1.0000000 1.000000 0.9988806 1.0000000 1.0000000 0.6906848
## [4,] 1.000000 1.0000000 1.000000 0.9988806 1.0000000 1.0000000 0.6727982
## [5,] 1.000000 1.0000000 1.000000 0.9988806 1.0000000 1.0000000 0.5948722
## [6,] 0.981229 0.9963752 0.996976 0.9988806 0.9920812 0.9945771 0.5280837
## bl-df02 bl-df03 bl-df04 bl-df05 lfdr adapt-glm fdrreg-t
## [1,] 0.9988806 0.9988806 0.9987791 0.9988806 0.9389829 Inf 0.9545476
## [2,] 0.9988806 0.9988806 0.9987752 0.9988806 0.9815271 Inf 0.9590348
## [3,] 0.9988806 0.9988806 0.9987878 0.9988806 0.9853928 Inf 0.9738564
## [4,] 0.9988806 0.9988806 0.9987737 0.9988806 0.9780420 Inf 0.9476087
## [5,] 0.9988806 0.9988806 0.9987848 0.9988806 0.9820017 Inf 0.9735677
## [6,] 0.9988806 0.9988806 0.9987812 0.9988806 0.9634040 1.05848 0.8857965
## fdrreg-e
## [1,] 0.8564327
## [2,] 0.8704705
## [3,] 0.8393779
## [4,] 0.8463939
## [5,] 0.8013781
## [6,] 0.5075351
assayNames(sb) <- "qvalue"
sb <- addDefaultMetrics(sb)
estimatePerformanceMetrics(sb, alpha = 0.05, tidy = TRUE) %>%
filter(performanceMetric == "rejections") %>%
select(blabel, performanceMetric, alpha, value) %>%
mutate(n = nrow(sb), prop = round(value / n, 3)) %>%
arrange(desc(value)) %>%
as_tibble() %>%
print(n = 40)## # A tibble: 23 x 6
## blabel performanceMetric alpha value n prop
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 unadjusted rejections 0.05 1947 28345 0.069
## 2 fdrreg-e rejections 0.05 588 28345 0.021
## 3 ashq rejections 0.05 281 28345 0.01
## 4 lfdr rejections 0.05 251 28345 0.009
## 5 fdrreg-t rejections 0.05 249 28345 0.009
## 6 ihw-a10 rejections 0.05 245 28345 0.009
## 7 bh rejections 0.05 243 28345 0.009
## 8 qvalue rejections 0.05 243 28345 0.009
## 9 ihw-a02 rejections 0.05 243 28345 0.009
## 10 ihw-a08 rejections 0.05 243 28345 0.009
## 11 ihw-a09 rejections 0.05 243 28345 0.009
## 12 bl-df02 rejections 0.05 243 28345 0.009
## 13 bl-df03 rejections 0.05 243 28345 0.009
## 14 bl-df04 rejections 0.05 243 28345 0.009
## 15 bl-df05 rejections 0.05 243 28345 0.009
## 16 ihw-a03 rejections 0.05 239 28345 0.008
## 17 ihw-a04 rejections 0.05 239 28345 0.008
## 18 ihw-a05 rejections 0.05 239 28345 0.008
## 19 ihw-a06 rejections 0.05 236 28345 0.008
## 20 ihw-a07 rejections 0.05 236 28345 0.008
## 21 ihw-a01 rejections 0.05 233 28345 0.008
## 22 bonf rejections 0.05 68 28345 0.002
## 23 adapt-glm rejections 0.05 62 28345 0.002
rejections_scatter(sb, supplementary = FALSE)rejection_scatter_bins(sb, covariate = "ind_covariate",
bins = 4, supplementary = FALSE)plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb),
order.by = "freq", decreasing = TRUE,
supplementary = FALSE)covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")Here we compare the method ranks for the different covariates at alpha = 0.10.
plotMethodRanks(c(bench_file, bench_file_uninf),
colLabels = c("mean", "uninf"),
alpha = 0.10, xlab = "Comparison")sessionInfo()## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS: /usr/lib64/libblas.so.3.4.2
## LAPACK: /usr/lib64/liblapack.so.3.4.2
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 magrittr_1.5
## [3] cowplot_0.9.2 hexbin_1.27.2
## [5] recount_1.6.2 SummarizedBenchmark_0.99.1
## [7] mclust_5.4 stringr_1.3.1
## [9] rlang_0.2.1 UpSetR_1.3.3
## [11] tidyr_0.8.1 DESeq2_1.20.0
## [13] SummarizedExperiment_1.10.1 DelayedArray_0.6.1
## [15] BiocParallel_1.14.2 matrixStats_0.53.1
## [17] Biobase_2.40.0 GenomicRanges_1.32.3
## [19] GenomeInfoDb_1.16.0 IRanges_2.14.10
## [21] S4Vectors_0.18.3 BiocGenerics_0.26.0
## [23] ggplot2_3.0.0 dplyr_0.7.5
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 rprojroot_1.3-2
## [3] qvalue_2.12.0 htmlTable_1.12
## [5] XVector_0.20.0 base64enc_0.1-3
## [7] rstudioapi_0.7 bit64_0.9-7
## [9] AnnotationDbi_1.42.1 xml2_1.2.0
## [11] codetools_0.2-15 splines_3.5.0
## [13] geneplotter_1.58.0 knitr_1.20
## [15] jsonlite_1.5 Formula_1.2-3
## [17] Rsamtools_1.32.0 annotate_1.58.0
## [19] cluster_2.0.7-1 rentrez_1.2.1
## [21] readr_1.1.1 compiler_3.5.0
## [23] httr_1.3.1 backports_1.1.2
## [25] assertthat_0.2.0 Matrix_1.2-14
## [27] lazyeval_0.2.1 cli_1.0.0
## [29] limma_3.36.2 acepack_1.4.1
## [31] htmltools_0.3.6 prettyunits_1.0.2
## [33] tools_3.5.0 gtable_0.2.0
## [35] glue_1.2.0 GenomeInfoDbData_1.1.0
## [37] reshape2_1.4.3 doRNG_1.6.6
## [39] Rcpp_0.12.17 bumphunter_1.22.0
## [41] Biostrings_2.48.0 rtracklayer_1.40.3
## [43] iterators_1.0.9 rngtools_1.3.1
## [45] XML_3.98-1.11 zlibbioc_1.26.0
## [47] scales_0.5.0 BSgenome_1.48.0
## [49] VariantAnnotation_1.26.1 hms_0.4.2
## [51] GEOquery_2.48.0 derfinderHelper_1.14.0
## [53] RColorBrewer_1.1-2 yaml_2.1.19
## [55] memoise_1.1.0 gridExtra_2.3
## [57] downloader_0.4 pkgmaker_0.27
## [59] biomaRt_2.36.1 rpart_4.1-13
## [61] latticeExtra_0.6-28 stringi_1.2.2
## [63] RSQLite_2.1.1 genefilter_1.62.0
## [65] foreach_1.4.4 checkmate_1.8.5
## [67] GenomicFeatures_1.32.0 bibtex_0.4.2
## [69] pkgconfig_2.0.1 GenomicFiles_1.16.0
## [71] bitops_1.0-6 evaluate_0.10.1
## [73] lattice_0.20-35 purrr_0.2.5
## [75] bindr_0.1.1 labeling_0.3
## [77] GenomicAlignments_1.16.0 htmlwidgets_1.2
## [79] bit_1.1-14 tidyselect_0.2.4
## [81] plyr_1.8.4 R6_2.2.2
## [83] Hmisc_4.1-1 DBI_1.0.0
## [85] pillar_1.2.3 foreign_0.8-70
## [87] withr_2.1.2 survival_2.41-3
## [89] RCurl_1.95-4.10 nnet_7.3-12
## [91] tibble_1.4.2 crayon_1.3.4
## [93] derfinder_1.14.0 utf8_1.1.4
## [95] rmarkdown_1.10 progress_1.1.2
## [97] locfit_1.5-9.1 grid_3.5.0
## [99] data.table_1.11.4 blob_1.1.1
## [101] digest_0.6.15 xtable_1.8-2
## [103] munsell_0.4.3 registry_0.5