installifnot <- function (packageName){
 if (!(require(packageName, character.only=TRUE))) {
    install.packages(packageName)
  }else{
    detach(paste ("package", packageName, sep=":"), character.only=TRUE)
  } 
}
installBiocifnot <- function (packageName){
 if (!(require(packageName, character.only=TRUE))) {
    source("http://bioconductor.org/biocLite.R")
   biocLite(packageName)
  }else{
    detach(paste ("package", packageName, sep=":"), character.only=TRUE)
  } 
}
installifnot("knitr")
installifnot("xlsx")
# installifnot("writexl")
installBiocifnot("GEOquery")

installifnot("tidyverse")

The problem

Solving the problem

ExpressionSets

  • Bioconductor provides a data structure called ExpressionSet intended to store in a single object the distinct data associated with a microarray experiment such as:
    • Expression matrix
    • Information on covariates
    • Information on annotations
library(Biobase)
data("sample.ExpressionSet")

The distinct components of the dataset can be accessed with their names that can be known with the instruction SlotNames.

slotNames(sample.ExpressionSet)
## [1] "experimentData"    "assayData"         "phenoData"         "featureData"      
## [5] "annotation"        "protocolData"      ".__classVersion__"
experimentData(sample.ExpressionSet)
## Experiment data
##   Experimenter name: Pierre Fermat 
##   Laboratory: Francis Galton Lab 
##   Contact information: pfermat@lab.not.exist 
##   Title: Smoking-Cancer Experiment 
##   URL: www.lab.not.exist 
##   PMIDs:  
## 
##   Abstract: A 8 word abstract is available. Use 'abstract' method.
##   notes:
##    notes:     
##       An example object of expression set (exprSet) class

The two most commonly used functions for accessing data are: -exprs that provides the expression matrix. -pData that provides the covariables.

pData(sample.ExpressionSet)
##      sex    type score
## A Female Control  0.75
## B   Male    Case  0.40
## C   Male Control  0.73
## D   Male    Case  0.42
## E Female    Case  0.93
## F   Male Control  0.22
## G   Male    Case  0.96
## H   Male    Case  0.79
## I Female    Case  0.37
## J   Male Control  0.63
## K   Male    Case  0.26
## L Female Control  0.36
## M   Male    Case  0.41
## N   Male    Case  0.80
## O Female    Case  0.10
## P Female Control  0.41
## Q Female    Case  0.16
## R   Male Control  0.72
## S   Male    Case  0.17
## T Female    Case  0.74
## U   Male Control  0.35
## V Female Control  0.77
## W   Male Control  0.27
## X   Male Control  0.98
## Y Female    Case  0.94
## Z Female    Case  0.32
X <- exprs(sample.ExpressionSet)
dim(X)
## [1] 500  26
head(X)
##                        A         B        C        D        E       F        G       H       I
## AFFX-MurIL2_at  192.7420  85.75330 176.7570 135.5750 64.49390 76.3569 160.5050 65.9631 56.9039
## AFFX-MurIL10_at  97.1370 126.19600  77.9216  93.3713 24.39860 85.5088  98.9086 81.6932 97.8015
## AFFX-MurIL4_at   45.8192   8.83135  33.0632  28.7072  5.94492 28.2925  30.9694 14.7923 14.2399
## AFFX-MurFAS_at   22.5445   3.60093  14.6883  12.3397 36.86630 11.2568  23.0034 16.2134 12.0375
## AFFX-BioB-5_at   96.7875  30.43800  46.1271  70.9319 56.17440 42.6756  86.5156 30.7927 19.7183
## AFFX-BioB-M_at   89.0730  25.84610  57.2033  69.9766 49.58220 26.1262  75.0083 42.3352 41.1207
##                         J        K       L       M       N       O       P        Q        R
## AFFX-MurIL2_at  135.60800 63.44320 78.2126 83.0943 89.3372 91.0615 95.9377 179.8450 152.4670
## AFFX-MurIL10_at  90.48380 70.57330 94.5418 75.3455 68.5827 87.4050 84.4581  87.6806 108.0320
## AFFX-MurIL4_at   34.48740 20.35210 14.1554 20.6251 15.9231 20.1579 27.8139  32.7911  33.5292
## AFFX-MurFAS_at    4.54978  8.51782 27.2852 10.1616 20.2488 15.7849 14.3276  15.9488  14.6753
## AFFX-BioB-5_at   46.35200 39.13260 41.7698 80.2197 36.4903 36.4021 35.3054  58.6239 114.0620
## AFFX-BioB-M_at   91.53070 39.91360 49.8397 63.4794 24.7007 47.4641 47.3578  58.1331 104.1220
##                         S       T         U        V       W         X       Y         Z
## AFFX-MurIL2_at  180.83400 85.4146 157.98900 146.8000 93.8829 103.85500 64.4340 175.61500
## AFFX-MurIL10_at 134.26300 91.4031  -8.68811  85.0212 79.2998  71.65520 64.2369  78.70680
## AFFX-MurIL4_at   19.81720 20.4190  26.87200  31.1488 22.3420  19.01350 12.1686  17.37800
## AFFX-MurFAS_at   -7.91911 12.8875  11.91860  12.8324 11.1390   7.55564 19.9849   8.96849
## AFFX-BioB-5_at   93.44020 22.5168  48.64620  90.2215 42.0053  57.57380 44.8216  61.70440
## AFFX-BioB-M_at  115.83100 58.1224  73.42210  64.6066 40.3068  41.82090 46.1087  49.41220

The GEOquery package

  • GEOquery is a package that allows downloading a whole study from GEO with a simple instruction.
  • The data is downloaded as an ExpressionSet
  • Help can be obtained typing ? getGEO after loading the package.
gse <- getGEO('GSE10')
# Returns a list, so look at first item
eset<- gse[[1]]
class(eset)
pData(eset)
x<- exprs(eset)
dim(x)
head(x)

Downloading all datasets at once

  • With these concepts in mind it is possible to write a simple loop that + Downloads each dataset
    • Extract the covariates from each dataset
    • Write it into a distinct sheet of an excel spreadsheet.
require(GEOquery)
listOfStudies <- c("73517", "16476")# c("62564", "3446") # c("45547")
for (studyID in listOfStudies){
  gse <- getGEO(paste0("GSE",studyID), GSEMatrix =TRUE, AnnotGPL=TRUE)
  eset <- gse[[1]]
  phenoDat <- pData(eset)
  require(xlsx)
  write.xlsx(phenoDat, file="phenoData.xlsx", sheetName=paste0("GSE",studyID), append=TRUE)
}