set.seed(20)
If phased haplotypes are not avaliable, simref can be used to simulate some reference haplotypes. Lag is set to 5 due to the simple structure it provides for the variance/covariance matrix. This can be altered to allow for more or less linkage disequilibrium in simulations. The output is a \(nhaps*nsnps\) matrix of 0s and 1s with an extra columm, ‘Probability’, representing the probability of each haplotype.
# simref generates the reference haplotype
simref <- function(nsnps=100,nhaps=1000,lag=5) {
maf <- runif(nsnps+lag,0.05,0.5) # common SNPs
laghaps <- do.call("cbind", lapply(maf, function(f) rbinom(nhaps,1,f)))
haps <- laghaps[,1:nsnps]
for(j in 1:lag)
haps <- haps + laghaps[,(1:nsnps)+j]
haps <- round(haps/(lag+1))
snps <- colnames(haps) <- paste0("s",1:nsnps)
freq <- as.data.frame(haps+1)
freq$Probability <- 1/nrow(freq)
freq
}
This function uses the output from simref and incorporates functions avaliable in the simGWAS
and coloc
packages to ultimately obtain the posterior probabilities of each SNP being the causal variant. The output is a list of nrep data frames. A flat prior of \(p1=1e-04\) is used.
# simdata generates the posterior probabilities for each SNP being causal
simdata <- function(freq,OR=1.5,nrep=100,N0=1000,N1=1000) {
snps <- colnames(freq)[-ncol(freq)] # snp names
CV=sample(snps,1) # choose a random CV
# calculate genotype probabilties at each SNP relative to the genotype at the CV
FP <- make_GenoProbList(snps=snps,W=CV,freq=freq)
zsim <- simulated_z_score(N0=N0, # number of controls
N1=N1, # number of cases
snps=snps,
W=CV, # causal variants, subset of snps
gamma.W=log(OR), # log odds ratios
freq=freq,# reference haplotypes
GenoProbList=FP,
nrep=nrep)
p <- 2*pnorm(-abs(zsim)) # generate p values from z values
# generate posterior probabilities using finemap.abf function
MAF <- colMeans(freq[,snps]-1) # minor allele frequencies
PP <- matrix(0,nrow(p),ncol(p)) # will hold the posterior probs
results <- lapply(1:nrep, function(i) {
tmp <- subset(finemap.abf(dataset=list(pvalues=p[i,], N=N0+N1, MAF=MAF,
s=N1/(N0+N1), type="cc"), p1=1e-04),snp!="null")
tmp$SNP.PP <- tmp$SNP.PP/sum(tmp$SNP.PP) # replace post probs with the
# ratio of evidence for each variant being causal vrs all the others
colnames(tmp) <- sub("\\.$","",colnames(tmp))
colnames(tmp) <- sub("SNP.PP","PP",colnames(tmp)) # clean up column names
tmp$CV <- sub("SNP.","",tmp$snp)==sub("s","",CV)
tmp[,c("snp","pvalues","MAF","PP","CV")] # include all the rows and the named columns
})
results
}
This example illustrates the use of the expected_z_score
and the simulated_z_score
functions in the simGWAS
package and is taken from the simGWAS vignette. Firstly, the expected z scores are calculated for each SNP, where \[z_E = E(Z).\]
nsnps <- 100
nhaps <- 1000
lag <- 5 # genotypes are correlated between neighbouring variants
maf <- runif(nsnps+lag,0.05,0.5) # common SNPs
laghaps <- do.call("cbind", lapply(maf, function(f) rbinom(nhaps,1,f)))
haps <- laghaps[,1:nsnps]
for(j in 1:lag)
haps <- haps + laghaps[,(1:nsnps)+j]
haps <- round(haps/matrix(apply(haps,2,max),nhaps,nsnps,byrow=TRUE))
snps <- colnames(haps) <- paste0("s",1:nsnps)
freq <- as.data.frame(haps+1)
freq$Probability <- 1/nrow(freq)
sum(freq$Probability)
## [1] 1
CV=sample(snps[which(colMeans(haps)>0.1)],2) # choose two random SNPs as CVs
g1 <- c(1.4,1.2) # assign OR to these SNPs
FP <- make_GenoProbList(snps=snps,W=CV,freq=freq)
zexp <- expected_z_score(N0=10000, # number of controls
N1=10000, # number of cases
snps=snps, # column names in freq of SNPs for which Z scores should be generated
W=CV, # causal variants, subset of snps
gamma.W=log(g1), # odds ratios
freq=freq, # reference haplotypes
GenoProbList=FP) # FP above
plot(1:nsnps,zexp); abline(v=which(snps %in% CV),col="red"); abline(h=0)
The simulated z scores are then calculated for each SNP where, \[z^* \sim N(z_E,\Sigma).\]
Note that the genotype probabilities do not need to be given as input to the function.
zsim <- simulated_z_score(N0=10000, # number of controls
N1=10000, # number of cases
snps=snps, # column names in freq of SNPs for which Z scores should be generated
W=CV, # causal variants, subset of snps
gamma.W=log(g1), # log odds ratios
freq=freq, # reference haplotypes
nrep=3)
The following plots show the simulated z scores plotted over the expected z scores.
par(mfcol=c(3,1))
for(i in 1:3) {
plot(1:nsnps,zexp,xlab="SNP",ylab="Z score"); abline(v=which(snps %in% CV),col="red"); abline(h=0)
title(main=paste("Replication",i))
points(1:nsnps,zsim[i,],col="blue",pch=2)
}
This example illustrates the use of the simdata
function. The output for the first 6 SNPs of the first repetition (100 repetitions in total) is also shown. This gives the p-values, the MAF (mean of each column in ‘freq’) and the posterior probabilities for each SNP. Note that the posterior probability for SNP \(i\) is given by,
\[pp_i=\frac{pp_i}{\sum_k pp}.\]
freq <- simref()
example_simdata <- simdata(freq) # default is nrep=100
head(example_simdata[[1]])
## snp pvalues MAF PP CV
## s1 SNP.1 0.4934488 0.005 0.009714097 FALSE
## s2 SNP.2 0.1867151 0.009 0.011045895 FALSE
## s3 SNP.3 0.9058197 0.008 0.008928453 FALSE
## s4 SNP.4 0.7395401 0.009 0.008911872 FALSE
## s5 SNP.5 0.7851162 0.031 0.007036466 FALSE
## s6 SNP.6 0.8268046 0.078 0.005289268 FALSE
This function uses the posterior probabilities to generate credible sets. The ‘do.order’ input allows users to specify whether the posterior probabilities are ordered prior to forming credible sets. By default this is set to ‘TRUE’ to match current credible set practices.
credset <- function(pp, causal, thr=0.9,do.order=TRUE) {
o <- if(do.order) {
order(pp,decreasing=TRUE)
} else {
sample(seq_along(pp))
}
cumpp <- cumsum(pp[o])
wh <- which(cumpp > thr)[1]
credset <- o[1:wh]
return(list(credset=credset,
thr=thr,
size=sum( pp[credset] ),
contained=causal %in% credset))
}
This example shows how the credset function can be used to generate credible sets using both ordered and non-ordered posterior probabilities.
# Credible set obtained from ordered posterior probabilities
example_ord_credset <- credset(example_simdata[[1]]$PP, which(example_simdata[[1]]$CV))
example_ord_credset
## $credset
## [1] 45 11 100 71 99 72 73 10 44 80 13 95 23 79 29 98 75
## [18] 96 77 70 76 2 36 69 81 43 34 1 46 47 31 50 74 56
## [35] 3 51 4 85 12 8 52 32 37 30 48 97 63 62 33 26 14
## [52] 84 5 92 35 53 78 93 15 87 86 18 49 64 83 90 88 27
## [69] 82 28 89 66 67 16 61 91 54 65 38 94
##
## $thr
## [1] 0.9
##
## $size
## [1] 0.9031255
##
## $contained
## [1] TRUE
# Credible set obtained from non-ordered posterior probabilities
example_noord_credset <- credset(example_simdata[[1]]$PP, which(example_simdata[[1]]$CV),
do.order=FALSE)
example_noord_credset
## $credset
## [1] 29 17 83 18 11 85 80 88 51 96 7 48 45 79 38 32 76
## [18] 22 25 82 62 93 86 42 77 91 2 34 67 99 37 78 46 84
## [35] 47 44 54 60 100 94 87 95 5 26 59 23 6 55 49 52 30
## [52] 41 27 53 9 81 92 68 73 15 98 72 36 71 16 33 12 8
## [69] 35 75 70 39 63 21 20 1 50 90 10 58 65 4 13 31 56
## [86] 14
##
## $thr
## [1] 0.9
##
## $size
## [1] 0.9038037
##
## $contained
## [1] TRUE
The wrapper function incorporates the above functions to generate 2 credible sets (one from ordering and one from not ordering posterior probabilities) from a user specific OR and threshold. It uses the haplotype matrix obtained from the simref
function.
wrapper <- function(thr=0.9,...) {
data <- simdata(freq,...) # data is a list of data.frames
# work out the credsets
cs <- lapply(data, function(d) {
tmp.ord <- credset(d$PP, which(d$CV), thr=thr)
tmp.noord <- credset(d$PP, which(d$CV), do.order=FALSE,thr=thr)
data.frame(order=c(TRUE,FALSE),
thr=tmp.ord$thr,
size=c(tmp.ord$size,tmp.noord$size),
nvar=c(length(tmp.ord$credset),
length(tmp.noord$credset)),
covered=c(tmp.ord$contained,
tmp.noord$contained))
})
cs <- do.call("rbind",cs)
cs.ord <- subset(cs,cs$order==TRUE)
cs.noord <- subset(cs,cs$order==FALSE)
## what was the coverage?
c(size.ord=mean(cs.ord$size), cov.ord=mean(cs.ord$covered),
size.noord=mean(cs.noord$size), cov.noord=mean(cs.noord$covered))
}
This example uses the replicate and wrapper functions to perform 100 simulations on data with an odds ratio of 1.1 and a threshold of 0.5.
# only need one reference haplotype
freq <- simref()
results_0.5 <- replicate(100,wrapper(OR=1.1,thr=0.5))
results_0.5[,1:8] # shows the size and coverage (for ordered and non-ordered pps) for the first 8 replicates
## [,1] [,2] [,3] [,4] [,5] [,6]
## size.ord 0.5072597 0.5103906 0.5055956 0.5058682 0.5065738 0.5053218
## cov.ord 0.3200000 0.2900000 0.3300000 0.4400000 0.3100000 0.3500000
## size.noord 0.5159360 0.5182014 0.5143740 0.5179709 0.5126314 0.5103849
## cov.noord 0.4600000 0.4900000 0.5600000 0.5400000 0.4700000 0.5000000
## [,7] [,8]
## size.ord 0.5050188 0.5056509
## cov.ord 0.4100000 0.3300000
## size.noord 0.5075530 0.5190001
## cov.noord 0.4600000 0.5700000
# confidence interval
se_0.5 <- apply(results_0.5,1,function(x) sd(x)/sqrt(length(x)))
mn_0.5 <- rowMeans(results_0.5)
cbind(average=mn_0.5, lci=mn_0.5 - 1.96*se_0.5, uci=mn_0.5 + 1.96*se_0.5)
## average lci uci
## size.ord 0.506455 0.5060840 0.5068260
## cov.ord 0.373600 0.3561902 0.3910098
## size.noord 0.515921 0.5150483 0.5167936
## cov.noord 0.508800 0.4988476 0.5187524
This example uses the replicate and wrapper functions to perform 100 simulations on data with an odds ratio of 1.1 and a threshold of 0.9.
results_0.9 <- replicate(100,wrapper(OR=1.1,thr=0.9))
# confidence interval
se_0.9 <- apply(results_0.9,1,function(x) sd(x)/sqrt(length(x)))
mn_0.9 <- rowMeans(results_0.9)
cbind(average=mn_0.9, lci=mn_0.9 - 1.96*se_0.9, uci=mn_0.9 + 1.96*se_0.9)
## average lci uci
## size.ord 0.9031003 0.9030590 0.9031417
## cov.ord 0.8902000 0.8600461 0.9203539
## size.noord 0.9092293 0.9089923 0.9094664
## cov.noord 0.9020000 0.8964284 0.9075716
This function generates credible sets simultaneously for different ORs and sample sizes to allow analysis of how disorder affects the number of variants in a credible set. Disorder has been set as \[disorder=-mean(log(pp)).\]
The output is a data.frame with 200 rows (nrep is set to 100 in the simdata
function and each simulation is ran for ordered and non-ordered posterior probabilities).
entropy <- function(p) -mean(log(p))
wrapper2 <- function(thr=0.5,...) {
n <- sample(1:5,1)*1000
or <- sample(c(1,1.05,1.1,1.2,1.3),1)
data <- simdata(freq,N0=n,N1=n,OR=or) #,...) # data is a list of data.frames
cs <- lapply(data, function(d) {
tmp.ord <- credset(d$PP, which(d$CV), thr=thr)
tmp.noord <- credset(d$PP, which(d$CV), do.order=FALSE,thr=thr)
data.frame(order=c(TRUE,FALSE),
thr=tmp.ord$thr,
size=c(tmp.ord$size,tmp.noord$size),
nvar=c(length(tmp.ord$credset),
length(tmp.noord$credset)),
covered=c(tmp.ord$contained,
tmp.noord$contained))
})
cs <- do.call("rbind",cs)
ent <- sapply(data, function(x) entropy(x$PP))
cs$entropy <- rep(ent,each=2)
cs$N <- n
cs$OR <- or
cs
}
Coverage is defined as the proportion of simulations for which the causal variant is in the credible set. It is frequentist in nature as it describes long run behaviour. Coverage is closely related to the threshold of the credible set, which is a value set by the user describing the value of which the cumulative posterior probability of the credible set (“set size”) must exceed. Undercoverage/overcoverage refer to the case where more/less SNPs need to be added to the credible set, in order to achieve appropriate coverage.
In most cases, it is beneficial to sort posterior probabilities in descending order prior to credible set analysis. However, ordering does not always improve coverage. For example, if all SNP posterior probabilities are equal, then ordering does not provide the extra information which it is thought to, and undercoverage occurs.
Hope has shown in her thesis that undercoverage occurs for ordered sets at low OR values (1-1.1) and that overcoverage occurs for ordered sets at high OR values (1.2-1.5). She has also shown that undercoverage occurs for ordered sets with low sample sizes (N=1000) and that overcoverage occurs for ordered sets with high samples sizes (N=5000, 10000).
A new measure, ‘bias in coverage’ (bias), is defined and calculated for each repetition. \[bias=coverage-average\ size.\]
The entropy of a system refers to the lack of order. For this study we have used a related measure, disorder, defined by \[disorder=-mean(log(pp)).\] ‘Disorder’ and ‘entropy’ are used interchangeably throughout this report.
The wrapper2
and data.table
functions have been used to simulate 400,000 credible sets with varying sample sizes and odds ratios to further analyse their relationship with coverage for ordered and non-ordered credible sets.
Nb, for this section different combinations of OR and N were considered X=2000 (replicate(X,wrapper2()
) times. For each of these repetitions, Y=100 (simdata(nrep=Y,...
) different systems were considered and 2 credible sets were produced for each of these, one using ordered posterior probabilities and one using unordered posterior probabilities. Hence, 200,000 systems were considered and 400,000 credible sets produced.
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
setwd("/Users/anna/PhD")
# creds_ <- replicate(2000,wrapper2(), simplify=FALSE)
creds_ <- read.table("Cred sim 1")
creds_ <- as.data.table(creds_)
# creds_ <- data.table::rbindlist(creds_, idcol=TRUE)
creds_ord1 <- creds_[order=="TRUE"] # separate ordered and non-ordered simulations
creds_noord1 <- creds_[order=="FALSE"]
head(creds_ord1) # dim 200,000*9 - a list of all the ordered simulations
## .id order thr size nvar covered entropy N OR
## 1: 1 TRUE 0.5 0.5038964 22 FALSE 4.855843 4000 1.05
## 2: 1 TRUE 0.5 0.5153848 2 FALSE 5.398494 4000 1.05
## 3: 1 TRUE 0.5 0.8251045 1 FALSE 6.820145 4000 1.05
## 4: 1 TRUE 0.5 0.5056507 29 FALSE 4.750554 4000 1.05
## 5: 1 TRUE 0.5 0.5626324 1 FALSE 6.045790 4000 1.05
## 6: 1 TRUE 0.5 0.5033355 17 TRUE 4.954727 4000 1.05
head(creds_noord1) # dim 200,000*9 - a list of all the non-ordered simulations
## .id order thr size nvar covered entropy N OR
## 1: 1 FALSE 0.5 0.5117279 57 FALSE 4.855843 4000 1.05
## 2: 1 FALSE 0.5 0.5669616 21 FALSE 5.398494 4000 1.05
## 3: 1 FALSE 0.5 0.9946177 94 TRUE 6.820145 4000 1.05
## 4: 1 FALSE 0.5 0.5270758 49 TRUE 4.750554 4000 1.05
## 5: 1 FALSE 0.5 0.9935277 98 TRUE 6.045790 4000 1.05
## 6: 1 FALSE 0.5 0.5005062 52 TRUE 4.954727 4000 1.05
Note that the entropy column is the same for ordered and non-ordered sets. This is because entropy describes the system, and the same system was used for ordered and non-ordered sets at each simulation. I.E. row \(i\) in creds_noord
is derived from the same system as row \(i\) in creds_ord
, the only difference is the ordering of the posterior probabilities.
The data.table
function is now used to calculate the average coverage, size and entropy for each repetition (2000).
# use data.table to find average coverage, size and entropy by the groups N, OR and ID
creds_ave_noord1 <- creds_noord1[,list("Coverage"=mean(covered),"Ave. Size"=mean(size),"Entropy"=mean(entropy)),by=c("N","OR",".id")]
creds_ave_ord1 <- creds_ord1[,list("Coverage"=mean(covered),"Ave. Size"=mean(size),"Entropy"=mean(entropy)),by=c("N","OR",".id")]
# calculate the bias for ordered and non-ordered sets
bias_noord1 <- creds_ave_noord1$Coverage-creds_ave_noord1$`Ave. Size`
bias_ord1 <- creds_ave_ord1$Coverage-creds_ave_ord1$`Ave. Size`
# create new data.frames
creds_ave_noord1 <- cbind(creds_ave_noord1,bias_noord1) # dim 2000*7
creds_ave_ord1 <- cbind(creds_ave_ord1,bias_ord1) # dim 2000*7
head(creds_ave_noord1)
## N OR .id Coverage Ave. Size Entropy bias_noord1
## 1: 4000 1.05 1 0.54 0.5395693 4.999050 0.0004307312
## 2: 3000 1.00 2 0.46 0.5161547 4.872654 -0.0561547388
## 3: 2000 1.10 3 0.56 0.5262763 4.892404 0.0337237277
## 4: 3000 1.05 4 0.47 0.5348127 4.928217 -0.0648127142
## 5: 4000 1.05 5 0.55 0.5488284 5.032648 0.0011715799
## 6: 2000 1.30 6 0.63 0.5735745 5.139263 0.0564254781
head(creds_ave_ord1)
## N OR .id Coverage Ave. Size Entropy bias_ord1
## 1: 4000 1.05 1 0.23 0.5158248 4.999050 -0.2858248
## 2: 3000 1.00 2 0.24 0.5070677 4.872654 -0.2670677
## 3: 2000 1.10 3 0.37 0.5092287 4.892404 -0.1392287
## 4: 3000 1.05 4 0.28 0.5150158 4.928217 -0.2350158
## 5: 4000 1.05 5 0.18 0.5206496 5.032648 -0.3406496
## 6: 2000 1.30 6 0.88 0.5266771 5.139263 0.3533229
Coverage has been plotted against average credible set size for non-ordered and ordered sets.
Plotting ordered and non-ordered credible sets together gives the following plot, with \(y=x\) added for reference.
It appears that whilst average set size may be a good predictor of coverage for non-ordered sets, the same is not true for ordered sets.
The following plots show the difference in the relationship between bias in coverage and average set size for ordered and non-ordered sets.
It appears that the bias in coverage is more widespread and there are more instances of overcoverage in ordered sets than non-ordered sets.
Plots are created for bias in coverage against entropy coloured by OR for ordered and non-ordered sets separately.
It appears there is a more structured relationship between bias in coverage and entropy for ordered sets as opposed to non-ordered sets. The range of bias values is clearly larger for ordered sets.
The plot above is faceted by sample size.
Zoom in and match y axes.