simref
function to simulate reference haplotypeslibrary(bindata)
# A function to simulate reference haplotypes, based on more realistic LD data
ref <- function(nsnps=100,nhaps=1000,LD1=0.08,LD2=0.02,maf=0.1){ # LD1 is value
# eitherside of lead diagonal
R<-diag(1,nsnps) # LD2 is next diagonal out
R[cbind(1:(nsnps-1),2:nsnps)] <- LD1
R[cbind(2:nsnps,1:(nsnps-1))] <- LD1
R[cbind(1:(nsnps-2),3:nsnps)] <- LD2
R[cbind(3:nsnps,1:(nsnps-2))] <- LD2
x=rmvbin(nhaps, rep(maf,nsnps), bincorr=R)
x=x+1
x=as.data.frame(x)
snps <- colnames(x) <- paste0("s",1:nsnps)
x$Probability <- 1/nrow(x)
x
}
Notes:
LD1 and LD2 are set very small to try and match the distribution of 1s and 2s in ‘freq’ (using original simref
). If LD1, LD2 or MAF are set any higher then there are way too many 2s (representing the snp), and size is way above threshold.
Later, this can be adapted to make many reference haplotypes from varying LD values in order to investigate the effect of LD and MAF on coverage.
simdata
and wrapper
functions to use these new reference haplotypes# amend simdata function
simdata_x <- function(x,OR=1.5,nrep=100,N0=1000,N1=1000,thr=0.5) {
snps <- colnames(x)[-ncol(x)]
CV=sample(snps,1)
FP <- make_GenoProbList(snps=snps,W=CV,freq=x)
zsim <- simulated_z_score(N0=N0, # number of controls
N1=N1, # number of cases
snps=snps,
W=CV,
gamma.W=log(OR),
freq=x,# reference haplotypes
GenoProbList=FP,
nrep=nrep)
p <- 2*pnorm(-abs(zsim)) # generate p values from z values
# generate posterior probabilities using finemap.abf function
MAF <- colMeans(x[,snps]-1) # minor allele frequencies
PP <- matrix(0,nrow(p),ncol(p)) # will hold the posterior probs
results <- lapply(1:nrep, function(i) {
tmp <- subset(finemap.abf(dataset=list(pvalues=p[i,], N=N0+N1, MAF=MAF,
s=N1/(N0+N1), type="cc"), p1=1e-04),snp!="null")
tmp$SNP.PP <- tmp$SNP.PP/sum(tmp$SNP.PP) # replace post probs with the
# ratio of evidence for each variant being causal vrs all the others
colnames(tmp) <- sub("\\.$","",colnames(tmp))
colnames(tmp) <- sub("SNP.PP","PP",colnames(tmp)) # clean up column names
tmp$CV <- sub("SNP.","",tmp$snp)==sub("s","",CV)
tmp[,c("snp","pvalues","MAF","PP","CV")] # include all the rows and the named columns
})
results
}
# amend wrapper function
wrapper_x <- function(thr=0.9,...) {
data <- simdata_x(x) # data is a list of data.frames
cs <- lapply(data, function(d) {
tmp.ord <- credset(d$PP, which(d$CV), thr=thr)
tmp.noord <- credset(d$PP, which(d$CV), do.order=FALSE,thr=thr)
data.frame(order=c(TRUE,FALSE),
thr=tmp.ord$thr,
size=c(tmp.ord$size,tmp.noord$size),
nvar=c(length(tmp.ord$credset),
length(tmp.noord$credset)),
covered=c(tmp.ord$contained,
tmp.noord$contained))
})
cs <- do.call("rbind",cs)
cs.ord <- subset(cs,cs$order==TRUE)
cs.noord <- subset(cs,cs$order==FALSE)
c(size.ord=mean(cs.ord$size), cov.ord=mean(cs.ord$covered),
size.noord=mean(cs.noord$size), cov.noord=mean(cs.noord$covered))
}
wrapper2
so that it incorporates the new reference haplotypes and varies theshold valuesentropy <- function(p) -mean(log(p))
wrapper2_x <- function(...) {
n <- sample(1:5,1)*1000 # vary sample size
or <- sample(c(1,1.05,1.1,1.2,1.3),1) # vary or
thr <- sample(c(0.5,0.6,0.7,0.8,0.9),1) # vary threshold
data <- simdata_x(x,N0=n,N1=n,OR=or) #,...) # data is a list of data.frames
cs <- lapply(data, function(d) {
tmp.ord <- credset(d$PP, which(d$CV), thr=thr)
tmp.noord <- credset(d$PP, which(d$CV), do.order=FALSE,thr=thr)
data.frame(order=c(TRUE,FALSE),
thr=tmp.ord$thr,
size=c(tmp.ord$size,tmp.noord$size),
nvar=c(length(tmp.ord$credset),
length(tmp.noord$credset)),
covered=c(tmp.ord$contained,
tmp.noord$contained))
})
cs <- do.call("rbind",cs)
ent <- sapply(data, function(x) entropy(x$PP))
cs$entropy <- rep(ent,each=2)
cs$N <- n
cs$OR <- or
cs$thr <- thr
cs
}
# note, still need x <- ref() command prior to use
Wrapper2_x()
algorithm
- Randomly sample values for threshold, OR and N.
- Generate 100 ‘systems’ (posterior probability systems) using these values.
- For each system, generate one credible set using ordered posterior probabilities and one credible set using non-ordered posterior probabilities, obtaining 200 credible sets from 100 systems.
- For each credible set calculate the sum of posterior probabilities of elements in the credible set (size), number of variables in the credible set (nvar), whether the causal variant is contained in the credible set (covered) and the entropy. Note that the entropy is the same for each system, resulting in only 100 values.
- Replicate to repeat the process using different combinations of threshold, OR and N values.
The following code simulates 20,000 credible sets from 100 different combinations of threshold, OR and N values. For each of these combinations, 100 systems are considered and two credible sets derived from each of these (one using ordered pps and one using non-ordered pps).
x <- ref()
example_sim <- replicate(100,wrapper2_x(), simplify=FALSE)
example_sim <- data.table::rbindlist(example_sim)
dim(example_sim)
## [1] 20000 8
# split data into ordered and non-ordered sets
example_sim_ord <- example_sim[order=="TRUE"]
example_sim_noord <- example_sim[order=="FALSE"]
head(example_sim_ord)
## order thr size nvar covered entropy N OR
## 1: TRUE 0.9 0.9005356 80 TRUE 4.859103 5000 1.1
## 2: TRUE 0.9 0.9019405 80 TRUE 4.847719 5000 1.1
## 3: TRUE 0.9 0.9025467 71 TRUE 5.159823 5000 1.1
## 4: TRUE 0.9 0.9031222 72 TRUE 5.102479 5000 1.1
## 5: TRUE 0.9 0.9013108 80 TRUE 4.867697 5000 1.1
## 6: TRUE 0.9 0.9011400 78 FALSE 4.948458 5000 1.1
tail(example_sim_ord)
## order thr size nvar covered entropy N OR
## 1: TRUE 0.8 0.8041393 67 TRUE 4.755711 1000 1.3
## 2: TRUE 0.8 0.8017135 59 TRUE 4.985643 1000 1.3
## 3: TRUE 0.8 0.8017345 59 TRUE 4.938271 1000 1.3
## 4: TRUE 0.8 0.8037415 62 TRUE 4.904680 1000 1.3
## 5: TRUE 0.8 0.8054744 72 TRUE 4.699685 1000 1.3
## 6: TRUE 0.8 0.8010807 73 TRUE 4.684148 1000 1.3
head(example_sim_noord)
## order thr size nvar covered entropy N OR
## 1: FALSE 0.9 0.9038389 86 TRUE 4.859103 5000 1.1
## 2: FALSE 0.9 0.9033891 90 TRUE 4.847719 5000 1.1
## 3: FALSE 0.9 0.9199295 86 TRUE 5.159823 5000 1.1
## 4: FALSE 0.9 0.9020467 88 TRUE 5.102479 5000 1.1
## 5: FALSE 0.9 0.9081359 94 TRUE 4.867697 5000 1.1
## 6: FALSE 0.9 1.0000000 100 TRUE 4.948458 5000 1.1
tail(example_sim_noord)
## order thr size nvar covered entropy N OR
## 1: FALSE 0.8 0.8008385 89 FALSE 4.755711 1000 1.3
## 2: FALSE 0.8 0.9705256 95 TRUE 4.985643 1000 1.3
## 3: FALSE 0.8 0.8588590 84 TRUE 4.938271 1000 1.3
## 4: FALSE 0.8 0.8027812 77 FALSE 4.904680 1000 1.3
## 5: FALSE 0.8 0.8039018 82 TRUE 4.699685 1000 1.3
## 6: FALSE 0.8 0.8038397 86 FALSE 4.684148 1000 1.3