EMPIRICAL COVERAGE: Average coverage from simulations (“true coverage”)

CLAIMED COVERAGE: Size of the credible set

CORRECTED COVERAGE: GAM predicted coverage


Empirical and Claimed Coverage Plots From Simulations Using Real-World Data



(Wrong) Simulation method:

  1. Obtain freq, a reference haplotype matrix based on real world MAF and LD data.

  2. For a fixed OR (odds ratio) and N (sample size), generate nrep (=100) posterior probability systems.
  3. For each of these, form two credible sets: one using ordered posterior probabilities and one using unordered posterior probabilities. Obtain the size (claimed coverage) and whether the CV is in the set for each credible set (covered). The same CV is chosen as causal for these systems.
  4. In the form a table, report: order (whether the posterior probabilities were ordered), threshold, size, nvar (number of variants in the credible set), covered (whether the CV was in the credible set), N (sample size) and OR.
  5. Replicate steps 2-4 many times for varying thresholds (0.6, 0.8, 0.9, 0.95) ORs (1.001, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3) and sample sizes (N0=N1=2000, 5000, 10000, 50000).



Correct Simulation method

  1. Obtain freq, a reference haplotype matrix based on real-world MAF and LD data.

  2. For a given OR and N, generate a posterior probability system.
  3. From this, choose a random CV as causal and generate 2 credible sets, one using the ordered and one using unordered method. For each of these, report the size (claimed coverage), and whether the CV is in the set (covered).

Repeat step 1-3 many times to get a set of simulations using the ordered method, and a set of simulation using the unordered method.


Predict Empirical Coverage Using Claimed Coverage


Method: Fit different models to try and predict coverage, given claimed coverage (size), for credible sets obtained using the Bayesian approach to fine-mapping.


cali.func <- function(Zj, CV, Sigma, nrep, N, maf){
  exp.zm=Zj%*%Sigma # find marginals given joint
  zstar=rmvnorm(nrep,exp.zm,Sigma) # simulate z scores from mvnorm
  pstar=pnorm(abs(zstar),lower.tail=FALSE)*2 # calculate p values
  
  ppstar <- lapply(1:nrow(zstar), function(i) {
    tmp <- coloc:::approx.bf.p(p = pstar[i,],
                               f=maf, type = "cc", N=N, s=0.5)[,"lABF"]
    p1 <- 1e-04 # hard coded - bad code - can fix later
    nsnps <- length(tmp)
    prior <- c(1-nsnps*p1,rep(p1,nsnps))
    tmp <- c(1,tmp) # add on extra for null model
    my.denom <- coloc:::logsum(tmp + prior)
    tmp1 <- exp(tmp - my.denom)[-1] # drop null again
    tmp1/sum(tmp1) # ensure pps sum to 1
  }) # simulated posterior probabilities - list of dataframes
  
  lapply(ppstar, function(pp=ppstar,CV=1){
    o = order(pp,decreasing=TRUE)
    x = cumsum(pp[o]) # size 
    y = cumsum(o==CV) # 0 until CV reached, then 1
    return(data.frame(x,y))
  }) 
}

Finding: Sigmoid calibration curves obtained from logistic regressions do not allow us to estimate empirical coverage from claimed coverage.


library(magrittr)
library(data.table)
d <- lapply(d, as.data.table) # d is dataset obtained by cali.func above

# make a size/coverage summary at specific threshold
f <- function(dd,thr) {
    wh <- which(dd$x>thr)[1]
    dd[wh,] # size of cred set
}
a1 <- lapply(d,f,thr=0.5)  %>% rbindlist() # a1 is dataframe of size and covered                                                                    for cred sets obtained from each nrep of cali.func

# gam model fitted to LOO data
library(mgcv)

invlogit <- function(x) exp(x)/(1+exp(x))
pred <- numeric(nrow(a1)) # empty vector to fill with predictions
for(i in 1:nrow(a1)) {
    m1 <- gam(y ~ s(x), data=a1[-i,], family="binomial") # GAM model
    pred[i] <- invlogit(predict(m1,newdata=a1[i,]))
}
mean(pred)
mean(a1$y)
##    emp.cov corrected.cov       low       upp
## 1     0.73     0.7267016 0.6883424 0.7662405
## 2     0.79     0.7890582 0.7600271 0.8442997
## 3     0.88     0.8669479 0.7381942 0.9933964
## 4     0.98     0.9796385 0.8794017 1.0000000
## 5     1.00     1.0000000 1.0000000 1.0000000
## 6     0.68     0.6869433 0.6189254 0.7479444
## 7     0.74     0.7390439 0.7142329 0.8218740
## 8     0.82     0.8218930 0.4853095 0.9003595
## 9     0.88     0.8792153 0.6961825 0.9549436
## 10    0.96     0.9600961 0.9003163 0.9793048
## 11    0.66     0.6552073 0.4955468 0.8264000
## 12    0.68     0.6855999 0.4748193 0.7999557
## 13    0.77     0.7696999 0.6088765 0.8560195
## 14    0.90     0.8999006 0.8639899 0.9396835
## 15    0.96     0.9599950 0.9536561 0.9715357
## 16    0.77     0.7712628 0.6274897 0.8594822
## 17    0.88     0.8770935 0.7981624 0.9801001
## 18    0.92     0.9200367 0.7799826 0.9814758
## 19    1.00     1.0000000 1.0000000 1.0000000
## 20    0.69     0.6901173 0.5029995 0.7988606
## 21    0.77     0.7724668 0.7569691 0.7867546
## 22    0.85     0.8520123 0.7372359 0.9118072
## 23    0.98     0.9798236 0.8281203 1.0000000
## 24    0.90     0.9039690 0.7909850 0.9330777
## 25    0.91     0.9037485 0.6879339 0.9539291
## 26    0.97     0.9685837 0.9364247 0.9993127
## 27    1.00     1.0000000 1.0000000 1.0000000
## 28    0.89     0.8863868 0.7047639 0.9995277
## 29    0.94     0.9403532 0.8190942 0.9999612
## 30    0.96     0.9599342 0.9478756 0.9999070
## 31    0.99     0.9899598 0.9884072 0.9956029
## 32    0.80     0.8001461 0.6070607 0.9182208
## 33    0.84     0.8372153 0.6267709 0.9938822
## 34    0.88     0.8791885 0.7285369 0.9399655
## 35    0.95     0.9481236 0.8699557 0.9741238
## 36    0.87     0.8700090 0.6822501 0.9970053
## 37    0.87     0.8697797 0.8149642 0.9298802
## 38    0.90     0.8977303 0.4664082 1.0000000
## 39    0.92     0.9211589 0.6380639 0.9999448
## 40    0.75     0.7500198 0.6302131 0.7975957
## 41    0.80     0.7981308 0.6710703 0.8949047
## 42    0.90     0.9000282 0.7973305 0.9958819
## 43    0.92     0.9184210 0.8865253 0.9378464
## 44    0.95     0.9493961 0.9286318 0.9604349
## 45    0.75     0.7520045 0.5494102 0.8770837
## 46    0.79     0.7902817 0.6567325 0.8552064
## 47    0.84     0.8401636 0.6373166 0.9086459
## 48    0.90     0.9044164 0.6797263 0.9999026
## 49    0.94     0.9352515 0.7055147 0.9912558
## 50    0.83     0.8349345 0.7832600 0.9409287
## 51    0.90     0.8801534 0.7410670 0.9512469
## 52    0.95     0.9502111 0.8581542 0.9915073
## 53    0.96     0.9602695 0.8856794 0.9999177
## 54    0.98     0.9718760 0.9114949 0.9902742
## 55    0.83     0.8287708 0.8044122 0.8532579
## 56    0.88     0.8798262 0.8326240 0.9922831
## 57    0.98     0.9742586 0.9675946 0.9986737
## 58    1.00     1.0000000 1.0000000 1.0000000
## 59    0.76     0.7627122 0.6957246 0.8038657
## 60    0.89     0.8897985 0.7532988 0.9727991
## 61    0.96     0.9599910 0.8590078 0.9966100
## 62    0.99     0.9898103 0.9478120 0.9998386
## 63    0.87     0.8702256 0.6812976 0.9932610
## 64    0.93     0.9284074 0.8972788 0.9714348
## 65    0.95     0.9498080 0.8740143 0.9974588
## 66    0.99     0.9895641 0.9100962 1.0000000
## 67    1.00     1.0000000 1.0000000 1.0000000
## 68    0.74     0.7397521 0.6897168 0.8514377
## 69    0.80     0.7998445 0.7441432 0.9708145
## 70    0.92     0.9201237 0.8841044 0.9877452
## 71    0.99     0.9897640 0.9621992 1.0000000
## 72    0.69     0.6913588 0.6592340 0.7481722
## 73    0.79     0.7926151 0.7725941 0.8209343
## 74    0.88     0.8782651 0.8465491 0.8984008
## 75    0.92     0.9136723 0.8304219 0.9963282
## 76    0.98     0.9747403 0.9835237 1.0000000

Nb, error bars not put on because there are points on top of eachother??



Application of Approach When True CV Unknown



  1. Assume there are 100 SNPs in our system which we are considering. We want to obtain the marginal given the joint for each SNP being causal. The joint effect for SNP \(i\) being causal is a vector of 100 0’s except entry \(i\) which is the observed effect of that SNP.
### Code to get the joint effect vectors for each SNP being causal
temp <- diag(x = 4, nrow = 100, ncol = 100)
zj <- do.call(c, apply(temp, 1, list))

You’ve said to set this to the observed Z score from one simulated dataset whose credset needs adjusting. Is this from the sim_z_score function in GWAS? If so, then this requires specification of the CV. Do I do this and then just pretend I don’t know it?

  1. We obtain 100 \(E(Z_m)\) vectors by multiplying each \(Z_J\) by \(\Sigma\).

  2. From each of these we can simulate \(Z_m^*\sim MVN(E(X_J),\Sigma)\). Each row of \(Z_m^*\) is a simulation whereby the columns represent the marginal effects of the SNPs. At this stage, we have 100 lists (one for each SNP being causal) of \(100*100\) matrices (rows=100 simulations, columns=100 SNPs).

  3. For each simulation in each list, obtain the posterior probabiltiy system.

pp.func <- function(Zj, thr=0.6){
  exp.zm=Zj%*%Sigma
  zstar=rmvnorm(100,exp.zm,Sigma)
  pstar=pnorm(abs(zstar),lower.tail=FALSE)*2 # calculate p values
  
  ppstar <- lapply(1:nrow(zstar), function(i) {
    tmp <- coloc:::approx.bf.p(p = pstar[i,],
                               f=maf, type = "cc", N=5000, s=0.5)[,"lABF"]
    p1 <- 1e-04 # hard coded - bad code - can fix later
    nsnps <- length(tmp)
    prior <- c(1-nsnps*p1,rep(p1,nsnps))
    tmp <- c(1,tmp) # add on extra for null model
    my.denom <- coloc:::logsum(tmp + prior)
    tmp1 <- exp(tmp - my.denom)[-1] # drop null again
    tmp1/sum(tmp1) # ensure pps sum to 1
  })
}

pps.tmp <- lapply(zj,pp.func) # 100 lists of 100 lists, need to rbind first 100 then second 100 etc
pps <- lapply(pps.tmp, function(sub_list) {
  do.call(rbind, sub_list)
})
# pps is 100 lists of 100*100 matrix (rows is samples, columns is SNP)
  1. For a given threshold, obtain the credible set using the normal method and report the size (claimed coverage) and whether the CV is present. At this stage, we have 100 lists of \(100*2\) dataframes, the claimed.cov column is the size of the credible set obtained from the simulation and the covered column is whether the CV is in the credible set. There are 100 rows for each \(Z_M^*\) sample.
credset <- function(pp, CV, thr=0.6) {
  o <- order(pp,decreasing=TRUE) # order index for true pp
  cumpp <- cumsum(pp[o]) # cum sums of ordered pps
  wh <- which(cumpp > thr)[1] # how many needed to exceed thr
  size <- cumpp[wh] 
  contained=as.numeric(CV %in% o[1:wh])
  data.frame(claimed.cov=size, covered=contained)
}

n_pps <- length(pps)
args <- 1:100

d5 <- lapply(1:n_pps, function(x){
  apply(pps[[x]],1,credset,args[x]) %>% rbindlist()
}) 
# d5 is 100 lists of the size of the cred set and covered for each simulation
  1. Used these claimed coverage values as predictors in the leave-one-out GAM model approach to predict coverage, obtaining a corrected coverage value.
invlogit <- function(x) exp(x)/(1+exp(x))

pred <- numeric(nrow(d5[[1]]))
               
mean.func <- function(x){
  for(i in 1:nrow(x)) {
    m5 <- gam(covered ~ s(claimed.cov), data=x[-i,], family="binomial")
    pred[i] <- invlogit(predict(m5,newdata=x[i,]))
  }
  data.frame(corrected.ave.cov=mean(pred),true.cov=mean(x$covered))
} 

means <- lapply(d5,mean.func)
  1. Save the GAM model output. We now have 100 GAM model outputs, for 100 SNPs each being considered as causal. Weight these by the posterior probability of that SNP being causal to obtain a final calibration curve.

Questions

library(magrittr)
library(data.table)
library(simGWAS)
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("snpStats", version = "3.8")
library(snpStats)
library(coloc)


cor2 <- function (x) 
{
  1/(NROW(x) - 1) * crossprod(scale(x, TRUE, TRUE))
}

args <- list(ncse=5000,nctl=5000,hyp="h3",nreps=5)

## real data from UK10K
file.ldd="/home/cew54/share/Data/reference/lddetect/EUR/fourier_ls-chr22.bed"
file.vcf="/home/cew54/share/Data/reference/UK10K/BCF/chr22.bcf.gz"

## ldblocks
ldd <- fread(file.ldd)

## split bcf by ldblocks
ldd[,blocknum:=1:.N]
ldd[,dist:=stop-start]
ldd[,comm:=paste0("/home/cew54/share/bin/bcftools view ",file.vcf,
                  " --min-af 0.02:minor --max-alleles 2 --min-alleles 2 ",
                  " -r chr",22,":",start,"-",stop," -Ov ")] # -o ",tmp)]
gethap <- function(i) {
  y=fread(ldd$comm[i])
  ha <- simGWAS:::vcf2haps(as.matrix(y[,-c(1:9)]))
  rownames(ha) <- paste0("pos",y$POS)
  t(ha)
}

block <- sample(which(ldd$dist < 1.2e+6),1) # use smallest LD block to be fast
h <- gethap(block) # rows=samples, cols=snps
use <- apply(h,2,var)>0 & colMeans(h) > 0.01 & colMeans(h)<0.99  # no monomorphs
h <- h[,use,drop=FALSE]
nmax <- 1000 # keep this small to make simulations fast
if(ncol(h)>nmax) {
  start <- sample(1:(ncol(h)-nmax),1)
  h <- h[,start:(start+nmax-1)]
} ## max 1000 snps, for speed in simulations
maf <- colMeans(h)
LD <- cor2(h)

freq <- as.data.frame(h+1)
freq$Probability <- 1/nrow(freq)
snps <- colnames(freq)[-ncol(freq)]

simdata <- function(freq,OR=1.1,nrep=100,N0=2000,N1=2000,MAF=maf) {
  snps <- colnames(freq)[-ncol(freq)] # snp names
  iCV=sample(length(maf) ,1)
  CV=snps[iCV]
  
  FP <- make_GenoProbList(snps=snps,W=CV,freq=freq)
  zsim <- simulated_z_score(N0=N0, # number of controls
                            N1=N1, # number of cases
                            snps=snps,
                            W=CV, # causal variants, subset of snps
                            gamma.W=log(OR), # log odds ratios
                            freq=freq, # reference haplotypes
                            nrep=nrep)
  psim <- 2*pnorm(-abs(zsim)) # generate p values from z values
  
  # Generate posterior probabilities using finemap.abf function
  ppsim <- lapply(1:nrow(zsim), function(i) {
    tmp <- coloc:::approx.bf.p(p = psim[i,],
                               f=maf, type = "cc", N=N0+N1, s=N0/(N1+N0))[,"lABF"]
    p1 <- 1e-04 # hard coded - bad code - can fix later
    nsnps <- length(tmp)
    prior <- c(1-nsnps*p1,rep(p1,nsnps))
    tmp <- c(1,tmp) # add on extra for null model
    my.denom <- coloc:::logsum(tmp + prior)
    tmp1 <- exp(tmp - my.denom)[-1] # drop null again
    tmp1/sum(tmp1)
  })  %>% do.call("rbind",.) # simulated posterior probabilities
  return(list(ppsim=ppsim,CV=CV,iCV=iCV))
}

credset <- function(pp, causal, thr=0.9,do.order=TRUE) {
  o <- if(do.order) {
    order(pp,decreasing=TRUE)
  } else {
    seq_along(pp)
  }
  
  cumpp <- cumsum(pp[o])
  
  ## which is the first snp s t cumsum > 0.9
  wh <- which(cumpp > thr)[1]
  wh
  
  ## these are the variants in the cred set
  credset <- o[1:wh]
  
  return(list(credset=credset,
              thr=thr,
              size=sum( pp[credset] ),
              contained=causal %in% credset))
}

wrapper2_x <- function(...) {
  n <- 5000 # or can vary sample size too
  or <- sample(c(1.001,1.05,1.1,1.15,1.2,1.25,1.3),1) # vary or
  thr <- sample(c(0.6,0.8,0.9,0.95),1) # vary threshold 
  data <- simdata(freq, OR=or, N0=n, N1=n, MAF=maf) # data is a list of data.frames
  ppsim <- lapply(1:nrow(data$ppsim), function(i) data$ppsim[i,])
  
  cs <- lapply(ppsim, function(d) {
    tmp.ord <- credset(d, data$iCV, thr=thr)
    tmp.noord <- credset(d, data$iCV, thr=thr, do.order=FALSE)
    
    data.frame(order=c(TRUE,FALSE),
               thr=tmp.ord$thr,
               size=c(tmp.ord$size,tmp.noord$size),
               nvar=c(length(tmp.ord$credset),
                      length(tmp.noord$credset)),
               covered=c(tmp.ord$contained,
                         tmp.noord$contained))
  })  
  cs <- do.call("rbind",cs)
  cs$N <- n
  cs$OR <- or
  cs$thr <- thr
  cs
}

test1 <- replicate(100,wrapper2_x(),simplify = FALSE)
sims1 <- data.table::rbindlist(test1, idcol=FALSE)

sims_ord <- sims1[order=="TRUE"] # separate ordered and non-ordered simulations
sims_noord <- sims1[order=="FALSE"]

saveRDS(sims_ord, "sims.ord.RDS")
saveRDS(sims_noord, "sims.noord.RDS")