Simulating credible sets (sup. meeting 3)

1) Update the `simref` function to simulate reference haplotypes

library(bindata)

# A function to simulate reference haplotypes, based on more realistic LD data

ref <- function(nsnps=100,nhaps=1000,LD1=0.08,LD2=0.02,maf=0.1){ # LD1 is value 
                                                                # eitherside of lead diagonal
  R<-diag(1,nsnps)                                              # LD2 is next diagonal out
  R[cbind(1:(nsnps-1),2:nsnps)] <- LD1
  R[cbind(2:nsnps,1:(nsnps-1))] <- LD1
  R[cbind(1:(nsnps-2),3:nsnps)] <- LD2
  R[cbind(3:nsnps,1:(nsnps-2))] <- LD2
  x=rmvbin(nhaps, rep(maf,nsnps), bincorr=R) 
  x=x+1
  x=as.data.frame(x)
  snps <- colnames(x) <- paste0("s",1:nsnps)
  x$Probability <- 1/nrow(x)
  x
}

Notes:

LD1 and LD2 are set very small to try and match the distribution of 1s and 2s in ‘freq’ (using original simref). If LD1, LD2 or MAF are set any higher then there are way too many 2s (representing the snp), and size is way above threshold.
Later, this can be adapted to make many reference haplotypes from varying LD values in order to investigate the effect of LD and MAF on coverage.

2) Update the `simdata` and `wrapper` functions to use these new reference haplotypes

# amend simdata function 

simdata_x <- function(x,OR=1.5,nrep=100,N0=1000,N1=1000,thr=0.5) {
  snps <- colnames(x)[-ncol(x)] 
  CV=sample(snps,1)
  
  FP <- make_GenoProbList(snps=snps,W=CV,freq=x) 
  zsim <- simulated_z_score(N0=N0, # number of controls
                            N1=N1, # number of cases
                            snps=snps,
                            W=CV, 
                            gamma.W=log(OR), 
                            freq=x,# reference haplotypes
                            GenoProbList=FP, 
                            nrep=nrep)
  p <- 2*pnorm(-abs(zsim)) # generate p values from z values
  
  # generate posterior probabilities using finemap.abf function
  MAF <- colMeans(x[,snps]-1) # minor allele frequencies
  PP <- matrix(0,nrow(p),ncol(p)) # will hold the posterior probs
  results <- lapply(1:nrep, function(i) {
    tmp <- subset(finemap.abf(dataset=list(pvalues=p[i,], N=N0+N1, MAF=MAF,
                                           s=N1/(N0+N1), type="cc"), p1=1e-04),snp!="null")
    tmp$SNP.PP <- tmp$SNP.PP/sum(tmp$SNP.PP) # replace post probs with the 
    # ratio of evidence for each variant being causal vrs all the others
    colnames(tmp) <- sub("\\.$","",colnames(tmp))
    colnames(tmp) <- sub("SNP.PP","PP",colnames(tmp)) # clean up column names
    tmp$CV <- sub("SNP.","",tmp$snp)==sub("s","",CV)
    tmp[,c("snp","pvalues","MAF","PP","CV")] # include all the rows and the named columns
  })
  results
}

# amend wrapper function

wrapper_x <- function(thr=0.9,...) {
  data <- simdata_x(x) # data is a list of data.frames
  
  cs <- lapply(data, function(d) {
    tmp.ord <- credset(d$PP, which(d$CV), thr=thr)
    tmp.noord <- credset(d$PP, which(d$CV), do.order=FALSE,thr=thr)
    data.frame(order=c(TRUE,FALSE),
               thr=tmp.ord$thr,
               size=c(tmp.ord$size,tmp.noord$size),
               nvar=c(length(tmp.ord$credset),
                      length(tmp.noord$credset)),
               covered=c(tmp.ord$contained,
                         tmp.noord$contained))
  })
  cs <- do.call("rbind",cs)
  cs.ord <- subset(cs,cs$order==TRUE)
  cs.noord <- subset(cs,cs$order==FALSE)
  
  c(size.ord=mean(cs.ord$size), cov.ord=mean(cs.ord$covered),
    size.noord=mean(cs.noord$size), cov.noord=mean(cs.noord$covered))
}

3) Update `wrapper2` so that it incorporates the new reference haplotypes and varies theshold values

entropy <- function(p) -mean(log(p))

wrapper2_x <- function(...) {
  n <- sample(1:5,1)*1000 # vary sample size
  or <- sample(c(1,1.05,1.1,1.2,1.3),1) # vary or
  thr <- sample(c(0.5,0.6,0.7,0.8,0.9),1) # vary threshold
  data <- simdata_x(x,N0=n,N1=n,OR=or) #,...) # data is a list of data.frames
  
  cs <- lapply(data, function(d) {
    tmp.ord <- credset(d$PP, which(d$CV), thr=thr)
    tmp.noord <- credset(d$PP, which(d$CV), do.order=FALSE,thr=thr)
    data.frame(order=c(TRUE,FALSE),
               thr=tmp.ord$thr,
               size=c(tmp.ord$size,tmp.noord$size),
               nvar=c(length(tmp.ord$credset),
                      length(tmp.noord$credset)),
               covered=c(tmp.ord$contained,
                         tmp.noord$contained))
  })  
  cs <- do.call("rbind",cs)
  ent <- sapply(data, function(x) entropy(x$PP))
  cs$entropy <- rep(ent,each=2)
  cs$N <- n
  cs$OR <- or
  cs$thr <- thr
  cs
}
# note, still need x <- ref() command prior to use

`Wrapper2_x()` algorithm

Randomly sample values for threshold, OR and N.

Generate 100 ‘systems’ (posterior probability systems) using these values.

For each system, generate one credible set using ordered posterior probabilities and one credible set using non-ordered posterior probabilities, obtaining 200 credible sets from 100 systems.

For each credible set calculate the sum of posterior probabilities of elements in the credible set (size), number of variables in the credible set (nvar), whether the causal variant is contained in the credible set (covered) and the entropy. Note that the entropy is the same for each system, resulting in only 100 values.

Replicate to repeat the process using different combinations of threshold, OR and N values.

Example simulation

The following code simulates 20,000 credible sets from 100 different combinations of threshold, OR and N values. For each of these combinations, 100 systems are considered and two credible sets derived from each of these (one using ordered pps and one using non-ordered pps).

x <- ref()
example_sim <- replicate(100,wrapper2_x(), simplify=FALSE)
example_sim <- data.table::rbindlist(example_sim)
dim(example_sim)

## [1] 20000     8

# split data into ordered and non-ordered sets
example_sim_ord <- example_sim[order=="TRUE"]
example_sim_noord <- example_sim[order=="FALSE"]
head(example_sim_ord)

##    order thr      size nvar covered  entropy    N  OR
## 1:  TRUE 0.9 0.9005356   80    TRUE 4.859103 5000 1.1
## 2:  TRUE 0.9 0.9019405   80    TRUE 4.847719 5000 1.1
## 3:  TRUE 0.9 0.9025467   71    TRUE 5.159823 5000 1.1
## 4:  TRUE 0.9 0.9031222   72    TRUE 5.102479 5000 1.1
## 5:  TRUE 0.9 0.9013108   80    TRUE 4.867697 5000 1.1
## 6:  TRUE 0.9 0.9011400   78   FALSE 4.948458 5000 1.1

tail(example_sim_ord)

##    order thr      size nvar covered  entropy    N  OR
## 1:  TRUE 0.8 0.8041393   67    TRUE 4.755711 1000 1.3
## 2:  TRUE 0.8 0.8017135   59    TRUE 4.985643 1000 1.3
## 3:  TRUE 0.8 0.8017345   59    TRUE 4.938271 1000 1.3
## 4:  TRUE 0.8 0.8037415   62    TRUE 4.904680 1000 1.3
## 5:  TRUE 0.8 0.8054744   72    TRUE 4.699685 1000 1.3
## 6:  TRUE 0.8 0.8010807   73    TRUE 4.684148 1000 1.3

head(example_sim_noord)

##    order thr      size nvar covered  entropy    N  OR
## 1: FALSE 0.9 0.9038389   86    TRUE 4.859103 5000 1.1
## 2: FALSE 0.9 0.9033891   90    TRUE 4.847719 5000 1.1
## 3: FALSE 0.9 0.9199295   86    TRUE 5.159823 5000 1.1
## 4: FALSE 0.9 0.9020467   88    TRUE 5.102479 5000 1.1
## 5: FALSE 0.9 0.9081359   94    TRUE 4.867697 5000 1.1
## 6: FALSE 0.9 1.0000000  100    TRUE 4.948458 5000 1.1

tail(example_sim_noord)

##    order thr      size nvar covered  entropy    N  OR
## 1: FALSE 0.8 0.8008385   89   FALSE 4.755711 1000 1.3
## 2: FALSE 0.8 0.9705256   95    TRUE 4.985643 1000 1.3
## 3: FALSE 0.8 0.8588590   84    TRUE 4.938271 1000 1.3
## 4: FALSE 0.8 0.8027812   77   FALSE 4.904680 1000 1.3
## 5: FALSE 0.8 0.8039018   82    TRUE 4.699685 1000 1.3
## 6: FALSE 0.8 0.8038397   86   FALSE 4.684148 1000 1.3

Simulating credible sets (sup. meeting 3)

Anna Hutchinson

16/10/2018

1) Update the `simref` function to simulate reference haplotypes

2) Update the `simdata` and `wrapper` functions to use these new reference haplotypes

3) Update `wrapper2` so that it incorporates the new reference haplotypes and varies theshold values

`Wrapper2_x()` algorithm

Example simulation

Simulating credible sets (sup. meeting 3)

Anna Hutchinson

16/10/2018

1) Update the simref function to simulate reference haplotypes

2) Update the simdata and wrapper functions to use these new reference haplotypes

3) Update wrapper2 so that it incorporates the new reference haplotypes and varies theshold values

Wrapper2_x() algorithm

Example simulation

1) Update the `simref` function to simulate reference haplotypes

2) Update the `simdata` and `wrapper` functions to use these new reference haplotypes

3) Update `wrapper2` so that it incorporates the new reference haplotypes and varies theshold values

`Wrapper2_x()` algorithm