1. Speeding up functions

ERR = mvtnorm:::rmvnorm(nrep, rep(0, ncol(Sigma)), Sigma)
pp_ERR = function(Zj, nrep, Sigma) {
    exp.zm = Zj %*% Sigma
    mexp.zm = matrix(exp.zm, nrep, length(Zj), byrow = TRUE)  # matrix of Zj replicated in each row
    zstar = mexp.zm + ERR
    bf = 0.5 * (log(1 - r) + (r * zstar^2))
    denom = coloc:::logsum(bf)  
    pp.tmp = exp(bf - denom)  
    pp.tmp/rowSums(pp.tmp)
}

ERR = rmvnorm(nrep, rep(0, ncol(Sigma)), Sigma)
simzstar <- function(Zj) {
    mZj = matrix(Zj, nrep, length(Zj), byrow = TRUE)  # matrix of Zj replicated in each row
    ii = which(Zj != 0)  # currently considered causal snp
    mZj[, ii] = mZj[, ii] - ERR[, ii]
    exp.zm = mZj %*% Sigma
    exp.zm + ERR
}

2. Compare corrcov and corrcov_bhat functions


3. Obtaining required threshold

#' corrected_cs
#'
#' @param z Z-scores
#' @param f Minor allele frequencies
#' @param N0 Number of controls
#' @param N1 Number of cases
#' @param Sigma Correlation matrix of SNPs
#' @param lower Lower threshold
#' @param upper Upper threshold
#' @param desired.cov The desired coverage of the causal variant in the credible set
#' @param acc Accuracy of corrected coverage to desired coverage (default = 0.005)
#' @param max.iter Maximum iterations (default = 20)
#'
#' @return list of variants in credible set, required threshold, the corrected coverage and the size of the credible set
#' @export
corrected_cs <- function(z, f, N0, N1, Sigma, lower, upper, desired.cov, acc = 0.005, max.iter = 20){
  # lower <- 2*desired.cov - 1
  # upper <- min(1,desired.cov + 0.05)
  s = N1/(N0+N1) # proportion of cases
  V = 1/(2 * (N0+N1) * f * (1 - f) * s * (1 - s))
  W = 0.2
  r <- W^2/(W^2 + V)
  pp <- ppfunc(z, V = V) # pp of system in question
  muhat = est_mu(z, f, N0, N1)
  nsnps = length(pp)
  temp = diag(x = muhat, nrow = nsnps, ncol = nsnps)
  zj = lapply(seq_len(nrow(temp)), function(i) temp[i,]) # nsnp zj vectors for each snp considered causal
  nrep = 1000
  
  # simulate ERR matrix
  ERR = mvtnorm:::rmvnorm(nrep, rep(0,ncol(Sigma)), Sigma)
  pp_ERR = function(Zj){
    exp.zm = Zj %*% Sigma
    mexp.zm = matrix(exp.zm, nrep, length(Zj), byrow=TRUE) # matrix of Zj replicated in each row
    zstar = mexp.zm+ERR
    bf = 0.5 * (log(1 - r) + (r * zstar^2))
    denom = coloc:::logsum(bf)  # logsum(x) = max(x) + log(sum(exp(x - max(x)))) so sum is not inf
    pp.tmp = exp(bf - denom)  # convert back from log scale
    pp.tmp / rowSums(pp.tmp)
  }
  # simulate pp systems
  pps <- mapply(pp_ERR, zj, SIMPLIFY = FALSE)

  n_pps <- length(pps)
  args <- 1:length(pp)

  f <- function(thr){ # finds the difference between corrcov and desired.cov
    d5 <- lapply(1:n_pps, function(x) {
      credsetC(pps[[x]], CV = rep(args[x], dim(pps[[x]])[1]), thr = thr)
    })
    prop_cov <- lapply(d5, prop_cov) %>% unlist()
    sum(prop_cov * pp) - desired.cov
  }

  # initalize
  N=1
  fa = f(lower)
  fb = f(upper)

  if (fa * fb > 0) {
    stop("No root in range, increase window")
  } else {
    fc = min(fa, fb)
    while (N <= max.iter & !dplyr::between(fc, 0, acc)) {
      c = lower + (upper-lower)/2
      fc = f(c)
      print(paste("thr: ", c, ", cov: ", desired.cov + fc))

      if (fa * fc < 0) {
        upper = c
        fb = fc
      } else if (f(upper) * fc < 0) {
        lower = c
        fa = fc
      }
      N = N + 1
    }
    # df <- data.frame(req.thr = c, corr.cov = desired.cov + fc)
  }
  o <- order(pp, decreasing = TRUE)  # order index for true pp
  cumpp <- cumsum(pp[o])  # cum sums of ordered pps
  wh <- which(cumpp > c)[1]  # how many needed to exceed thr
  list(credset = names(pp)[o[1:wh]], req.thr = c, corr.cov = desired.cov + fc, size = cumpp[wh])
}

Is the true coverage of the new credible set approx. 0.8?

  • I ran some simulations using this corrected_cs function. The first plot shows the true empirical coverage of the simulated credible sets grouped by true \(\mu\) value (approx. 500-1000 simulations in each group).

    • Left: The original 80% credible sets obtained using the standard Bayesian approach
    • Right: The new credible sets obtained using the corrected_cs function (which finds the required threshold and uses the standard Bayesian approach with this new threshold).

  • The function works well at providing a new credible set with nearer to the desired coverage, however I am wary of how many are below 0.8 and the variability of the true coverage estimates. These are artefacts of our correction (since all the corrected coverages values for the points on the right plot are between 0.8 and 0.8005)

  • Side note: Why is there a ‘dip’ in the left plot?


Relative Error

  • The following plots are to visualise the relative error of the corrected and claimed coverage estimates of the original credible sets and the new ones obtained using the corrected_cs function.

  • Firstly, for the original credible sets:

  • Secondly, for the new credible sets:


Differences in nvar

  • The table below shows the summary statistics for the difference between the number of variants in the original 80% credible set and the new 80% credible set (using the corrected_cs function). I.e. how many variants have been removed.

  • Obviously fewer variants are removed in high power scenarios as the original credible set often only has 1 variant in it.

  • Infact, where we would use fine-mapping there is often only one variant in the credible set anyway so most of the time the credible set will stay the same? I think that our correction method will work best in high linkage disequilibrium cases where there are many variants in the set.

##         bin       mean median low.error high.error
## 1:    (0,4] 11.4607724      8         1         22
## 2: (4,4.75]  4.5208597      1         1          3
## 3: (4.75,6]  2.2918862      1         1          1
## 4:    (6,8]  0.8317152      1         0          1
## 5:   (8,10]  0.6227390      1         0          1
## 6:  (10,13]  0.6385542      1         0          1
## 7: (13,Inf]  0.5674419      1         0          1

4. T1D


Method (case-control)

  1. Single-SNP logistic regression for T1D versus additive genotype score (0, 1 or 2 risk alleles), adjusted for sex and regions in the UK.

  2. Used forward stepwise analyses to investigate secondary independent associations in each region. I.e. fit a new logistic regression model with the most significantly associated SNP as a covariate. Repeat, building up the covariates in the model until no more SNPs are significant.

  3. For each index SNP (\(i\)), consider all SNPs within a 50-kb window and calculate the ABFs comparing models containing the index SNP \(i\), and each alternative SNP \(j\). Where, \[-2log(ABF_{ij})=BIC_i-BIC_j\]
  4. Estimate the probability that any individual SNP \(j\) is the causal variant responsible for that signal by, \[PP_j=\frac{BIC_j}{sum(BIC_j)}\]
  5. Create 99% credible sets of SNPs by ordering these and cumulatively summing until the sum exceeds 0.99.

  6. Further modified these credible sets by
    • Expanding the set: Include neighbouring SNPs which did not pass genotyping on the Immunochip and are in high LD with a SNP in the credible set.
    • Filtering: Based on those which overlapped with enhancers in tissues that showed enrichment and those that were nonsynonymous.

Problems:

  • My calculated posterior probabilities do not match those in the paper. Perhaps due to cutting out SNPs not in LD matrix, rounding or the method used to calculate pps in the paper. Why are they looking at ABF comparing index SNP causal vrs each of the others rather than ABF comparing each SNP causal vrs the null?

  • Getting this error: In mvtnorm:::rmvnorm(nrep, rep(0, ncol(Sigma)), Sigma) : sigma is numerically not positive semidefinite.


  • Anyhow, I have ran an array job that finds the corrected coverage of the 99% credible sets from the T1D data and (if possible) finds a new credible set with corrected coverage >= 0.99 using the corrcoverage package.

    • their_cs_nvar: The number of variants in the credible set reported in the paper
    • our_cs_nvar: The number of variants in the credible set I found by following the standard Bayesian method
    • our_cs_corrcov: The corrected coverage estimate of the credible set using the standard Bayesian approach
    • new_cs_nvar: The number of variants in the new credible set found using the corrected_cs function
    • new_corrcov: The corrected coverage estimate of the new credible set
##     index_SNP their_cs_nvar our_cs_nvar our_cs_corrcov new_cs_nvar new_corrcov
## 1  rs10277986            26          11      0.9732795          17   0.9903420
## 2  rs11203202             5           4      0.9960134           4   0.9904442
## 3  rs11954020           136          83      0.9941234          74   0.9904929
## 4  rs12416116             7           6      0.9902935           5   0.9900865
## 5  rs12453507           112          65      0.9921685          63   0.9904078
## 6  rs12927355            41          20      0.9693932          24   0.9904093
## 7  rs13415583           139          92      0.9889096          97   0.9900088
## 8   rs1456988            40          26      0.9915964          26   0.9904314
## 9    rs151234             0          10      0.9848938          20   0.9904777
## 10  rs1538171           154          87      0.9701159         101   0.9904821
## 11  rs1615504            40          26      0.9920699          25   0.9904854
## 12  rs1893217            35          11      0.9478510          32   0.9900666
## 13   rs193778           102          23      0.9932024          24   0.9903892
## 14  rs2111485             4           4      0.9822731           4   0.9901479
## 15   rs229533            15           8      0.9937547           5   0.9902300
## 16  rs2476601             2           2      0.7986403           2   0.8968245
## 17  rs3024505             9           7      0.9965116           5   0.9901786
## 18  rs3087243            30          21      0.9523659          44   0.9902696
## 19 rs34593439             3           2      0.9934904           2   0.9903248
## 20   rs402072            25          14      0.9948048          12   0.9904155
## 21  rs4820830           117          59      0.9812119          67   0.9900985
## 22   rs516246            36          12      0.9863866          12   0.9903131
## 23 rs56994090            11           8      0.9974704           8   0.9902434
## 24  rs6043409            39          16      0.9956430          10   0.9901013
## 25 rs61839660             5           4      0.9512246           7   0.9900678
## 26 rs62447205            56          40      0.9824468          55   0.9901434
## 27  rs6476839            21          18      0.9947390          17   0.9903600
## 28  rs6518350            68          41      0.9975402          26   0.9900527
## 29   rs653178             5           2      0.8339680           5   0.9805488
## 30   rs705705            22           9      0.9297544          19   0.9903935
## 31 rs72727394            32          19      0.9946555          19   0.9902028
## 32 rs72928038            56          36      0.9934878          34   0.9904087
## 33   rs757411            82          54      0.9904337          53   0.9900654
## 34 rs75793288            34          22      0.9695837          35   0.9901022
## 35  rs8056814            22           8      0.9671638          14   0.9902707
## 36   rs917911           135         133      0.9950358          48   0.9902130
## 37  rs9585056             7           3      0.9969532           1   0.9898862
  • It seems that our method works well at increasing the number of variants in the set when the corrected coverage estimate is too low (e.g. rs61839660) and removing variants from the set when the corrected coverage estimate is too high (e.g. rs11954020).

5. Next Steps + Qs