cFDR


I derive a cFDR estimator for a continuous auxillary covariate (q):

\[\begin{equation} \begin{split} \widehat{cFDR}(p,q) &= P(H_0^p\text{ | }P\leq p, Q\leq q)\\[2ex] &= \dfrac{P(P\leq p\text{ | }H_0^p, Q\leq q) P(H_0^p\text{ | }Q\leq q)}{P(P\leq p \text{ | }Q\leq q)}\\[2ex] &= \dfrac{P(P\leq p\text{ | }H_0^p)}{\dfrac{P(P\leq p, Q\leq q)}{P(Q\leq q)}} P(H_0^p\text{ | }Q\leq q)\\[2ex] &= \dfrac{p P(Q\leq q)}{P(P\leq p, Q\leq q)} P(H_0^p\text{ | }Q\leq q)\\[2ex] &= \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} P(H_0^p\text{ | }Q\leq q) \end{split} \end{equation}\]

So that,

\[\begin{equation} \widehat{cFDR}_{unadj}(p,q) = \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} \end{equation}\]

and

\[\begin{equation} \widehat{cFDR}_{adj}(p,q) = \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} \widehat{P(H_0^p\text{ | }Q\leq q)} \end{equation}\]

where \(\widehat{P(H_0^p\text{ | }Q\leq q)}=\dfrac{P(Q\leq q|p>1/2)}{P(Q\leq q)}\).


pq_full <- readRDS("realdata.RDS")

par(mfrow = c(1,2))
hist(pq_full$p, breaks = 100, main = "Histogram of P")
hist(pq_full$q, breaks = 100, main = "Histogram of Q")

chr1 = which(pq_full$chr == 1)
candidate_indices = which(pq_full$p < 0.0005 | pq_full$q > 1)

indices = intersect(chr1, candidate_indices)

fold = chr1 # leave out these indices

mode = 2 # leave one-chromosome-out

adj = TRUE # new method does adjustment separately

nt = 5000 # resolution of x
nv = 1000 # resolution of L curves (q)

res_p = 500 # resolution for KDE estimates
res_q = 200

p <- pq_full$p
q <- pq_full$q

gx <- 10^-30

zp <- -qnorm(pq_full$p/2)

maxx = max(c(10, zp))

# zp[which(zp > maxx)] = maxx

maxy = max(q)
miny = min(q)

if (mode==2) sub = setdiff(1:length(zp), fold) else sub = 1:length(zp)

# define limits of q by adding 20% either side
q_20 <- (max(q[sub]) - min(q[sub])) * 0.2

# define limits of p similar to james
lims=c(0, max(12, maxx), miny-q_20, maxy+q_20) # c(xl, xu, yl, yu)

kpq = kde2d(zp[sub], q[sub], n=c(res_p, res_q), lims=lims) 

df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(kpq$z))

ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('KDE of P,Q')

# need Fdr rather than fdr to sum to get P(P\leq p, Q\leq q)
# think carefully about this bit because James' code looks for ZQ>zq (we need Q<q)
# int2_kpq=t(apply(apply(kpq$z[res:1,res:1],2,cumsum),1,cumsum))[res:1,res:1]

# int2_kpq estimates P(ZP\geq zp, Q\leq q)
int2_kpq = t(apply(apply(kpq$z[res_p:1,],2,cumsum),1,cumsum))[res_p:1,]

df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(int2_kpq))

ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('"Integration" (summing) for P(ZP>=zp, Q<= q)')

int2_kpq[which(int2_kpq==0)] = min(int2_kpq[which(int2_kpq>0)]) # avoid 0/0 error

# int_kpq estimates P(ZP\geq 0, Q\leq q) i.e. P(Q\leq q)
# this is replicated in each row
int_kpq = t(outer(int2_kpq[1,], rep(1,res_p))) 
par(mfrow = c(1,2))

plot(kpq$y, int_kpq[1,], type = "l", xlab = "q", ylab = "row of int_kpq", main = "KDE est of P(Q<= q) (int_kpq)")

# check with that from original method
qs  = q[sub]

kq = density(qs, from=lims[3], to=lims[4], n=res_q)
plot(kq$x, cumsum(kq$y), xlab = "q", ylab = "Density", main = "KDE est of P(Q<= q) (density func)", type = "l")

# why are y axis scales different?

# int_p is P(P\leq p)
int_p = outer(2*pnorm(-kpq$x), rep(1,res_q)) # rep P vals in all cols (biggest to smallest)

# kgrid is denominator of cfdr 
# P(ZP\geq zp | Q\leq q) = P(ZP\geq zp, Q\leq q)/P(Q\leq q)
kgrid = kpq
kgrid$z = int2_kpq/int_kpq

df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(kgrid$z))

ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kgrid$z") + ggtitle('kgrid estimates P(ZP>= zp | Q<= q) = P(ZP>= zp, Q<= q)/P(Q<= q)')

# not sure why this is here
#kgrid$z[which(kgrid$z < 1/length(sub))] = 1/length(sub)

# df <- data.frame(ZP = rep(kpq$x, times = 200), Q = rep(kpq$y, each = 200), z = as.vector(kgrid$z))

#ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kgrid$z") + ggtitle('kgrid estimates P(ZP>= zp | Q<= q) = P(ZP>= zp, Q<= q)/P(Q<= q)')

# cgrid is unadjusted cFDR = ( f(Q\leq q) * p ) / f(P\leq p, Q\leq q)
cgrid = kgrid
cgrid$z = int_p/kgrid$z
cgrid$z = apply(cgrid$z, 2, cummin) # make non-increasing
cgrid$z = apply(cgrid$z, 2, function(x) x-gx*kpq$x+gx*maxx) # make strictly decreasing

apply(cgrid$z, 2, function(x) all(diff(x)))
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [23]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [34]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [45]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [67]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [89]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [100]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [122]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [144]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [155]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [166]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [177]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [188]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [199]  TRUE  TRUE
#par(mfrow = c(1,2))
#plot(cgrid$z[,1])
#all(diff(cgrid$z[,1]) < 0)
#plot(cgrid$z[,1]-gx*kpq$x+gx*maxx)
#all(diff(cgrid$z[,1]-gx*kpq$x+gx*maxx) < 0)

df <- data.frame(ZP = rep(cgrid$x, times = res_q), ZQ = rep(cgrid$y, each = res_p), z = as.vector(cgrid$z))

ggplot(data = df, aes(x = ZP, y = z, group = ZQ, col = ZQ)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('Unadjusted cFdr')

# find unadjusted cFdr value for each p,q pair
if (!is.null(indices)) ccut = interp.surface(cgrid, cbind(pmin(zp[indices],maxx), pmin(q[indices],maxy))) 
out=rep(0,length(ccut))

# check unadjusted cfdr<=1
max(ccut)
## [1] 0.9994525
yval2 = seq(miny, maxy, length.out=nv+1)[1:nv]
xval2 = outer(rep(1,length(ccut)), yval2)
xtest = seq(0, maxx, length.out=nt)
ptest = 2*pnorm(-xtest)


# adjustment for P(H0|Q\leq q)
if (adj) {
  correct = cummin((1 + ecdf(q[sub][which(p[sub]>0.5)])(yval2)*length(p[sub]))/
                     (1 + ecdf(q[sub])(yval2)*length(p[sub])))
  if (!is.null(indices)) correct_ccut = approx(yval2, correct, q[indices],rule=2)$y #cummin((1+ecdf(pc[which(p>0.5)])(pc)*length(p))/(1+ rank(pc)))
} else {
  correct=rep(1,length(pval2)) # adjustment factor for q[i]
  correct_ccut=rep(1,length(ccut))
}

if (!is.null(indices)) ccut = ccut*correct_ccut

# check adjusted cfdr<=1
max(ccut)
## [1] 0.9954825
for (i in 1:length(yval2)) {
  w = which(q < yval2[i])
  xdenom = interp.surface(kgrid, cbind(xtest, rep(yval2[i], length(xtest))))
  if (length(w) > 2) {
    cfx = cummin(ptest/xdenom)
    xval2[,i] = approx(cfx-gx*xtest  + gx*maxx, xtest, ccut/correct[i], rule=2, method="const", f=1)$y
    #} else xval2[,i]=-qnorm(ccut/2)    
  } else if (i>1) xval2[,i]=xval2[,i-1] 
  else xval2[,i]=-qnorm(ccut/2)    
}

plot(cfx-gx*xtest  + gx*maxx, xtest, xlab = "adjusted cfdr (red line at ccut)")
abline(v = ccut[length(indices)]/correct[i], lty = "dashed", col = "red")

scale = "p"

if (scale[1]=="p") {
  X=2*pnorm(-abs(xval2))
} else {
  X=xval2
}

Y=yval2

L = list(x=X,y=Y)

plotL <- function(L,xlab="p",ylab="q",...) {
  plot(0,xlim=c(min(L$x),max(L$x)),ylim=c(min(L$y),max(L$y)),xlab=xlab,ylab=ylab,...)
  for(i in 1:nrow(L$x))
    lines(L$x[i,],L$y)
}

par(mfrow = c(1,1))

plotL(L, main = "L curves")

plot(L$x[199,], L$y, cex = 0.4, pch = 16, type = "l", main = "Example L curve", xlab = "p", ylab = "q")

######## now I have my L curves I need to find the joint null to integrate (the PDF not the CDF)

# ZP|H0
fzph0 = function(x) dnorm(x)

# P | H0 
fph0 = function(x) dunif(x)

# Q|H0
qs  = q[sub][which(p[sub]>0.5)]
kq = density(qs, from=lims[3], to=lims[4], n=res_q)

fqh0 <- approxfun(kq$x, kq$y)

integrate(fqh0, lims[3], lims[4])
## 1.000979 with absolute error < 6.1e-05
fpzqh0 <- function(s){
  fph0(s[,1])*fqh0(s[,2])
}

# check I'm defining the L regions correctly
plot(owin(poly=list(x = c(L$x[2,nv], 0,0, L$x[2,]), y = c(L$y[nv], max(L$y), min(L$y), L$y))))

# find v values
v_p <- apply(L$x, 1, function(x) {
  return(tryCatch(polyCub(owin(poly=list(x = c(x[nv], 0,0, x), y = c(L$y[nv], max(L$y), min(L$y), L$y)), check = FALSE, fix  = FALSE), fpzqh0), error = function(e) NA))
}) %>% unlist()

df <- data.frame(P = p[indices], v_p, Q = q[indices])

ggplot(df, aes(x = -log10(P) , y = -log10(v_p), col = Q)) + geom_point() + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + geom_abline(intercept = 0, slope = 1,  linetype="dashed") + xlab("P (-log10)") + ylab("V (-log10)") + coord_cartesian(xlim = c(0,30))


Other things


Full genome

I’ve ran my method for all 121,000 SNPs.:


To do