I derive a cFDR estimator for a continuous auxillary covariate (q):
\[\begin{equation} \begin{split} \widehat{cFDR}(p,q) &= P(H_0^p\text{ | }P\leq p, Q\leq q)\\[2ex] &= \dfrac{P(P\leq p\text{ | }H_0^p, Q\leq q) P(H_0^p\text{ | }Q\leq q)}{P(P\leq p \text{ | }Q\leq q)}\\[2ex] &= \dfrac{P(P\leq p\text{ | }H_0^p)}{\dfrac{P(P\leq p, Q\leq q)}{P(Q\leq q)}} P(H_0^p\text{ | }Q\leq q)\\[2ex] &= \dfrac{p P(Q\leq q)}{P(P\leq p, Q\leq q)} P(H_0^p\text{ | }Q\leq q)\\[2ex] &= \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} P(H_0^p\text{ | }Q\leq q) \end{split} \end{equation}\]
So that,
\[\begin{equation} \widehat{cFDR}_{unadj}(p,q) = \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} \end{equation}\]
and
\[\begin{equation} \widehat{cFDR}_{adj}(p,q) = \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} \widehat{P(H_0^p\text{ | }Q\leq q)} \end{equation}\]
where \(\widehat{P(H_0^p\text{ | }Q\leq q)}=\dfrac{P(Q\leq q|p>1/2)}{P(Q\leq q)}\).
pq_full <- readRDS("realdata.RDS")
par(mfrow = c(1,2))
hist(pq_full$p, breaks = 100, main = "Histogram of P")
hist(pq_full$q, breaks = 100, main = "Histogram of Q")
chr1 = which(pq_full$chr == 1)
candidate_indices = which(pq_full$p < 0.0005 | pq_full$q > 1)
indices = intersect(chr1, candidate_indices)
fold = chr1 # leave out these indices
mode = 2 # leave one-chromosome-out
adj = TRUE # new method does adjustment separately
nt = 5000 # resolution of x
nv = 1000 # resolution of L curves (q)
res_p = 500 # resolution for KDE estimates
res_q = 200
p <- pq_full$p
q <- pq_full$q
gx <- 10^-30
zp <- -qnorm(pq_full$p/2)
maxx = max(c(10, zp))
# zp[which(zp > maxx)] = maxx
maxy = max(q)
miny = min(q)
if (mode==2) sub = setdiff(1:length(zp), fold) else sub = 1:length(zp)
# define limits of q by adding 20% either side
q_20 <- (max(q[sub]) - min(q[sub])) * 0.2
# define limits of p similar to james
lims=c(0, max(12, maxx), miny-q_20, maxy+q_20) # c(xl, xu, yl, yu)
kpq = kde2d(zp[sub], q[sub], n=c(res_p, res_q), lims=lims)
df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(kpq$z))
ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('KDE of P,Q')
# need Fdr rather than fdr to sum to get P(P\leq p, Q\leq q)
# think carefully about this bit because James' code looks for ZQ>zq (we need Q<q)
# int2_kpq=t(apply(apply(kpq$z[res:1,res:1],2,cumsum),1,cumsum))[res:1,res:1]
# int2_kpq estimates P(ZP\geq zp, Q\leq q)
int2_kpq = t(apply(apply(kpq$z[res_p:1,],2,cumsum),1,cumsum))[res_p:1,]
df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(int2_kpq))
ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('"Integration" (summing) for P(ZP>=zp, Q<= q)')
int2_kpq[which(int2_kpq==0)] = min(int2_kpq[which(int2_kpq>0)]) # avoid 0/0 error
# int_kpq estimates P(ZP\geq 0, Q\leq q) i.e. P(Q\leq q)
# this is replicated in each row
int_kpq = t(outer(int2_kpq[1,], rep(1,res_p)))
par(mfrow = c(1,2))
plot(kpq$y, int_kpq[1,], type = "l", xlab = "q", ylab = "row of int_kpq", main = "KDE est of P(Q<= q) (int_kpq)")
# check with that from original method
qs = q[sub]
kq = density(qs, from=lims[3], to=lims[4], n=res_q)
plot(kq$x, cumsum(kq$y), xlab = "q", ylab = "Density", main = "KDE est of P(Q<= q) (density func)", type = "l")
# why are y axis scales different?
# int_p is P(P\leq p)
int_p = outer(2*pnorm(-kpq$x), rep(1,res_q)) # rep P vals in all cols (biggest to smallest)
# kgrid is denominator of cfdr
# P(ZP\geq zp | Q\leq q) = P(ZP\geq zp, Q\leq q)/P(Q\leq q)
kgrid = kpq
kgrid$z = int2_kpq/int_kpq
df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(kgrid$z))
ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kgrid$z") + ggtitle('kgrid estimates P(ZP>= zp | Q<= q) = P(ZP>= zp, Q<= q)/P(Q<= q)')
# not sure why this is here
#kgrid$z[which(kgrid$z < 1/length(sub))] = 1/length(sub)
# df <- data.frame(ZP = rep(kpq$x, times = 200), Q = rep(kpq$y, each = 200), z = as.vector(kgrid$z))
#ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kgrid$z") + ggtitle('kgrid estimates P(ZP>= zp | Q<= q) = P(ZP>= zp, Q<= q)/P(Q<= q)')
# cgrid is unadjusted cFDR = ( f(Q\leq q) * p ) / f(P\leq p, Q\leq q)
cgrid = kgrid
cgrid$z = int_p/kgrid$z
cgrid$z = apply(cgrid$z, 2, cummin) # make non-increasing
cgrid$z = apply(cgrid$z, 2, function(x) x-gx*kpq$x+gx*maxx) # make strictly decreasing
apply(cgrid$z, 2, function(x) all(diff(x)))
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [56] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [67] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [100] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [111] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [122] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [144] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [177] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [188] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [199] TRUE TRUE
#par(mfrow = c(1,2))
#plot(cgrid$z[,1])
#all(diff(cgrid$z[,1]) < 0)
#plot(cgrid$z[,1]-gx*kpq$x+gx*maxx)
#all(diff(cgrid$z[,1]-gx*kpq$x+gx*maxx) < 0)
df <- data.frame(ZP = rep(cgrid$x, times = res_q), ZQ = rep(cgrid$y, each = res_p), z = as.vector(cgrid$z))
ggplot(data = df, aes(x = ZP, y = z, group = ZQ, col = ZQ)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('Unadjusted cFdr')
# find unadjusted cFdr value for each p,q pair
if (!is.null(indices)) ccut = interp.surface(cgrid, cbind(pmin(zp[indices],maxx), pmin(q[indices],maxy)))
out=rep(0,length(ccut))
# check unadjusted cfdr<=1
max(ccut)
## [1] 0.9994525
yval2 = seq(miny, maxy, length.out=nv+1)[1:nv]
xval2 = outer(rep(1,length(ccut)), yval2)
xtest = seq(0, maxx, length.out=nt)
ptest = 2*pnorm(-xtest)
# adjustment for P(H0|Q\leq q)
if (adj) {
correct = cummin((1 + ecdf(q[sub][which(p[sub]>0.5)])(yval2)*length(p[sub]))/
(1 + ecdf(q[sub])(yval2)*length(p[sub])))
if (!is.null(indices)) correct_ccut = approx(yval2, correct, q[indices],rule=2)$y #cummin((1+ecdf(pc[which(p>0.5)])(pc)*length(p))/(1+ rank(pc)))
} else {
correct=rep(1,length(pval2)) # adjustment factor for q[i]
correct_ccut=rep(1,length(ccut))
}
if (!is.null(indices)) ccut = ccut*correct_ccut
# check adjusted cfdr<=1
max(ccut)
## [1] 0.9954825
for (i in 1:length(yval2)) {
w = which(q < yval2[i])
xdenom = interp.surface(kgrid, cbind(xtest, rep(yval2[i], length(xtest))))
if (length(w) > 2) {
cfx = cummin(ptest/xdenom)
xval2[,i] = approx(cfx-gx*xtest + gx*maxx, xtest, ccut/correct[i], rule=2, method="const", f=1)$y
#} else xval2[,i]=-qnorm(ccut/2)
} else if (i>1) xval2[,i]=xval2[,i-1]
else xval2[,i]=-qnorm(ccut/2)
}
plot(cfx-gx*xtest + gx*maxx, xtest, xlab = "adjusted cfdr (red line at ccut)")
abline(v = ccut[length(indices)]/correct[i], lty = "dashed", col = "red")
scale = "p"
if (scale[1]=="p") {
X=2*pnorm(-abs(xval2))
} else {
X=xval2
}
Y=yval2
L = list(x=X,y=Y)
plotL <- function(L,xlab="p",ylab="q",...) {
plot(0,xlim=c(min(L$x),max(L$x)),ylim=c(min(L$y),max(L$y)),xlab=xlab,ylab=ylab,...)
for(i in 1:nrow(L$x))
lines(L$x[i,],L$y)
}
par(mfrow = c(1,1))
plotL(L, main = "L curves")
plot(L$x[199,], L$y, cex = 0.4, pch = 16, type = "l", main = "Example L curve", xlab = "p", ylab = "q")
######## now I have my L curves I need to find the joint null to integrate (the PDF not the CDF)
# ZP|H0
fzph0 = function(x) dnorm(x)
# P | H0
fph0 = function(x) dunif(x)
# Q|H0
qs = q[sub][which(p[sub]>0.5)]
kq = density(qs, from=lims[3], to=lims[4], n=res_q)
fqh0 <- approxfun(kq$x, kq$y)
integrate(fqh0, lims[3], lims[4])
## 1.000979 with absolute error < 6.1e-05
fpzqh0 <- function(s){
fph0(s[,1])*fqh0(s[,2])
}
# check I'm defining the L regions correctly
plot(owin(poly=list(x = c(L$x[2,nv], 0,0, L$x[2,]), y = c(L$y[nv], max(L$y), min(L$y), L$y))))
# find v values
v_p <- apply(L$x, 1, function(x) {
return(tryCatch(polyCub(owin(poly=list(x = c(x[nv], 0,0, x), y = c(L$y[nv], max(L$y), min(L$y), L$y)), check = FALSE, fix = FALSE), fpzqh0), error = function(e) NA))
}) %>% unlist()
df <- data.frame(P = p[indices], v_p, Q = q[indices])
ggplot(df, aes(x = -log10(P) , y = -log10(v_p), col = Q)) + geom_point() + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + geom_abline(intercept = 0, slope = 1, linetype="dashed") + xlab("P (-log10)") + ylab("V (-log10)") + coord_cartesian(xlim = c(0,30))
I’ve looked at the impact of increasing nt
(resolution of x on L curves) from 5000 to 10000 and increasing nv
(resolution of y on L curves) from 1000 to 2000 but both of these had no impact. Note that these are used in the last stage where we find the xval2[i,j]
s.t. cfdr(xval2[i,j],yval2[i,j])=ccut[i]
.
My Q values now range from approximately -1.5 to 3 whereas the ZQ vary from 0 to 30. It therefore doesn’t make sense to have one res
(res=200
) parameter which determines the number of grid points in each direction for the KDE estimation. For now, I set res_p=500
and res_q=200
. This also had little impact on the results.
I’ve introduced check = FALSE, fix = FALSE
into the owin
function, which helped previously when owin was reporting “empty polygons” (and indeed helps with tailing off problem here).
I’ve ran my method for all 121,000 SNPs.:
Look into bandwidths for KDE estimation.
Figure out some formula to decide the number of grid points in x (Z scores) and y (Q values) directions
Investigate deinflation (see this earlier summary where I discuss this: https://annahutch.github.io/PhD/12feb2020.html)