24th April

cFDR

I derive a cFDR estimator for a continuous auxillary covariate (q):

\[\begin{equation} \begin{split} \widehat{cFDR}(p,q) &= P(H_0^p\text{ | }P\leq p, Q\leq q)\\[2ex] &= \dfrac{P(P\leq p\text{ | }H_0^p, Q\leq q) P(H_0^p\text{ | }Q\leq q)}{P(P\leq p \text{ | }Q\leq q)}\\[2ex] &= \dfrac{P(P\leq p\text{ | }H_0^p)}{\dfrac{P(P\leq p, Q\leq q)}{P(Q\leq q)}} P(H_0^p\text{ | }Q\leq q)\\[2ex] &= \dfrac{p P(Q\leq q)}{P(P\leq p, Q\leq q)} P(H_0^p\text{ | }Q\leq q)\\[2ex] &= \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} P(H_0^p\text{ | }Q\leq q) \end{split} \end{equation}\]

So that,

\[\begin{equation} \widehat{cFDR}_{unadj}(p,q) = \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} \end{equation}\]

and

\[\begin{equation} \widehat{cFDR}_{adj}(p,q) = \dfrac{p \int_{-\infty}^q \int_0^{\infty} f(x,y)dxdy}{\int_{-\infty}^q \int_{z_p}^{\infty} f(x,y)dxdy} \widehat{P(H_0^p\text{ | }Q\leq q)} \end{equation}\]

where \(\widehat{P(H_0^p\text{ | }Q\leq q)}=\dfrac{P(Q\leq q|p>1/2)}{P(Q\leq q)}\).

pq_full <- readRDS("realdata.RDS")

par(mfrow = c(1,2))
hist(pq_full$p, breaks = 100, main = "Histogram of P")
hist(pq_full$q, breaks = 100, main = "Histogram of Q")

chr1 = which(pq_full$chr == 1)
candidate_indices = which(pq_full$p < 0.0005 | pq_full$q > 1)

indices = intersect(chr1, candidate_indices)

fold = chr1 # leave out these indices

mode = 2 # leave one-chromosome-out

adj = TRUE # new method does adjustment separately

nt = 5000 # resolution of x
nv = 1000 # resolution of L curves (q)

res_p = 500 # resolution for KDE estimates
res_q = 200

p <- pq_full$p
q <- pq_full$q

gx <- 10^-30

zp <- -qnorm(pq_full$p/2)

maxx = max(c(10, zp))

# zp[which(zp > maxx)] = maxx

maxy = max(q)
miny = min(q)

if (mode==2) sub = setdiff(1:length(zp), fold) else sub = 1:length(zp)

# define limits of q by adding 20% either side
q_20 <- (max(q[sub]) - min(q[sub])) * 0.2

# define limits of p similar to james
lims=c(0, max(12, maxx), miny-q_20, maxy+q_20) # c(xl, xu, yl, yu)

kpq = kde2d(zp[sub], q[sub], n=c(res_p, res_q), lims=lims) 

df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(kpq$z))

ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('KDE of P,Q')

# need Fdr rather than fdr to sum to get P(P\leq p, Q\leq q)
# think carefully about this bit because James' code looks for ZQ>zq (we need Q<q)
# int2_kpq=t(apply(apply(kpq$z[res:1,res:1],2,cumsum),1,cumsum))[res:1,res:1]

# int2_kpq estimates P(ZP\geq zp, Q\leq q)
int2_kpq = t(apply(apply(kpq$z[res_p:1,],2,cumsum),1,cumsum))[res_p:1,]

df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(int2_kpq))

ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('"Integration" (summing) for P(ZP>=zp, Q<= q)')

int2_kpq[which(int2_kpq==0)] = min(int2_kpq[which(int2_kpq>0)]) # avoid 0/0 error

# int_kpq estimates P(ZP\geq 0, Q\leq q) i.e. P(Q\leq q)
# this is replicated in each row
int_kpq = t(outer(int2_kpq[1,], rep(1,res_p))) 
par(mfrow = c(1,2))

plot(kpq$y, int_kpq[1,], type = "l", xlab = "q", ylab = "row of int_kpq", main = "KDE est of P(Q<= q) (int_kpq)")

# check with that from original method
qs  = q[sub]

kq = density(qs, from=lims[3], to=lims[4], n=res_q)
plot(kq$x, cumsum(kq$y), xlab = "q", ylab = "Density", main = "KDE est of P(Q<= q) (density func)", type = "l")

# why are y axis scales different?

# int_p is P(P\leq p)
int_p = outer(2*pnorm(-kpq$x), rep(1,res_q)) # rep P vals in all cols (biggest to smallest)

# kgrid is denominator of cfdr 
# P(ZP\geq zp | Q\leq q) = P(ZP\geq zp, Q\leq q)/P(Q\leq q)
kgrid = kpq
kgrid$z = int2_kpq/int_kpq

df <- data.frame(ZP = rep(kpq$x, times = res_q), Q = rep(kpq$y, each = res_p), z = as.vector(kgrid$z))

ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kgrid$z") + ggtitle('kgrid estimates P(ZP>= zp | Q<= q) = P(ZP>= zp, Q<= q)/P(Q<= q)')

# not sure why this is here
#kgrid$z[which(kgrid$z < 1/length(sub))] = 1/length(sub)

# df <- data.frame(ZP = rep(kpq$x, times = 200), Q = rep(kpq$y, each = 200), z = as.vector(kgrid$z))

#ggplot(data = df, aes(x = ZP, y = z, group = Q, col = Q)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kgrid$z") + ggtitle('kgrid estimates P(ZP>= zp | Q<= q) = P(ZP>= zp, Q<= q)/P(Q<= q)')

# cgrid is unadjusted cFDR = ( f(Q\leq q) * p ) / f(P\leq p, Q\leq q)
cgrid = kgrid
cgrid$z = int_p/kgrid$z
cgrid$z = apply(cgrid$z, 2, cummin) # make non-increasing
cgrid$z = apply(cgrid$z, 2, function(x) x-gx*kpq$x+gx*maxx) # make strictly decreasing

apply(cgrid$z, 2, function(x) all(diff(x)))

##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [23]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [34]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [45]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [67]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [89]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [100]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [122]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [144]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [155]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [166]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [177]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [188]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [199]  TRUE  TRUE

#par(mfrow = c(1,2))
#plot(cgrid$z[,1])
#all(diff(cgrid$z[,1]) < 0)
#plot(cgrid$z[,1]-gx*kpq$x+gx*maxx)
#all(diff(cgrid$z[,1]-gx*kpq$x+gx*maxx) < 0)

df <- data.frame(ZP = rep(cgrid$x, times = res_q), ZQ = rep(cgrid$y, each = res_p), z = as.vector(cgrid$z))

ggplot(data = df, aes(x = ZP, y = z, group = ZQ, col = ZQ)) + geom_line(cex = 0.2) + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + ylab("kpq$z") + ggtitle('Unadjusted cFdr')

# find unadjusted cFdr value for each p,q pair
if (!is.null(indices)) ccut = interp.surface(cgrid, cbind(pmin(zp[indices],maxx), pmin(q[indices],maxy))) 
out=rep(0,length(ccut))

# check unadjusted cfdr<=1
max(ccut)

## [1] 0.9994525

yval2 = seq(miny, maxy, length.out=nv+1)[1:nv]
xval2 = outer(rep(1,length(ccut)), yval2)
xtest = seq(0, maxx, length.out=nt)
ptest = 2*pnorm(-xtest)


# adjustment for P(H0|Q\leq q)
if (adj) {
  correct = cummin((1 + ecdf(q[sub][which(p[sub]>0.5)])(yval2)*length(p[sub]))/
                     (1 + ecdf(q[sub])(yval2)*length(p[sub])))
  if (!is.null(indices)) correct_ccut = approx(yval2, correct, q[indices],rule=2)$y #cummin((1+ecdf(pc[which(p>0.5)])(pc)*length(p))/(1+ rank(pc)))
} else {
  correct=rep(1,length(pval2)) # adjustment factor for q[i]
  correct_ccut=rep(1,length(ccut))
}

if (!is.null(indices)) ccut = ccut*correct_ccut

# check adjusted cfdr<=1
max(ccut)

## [1] 0.9954825

for (i in 1:length(yval2)) {
  w = which(q < yval2[i])
  xdenom = interp.surface(kgrid, cbind(xtest, rep(yval2[i], length(xtest))))
  if (length(w) > 2) {
    cfx = cummin(ptest/xdenom)
    xval2[,i] = approx(cfx-gx*xtest  + gx*maxx, xtest, ccut/correct[i], rule=2, method="const", f=1)$y
    #} else xval2[,i]=-qnorm(ccut/2)    
  } else if (i>1) xval2[,i]=xval2[,i-1] 
  else xval2[,i]=-qnorm(ccut/2)    
}

plot(cfx-gx*xtest  + gx*maxx, xtest, xlab = "adjusted cfdr (red line at ccut)")
abline(v = ccut[length(indices)]/correct[i], lty = "dashed", col = "red")

scale = "p"

if (scale[1]=="p") {
  X=2*pnorm(-abs(xval2))
} else {
  X=xval2
}

Y=yval2

L = list(x=X,y=Y)

plotL <- function(L,xlab="p",ylab="q",...) {
  plot(0,xlim=c(min(L$x),max(L$x)),ylim=c(min(L$y),max(L$y)),xlab=xlab,ylab=ylab,...)
  for(i in 1:nrow(L$x))
    lines(L$x[i,],L$y)
}

par(mfrow = c(1,1))

plotL(L, main = "L curves")

plot(L$x[199,], L$y, cex = 0.4, pch = 16, type = "l", main = "Example L curve", xlab = "p", ylab = "q")

######## now I have my L curves I need to find the joint null to integrate (the PDF not the CDF)

# ZP|H0
fzph0 = function(x) dnorm(x)

# P | H0 
fph0 = function(x) dunif(x)

# Q|H0
qs  = q[sub][which(p[sub]>0.5)]
kq = density(qs, from=lims[3], to=lims[4], n=res_q)

fqh0 <- approxfun(kq$x, kq$y)

integrate(fqh0, lims[3], lims[4])

## 1.000979 with absolute error < 6.1e-05

fpzqh0 <- function(s){
  fph0(s[,1])*fqh0(s[,2])
}

# check I'm defining the L regions correctly
plot(owin(poly=list(x = c(L$x[2,nv], 0,0, L$x[2,]), y = c(L$y[nv], max(L$y), min(L$y), L$y))))

# find v values
v_p <- apply(L$x, 1, function(x) {
  return(tryCatch(polyCub(owin(poly=list(x = c(x[nv], 0,0, x), y = c(L$y[nv], max(L$y), min(L$y), L$y)), check = FALSE, fix  = FALSE), fpzqh0), error = function(e) NA))
}) %>% unlist()

df <- data.frame(P = p[indices], v_p, Q = q[indices])

ggplot(df, aes(x = -log10(P) , y = -log10(v_p), col = Q)) + geom_point() + theme_cowplot(12) + background_grid(major = "xy", minor = "none") + geom_abline(intercept = 0, slope = 1,  linetype="dashed") + xlab("P (-log10)") + ylab("V (-log10)") + coord_cartesian(xlim = c(0,30))

Other things

I’ve looked at the impact of increasing nt (resolution of x on L curves) from 5000 to 10000 and increasing nv (resolution of y on L curves) from 1000 to 2000 but both of these had no impact. Note that these are used in the last stage where we find the xval2[i,j] s.t. cfdr(xval2[i,j],yval2[i,j])=ccut[i].
My Q values now range from approximately -1.5 to 3 whereas the ZQ vary from 0 to 30. It therefore doesn’t make sense to have one res (res=200) parameter which determines the number of grid points in each direction for the KDE estimation. For now, I set res_p=500 and res_q=200. This also had little impact on the results.
I’ve introduced check = FALSE, fix = FALSE into the owin function, which helped previously when owin was reporting “empty polygons” (and indeed helps with tailing off problem here).

Full genome

I’ve ran my method for all 121,000 SNPs.:

To do

Look into bandwidths for KDE estimation.
Figure out some formula to decide the number of grid points in x (Z scores) and y (Q values) directions
Investigate deinflation (see this earlier summary where I discuss this: https://annahutch.github.io/PhD/12feb2020.html)

24th April

Anna Hutchinson

cFDR

Other things

Full genome

To do