#' zq_h0_func
#'
#' function to estimate P(ZQ>zq|H_0^p) using a GMM fitted to ZQ|P>1/2
#'
#' @param zq
#' @param dens output from densityMclust() fitting a GMM to ZQ|P>1/2
#'
#' @return Estimate of P(ZQ>zq|H_0^p)
#' @export
#'
zq_h0_func <- function(zq, dens) {
sum <- 0
for(i in 1:length(dens$parameters$mean)){
sum <- sum + dens$parameters$pro[i] * pnorm(zq,mean = dens$parameters$mean[i], sd = sqrt(dens$parameters$variance$sigmasq[i]), lower.tail = FALSE)
}
return(sum)
}
#' q_h0_func
#'
#' function to estimate P(Q<q|H_0^p) using a GMM fitted to Q|P>1/2
#'
#' @param q
#' @param dens output from densityMclust() fitting a GMM to Q|P>1/2
#'
#' @return Estimate of P(Q<q|H_0^p)
#' @export
#'
q_h0_func <- function(q, dens) {
sum <- 0
for(i in 1:length(dens$parameters$mean)){
sum <- sum + dens$parameters$pro[i] * pnorm(q,mean = dens$parameters$mean[i], sd = sqrt(dens$parameters$variance$sigmasq[i]), lower.tail = FALSE)
}
return(sum)
}
#' joint_func_zq
#'
#' function to estimate P(ZP>zp, ZQ>zq) using a bivariate GMM fitted to ZP, ZQ
#'
#' @param zp
#' @param zq
#' @param dens output from densityMclust() fitting a bivariate GMM to ZP,ZQ
#'
#' @return
#' @export
#'
joint_func_zq <- function(zp, zq, dens) {
sum <- 0
for(i in 1:ncol(dens$parameters$mean)){
sum <- sum + (dens$parameters$pro[i] * pmvnorm(lower = c(zp, zq), upper = c(Inf, Inf), mean = dens$parameters$mean[,i], sigma = dens$parameters$variance$sigma[, , i]))[1]
}
return(sum)
} # can potentially vectorise this later but sigma causing problems...
#' joint_func_q
#'
#' function to estimate P(ZP>p, Q<q using a bivariate GMM fitted to ZP, Q
#'
#' @param zp
#' @param q
#' @param dens output from densityMclust() fitting a bivariate GMM to ZP,Q
#'
#' @return
#' @export
#'
joint_func_q <- function(zp, q, dens) {
sum <- 0
for(i in 1:ncol(dens$parameters$mean)){
sum <- sum + (dens$parameters$pro[i] * pmvnorm(lower = c(zp, -Inf), upper = c(Inf, q), mean = dens$parameters$mean[,i], sigma = dens$parameters$variance$sigma[, , i]))[1]
}
return(sum)
}
# can potentially vectorise this later but sigma causing problems...
# can also adjust this to take in p rather than zp is needed
Method for generating ccut
values:
For a given zq
, say zq[1]
, find cfdr(xtest, zq[1])
where xtest
are equally spaced points on the x axis.
Interpolate to find cfdr(zp[1],zq[1])=ccut[1]
.
I compare the ccut values from mine and James’ cfdr approximation method where p and q are P-values (requirements for James’ functions).
Recall that James’ approximation is:
\[\begin{equation} \text{cfsub_{james}} = \dfrac{(1+\dfrac{1}{|w|})*ptest}{(1+\dfrac{1}{|w|})-ecdf_{zp|zq\geq zq[1]}(xtest)} \end{equation}\]
and mine is:
\[\begin{equation} \text{cfsub_{anna}} = \dfrac{ptest * P(q\leq q[1]|p>1/2)}{P(p\leq ptest, q\leq q[1])}. \end{equation}\]
# function to generate ccut values using James' cfdr approximation
james_ccut_z <- function(p,q,indices=NULL,fold=NULL,nt=5000, nv=1000){
zp = -qnorm(p/2)
zq = -qnorm(q/2)
mx = max(c(10,abs(-qnorm(p/2))))
my = max(c(10,abs(-qnorm(q/2)))) # maximum limits for integration
gx = min(p[indices])
ccut = rep(0,length(indices))
xtest = seq(0,mx,length.out=nt)
ptest = 2*pnorm(-xtest)
ccut=rep(0,length(indices))
for (i in 1:length(indices)) {
w=which(zq[-fold] >= zq[indices[i]])
if (length(w) >= 1) {
cfsub= (1+(1/length(w)))*ptest/(1+(1/length(w))-ecdf(zp[-fold][w])(xtest)); cfsub=cummin(cfsub)
ccut[i]=approx(xtest,cfsub-gx*xtest + gx*mx,zp[indices[i]],rule=2)$y
} else ccut[i]=p[indices[i]]
}
return(ccut)
}
# function to generate ccut values using my cfdr approximation
anna_ccut_pscale <- function(p, q, indices=NULL, fold=NULL, nt=5000, nv=1000){
zp = -qnorm(p/2)
zq = -qnorm(q/2)
mx = max(zp)
gx = min(p[indices])
xtest = seq(0, mx, length.out = nt)
ptest = 2*pnorm(-xtest)
ccut = rep(0, length(indices))
# GMM for P(Q | H_0)
zq_h0_gmm <- densityMclust(zq[-fold][which(p[-fold]>1/2)])
# GMM for P(Q,P)
joint_pq <- densityMclust(data.frame("zp" = zp[-fold], "zq" = zq[-fold]))
if (!is.null(indices)) { # set ccut. NOT equal to cfdr at the points; needs to be adjusted since an additional test point is used in determination of L
ccut = rep(0,length(indices))
for (i in 1:length(indices)) {
w = which(zq[-fold] >= zq[indices[i]])
if (length(w) >= 1) {
cfsub = (ptest * zq_h0_func(zq = zq[indices[i]], dens = zq_h0_gmm))/sapply(xtest, function(x) joint_func_zq(x, zq[indices[i]], dens = joint_pq))
cfsub =cummin(cfsub) # make non-decreasing
ccut[i] = approx(xtest, cfsub-gx*xtest + gx*mx, zp[indices[i]], rule=2)$y
} else ccut[i] = p[indices[i]]
}
}
return(ccut)
}
Why are some of my ccut values >>1?
Below shows a table of the p, q and ccut values where my ccut values are \(>1\). I’ve also included a column for length(w)
which is how many \(ZQ>zq\).
## p q anna_ccut james_ccut w
## 1 0.9413570 8.847909e-04 1.045685 0.9728717 38
## 2 0.2393619 5.025355e-10 1.334804 0.3989367 4
## 3 0.6525755 1.325162e-08 1.749192 0.7504367 4
## 4 0.6219225 1.923871e-12 2.946177 0.6219227 1
## 5 0.8933018 1.387191e-04 1.040314 0.9728717 22
## 6 0.9773098 2.627178e-04 1.071522 0.9773098 26
Recall that James’ cfdr approximation is:
\[\begin{equation} \text{cfsub} = \dfrac{(1+\dfrac{1}{|w|})*ptest}{(1+\dfrac{1}{|w|})-ecdf_{zp|zq\geq zq[1]}(xtest)} \end{equation}\]
I’m still not sure where the \((1+\dfrac{1}{|w|})\) thing comes from but perhaps it bounds James’ ccut values between \([0,1]\)?
If I plot the difference between the ccut values using mine and James’ cfdr approximations against length(w) then we see there is kind of (?) a relationship: The smaller length(w)
, the bigger the difference (smaller -log10 value) betweeen min and James’ ccut values. Probably because \((1+\dfrac{1}{|w|})\longrightarrow 2\) as \(|w| \longrightarrow 1\).
Taking row 4 of the above table \((p = 0.6219225, q = 1.923871e-12, length(w) =1)\) as an example, I plot the cfsub(xtest,zq)
curve against the xtest
values with a vertical line at the zp
value we read off the y value for to obtain ccut.
I think that I need to ensure that my curve is also bounded by 0 and 1 on the y axis. How can I do this?
Both ptest
and \(P(q\leq q[1]|p>1/2)\) are \(<1\). To get a value \(>1\), the denominator must be less than the numerator. I.e. P(p <= ptest, q <= q[1]) < ptest * P(q <= q[1]|p > 1/2)
.
Method for finding \(x\) coordinates:
yval2
(equally spaced points on y axis) find the corresponding x value such that cfdr(xval2[i,j], yval2[j])=ccut[i]
.Focussing on James’ functions first, he has an adj=TRUE
parameter which “adjust cFDR values and hence curves L using estimate of Pr(H0|Pj<pj)”. I don’t know where this comes from.. I think he means adjusting by finding a better estimate for Pr(H0|Qj<qj) than 1? The adjustment is:
if (adj) {
correct=cummin((1+ecdf(q[-fold][which(p[-fold]>0.5)])(pval2)*length(p[-fold]))/
(1+ ecdf(q[-fold])(pval2)*length(p[-fold])))
if (!is.null(indices)) correct_ccut=approx(pval2,correct,q[indices],rule=2)$y #cummin((1+ecdf(pc[which(p>0.5)])(pc)*length(p))/(1+ rank(pc)))
} else {
correct=rep(1,length(pval2)) # adjustment factor for pc[i]
correct_ccut=rep(1,length(ccut))
}
if (!is.null(indices)) ccut=ccut*correct_ccut
But I’m pretty sure the correct
derivation is not equal to the adjustment in the paper:
I use my ccut
values in James’ code to generate L curves. The results look very similar to those using James’ adjusted cfdr method which is reassuring.
Problematic indices:
19: the example looked at earlier - jumps to else statement (so tries to do -qnorm(2.9/2)
) ?
55: gives all 1s for some reason.
I write a function to calculate the x coordinates of the L curves using my cfdr approximation. I plot the results from this using mine and James’ ccut values. Note that this takes agessssssss to run.
xvals_anna_func <- function(p, q, indices, fold, ccut, nt = 5000, nv = 1000){
zp = -qnorm(p/2)
zq = -qnorm(q/2)
mx = max(zp)
my = max(zq)
gx = min(p[indices])
yval2 = seq(0,my,length.out=nv+1)[1:nv]
xval2 = outer(rep(1,length(ccut)), yval2)
pval2 = 2*pnorm(-yval2)
xtest = seq(0,mx,length.out=nt)
ptest = 2*pnorm(-xtest)
ccut = ccut*(1+ 1e-6) # ccut + 1e-8 # prevent floating-point comparison errors
correct = rep(1,length(pval2)) # adjustment factor for pc[i]
correct_ccut = rep(1,length(ccut))
ccut = ccut*correct_ccut
# GMM for P(Q | H_0)
zq_h0_gmm <- densityMclust(zq[-fold][which(p[-fold]>1/2)])
# GMM for P(Q,P)
joint_pq <- densityMclust(data.frame("zp" = zp[-fold], "zq" = zq[-fold]))
for (i in 1:length(yval2)) {
w=which(zq[-fold] > yval2[i])
if (length(w) >= 1 ) {
cfsub = (ptest * zq_h0_func(zq = yval2[i], dens = zq_h0_gmm))/sapply(xtest, function(x) joint_func_zq(x, yval2[i], dens = joint_pq))
cfsub = cummin(cfsub) # make non-decreasing
xval2[,i] = approx((cfsub-gx*xtest + gx*mx),xtest,ccut,rule=2,f=1)$y
} else xval2[,i]=-qnorm((ccut/correct_ccut)/2)
}
yval2 = c(Inf,0,yval2,Inf,Inf)
xval2 = cbind(Inf,Inf,xval2,xval2[,nv],Inf)
X = 2*pnorm(-abs(xval2))
Y = 2*pnorm(-abs(yval2))
return(list(x=X,y=Y))
}
The two sticky out ones (from mine and James’ ccut values) are for index 10 and 55 which are the first and last rows in the earlier table where I look at those indices that give me ccut > 1. I.e. these ones;:
## p q anna_ccut james_ccut w
## 1 0.9413570 0.0008847909 1.045685 0.9728717 38
## 6 0.9773098 0.0002627178 1.071522 0.9773098 26
To better understand the difference between mine and James’ L curves I consider a single L curve for a single \((p,q)\) pair with \((p = 1.02e-05, q = 0.856)\).
4. Comments and questions
Should my
ccut
values be bounded by 1, like James’? If so how do I do this?James’
adj
parameter. He says that this estimates Pr(H0|Pj<pj) but from his paper it seems that it estimates Pr(H0|Qj<qj)? For now I’m just using non-adjusted cfdr, but this is something to think about for the future…Really need to speed up my code - takes all day to run on 55 SNPs! Have looked at
Vectorize()
but it doesn’t work with the form of sigma. Specifically, I need to speed up the following: