mclust::densityMclust
takes on average 10 seconds for 50,000 data points.
mixtools::normalmixEM
takes on average 24 seconds for 50,000 data points and requires specification of how may Gaussians.
At the moment I am using:
\[\begin{equation} P(P\leq p|Q\leq q) = \dfrac{P(P\leq p, Q\leq q)}{P(Q\leq q)} \end{equation}\]
where \(P(Q\leq q)\) cancels with part of the adjustment factor on the numerator.
\(P(P\leq p, Q\leq q)\) is then estimated using the joint_func_zq
function.
For each (p,q) pair…
joint_func_zq
is called nt=5000 times (length of xtest) to generate the cfdr curve to estimate ccut
.
cfdr curves are then fit for each entry in yval2. Meaning that for each (p,q) pair, joint_func_zq
is called nv=1000 * nt=5000 times (length of yval2 * length of xtest).
That means for each (p,q) pair, joint_func_zq
is called 25,000,000,000 times!! It takes 4.6 milliseconds to call but in my sapply loop, it takes 14 seconds to fit to nt=5000
x coordinates.
Alternatively, Chris has suggested estimating this conditional probability directly using a univariate mixture of normals. I.e. we fit it to all (SIGNED) zp values and then do something else… We use signed z scores because then hopefully all the means are approximately 0. –> may come back to this later.
I summarise the methods I have explored so far.
The idea now is to use the same density estimate to (i) generate the L curves (ii) estimate \(Q|H_0^p\) to integrate the L curves over.
I investigate using KDEs for the L curves and for estimating the null to integrate. Using James’ standard code in the cfdr package, I observe that the L curves are pretty different when using vlyl()
compared with vl()
. [Note that vlyl()
includes the adjustment so need to specify adj=TRUE
in the vl()
function].
I annotate the vlyl
function to try and get my head around what it is doing.
## annotated original vlyl() function
vlyl=function(p,q,indices=NULL,at=NULL,mode=0,fold=NULL,p_threshold=0,nt=5000,nv=1000,scale=c("p","z"),closed=T,bw=1,...) {
res=200 # resolution. the number of grid points in each direction
zp=-qnorm(p/2)
zq=-qnorm(q/2)
# define subset of indices to evalute at (leaving out chr)
if (mode==2) sub=setdiff(1:length(zp),fold) else sub=1:length(zp)
mx=max(c(10,abs(-qnorm(p/2))))
my=max(c(10,abs(-qnorm(q/2)))) # why does he choose 10?
zp[which(zp > mx)]=mx
zq[which(zq > my)]=my # any bigger than limits, set to limits
w=which(is.finite(zp[sub]+zq[sub])) # check zp+zq is not empty???
rr=max(c(12,mx,my)) # why does he choose 12?
lims=c(0,rr,0,rr) # c(xl, xu, yl, yu)
px=dnorm(seq(lims[1],lims[2],length.out=res)) # Expected normal kernel density of p vals under null
kpq=kde2d(c(zp[sub],zp[sub],-zp[sub],-zp[sub]),c(zq[sub],-zq[sub],-zq[sub],zq[sub]),n=res,lims=lims,h=c(1,1),...) # Kernel density of P,Q
# kpq$z=pmax(kpq$z,outer(px,px)) # minimum value is normal density; needed for regularisation
kpq$z=apply(kpq$z,2,function(x) rev(cummax(rev(x)))) # ensure non-increasing - i assume for same reason as gx in vl
zqs=zq[sub][which(p[sub]>0.5)] # zq|P>1/2
kq=density(c(zqs,-zqs),from=lims[1],to=lims[2],n=res) # Kernel density of Q|HP0 (marginal)
# find Kernel density of P,Q|H0 using independence (P,Q|H0 = P|H0 * Q|H0)
kpq0=kpq # x and y coords are same for KDE of P,Q and P,Q|H0
# we change the z coords only, i assume to save computations
kpq0$z=outer(px,kq$y) # px is ZP|H0 and kq$y is ZQ|H0
kpq0$z=kpq0$z*(sum(kpq$z)/sum(kpq0$z)) # I assume just some normalisation thing
kpq$z[which(kpq$z==0)]=min(kpq$z[which(kpq$z>0)]) # ensure none are exactly 0
kpq0$z[which(kpq0$z==0)]=min(kpq0$z[which(kpq0$z>0)])
cgrid=kpq
cgrid$z=pmin(kpq0$z/kpq$z,10) # cfdr value
# cgrid$z=apply(cgrid$z,2,function(x) rev(cummax(rev(x-rev(cummin(rev(x)))))))
cgrid$z=apply(cgrid$z,2,function(x) cummin(x-rev(cummin(rev(x)))))
# rev(cummax(rev(x))))
# interpolate surface to get ccut=cfdr(zp,zq) [pmin to make sure none out of range]
# why * 1.01?
if (!is.null(indices)) ccut= interp.surface(cgrid,cbind(pmin(zp[indices],mx),pmin(zq[indices],my)))*1.01
out=rep(0,length(ccut))
yval2=seq(0,my,length.out=nv+1)[1:nv]
xval2=outer(rep(1,length(ccut)),yval2) # set up structure, need x coords for each ccutt
pval2=2*pnorm(-yval2)
xtest=seq(0,my,length.out=nt)
ptest=2*pnorm(-xtest)
for (i in 1:length(yval2)) {
xdenom=interp.surface(cgrid,cbind(xtest,rep(yval2[i],length(xtest))))
cfx=xdenom
xval2[,i]=approx(c(cfx,1.005*max(cfx)),c(xtest,0),ccut,rule=2,method="const",f=1)$y
# i assume the 1.005 and 0 bits of the vectors are for the additional point?
# rule=2 returns closest value rather than NA for points outside of the grid
}
xval2[which(xval2> -qnorm(p_threshold/2))]=-qnorm(p_threshold/2)
if (closed) {
yval2=c(Inf,0,yval2,Inf,Inf)
xval2=cbind(Inf,Inf,xval2,xval2[,nv],Inf)
}
if (scale[1]=="p") {
X=2*pnorm(-abs(xval2))
Y=2*pnorm(-abs(yval2))
} else {
X=xval2
Y=yval2
}
return(list(x=X,y=Y))
}
Interpolate cgrid
to get ccut=cfdr(zp,zq)
for each p,q pair.
Interpolate cgrid
to get cfx=cfdr(xtest,yval2[i])
for each yval2
value. Use approx
with these cfdr curves to find xval2[i,j]
such that cfdr(xval2[i,j],yval[i,j])=ccut[i]
.
So what does cgrid
look like?
cgrid = kpq
cgrid$z = pmin(kpq0$z/kpq$z,10)
filled.contour(cgrid, xlab = "ZP", ylab = "ZQ", main = "cgrid$z = pmin(kpq0$z/kpq$z,10)")
To then obtain his final cgrid
he uses cummin(x-rev(cummin(rev(x)))))
. This function is applied to every column of cgrid$z
(relating to to a Q). But what does this do?
It ensures that cgrid$z
is a strictly decreasing function. It also makes the values smaller… E.g.
par(mfrow=c(1,2))
plot(cgrid$z[,10], ylab = "", main = "Example column of cgrid$z")
plot(cummin(cgrid$z[,10]-rev(cummin(rev(cgrid$z[,10])))), ylab = "", main = "Example column of cgrid$z after\ncummin(x-rev(cummin(rev(x))))")
James’ final cgrid
therefore looks like this:
cgrid$z=apply(cgrid$z,2,function(x) cummin(x-rev(cummin(rev(x)))))
filled.contour(cgrid, xlab = "ZP", ylab = "ZQ", main = "Final cgrid after cummin(x-rev(cummin(rev(x))))")
James finds the kernel density of P,Q|HP0
using the independence assumption, that is, P,Q|HP0 = P|HP0 * Q|HP0
. He first sets the kernel density of P,Q|HP0 equal to the kernel density of P,Q (x and y will be the same).
kpq0 = kpq
Then he changes the Z values to the outer product of the expected normal density estimate (P) and the marginal kernel density of Q|HP0 [both found from lims[1] to lims[2] - but in my case P and Q may be defined on different supports(?)]. I assume the second line is for some sort of normalisation.
kpq0$z = outer(px,kq$y)
kpq0$z = kpq0$z*(sum(kpq$z)/sum(kpq0$z))
I compare the kernel density using this method and using the kde2d
function to find this directly.
The results seem pretty different.. why?
filled.contour(kpq0, main = "P,Q|H0 using independence assumption", ylab = "ZQ|H0", xlab = "ZP|H0")
kpq0_kde2d <- kde2d(c(zp[sub][which(p[sub]>0.5)],zp[sub][which(p[sub]>0.5)],-zp[sub][which(p[sub]>0.5)],-zp[sub][which(p[sub]>0.5)]),c(zq[sub][which(p[sub]>0.5)],-zq[sub][which(p[sub]>0.5)],-zq[sub][which(p[sub]>0.5)],zq[sub][which(p[sub]>0.5)]),n=res,lims=lims,h=c(1,1))
filled.contour(kpq0_kde2d, main = "P,Q|H0 using kde2d", ylab = "ZQ|H0", xlab = "ZP|H0")
Anyhow, I assume James uses this method rather than the kde2d method to ensure that the KDE of P,Q and P,Q|H0 are defined on the same support and to save big computations (which I assume kde2d
is). .
In James’ normal method, L curves are generated (e.g. using vl()
or vlyl()
) and then the il()
function is used to integrate the null over the L curves. I investigate the difference in V values when using:
vl()
then il()
vlyl()
then il()
vl()
then KDE approximation of null and polyCubvlyl()
then KDE approximation of null and polyCubWe hope that option 4 works well (same density estimate to generate L curves and estimate the null).
To try and integrate the exact same joint null that was used in generating the L curves (using the same limits and the same subset of data points from the fold removal method) I use:
fzph0 <- function(x) dnorm(x, 0, 1)
sub = setdiff(1:length(p),fold1)
qs = zq[sub][which(p_vals[sub] > 0.5)] # q|P>1/2
kq = density(qs, from = 0, to = 12, n = res) # Kernel density of Q|HP0 (marginal)
fzqh0 <- approxfun(kq$x, kq$y)
# estimate joint density
fzpzqh0 <- function(s){
fzph0(s[,1])*fzqh0(s[,2])
}
[Recall that we are only finding V values for a subset of data pairs with p<0.005, q< 0.001
]
vlyl
to take positive continous Q and PThe aim is to provide the new function with continuous Q (positive and negative) and signed Z scores. For now, I amend the function so that it can take in positive continuous P and Q. I will then amend it to take negative values as well.
As a check, I use absolute zq
as the “continuous q” and absolute zp
as the “continuous p”, so that I can check the resultant L curves to those using the standard vlyl
function on the Z scale.
# vlyl_contpq takes in positive continuous P and Q values
vlyl_contpq = function(p,q,indices=NULL,at=NULL,mode=0,fold=NULL,p_threshold=0,nt=5000,nv=1000,scale=c("p","z"),closed=T,bw=1,...) {
res=200
# define subset of indices to evalute at (leaving out chr)
if (mode==2) sub=setdiff(1:length(p),fold) else sub=1:length(p)
mx=max(p)
my=max(q)
w=which(is.finite(p[sub]+q[sub])) # check zp+zq is not empty???
rr=max(c(12,mx,my))
lims=c(0,rr,0,rr) # c(xl, xu, yl, yu)
px=dnorm(seq(lims[1],lims[2],length.out=res)) # Expected normal kernel density
kpq=kde2d(c(p[sub],p[sub],-p[sub],-p[sub]),c(q[sub],-q[sub],-q[sub],q[sub]),n=res,lims=lims,h=c(1,1)) # Kernel density of P,Q
# kpq$z=pmax(kpq$z,outer(px,px)) # minimum value is normal density; needed for regularisation
# filled.contour(kpq)
kpq$z=apply(kpq$z,2,function(x) rev(cummax(rev(x)))) # ensure strictly decreasing
p_vals <- 2*pnorm(-abs(zp))
qs=q[sub][which(p_vals[sub]>0.5)] # q|P>1/2
kq=density(c(qs,-qs),from=lims[1],to=lims[2],n=res) # Kernel density of Q|HP0 (marginal)
# find Kernel density of P,Q|H0 using independence (P,Q|H0 = P|H0 * Q|H0)
kpq0=kpq # x and y coords are same for KDE of P,Q and P,Q|H0
# we change the z coords only, i assume to save computations
kpq0$z=outer(px,kq$y) # px is ZP|H0 and kq$y is Q|H0
kpq0$z=kpq0$z*(sum(kpq$z)/sum(kpq0$z)) # I assume just some normalisation thing
kpq$z[which(kpq$z==0)]=min(kpq$z[which(kpq$z>0)]) # ensure none are exactly 0
kpq0$z[which(kpq0$z==0)]=min(kpq0$z[which(kpq0$z>0)])
cgrid=kpq
cgrid$z=pmin(kpq0$z/kpq$z,10)
# cgrid$z=apply(cgrid$z,2,function(x) rev(cummax(rev(x-rev(cummin(rev(x)))))))
cgrid$z=apply(cgrid$z,2,function(x) cummin(x-rev(cummin(rev(x)))))
# interpolate surface to get ccut=cfdr(zp,zq) [pmin to make sure none out of range]
# why * 1.01?
if (!is.null(indices)) ccut= interp.surface(cgrid,cbind(pmin(p[indices],mx),pmin(q[indices],my)))*1.01
out=rep(0,length(ccut))
yval2=seq(0,my,length.out=nv+1)[1:nv]
xval2=outer(rep(1,length(ccut)),yval2) # set up structure, need x coords for each ccutt
xtest=seq(0,my,length.out=nt)
for (i in 1:length(yval2)) {
xdenom=interp.surface(cgrid,cbind(xtest,rep(yval2[i],length(xtest))))
cfx=xdenom
xval2[,i]=approx(c(cfx,1.005*max(cfx)),c(xtest,0),ccut,rule=2,method="const",f=1)$y
# i assume the 1.005 and 0 bits of the vectors are for the additional point?
# rule=2 returns closest value rather than NA for points outside of the grid
}
if (closed) {
yval2=c(Inf,0,yval2,Inf,Inf)
xval2=cbind(Inf,Inf,xval2,xval2[,nv],Inf)
}
if (scale[1]=="p") {
X=2*pnorm(-abs(xval2))
} else {
X=xval2
}
Y=yval2
return(list(x=X,y=Y))
}
set.seed(1) # ensure reproducibility
n=10000; n1p=100; n1pq=100; n1q=100 # parameters
zp=c(rnorm(n1p,sd=3), rnorm(n1q,sd=1),
rnorm(n1pq,sd=3), rnorm(n-n1p-n1q-n1pq,sd=1)) # simulate z-scores corresponding to p
zq=c(rnorm(n1p,sd=1), rnorm(n1q,sd=3),
rnorm(n1pq,sd=3), rnorm(n-n1p-n1q-n1pq,sd=1)) # simulate z-scores corresponding to q
p=2*pnorm(-abs(zp)); q=2*pnorm(-abs(zq))
fold_id=(1:n) %% 3
fold=which(fold_id==1)
candidate_indices=which(p<0.005 | q< 0.001)
indices = intersect(candidate_indices,fold)
l_pscale <- vlyl(p, q, indices = indices, mode = 2, fold = fold, scale = "p", closed = FALSE)
plotL(l_pscale, main = "L curves using standard vlyl on P scale")
l_zscale <- vlyl(p, q, indices = indices, mode = 2, fold = fold, scale = "z", closed = FALSE)
plotL(l_zscale, main = "L curves using standard vlyl on Z scale")
l_contpq <- vlyl_contpq(p = -qnorm(p/2), q = -qnorm(q/2), indices = indices, mode = 2, fold = fold, scale = "z", closed = FALSE)
plotL(l_contpq, main = "L curves using new vlyl for positive continuous Q and P")
vlyl
to take +/- P and QI try to amend the vlyl
function so that it takes in (i) ZP (signed Z scores for principal trait) and (ii) Q (continuous Q for functional annotations).
set.seed(1) # ensure reproducibility
n = 10000
n1p = 100
n1pq = 100
n1q = 100 # parameters
zp = c(rnorm(n1p, sd = 3), rnorm(n1q, sd = 1), rnorm(n1pq, sd = 3), rnorm(n - n1p - n1q - n1pq, sd = 1)) # simulate z-scores corresponding to p
zq = c(rnorm(n1p, sd = 1), rnorm(n1q, sd = 3), rnorm(n1pq, sd = 3), rnorm(n - n1p - n1q - n1pq, sd = 1)) # simulate z-scores corresponding to q
fold_id = (1:n)%%3 # chr for each p,q pair
candidate_indices = which(p < 0.005 | q < 0.001)
fold = which(fold_id == 1) # which p,q pair on chr 1
indices = intersect(candidate_indices, fold)
mode = 2
at = NULL
fold = which(fold_id == 1)
p_threshold = 0
nt = 5000
nv = 1000
closed = TRUE
scale = c("p", "z")
res = 200
# set p and q parameters equal to zp and zq (just because they are continuous)
p = zp
q = zq
summary(p)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.666762 -0.683036 -0.017094 -0.003766 0.690212 7.947501
summary(q)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -8.355331 -0.696668 -0.019058 -0.003808 0.672468 7.039893
# define subset of indices to evalute at (leaving out chr)
if (mode == 2) sub = setdiff(1:length(p), fold) else sub = 1:length(p)
mx = max(p)
my = max(q)
w = which(is.finite(p[sub] + q[sub])) # check zp+zq is not empty???
rr = max(c(12, mx, my))
ll = min(c(-12, min(p), min(q)))
lims = c(ll, rr, ll, rr) # c(xl, xu, yl, yu)
lims
## [1] -12 12 -12 12
Note that I probably don’t want the limits to be the same for P and Q because P (signed Z scores) will be more extreme than Q (tend to vary between -1 and 3).
px = dnorm(seq(lims[1], lims[2], length.out = res)) # Expected normal kernel density
plot(px)
kpq = kde2d(p[sub], q[sub], n = res, lims = lims, h = c(1, 1)) # Kernel density of P,Q
filled.contour(kpq)
The next step is to ensure that the cfdr curves for each Q value (each column of kpq$z
) are non-increasing. Does this still make sense to do?
plot(kpq$z[,10], main = "kpq$z[,10]")
kpq$z = apply(kpq$z, 2, function(x) rev(cummax(rev(x))))
plot(kpq$z[,10], main = "Non-increasing kpq$z[,10]")
p_vals <- 2 * pnorm(-abs(zp))
qs = q[sub][which(p_vals[sub] > 0.5)] # q|P>1/2
kq = density(qs, from = lims[1], to = lims[2], n = res) # Kernel density of Q|HP0 (marginal)
plot(kq, main = "Kernel density of Q|HP0 (marginal)")
# find Kernel density of P,Q|H0 using independence (P,Q|H0 = P|H0 * Q|H0)
kpq0 = kpq # x and y coords are same for KDE of P,Q and P,Q|H0
kpq0$z = outer(px, kq$y) # px is P|H0 and kq$y is Q|H0
kpq0$z = kpq0$z * (sum(kpq$z)/sum(kpq0$z)) # I assume just some normalisation thing
kpq$z[which(kpq$z == 0)] = min(kpq$z[which(kpq$z > 0)]) # ensure none are exactly 0
kpq0$z[which(kpq0$z == 0)] = min(kpq0$z[which(kpq0$z > 0)])
Notice that the Z scores are now much higher than in the earlier contour plots for P,Q|H0
filled.contour(kpq0)
The cummin(x - rev(cummin(rev(x))))
trick used to get the Z values of cgrid
now doesn’t work….
cgrid = kpq
cgrid$z = pmin(kpq0$z/kpq$z, 10)
filled.contour(cgrid, main = "cgrid$z = pmin(kpq0$z/kpq$z, 10)")
par(mfrow=c(1,2))
plot(cgrid$z[,10], ylab = "", main = "Example column of cgrid$z")
plot(cummin(cgrid$z[,10]-rev(cummin(rev(cgrid$z[,10])))), ylab = "", main = "Example column of cgrid$z after\ncummin(x-rev(cummin(rev(x))))")
cgrid$z = apply(cgrid$z, 2, function(x) cummin(x - rev(cummin(rev(x)))))
filled.contour(cgrid, main = "cgrid$z = pmin(kpq0$z/kpq$z, 10)")
Comments + questions
Questions about
vlyl
function:Where does 10 come from in
mx=max(c(10,abs(-qnorm(p/2)))); my=max(c(10,abs(-qnorm(q/2))))
Where does 12 come from in
rr=max(c(12,mx,my)); lims=c(0,rr,0,rr)
cgrid$z
are the cfdr values, but why iscgrid$z=pmin(kpq0$z/kpq$z,10)
? The denominator makes sense but shouldn’t the numerator bep * P(Q<= q) * p(H0P|Q<=q)
? I also don’t really understand the next step:cgrid$z=apply(cgrid$z,2,function(x) cummin(x-rev(cummin(rev(x)))))
Why does he mutliple by 1.01?
ccut= interp.surface(cgrid,cbind(pmin(zp[indices],mx),pmin(zq[indices],my)))*1.01
Why has he added an additional point to approximate for -
(zq = 1.005*max(cfx), zp = 0)
? Inxval2[,i]=approx(c(cfx,1.005*max(cfx)),c(xtest,0),ccut,rule=2,method="const",f=1)$y
L curves:
Integration: