Skip to content

Commit

Permalink
Adding Barber (2019) examples
Browse files Browse the repository at this point in the history
  • Loading branch information
ryantibs committed Apr 18, 2019
1 parent ca7de81 commit c8b820a
Show file tree
Hide file tree
Showing 8 changed files with 1,814 additions and 0 deletions.
1,503 changes: 1,503 additions & 0 deletions barber2019/airfoil.txt

Large diffs are not rendered by default.

164 changes: 164 additions & 0 deletions barber2019/fig.airfoil.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
## Statistics
load("rda/airfoil_sim.rda")
apply(cov.mat, 2, mean, na.rm=TRUE)
apply(len.mat, 2, median, na.rm=TRUE)
colMeans(!is.finite(len.mat))
apply(beta.mat1, 2, mean, na.rm=TRUE)
apply(beta.mat2, 2, mean, na.rm=TRUE)
mean(ssize)

## Histograms, coverage
xlim = range(cov.mat[,1:6], na.rm=TRUE)
bks = seq(xlim[1],xlim[2],length=60)
ylim = c()
for (i in 1:6) {
ylim = range(c(ylim, hist(cov.mat[,i],breaks=bks,plot=FALSE)$count))
}

hh = 8; ww = 6
mar = c(4.25,4.25,1,1)
xlab = "Coverage"; ylab = "Frequency"

pdf(file="fig/airfoil_cov.pdf", height=hh, width=ww)
par(mar=mar, mfrow=c(3,1))
hist(cov.mat[,1], breaks=bks, col="red", xlim=xlim, ylim=ylim,
main="", xlab=xlab, ylab=ylab)
hist(cov.mat[,2], breaks=bks, col=rgb(0,0,1,0.5), add=TRUE)
rug(mean(cov.mat[,1]), col="red", lwd=4)
rug(mean(cov.mat[,2]), col=rgb(0,0,1,0.5), lwd=4)
legend("topleft", col=c("red",rgb(0,0,1,0.5)), lwd=4, lty=1,
legend=c("No covariate shift", "Covariate shift"))

hist(cov.mat[,3], breaks=bks, col="orange", xlim=xlim, ylim=ylim,
main="", xlab=xlab, ylab=ylab)
hist(cov.mat[,4], breaks=bks, col=rgb(0.5,0,0.5,0.5), add=TRUE)
rug(mean(cov.mat[,3]), col="orange", lwd=4)
rug(mean(cov.mat[,4]), col=rgb(0.5,0,0.5,0.5), lwd=4)
legend("topleft", col=c("orange",rgb(0.5,0,0.5,0.5)), lwd=4, lty=1,
legend=c("Oracle weights", "No shift, fewer samples"))

hist(cov.mat[,5], breaks=bks, col="gray65", xlim=xlim, ylim=ylim,
main="", xlab=xlab, ylab=ylab)
hist(cov.mat[,6], breaks=bks, col=rgb(0,1,0,0.5), add=TRUE)
rug(mean(cov.mat[,5]), col="gray65", lwd=4)
rug(mean(cov.mat[,6]), col=rgb(0,1,0,0.5), lwd=4)
legend("topleft", col=c("gray65",rgb(0,1,0,0.5)), lwd=4, lty=1,
legend=c("Logistic regression weights", "Random forest weights"))
graphics.off()

## Histograms, length
xlim = range(len.mat[,1:6], na.rm=TRUE, finite=TRUE)
bks = seq(xlim[1],xlim[2],length=75)
ylim = c()
for (i in 1:6) {
ylim = range(c(ylim, hist(len.mat[,i],breaks=bks,plot=FALSE)$count))
}

hh = 8; ww = 6
mar = c(4.25,4.25,1,1)
main = ""
xlab = "Length"; ylab = "Frequency"
xlim = c(xlim[1],40) # Manually trim this, due to RF long right tail

pdf(file="fig/airfoil_len.pdf", height=hh, width=ww)
par(mar=mar, mfrow=c(3,1))
hist(len.mat[,1], breaks=bks, col="red", xlim=xlim, ylim=ylim,
main="", xlab=xlab, ylab=ylab)
hist(len.mat[,2], breaks=bks, col=rgb(0,0,1,0.5), add=TRUE)
rug(median(len.mat[,1]), col="red", lwd=4)
rug(median(len.mat[,2]), col=rgb(0,0,1,0.5), lwd=4)
legend("topright", col=c("red",rgb(0,0,1,0.5)), lwd=4, lty=1,
legend=c("No covariate shift", "Covariate shift"))

hist(len.mat[,3], breaks=bks, col="orange", xlim=xlim, ylim=ylim,
main="", xlab=xlab, ylab=ylab)
hist(len.mat[,4], breaks=bks, col=rgb(0.5,0,0.5,0.5), add=TRUE)
rug(median(len.mat[,3]), col="orange", lwd=4)
rug(median(len.mat[,4]), col=rgb(0.5,0,0.5,0.5), lwd=4)
legend("topright", col=c("orange",rgb(0.5,0,0.5,0.5)), lwd=4, lty=1,
legend=c("Oracle weights", "No shift, fewer samples"))

hist(len.mat[,5], breaks=bks, col="gray65", xlim=xlim, ylim=ylim,
main="", xlab=xlab, ylab=ylab)
hist(len.mat[,6], breaks=bks, col=rgb(0,1,0,0.5), add=TRUE)
rug(median(len.mat[,5]), col="gray65", lwd=4)
rug(median(len.mat[,6]), col=rgb(0,1,0,0.5), lwd=4)
legend("topright", col=c("gray65",rgb(0,1,0,0.5)), lwd=4, lty=1,
legend=c("Logistic regression weights", "Random forest weights"))
graphics.off()

## Histograms, adaptive?
xlim1 = range(cov.mat[,c(1,7:8)], na.rm=TRUE, finite=TRUE)
bks1 = seq(xlim1[1],xlim1[2],length=40)
ylim1 = c()
for (i in c(1,7:8)) {
ylim1 = range(c(ylim1, hist(cov.mat[,i],breaks=bks1,plot=FALSE)$count))
}
xlab1 = "Coverage"; ylab1 = "Frequency"; cex = 0.9

xlim2 = range(len.mat[,c(1,7:8)], na.rm=TRUE, finite=TRUE)
bks2 = seq(xlim2[1],xlim2[2],length=70)
ylim2 = c()
for (i in c(1,7:8)) {
ylim2 = range(c(ylim2, hist(len.mat[,i],breaks=bks2,plot=FALSE)$count))
}
xlab2 = "Length"; ylab2 = "Frequency"
xlim2 = c(xlim2[1],25) # Manually trim this, due to RF long right tail

hh = 6.5; ww = 6.5
mar = c(4.25,4.25,1,1)

pdf(file="fig/airfoil_ada.pdf", height=hh, width=ww)
par(mar=mar, mfrow=c(2,1))
hist(cov.mat[,1], breaks=bks1, col="red", xlim=xlim1, ylim=ylim1,
main="", xlab=xlab1, ylab=ylab1, cex.lab=cex, cex.axis=cex)
hist(cov.mat[,7], breaks=bks1, col=rgb(0.65,0.65,0.65,0.5), add=TRUE)
hist(cov.mat[,8], breaks=bks1, col=rgb(0,1,0,0.5), add=TRUE)
rug(mean(cov.mat[,1]), col="red", lwd=4)
rug(mean(cov.mat[,7]), col="gray65", lwd=4)
rug(mean(cov.mat[,8]), col=rgb(0,1,0,0.5), lwd=4)
legend("topleft", col=c("red", rgb(0.65,0.65,0.65,0.5),rgb(0,1,0,0.5)),
lwd=4, lty=1, legend=c("Unity weights", "Logistic regression weights",
"Random forest weights"), cex=cex)

hist(len.mat[,1], breaks=bks2, col="red", xlim=xlim2, ylim=ylim2,
main="", xlab=xlab2, ylab=ylab2, cex.lab=cex, cex.axis=cex)
hist(len.mat[,7], breaks=bks2, col=rgb(0.65,0.65,0.65,0.5), add=TRUE)
hist(len.mat[,8], breaks=bks2, col=rgb(0,1,0,0.5), add=TRUE)
rug(median(len.mat[,1]), col="gray65", lwd=4)
rug(median(len.mat[,7]), col="gray65", lwd=4)
rug(median(len.mat[,8]), col=rgb(0,1,0,0.5), lwd=4)
graphics.off()

## Tilted densities
set.seed(0)
i = sample(N,n)
x = dat.x[i,]; y = dat.y[i]
x0 = dat.x[-i,]; y0 = dat.y[-i]
i0 = wsample(w(x0))
n00 = length(i0); x00 = x0[i0,]; y00 = y0[i0]

cex = 1.3
hh = 8; ww = 8.5
mar = c(4.25,4.25,1,1)

pdf(file="fig/airfoil_tilt.pdf", height=hh, width=ww)
par(mar=mar, mfcol=c(2,2))
plot(x0[,1], y0, main="", xlab="x1", ylab="y",
cex.lab=cex, cex.axis=cex)

d1 = density(x0[,1]); d2 = density(x00[,1])
plot(d1, type="l", ylim=c(min(d1$y,d2$y),max(d1$y,d2$y)),
main="", xlab="x1", ylab="Frequency",
cex.lab=cex, cex.axis=cex)
lines(d2, col="blue")

plot(x0[,5], y0, main="", xlab="x5", ylab="y",
cex.lab=cex, cex.axis=cex)

d1 = density(x0[,5]); d2 = density(x00[,5])
plot(d1, type="l", ylim=c(min(d1$y,d2$y),max(d1$y,d2$y)),
main="", xlab="x5", ylab="Frequency",
cex.lab=cex, cex.axis=cex)
lines(d2, col="blue")
graphics.off()
Binary file added barber2019/fig/airfoil_ada.pdf
Binary file not shown.
Binary file added barber2019/fig/airfoil_cov.pdf
Binary file not shown.
Binary file added barber2019/fig/airfoil_len.pdf
Binary file not shown.
Binary file added barber2019/fig/airfoil_tilt.pdf
Binary file not shown.
Binary file added barber2019/rda/airfoil_sim.rda
Binary file not shown.
147 changes: 147 additions & 0 deletions barber2019/sim.airfoil.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
## Load the airfoil data
dat = read.table("airfoil.txt")
dim(dat)
colnames(dat) = c("Frequency",
"Angle",
"Chord",
"Velocity",
"Suction",
"Sound")

dat.x = as.matrix(dat[,1:5])
dat.y = as.numeric(dat[,6])
dat.x[,1] = log(dat.x[,1]) # Log transform
dat.x[,5] = log(dat.x[,5]) # Log transform
N = nrow(dat.x); p = ncol(dat.x)

## Exponential tilting functions
w = function(x) {
exp(x[,c(1,5)] %*% c(-1,1))
}

wsample = function(wts, frac=0.25) {
n = length(wts)
i = c()
while(length(i) <= n*frac) {
i = c(i, which(runif(n) <= wts/max(wts)))
}
return(i)
}

## Run the simulation
library(randomForest)
library("conformalInference")

set.seed(0)
R = 5000
cov.mat = len.mat = matrix(0,R,8)
beta.mat1 = beta.mat2 = matrix(0,R,p+1)
ssize = numeric(R)
n = round(N/2)
my.lm.funs = lm.funs()

for (r in 1:R) {
if (r %% 20 == 0) cat(r,".. ")
i = sample(N,n)
x = dat.x[i,]; y = dat.y[i]
x0 = dat.x[-i,]; y0 = dat.y[-i]

# No tilting
out1 = conformal.pred.split(x, y, x0,
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,1] = mean(out1$lo <= y0 & y0 <= out1$up)
len.mat[r,1] = median(out1$up - out1$lo)

# Tilting
i0 = wsample(w(x0))
x00 = x0[i0,]; y00 = y0[i0]
i1 = out1$split; n1 = length(out1$split)
ssize[r] = sum(w(x[-i1,]))^2/sum(w(x[-i1,])^2)

# Unweighted
out2 = conformal.pred.split(x, y, x00,
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,2] = mean(out2$lo <= y00 & y00 <= out2$up)
len.mat[r,2] = median(out2$up - out2$lo)

# Weighted, oracle
out3 = conformal.pred.split(x, y, x00, w=w(rbind(x,x00)),
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,3] = mean(out3$lo <= y00 & y00 <= out3$up)
len.mat[r,3] = median(out3$up - out3$lo)

# No tilting, smaller sample size
i2 = (1:n)[-i1]
i3 = c(i1, sample(i2, floor(ssize[r])))
out4 = conformal.pred.split(x[i3,], y[i3], x0, split=1:n1,
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,4] = mean(out4$lo <= y0 & y0 <= out4$up)
len.mat[r,4] = median(out4$up - out4$lo)

# Weighted, estimated with logistic regression
zy = as.factor(c(rep(0,n),rep(1,nrow(x00))))
zx = rbind(x,x00)
obj.glm = glm(zy ~ zx, family="binomial")
prob.glm = predict(obj.glm, type="response")
wts.glm = prob.glm / (1-prob.glm)
beta.mat1[r,] = coef(obj.glm)

out5 = conformal.pred.split(x, y, x00, w=wts.glm,
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,5] = mean(out5$lo <= y00 & y00 <= out5$up)
len.mat[r,5] = median(out5$up - out5$lo)

# Weighted, estimated with random forest
obj.rf = randomForest(zx, zy)
prob.rf = predict(obj.rf, type="prob")[,2]
prob.rf = pmax(pmin(prob.rf, 0.99), 0.01)
wts.rf = prob.rf / (1-prob.rf)

out6 = conformal.pred.split(x, y, x00, w=wts.rf,
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,6] = mean(out6$lo <= y00 & y00 <= out6$up)
len.mat[r,6] = median(out6$up - out6$lo)

# No tilting, weighted, estimated with logistic
zy = as.factor(c(rep(0,n),rep(1,nrow(x0))))
zx = rbind(x,x0)
obj.glm = glm(zy ~ zx, family="binomial")
prob.glm = predict(obj.glm, type="response")
wts.glm = prob.glm / (1-prob.glm)
beta.mat2[r,] = coef(obj.glm)

out7 = conformal.pred.split(x, y, x0, w=wts.glm,
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,7] = mean(out7$lo <= y0 & y0 <= out7$up)
len.mat[r,7] = median(out7$up - out7$lo)

# Weighted, estimated with random forest
obj.rf = randomForest(zx, zy)
prob.rf = predict(obj.rf, type="prob")[,2]
prob.rf = pmax(pmin(prob.rf, 0.99), 0.01)
wts.rf = prob.rf / (1-prob.rf)

out8 = conformal.pred.split(x, y, x0, w=wts.rf,
train.fun=my.lm.funs$train,
predict.fun=my.lm.funs$predict)

cov.mat[r,8] = mean(out8$lo <= y0 & y0 <= out8$up)
len.mat[r,8] = median(out8$up - out8$lo)
}
cat("\n")

save(list=ls(), file="rda/airfoil_sim.rda")

0 comments on commit c8b820a

Please sign in to comment.