surveyCV
Stat paper based on SDSS presentationWriting cleaner code for only the simulations needed for our submission to Stat (the special issue for SDSS 2021 talks).
Based on the earlier sims (now on GitHub in data-raw/plot_generation.R
and data-raw/early-results.html
) – but instead of having similar functions repeatedly defined separately as in that file, use our functions such as cv.svy()
directly from the package.
library(survey)
library(ggplot2)
library(splines)
library(magrittr)
library(surveyCV)
<- FALSE # when we *do* rerun sims, remember to reinstall package & restart R
rerunSims ::opts_chunk$set(dpi=300, out.extra ='WIDTH="95%"') knitr
# Generate a cubic-ish artificial population with 1000 units
set.seed(47)
<- 1000
N <- 10
nrStrata <- 100
nrClusters <- N/nrClusters
clusSizes
= stats::runif(1:(N/2), min = 26, max = 38)
x1 = (x1-29)^3 - 13*(x1-29)^2 + 0*(x1-29) + 900
y1
= stats::runif(1:(N/2), min = 38, max = 50)
x2 = (x2-36)^3 - 10*(x2-36)^2 + 2*(x2-36) + 600
y2
= jitter(y1, 15000)
z1 = jitter(y2, 15000)
z2
<- data.frame(Response = z1, Predictor = x1)
ds1 <- data.frame(Response = z2, Predictor = x2)
ds2 <- rbind(ds1, ds2)
ds12
<- data.frame(ID = c(1:1000))
b12 <- cbind(b12, ds12)
splinepop.df <- splinepop.df %>%
splinepop.df ::arrange(Predictor) %>%
dplyr# Create Cluster and Stratum variables so that
# we can (separately) simulate both kinds of sampling
::mutate(Stratum = dplyr::row_number(),
dplyrCluster = dplyr::row_number()) %>%
::mutate(Stratum = cut(Stratum, nrStrata, 1:nrStrata),
dplyrCluster = cut(Cluster, nrClusters, 1:nrClusters)) %>%
# Create fpc variables for each sampling type:
# full pop has 1000 units; 100 units/stratum; 100 clusters (of 10 units each);
# & if we added a combined Strat+Clus sim, we'd need different fpc's for that
::mutate(fpcSRS = N,
dplyrfpcStratum = N/nrStrata,
fpcCluster = nrClusters) %>%
::arrange(ID)
dplyr
# Create unequal sampling weights that are intentionally
# NOT well matched to the true shape of the population,
# so we can see how the use of weights impacts
# model-training step vs test-error estimation step
# in a case where naively ignoring weights will confidently pick wrong model
<- stats::lm(Response ~ Predictor + I(Predictor^2),
lm_quad data = splinepop.df)
$samp_prob_quad <-
splinepop.df1/(abs(lm_quad$residuals))) / sum(1/(abs(lm_quad$residuals)))
($samp_wt_quad <- 1/splinepop.df$samp_prob_quad
splinepop.df
stopifnot(all.equal(sum(1/splinepop.df$samp_wt_quad), 1))
<- 100
n
<- dplyr::sample_n(splinepop.df, n)
srs.df
<- rep(n/nrStrata, nrStrata)
stratcounts names(stratcounts) <- 1:nrStrata
<- stratsample(splinepop.df$Stratum, stratcounts)
s <- splinepop.df[s,]
strat.df
<- unique(splinepop.df[["Cluster"]])
c <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]
clus.df
<- ggplot(splinepop.df, aes(x = Predictor, y = Response)) +
a geom_point(shape = 1) +
labs(title = "Artificial Pop.", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
<- ggplot(srs.df, aes(x = Predictor, y = Response)) +
b geom_point(shape = 8, color = "darkgreen") +
labs(title = "SRS", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
<- ggplot(strat.df, aes(x = Predictor, y = Response)) +
c geom_point(shape = 3, color = "darkred") +
labs(title = "Stratified Sample", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
<- ggplot(clus.df, aes(x = Predictor, y = Response)) +
d geom_point(shape = 4, color = "steelblue") +
labs(title = "Cluster Sample", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
::grid.arrange(a, b, c, d, ncol = 4,
gridExtratop = "Simulated Data and Examples of How It Was Sampled")
# TODO: redo with different n's?
<- 100
n <- 500 loops
# Use SRS samples and make SRS folds
<- data.frame(df = c(), MSE = c())
srssrsds # Use SRS samples and evaluate on full pop
<- data.frame(df = c(), MSE = c())
srspopds
# Use Cluster samples and make Cluster folds
<- data.frame(df = c(), MSE = c())
clusclusds # Use Cluster samples and make SRS folds
<- data.frame(df = c(), MSE = c())
clussrsds # Use Cluster samples and evaluate on full pop
<- data.frame(df = c(), MSE = c())
cluspopds
# Use Strat samples and make Strat folds
<- data.frame(df = c(), MSE = c())
stratstratds # Use Strat samples and make SRS folds
<- data.frame(df = c(), MSE = c())
stratsrsds # Use Strat samples and evaluate on full pop
<- data.frame(df = c(), MSE = c())
stratpopds
<- c("Response~splines::ns(Predictor, df=1)",
modelsToFit "Response~splines::ns(Predictor, df=2)",
"Response~splines::ns(Predictor, df=3)",
"Response~splines::ns(Predictor, df=4)",
"Response~splines::ns(Predictor, df=5)",
"Response~splines::ns(Predictor, df=6)")
# Making as many simple random samples as we specify for 'loops'
for (i in 1:loops) {
# Take a SRS
<- dplyr::sample_n(splinepop.df, n)
sim.srs
# Using our SRS function on SRS samples
<- cv.svy(sim.srs, modelsToFit,
srs.CV.out nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
<- data.frame(df = 1:6, MSE = srs.CV.out)
srssrsds2 <- rbind(srssrsds, srssrsds2)
srssrsds
# Eval SRS samples on full pop
<- dplyr::sample_n(sim.srs, n*.8)
sub.srs <- svydesign(ids = ~1, fpc = ~fpcSRS, data = sub.srs)
srs.des <- sapply(1:6,
srs.pop.out function(ii) {
<- predict(svyglm(modelsToFit[ii], srs.des),
yhat newdata = splinepop.df)
return(mean((yhat - splinepop.df$Response)^2))
})<- data.frame(df = 1:6, MSE = srs.pop.out)
srspopds2 <- rbind(srspopds, srspopds2)
srspopds
}
# Making as many cluster samples as we specify for 'loops'
for (i in 1:loops) {
# Take a Cluster sample
<- unique(splinepop.df[["Cluster"]])
c <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]
sim.clus
# Using our Cluster function on Cluster samples
<- cv.svy(sim.clus, modelsToFit,
clus.CV.out clusterID = "Cluster", nfolds = 5, fpcID = "fpcCluster") %>% as.vector()
<- data.frame(df = 1:6, MSE = clus.CV.out)
clusclusds2 <- rbind(clusclusds, clusclusds2)
clusclusds
# Using our SRS function on Cluster samples
<- cv.svy(sim.clus, modelsToFit,
srs.CV.out nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
<- data.frame(df = 1:6, MSE = srs.CV.out)
clussrsds2 <- rbind(clussrsds, clussrsds2)
clussrsds
# Eval 80% Cluster samples on full pop
<- unique(sim.clus[["Cluster"]])
sub.c <- sim.clus[sim.clus[["Cluster"]] %in% sample(sub.c, (n/clusSizes)*.8),]
sub.clus
<- svydesign(ids = ~Cluster, fpc = ~fpcCluster, data = sub.clus)
clus.des <- sapply(1:6,
clus.pop.out function(ii) {
<- predict(svyglm(modelsToFit[ii], clus.des),
yhat newdata = splinepop.df)
return(mean((yhat - splinepop.df$Response)^2))
})<- data.frame(df = 1:6, MSE = clus.pop.out)
cluspopds2 <- rbind(cluspopds, cluspopds2)
cluspopds
}
# Making as many stratified samples as we specify for 'loops'
for (i in 1:loops) {
# Take a Strat sample
<- rep(n/nrStrata, nrStrata)
stratcounts names(stratcounts) <- 1:nrStrata
<- stratsample(splinepop.df$Stratum, stratcounts)
s <- splinepop.df[s,]
sim.strat
# Using our Strat function on Strat samples
<- cv.svy(sim.strat, modelsToFit,
strat.CV.out strataID = "Stratum", nfolds = 5, fpcID = "fpcStratum") %>% as.vector()
<- data.frame(df = 1:6, MSE = strat.CV.out)
stratstratds2 <- rbind(stratstratds, stratstratds2)
stratstratds
# Using our SRS function on Strat samples
<- cv.svy(sim.strat, modelsToFit,
srs.CV.out nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
<- data.frame(df = 1:6, MSE = srs.CV.out)
stratsrsds2 <- rbind(stratsrsds, stratsrsds2)
stratsrsds
# Eval 80% Strat samples on full pop
<- rep((n/nrStrata)*.8, nrStrata)
sub.stratcounts names(sub.stratcounts) <- 1:nrStrata
<- survey::stratsample(sim.strat$Stratum, sub.stratcounts)
sub.s <- sim.strat[sub.s,]
sub.strat
<- svydesign(ids = ~1, strata = ~Stratum, fpc = ~fpcStratum, data = sub.strat)
strat.des <- sapply(1:6,
strat.pop.out function(ii) {
<- predict(svyglm(modelsToFit[ii], strat.des),
yhat newdata = splinepop.df)
return(mean((yhat - splinepop.df$Response)^2))
})<- data.frame(df = 1:6, MSE = strat.pop.out)
stratpopds2 <- rbind(stratpopds, stratpopds2)
stratpopds
}
# Making the degrees of freedom variable a factor variable
$df <- as.factor(srssrsds$df)
srssrsds$df <- as.factor(srspopds$df)
srspopds$df <- as.factor(clusclusds$df)
clusclusds$df <- as.factor(clussrsds$df)
clussrsds$df <- as.factor(cluspopds$df)
cluspopds$df <- as.factor(stratstratds$df)
stratstratds$df <- as.factor(stratsrsds$df)
stratsrsds$df <- as.factor(stratpopds$df)
stratpopds
::use_data(srssrsds, srspopds,
usethis
clusclusds, clussrsds, cluspopds,
stratstratds, stratsrsds, stratpopds,internal = FALSE, overwrite = TRUE)
# TODO: for now setting internal=FALSE
# so that we don't overwrite the sysdata.Rda for intro.Rmd;
# but eventually when we're ready to replace that old vignette,
# we can return to internal=TRUE here
# if that's indeed better for R packages
# ...
# OK, now we've written those dataframes,
# AND the 4 below, all into R/sysdata.Rda together
if(!rerunSims) {
<- surveyCV:::srssrsds
srssrsds <- surveyCV:::clusclusds
clusclusds <- surveyCV:::clussrsds
clussrsds <- surveyCV:::stratstratds
stratstratds <- surveyCV:::stratsrsds
stratsrsds <- surveyCV:::srspopds
srspopds <- surveyCV:::cluspopds
cluspopds <- surveyCV:::stratpopds
stratpopds
}
# Find the y-ranges of MSEs
<- min(min(srssrsds$MSE),
yminMSE min(clusclusds$MSE), min(clussrsds$MSE),
min(stratstratds$MSE), min(stratsrsds$MSE))
<- max(max(srssrsds$MSE),
ymaxMSE max(clusclusds$MSE), max(clussrsds$MSE),
max(stratstratds$MSE), max(stratsrsds$MSE))
<- dplyr::group_by(srspopds, df) %>%
srspopds ::summarise(MSE = mean(MSE)) %>%
dplyr::mutate(df = as.numeric(df))
dplyr<- dplyr::group_by(cluspopds, df) %>%
cluspopds ::summarise(MSE = mean(MSE)) %>%
dplyr::mutate(df = as.numeric(df))
dplyr<- dplyr::group_by(stratpopds, df) %>%
stratpopds ::summarise(MSE = mean(MSE)) %>%
dplyr::mutate(df = as.numeric(df))
dplyr
# Making ggplot objects for the MSEs and AICs
<- ggplot(data = srssrsds, mapping = aes(x = df, y = MSE/1e4)) +
plot.srssrs geom_boxplot() +
geom_line(data = srspopds, mapping = aes(x = df, y = MSE/1e4)) +
ggtitle("CV: SRS folds,\nSRS sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
<- rectGrob(width = 0, height = 0) # empty spot here -- no corresponding sim
plot.blank <- ggplot(data = clusclusds, mapping = aes(x = df, y = MSE/1e4)) +
plot.clusclus geom_boxplot() +
geom_line(data = cluspopds) +
ggtitle("CV: Cluster folds,\nCluster sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
<- ggplot(data = clussrsds, mapping = aes(x = df, y = MSE/1e4)) +
plot.clussrs geom_boxplot() +
geom_line(data = cluspopds) +
ggtitle("CV: SRS folds,\nCluster sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
<- ggplot(data = stratstratds, mapping = aes(x = df, y = MSE/1e4)) +
plot.stratstrat geom_boxplot() +
geom_line(data = stratpopds) +
ggtitle("CV: Strat folds,\nStrat sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
<- ggplot(data = stratsrsds, mapping = aes(x = df, y = MSE/1e4)) +
plot.stratsrs geom_boxplot() +
geom_line(data = stratpopds) +
ggtitle("CV: SRS folds,\nStrat sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
::grid.arrange(plot.srssrs, plot.clussrs, plot.stratsrs,
gridExtra
plot.blank, plot.clusclus, plot.stratstrat,ncol = 3,
top = paste0("Simulated Data (Sample Sizes = ", n,
", Clusters = ", n/clusSizes,
" or Strata = ", nrStrata,
", Loops = ", loops, ", Folds = 5)"))
ggplot(splinepop.df,
aes(x = Predictor, y = Response)) +
geom_point(aes(size = samp_prob_quad), alpha = 0.1) +
stat_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
labs(title = "Artificial Population with Sampling Probabilities",
subtitle = "Sampling prob. is higher for points nearer the solid curve",
x = "Predictor", y = "Response",
size = "Sampling Probability")
# TODO: redo with different n's?
<- 100
n <- 500
loops <- "samp_wt_quad" weights
# In all the following cases,
# we are only checking the effect of using sampling **weights**
# during model-fitting on training sets, and testing on test sets;
# we did not use cluster or stratified sampling,
# so all CV folds will (sensibly) be SRS (regardless of samp weights)
# Use weighted models, and calculate MSEs using weighted design
<- data.frame(df = c(), MSE = c())
AllW # Use SRS models, and calculate MSEs using SRS design
<- data.frame(df = c(), MSE = c())
NoW # Use weighted models, and calculate MSEs using SRS design
<- data.frame(df = c(), MSE = c())
ModW # Use SRS models, and calculate MSEs using weighted design
<- data.frame(df = c(), MSE = c())
MSEW
for (i in 1:loops) {
# Take a sample of size n, using the sampling probabilities instead of SRS
# (using 1/samp_wt as the samp_prob per unit,
# or n/samp_wt as overall samp_prob to get a sample of size n)
<- sampling::UPtille(n / splinepop.df[[weights]])
in.sample <- splinepop.df[in.sample > 0, ]
splinesamp.df <- c("Response~ns(Predictor, df=1)", "Response~ns(Predictor, df=2)",
modelsToFit "Response~ns(Predictor, df=3)", "Response~ns(Predictor, df=4)",
"Response~ns(Predictor, df=5)", "Response~ns(Predictor, df=6)")
<- cv.svy(splinesamp.df, modelsToFit,
AllWdat nfolds = 5, weightsID = weights) %>% as.vector()
<- cv.svy(splinesamp.df, modelsToFit,
NoWdat nfolds = 5) %>% as.vector()
<- cv.svy(splinesamp.df, modelsToFit,
ModWdat nfolds = 5, weightsID = weights,
useSvyForLoss = FALSE) %>% as.vector()
<- cv.svy(splinesamp.df, modelsToFit,
MSEWdat nfolds = 5, weightsID = weights,
useSvyForFits = FALSE) %>% as.vector()
# compiling one data frame
<- data.frame(df = 1:6, MSE = AllWdat, sample = rep(i, 6))
AllW2 <- rbind(AllW, AllW2)
AllW <- data.frame(df = 1:6, MSE = NoWdat, sample = rep(i, 6))
NoW2 <- rbind(NoW, NoW2)
NoW <- data.frame(df = 1:6, MSE = ModWdat, sample = rep(i, 6))
ModW2 <- rbind(ModW, ModW2)
ModW <- data.frame(df = 1:6, MSE = MSEWdat, sample = rep(i, 6))
MSEW2 <- rbind(MSEW, MSEW2)
MSEW
}
# Making the degrees of freedom variable a factor variable
$df <- as.factor(AllW$df)
AllW$df <- as.factor(NoW$df)
NoW$df <- as.factor(MSEW$df)
MSEW$df <- as.factor(ModW$df)
ModW
::use_data(AllW, NoW,
usethis
MSEW, ModW,internal = FALSE, overwrite = TRUE)
# TODO: as above, for now setting internal=FALSE
# so that we don't overwrite the sysdata.Rda for intro.Rmd;
# but eventually when we're ready to replace that old vignette,
# we can return to internal=TRUE here
# if that's indeed better for R packages
# ...
# OK, now we've written those dataframes,
# AND the 8 above, all into R/sysdata.Rda together
if(!rerunSims) {
<- surveyCV:::AllW
AllW <- surveyCV:::NoW
NoW <- surveyCV:::MSEW
MSEW <- surveyCV:::ModW
ModW
}
# Find the y-range of MSEs
<- min(min(AllW$MSE), min(NoW$MSE), min(MSEW$MSE), min(ModW$MSE))
ymin <- max(max(AllW$MSE), max(NoW$MSE), max(MSEW$MSE), max(ModW$MSE))
ymax # Making a ggplot object for the MSEs collected when using
# SRS folds, SRS models, and SRS error calculations
<- ggplot(data = AllW, mapping = aes(x = df, y = MSE/1e5)) +
pAllW ggtitle("Weights for both training and testing") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# SRS folds, Clus models, and SRS error calculations
<- ggplot(data = NoW, mapping = aes(x = df, y = MSE/1e5)) +
pNoW ggtitle("No weights") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, Clus models, and Clus error calculations
<- ggplot(data = ModW, mapping = aes(x = df, y = MSE/1e5)) +
pModW ggtitle("Weights when training models") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, SRS models, and Clus error calculations
<- ggplot(data = MSEW, mapping = aes(x = df, y = MSE/1e5)) +
pMSEW ggtitle("Weights when estimating test MSE") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a grid display of the four plot objects above
<- matrix(c(NA, 1, 1, NA,
lay NA, 1, 1, NA,
2, 2, 3, 3,
2, 2, 3, 3,
NA, 4, 4, NA,
NA, 4, 4, NA), byrow = TRUE, ncol = 4)
::grid.arrange(pAllW, pModW, pMSEW, pNoW, ncol = 2,
gridExtratop = paste0("Simulated Data (Sample Sizes = ", n,
", Loops = ", loops, ", 5 Folds, Samp. Wts from Quad. Fit)"),
layout_matrix = lay)