Hi all,
A new version of ipumsr has been posted to CRAN. It features some speed improvements, a new style of “yield” reading that gives more flexibility when working with large extracts and a fancy new pkgdown website.
The “yield” reading is described in the “big-data” vignette but as a preview, here’s how you can now run a logistic regression without ever holding the full data in memory.
library(ipumsr)
library(biglm)
library(dplyr)
# Data from CPS vignette
cps_ddi_file <- "cps_00001.xml"
cps_data_file <- "cps_00001.dat"
# Setup connection to data ----
data <- read_ipums_micro_yield(
cps_ddi_file,
data_file = cps_data_file,
verbose = FALSE
)
# Function to prepare data for biglm ----
get_model_data <- function(reset) {
if (reset) {
data$reset()
} else {
yield <- data$yield(n = 1000) # Set n pretty low for example
if (is.null(yield)) return(yield)
out <- yield %>%
mutate(
HEALTH = as_factor(HEALTH),
WORK30PLUS = lbl_na_if(AHRSWORKT, ~.lbl == "NIU (Not in universe)") %>%
{. >= 30},
AT_WORK = EMPSTAT %>%
lbl_relabel(
lbl(1, "Yes") ~ .lbl == "At work",
lbl(0, "No") ~ .lbl != "At work"
) %>%
as_factor()
) %>%
filter(AT_WORK == "Yes")
return(out)
}
}
# Run regression ----
results <- bigglm(
WORK30PLUS ~ AGE + I(AGE^2) + HEALTH,
family = binomial(link = "logit"),
data = get_model_data
)
summary(results)
#> Large data regression model: bigglm(WORK30PLUS ~ AGE + I(AGE^2) + HEALTH, family = binomial(link = "logit"),
#> data = get_model_data)
#> Sample size = 88801
#> Coef (95% CI) SE p
#> (Intercept) -3.9623 -4.0956 -3.8290 0.0667 0.0000
#> AGE 0.2677 0.2609 0.2744 0.0034 0.0000
#> I(AGE^2) -0.0029 -0.0030 -0.0028 0.0000 0.0000
#> HEALTHVery good 0.0491 0.0043 0.0939 0.0224 0.0282
#> HEALTHGood -0.1248 -0.1737 -0.0760 0.0244 0.0000
#> HEALTHFair -0.6458 -0.7250 -0.5666 0.0396 0.0000
#> HEALTHPoor -0.9561 -1.1209 -0.7913 0.0824 0.0000
Created on 2019-03-11 by the reprex package (v0.2.1)