outfile <- "data/blockshuffle_testing_data.csv.gz"
if(!file.exists(outfile)) {
libs <- system("pkg-config --libs libzstd", intern = TRUE)
cflags <- system("pkg-config --cflags libzstd", intern = TRUE)
Sys.setenv(PKG_CPPFLAGS = "-mavx2 %s %s" | c(cflags, libs))
Sys.setenv(PKG_LIBS = "-mavx2 %s %s" | c(cflags, libs))
sourceCpp("blockshuffle_heuristic.cpp", verbose=TRUE, rebuild = TRUE)
min_shuffleblock_size <- 262144
blocks_df <- lapply(datasets, function(d) {
tmp <- tempfile()
data <- read_dataset(d)
dname<- basename(d) %>% gsub("\\..+", "", .)
qs2::qd_save(data, file = tmp)
x <- qs2::qx_dump(tmp)
r1 <- tibble(dataset = dname, blocks = x$blocks, algo = "qdata")
qs2::qs_save(data, file = tmp)
x <- qs2::qx_dump(tmp)
r2 <- tibble(dataset = dname, blocks = x$blocks, algo = "qs2")
rbind(r1, r2)
}) %>% rbindlist
blocks_df$blocksize <- sapply(blocks_df$blocks, length)
blocks_df <- filter(blocks_df, blocksize >= min_shuffleblock_size)
gc(full=TRUE)
compress_levels <- 22:1
results <- mclapply(compress_levels, function(cl) {
print(cl)
output <- shuffle_heuristic(blocks_df$blocks)
output$no_shuffle_zblocksize <- og_compress(blocks_df$blocks, cl)$size
output$shuffle_zblocksize <- shuffle_compress(blocks_df$blocks, 8, cl)$size
output <- output %>% mutate(compress_level = cl)
}, mc.cores=8, mc.preschedule=FALSE) %>% rbindlist
results2 <- blocks_df %>% dplyr::select(dataset, algo, blocksize) %>%
{lapply(1:length(compress_levels), function(i) .)} %>% rbindlist
results <- cbind(results2, results)
# add block index per dataset
results <- results %>%
group_by(dataset, compress_level, algo) %>%
mutate(index = 1:n()) %>%
as.data.frame
fwrite(results, outfile, sep = ",")
} else {
results <- fread(outfile, data.table=FALSE)
}
##
## Generated extern "C" functions
## --------------------------------------------------------
##
##
## #include <Rcpp.h>
## #ifdef RCPP_USE_GLOBAL_ROSTREAM
## Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
## Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
## #endif
##
## // SIMD_test
## std::string SIMD_test();
## RcppExport SEXP sourceCpp_3_SIMD_test() {
## BEGIN_RCPP
## Rcpp::RObject rcpp_result_gen;
## rcpp_result_gen = Rcpp::wrap(SIMD_test());
## return rcpp_result_gen;
## END_RCPP
## }
## // shuffle_compress
## DataFrame shuffle_compress(List blocks, int elementsize, int compress_level);
## RcppExport SEXP sourceCpp_3_shuffle_compress(SEXP blocksSEXP, SEXP elementsizeSEXP, SEXP compress_levelSEXP) {
## BEGIN_RCPP
## Rcpp::RObject rcpp_result_gen;
## Rcpp::traits::input_parameter< List >::type blocks(blocksSEXP);
## Rcpp::traits::input_parameter< int >::type elementsize(elementsizeSEXP);
## Rcpp::traits::input_parameter< int >::type compress_level(compress_levelSEXP);
## rcpp_result_gen = Rcpp::wrap(shuffle_compress(blocks, elementsize, compress_level));
## return rcpp_result_gen;
## END_RCPP
## }
## // og_compress
## DataFrame og_compress(List blocks, int compress_level);
## RcppExport SEXP sourceCpp_3_og_compress(SEXP blocksSEXP, SEXP compress_levelSEXP) {
## BEGIN_RCPP
## Rcpp::RObject rcpp_result_gen;
## Rcpp::traits::input_parameter< List >::type blocks(blocksSEXP);
## Rcpp::traits::input_parameter< int >::type compress_level(compress_levelSEXP);
## rcpp_result_gen = Rcpp::wrap(og_compress(blocks, compress_level));
## return rcpp_result_gen;
## END_RCPP
## }
## // shuffle_heuristic
## DataFrame shuffle_heuristic(List blocks);
## RcppExport SEXP sourceCpp_3_shuffle_heuristic(SEXP blocksSEXP) {
## BEGIN_RCPP
## Rcpp::RObject rcpp_result_gen;
## Rcpp::traits::input_parameter< List >::type blocks(blocksSEXP);
## rcpp_result_gen = Rcpp::wrap(shuffle_heuristic(blocks));
## return rcpp_result_gen;
## END_RCPP
## }
##
## Generated R functions
## -------------------------------------------------------
##
## `.sourceCpp_3_DLLInfo` <- dyn.load('/tmp/Rtmpf23OsJ/sourceCpp-x86_64-pc-linux-gnu-1.0.12/sourcecpp_1e9091a5542cd/sourceCpp_4.so')
##
## SIMD_test <- Rcpp:::sourceCppFunction(function() {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_SIMD_test')
## shuffle_compress <- Rcpp:::sourceCppFunction(function(blocks, elementsize, compress_level) {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_shuffle_compress')
## og_compress <- Rcpp:::sourceCppFunction(function(blocks, compress_level) {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_og_compress')
## shuffle_heuristic <- Rcpp:::sourceCppFunction(function(blocks) {}, FALSE, `.sourceCpp_3_DLLInfo`, 'sourceCpp_3_shuffle_heuristic')
##
## rm(`.sourceCpp_3_DLLInfo`)
##
## Building shared library
## --------------------------------------------------------
##
## DIR: /tmp/Rtmpf23OsJ/sourceCpp-x86_64-pc-linux-gnu-1.0.12/sourcecpp_1e9091a5542cd
##
## /usr/lib/R/bin/R CMD SHLIB --preclean -o 'sourceCpp_4.so' 'blockshuffle_heuristic.cpp'
# compare C++ implementation
test_data <- results %>% mutate(improvement = log(no_shuffle_zblocksize/shuffle_zblocksize))
timing_data <- test_data %>% dplyr::select(h1,h2,h3,h4,h5,h6,h7,h8,compress_level) %>% {lapply(1:5, function(i) .)} %>% rbindlist
tic(msg = "R package prediction time")
dtest <- xgb.DMatrix(data = timing_data %>% data.matrix)
r_pred <- predict(bst, dtest)
toc()
## R package prediction time: 3.576 sec elapsed
tic(msg = "Cpp package prediction time")
cpp_pred <- predict_xgboost_cpp(timing_data)
toc()
## Cpp package prediction time: 6.966 sec elapsed
dtest <- xgb.DMatrix(data = test_data %>% dplyr::select(h1,h2,h3,h4,h5,h6,h7,h8,compress_level) %>% data.matrix)
test_data <- test_data %>%
mutate(r_prediction = predict(bst, dtest)) %>%
mutate(cpp_prediction = predict_xgboost_cpp(test_data %>% dplyr::select(h1,h2,h3,h4,h5,h6,h7,h8,compress_level)))
# compare predictions
abs(test_data$r_prediction - test_data$cpp_prediction) %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000001 0.032395 0.072015 0.119527 0.129750 1.731415
# compare prediction to actual
MAX_COMPRESS <- 16 # don't plot blocks with >16x compression, noisy
test_data2 <- test_data %>%
filter(algo == "qdata") %>%
filter(32768*4/(h2 + h4 + h6 + h8) < MAX_COMPRESS)
pal <- palette.colors(palette = "Okabe-Ito")
ggplot(test_data2, aes(x = cpp_prediction, y = improvement, color = dataset)) +
geom_abline(aes(slope=1, intercept = 0), lty = 2) +
geom_vline(aes(xintercept=0), lty=2, color = "orange") +
geom_point(shape=21, alpha=0.75) +
facet_wrap(~compress_level, ncol=4) +
scale_color_manual(values = pal) +
theme_bw(base_size = 12)

# plot cumulative savings
test_data3 <- test_data %>%
filter(algo == "qdata") %>%
mutate(predicted_compression = 32768*4/(h2 + h4 + h6 + h8)) %>%
mutate(do_blockshuffle = predicted_compression < MAX_COMPRESS & cpp_prediction > 0) %>%
group_by(dataset, compress_level) %>%
mutate(optimal = cumsum(pmin(no_shuffle_zblocksize, shuffle_zblocksize))) %>%
mutate(shuffle_heuristic = cumsum(ifelse(do_blockshuffle, shuffle_zblocksize, no_shuffle_zblocksize))) %>%
mutate(no_shuffle = cumsum(no_shuffle_zblocksize)) %>%
mutate(heuristic_was_optimal = do_blockshuffle == (shuffle_zblocksize < no_shuffle_zblocksize) ) %>%
ungroup
test_data3 <- test_data3 %>%
filter(compress_level %in% c(3,9)) %>%
dplyr::select(dataset, compress_level, index, optimal, shuffle_heuristic, no_shuffle) %>%
pivot_longer(c(-index, -dataset, -optimal, -compress_level), names_to = "shuffle_selection", values_to = "cumulative_bytes") %>%
mutate(inefficiency = (cumulative_bytes - optimal)/max(optimal) )
ggplot(test_data3, aes(x = index, y = inefficiency, color = shuffle_selection, lty = factor(compress_level))) +
geom_line() +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~dataset, scales = "free") +
theme_bw(base_size=12) +
labs(x = "Block Index", y = "Inefficiency", lty = "Compress Level", color = "Shuffle Selection")

# compare prediction to actual
MAX_COMPRESS <- 16 # don't plot blocks with >16x compression, noisy
test_data2 <- test_data %>%
filter(algo == "qs2") %>%
filter(32768*4/(h2 + h4 + h6 + h8) < MAX_COMPRESS)
pal <- palette.colors(palette = "Okabe-Ito")
ggplot(test_data2, aes(x = cpp_prediction, y = improvement, color = dataset)) +
geom_abline(aes(slope=1, intercept = 0), lty = 2) +
geom_vline(aes(xintercept=0), lty=2, color = "orange") +
geom_point(shape=21, alpha=0.75) +
facet_wrap(~compress_level, ncol=4) +
scale_color_manual(values = pal) +
theme_bw(base_size = 12)

# plot cumulative savings
test_data3 <- test_data %>%
filter(algo == "qs2") %>%
mutate(predicted_compression = 32768*4/(h2 + h4 + h6 + h8)) %>%
mutate(do_blockshuffle = predicted_compression < MAX_COMPRESS & cpp_prediction > 0) %>%
group_by(dataset, compress_level) %>%
mutate(optimal = cumsum(pmin(no_shuffle_zblocksize, shuffle_zblocksize))) %>%
mutate(shuffle_heuristic = cumsum(ifelse(do_blockshuffle, shuffle_zblocksize, no_shuffle_zblocksize))) %>%
mutate(no_shuffle = cumsum(no_shuffle_zblocksize)) %>%
mutate(heuristic_was_optimal = do_blockshuffle == (shuffle_zblocksize < no_shuffle_zblocksize) ) %>%
ungroup
test_data3 <- test_data3 %>%
filter(compress_level %in% c(3,9)) %>%
dplyr::select(dataset, compress_level, index, optimal, shuffle_heuristic, no_shuffle) %>%
pivot_longer(c(-index, -dataset, -optimal, -compress_level), names_to = "shuffle_selection", values_to = "cumulative_bytes") %>%
mutate(inefficiency = (cumulative_bytes - optimal)/max(optimal) )
ggplot(test_data3, aes(x = index, y = inefficiency, color = shuffle_selection, lty = factor(compress_level))) +
geom_line() +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~dataset, scales = "free") +
theme_bw(base_size=12) +
labs(x = "Block Index", y = "Inefficiency", lty = "Compress Level", color = "Shuffle Selection")
