set.seed(42) # Consider an infinite population where the association # between two variables x and y is described by the following: # y = x^2 + e # x ~ U(0, 15) # e ~ N(0, 10) # We seek a linear approximation of the relationship between # x and y that takes the form below, and minimizes the average # squared deviation (i.e., the 'best' approximation). # hat(y) = a + b*x # This function simulates "n" observations from the population. simulate <- function(n=20) { x <- runif(n, 0, 15) y <- rnorm(n, x^2, 10) data.frame(y=y,x=x) } # This function finds the 'sample' best linear approximation to the # relationship between x and y. sam_fit <- function(dat) lm(y ~ x, data=dat) # We can approximate the 'population' best linear approximation by # taking a very large sample. Note that this only works well for # statistics that converge to a population quantity. pop_fit <- sam_fit(simulate(1e6)) ## > pop_fit ## ## Call: ## lm(formula = y ~ x, data = simulate(1e+06)) ## ## Coefficients: ## (Intercept) x ## -37.53 15.00

The figure below illustrates the true quadratic curve and the best linear approximation (`pop_fit`), overlaid against 10000 samples.

# This function creates a level confidence region for the intercept # and slope of the best linear approximation, and tests whether # the region includes the corresponding population values. # The 'adj' parameter adjusts the critical value, making the # confidence region larger or smaller. sam_int <- function(dat, val=c(a=0, b=0), level=0.95, adj=0.00) { s <- sam_fit(dat) d <- coef(s) - val v <- vcov(s) c <- qchisq(level+adj, 2) as.numeric(d %*% solve(v) %*% d) < c } # By specifying that the region has 95% confidence, we intend that # the region includes the population quantity in 95% of samples. # We can assess the coverage of the above 95% confidence region by # drawing repeated samples from the population and checking whether # the associated confidence regions include the population values: coverage20 <- mean(replicate(1e4, sam_int(simulate(20), val=coef(pop_fit)))) ## > coverage20 ## [1] 0.855 # Because the true coverage is less than the nominal coverage, # the confidence region is anti-conservative. However, suppose that # we can adjust the critical value of the region so that the true # coverage is equal to the nominal value: coverage20a <- mean(replicate(1e4, sam_int(simulate(20), val=coef(pop_fit), adj=0.045))) ## > coverage20a ## [1] 0.9476

The code above illustrates that if we had access to a population, we can adjust the coverage of a confidence region to be correct. The animation (created using Yihui's animation package) below illustrates the original and corrected confidence regions for ten different samples of size 20, overlaid against 10000 sample estimates of `a` and `b`. Unfortunately, we don't generally have access to the population. Hence, the adjustment must be made by an alternative, empirical, mechanism. In the next post, I will show how to use iterated Monte Carlo (specifically the double bootstrap) to make such an adjustment.

In the prediction framework, we use model diagnostics to verify that the model fits well, which has a direct bearing on the quality of predictions. For example, a line generally does not approximate a quadratic curve. However, it is possible to make accurate inferences about a linear approximation to a quadratic curve. Hence, model fit is not required to make quality inferences. Rather, the requirement is that the associated probability statements are correct.

Assessing model diagnostics is an indirect mechanism to comfort ourselves about the quality of inferences. As an alternative, we might attempt a more direct check, for example, by constructing an empirical estimate of coverage. We may then go further and adjust, or calibrate, the confidence interval to have the correct empirical coverage. These ideas are fundamental parts of the 'double bootstrap', and 'iterated Monte Carlo' methods. For the sake of argument, I will state that this type of empirical check and calibration is sufficient to fully replace model diagnostics for statistical inference. It is also my hypothesis that model diagnostics have been historically favored to iterative Monte Carlo methods (the double bootstrap appeared in the late 1980's) because the latter is more computationally intensive. Current computational tools mitigate, but do not eliminate this concern.

I will present examples with R code in a later post.

]]>#!/bin/sh REXEC="/usr/local/bin/R --vanilla --slave" $REXEC <<EOF # create a random file name pngfile <- paste0(format(Sys.time(), "%Y%m%d%H%M%S"), paste(sample(letters,10), collapse=""), ".png") # create temporary graphic file png(pngfile, type = "cairo") x <- seq(-10, 10, length= 30) y <- x f <- function(x, y) { r <- sqrt(x^2+y^2); 10 * sin(r)/r } z <- outer(x, y, f) z[is.na(z)] <- 1 op <- par(bg = rgb(1,1,1,0)) persp(x, y, z, theta = 30, phi = 30, expand = 0.5, col = "lightblue") persp(x, y, z, theta = 30, phi = 30, expand = 0.5, col = "lightblue", ltheta = 120, shade = 0.75, ticktype = "detailed", xlab = "X", ylab = "Y", zlab = "Sinc( r )") invisible(dev.off()) # write headers pngsize <- file.info(pngfile)[["size"]] cat("Content-type: image/png\n") cat(paste("Content-length: ", pngsize, "\n\n", sep="")) # open pipe to stdout and pass image data con <- pipe("cat", "wb") writeBin(readBin(pngfile, 'raw', n=pngsize), con) flush(con) close(con) # remove intermediate graphic invisible(file.remove(pngfile)) EOF ###]]>

We have a loan with Volkswagen Credit (VC), and today received an offer to postpone our December payment of $447.50, for a small fee of $25. VC does a good job of making the offer read like a friendly holiday gesture:

'Tis the season to spread holiday cheer and join in the spirit of giving. That's why Volkswagen Credit wants to thank loyal customers like you by offering the opportunity to skip your December 2013 payment...

This is a terrible opportunity! It's essentially a new, one month loan. The annualized simple interest rate paid to VC on this loan would have been 25 / 447.50 * 100 * 12 = 67%! That would be considered usurious in Canada.

]]>Below is the type of graphic that I had initially created. It's a binary calibration plot; it plots model predictions (probabilities) against empirical estimates of the outcome probability using a kernel smoother. The kernel smoother isn't optimal because it's prone to bias, especially near the extreme predictions. This is clear in the figure below, near the origin. Although the simulated data are perfectly calibrated, the figure might lead us to conclude otherwise for predictions near zero. A 95% pointwise confidence band, and a (inverted) histogram of model predictions are also displayed.

> set.seed(42) > x <- rbeta(5000, 1, 5) > y <- rbinom(5000, 1, x) > calib(y, x, sx = seq(min(x), max(x), + length.out=100))

For completeness, listed below are the two functions that implement the kernel smoothing and calibration plot, respectively. However, the recipe for faceting with base R, outlined below, is agnostic to the type of plot used.

# binary kernel smoothing, with pointwise confidence band # y - vector of binary outcomes # x - vector of probabilities # bw - bandwidth # sx - values of x where p(y|x) is estimated # conf - confidence level for pointwise confidence band bks <- function(y, x, bw = 0.0075, sx = x, conf = 0.95) { # normal kernel estimate lsx <- length(sx); lx <- length(x) kmat <- matrix(rep(x, lsx), lsx, lx, TRUE) wts <- exp(-(kmat - sx)^2/bw) rsm <- rowSums(wts) sms <- wts %*% y est <- sms / rsm # Clopper-Pearson intervals clo <- qbeta((1-conf)/2, sms, rsm - sms + 1) chi <- qbeta(1-(1-conf)/2, sms + 1, rsm - sms) list(x = sx, est = est, clo = clo, chi = chi, conf = conf) } # Calibration curve # y - vector of binary outcomes # x - vector of probabilities (predictions) # bw - bandwidth # sx - values of x where p(y|x) is estimated # conf - confidence level for pointwise confidence band calib <- function(y, x, bw = 0.0075, sx = sort(x), conf = 0.95, ...) { ox <- order(x) x <- x[ox] y <- y[ox] bf <- bks(y, x, bw, sx, conf) hx <- hist(x, breaks=100, plot=FALSE) hx$counts <- hx$counts/sum(hx$counts) plot(hx$mids, hx$counts, ylim=c(1,0), xlim=c(0,1), type='h', ann=FALSE, yaxt='n', xaxt='n', bty='n') par(new=TRUE) plot(bf$x, bf$est, type="n", xlim = c(0,1), ylim = c(0,1), xlab = "Model", ylab = "Empirical", ...) lines(range(bf$x),range(bf$x),col="darkgray") lines(bf$x, bf$est, lty=1) lines(bf$x, bf$clo, lty=2) lines(bf$x, bf$chi, lty=2) }

The conventional approach to faceting using base R functionality is to arrange whole plots (i.e., including titles, axes, and labels) in a rectangular array within a single figure. This is accomplished, for example, by setting the `mfrow` or `mfcol` parameters using the `par` function, or by using the `layout` function. The result is something similar to the following:

> par(mfrow=c(2,2)) > replicate(4, { + x <- rbeta(5000, 1, 5) + y <- rbinom(5000, 1, x) + calib(y, x, sx = seq(min(x), max(x), + length.out=100)) + })

The `mfrow` solution is not optimal in this application, for several reasons that I won't mention here. This brings me to my solution:

- Set an outer margin to be used for labels and axes.
- Eliminate the inner margin.
- Use
`mfrow`/`mfcol`. - For each plot:
- Create a plot without titles, axes, or labels.
- Add titles, axes, and labels manually.

> CalibPlot <- expression({ + x <- rbeta(5000, 1, 5) + y <- rbinom(5000, 1, x) + calib(y, x, + sx = seq(min(x), max(x), + length.out=100), + xaxt="n", yaxt="n") + }) > > par(omi=rep(1.0, 4), mar=c(0,0,0,0), mfrow=c(2,2)) > > #1,1 > eval(CalibPlot) > mtexti("Column I", 3) > > #1,2 > eval(CalibPlot) > mtexti("Column II", 3) > mtexti("Row I", 4) > > #2,1 > eval(CalibPlot) > axis(1) > mtexti("Model Probability", 1, 0.75) > axis(2) > mtexti("Empirical Probability", 2, 0.75) > > #2,2 > eval(CalibPlot) > mtexti("Row II", 4)

The function `mtexti` (not defined here) behaves similarly to `mtext`. I recently wrote about `mtexti` (see post 2522). There are many ways in which to style and automate the faceted version. I've elected to put the axes and axis labels on the bottom-left figure only, which has some drawbacks. Hence, I haven't made much effort to encapsulate this recipe for faceting within a function, since I'm not quite sure how to best display the axes, etc.

In a comment below, Sebastian gives code for the lattice version, which is easier that I had thought. Here is the result:

]]>The `mtexti` function defined below takes arguments that are similar to `mtext`, with one major exception. Rather than specifying the margin line on which to render the text, the offset (in inches) from the edge of the plotting region is specified instead. Hence, the "`i`" in `mtexti` is intended to remind the user of this distinction.

# text - character, text to be plotted # side - numeric, 1=bottom 2=left 3=top 4=right # off - numeric, offset in inches from the edge of the plotting region # srt - string rotation in degrees # ... - additional arguments passed to text() mtexti <- function(text, side, off = 0.25, srt = if(side == 2) 90 else if(side == 4) 270 else 0, ...) { # dimensions of plotting region in user units usr <- par('usr') # dimensions of plotting region in inches pin <- par('pin') # user units per inch upi <- c(usr[2]-usr[1], usr[4]-usr[3]) / pin # default x and y positions xpos <- (usr[1] + usr[2])/2 ypos <- (usr[3] + usr[4])/2 if(1 == side) ypos <- usr[3] - upi[2] * off if(2 == side) xpos <- usr[1] - upi[1] * off if(3 == side) ypos <- usr[4] + upi[2] * off if(4 == side) xpos <- usr[2] + upi[1] * off text(x=xpos, y=ypos, text, xpd=NA, srt=srt, ...) }

Here is an example:

plot(1, yaxt='n', xaxt='n', xlab='', ylab='', type='n') mtexti("test", 1) mtexti("test", 2) mtexti("test", 3) mtexti("test", 4)]]>

- Program 1. Obese employees are given a monthly weight loss goal. If the goal is reached, the participant receives $100, otherwise the employer keeps the money. This is called the
*individual incentive*. - Program 2. Obese employees are organized into groups of five, and each participant is given a montly weight loss goal. A sum of $500 dollars is evenly split among those participants who achieve their monthly weight loss goal. In the event that no participant achieves their montly goal, the employer keeps the incentive money. This is called the
*group incentive*.

The researchers found that the group incentive was associated with greater average weight loss than the individual incentive. This result is especially interesting from a psychological perspective, but I was most drawn to the issue of cost. I found it odd that the authors focused on the fact that "both designs used the same up-front allocation of resources". Presumably, this is to argue that the second program was more effective at no additional up-front cost. For example, the authors write: "Similar to that in the individual-incentive group, the up-front allocation of incentives for meeting weight-loss goals was $100 per participant per month (totaling $21 000)." But, the authors later write that, over a 24 week period: "Mean earnings were $514.70 (SD, $522.60) in the group-incentive group and $128.60 (SD, $165.50) in the individual-incentive group (mean between-group difference, $386.10 [CI, $201.00 to $571.30]; P < 0.001)." Hence, it's clear that the second program is more expensive, as one might expect. It's also a little odd that the study consisted mostly of women (89%). The allocation of race/ethnicity was also somewhat imbalanced.

I like that the authors used confidence intervals throughout to summarize the differences in average weight loss (and incentive earnings) between groups. They also used p-values, but I think this was unnecessary. The authors used multiple imputation for missing weights at 24 and 36 weeks. I've always had trouble accepting multiple imputation of outcomes, because the imputation depends so heavily on the method and model used for imputation. In the appendix, the authors write that weight was imputed "adjusting for incentive group, age, sex, race, education, household income, baseline weight, importance of controlling weight, and confidence in controlling weight". No additional details are given about the model, although the software used to implement the method is listed (SAS PROC MI and MIANALYZE). Finally, I felt this senctence was incomplete: "To maintain the type I error rate while testing the 3 hypotheses of primary interest, we used a Bonferroni correction to define an α of 0.0167 as our threshold for statistical significance." The authors neglected that this approach attempts to control the *familywise* type I error rate. This is an important omission.

So that we know who to thank:

matt@deb6box$ svn log -r 62016 http://svn.r-project.org/R/trunk/src/include/R_ext/Connections.h ------------------------------------------------------------------------ r62016 | urbaneks | 2013-02-21 14:29:44 -0500 (Thu, 21 Feb 2013) | 1 line add API to create custom connections ------------------------------------------------------------------------

Here is the header file itself, with credits. It looks like all of the `Rconnection` struct is made available. But, notice the warning!

matt@deb6box$ svn blame -r 62016 http://svn.r-project.org/R/trunk/src/include/R_ext/Connections.h 11656 ripley /* 11656 ripley * R : A Computer Language for Statistical Data Analysis 62016 urbaneks * Copyright (C) 2000-2013 The R Core Team. 11656 ripley * 11656 ripley * This program is free software; you can redistribute it and/or modify 11656 ripley * it under the terms of the GNU General Public License as published by 11656 ripley * the Free Software Foundation; either version 2 of the License, or 11656 ripley * (at your option) any later version. 11656 ripley * 11656 ripley * This program is distributed in the hope that it will be useful, 11656 ripley * but WITHOUT ANY WARRANTY; without even the implied warranty of 11656 ripley * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11656 ripley * GNU General Public License for more details. 11656 ripley * 11656 ripley * You should have received a copy of the GNU General Public License 42308 ripley * along with this program; if not, a copy is available at 42308 ripley * http://www.r-project.org/Licenses/ 11656 ripley */ 11656 ripley 62016 urbaneks #ifndef R_EXT_CONNECTIONS_H_ 62016 urbaneks #define R_EXT_CONNECTIONS_H_ 62016 urbaneks 11668 ripley #include]]>11668 ripley 62016 urbaneks #ifndef NO_C_HEADERS 62016 urbaneks # include /* for size_t */ 62016 urbaneks # include /* for va_list */ 42677 urbaneks #endif 42677 urbaneks 62016 urbaneks /* IMPORTANT: we do not expect future connection APIs to be 62016 urbaneks backward-compatible so if you use this, you *must* check the version 62016 urbaneks and proceed only if it matches what you expect 62016 urbaneks 62016 urbaneks We explicitly reserve the right to change the connection 62016 urbaneks implementation without a compatibility layer. 62016 urbaneks */ 62016 urbaneks #define R_CONNECTIONS_VERSION 1 62016 urbaneks 45984 ripley /* this allows the opaque pointer definition to be made available 44013 ripley in Rinternals.h */ 16472 luke #ifndef HAVE_RCONNECTION_TYPEDEF 16472 luke typedef struct Rconn *Rconnection; 16472 luke #endif 16472 luke struct Rconn { 11656 ripley char* class; 11656 ripley char* description; 44013 ripley int enc; /* the encoding of 'description' */ 11656 ripley char mode[5]; 23228 ripley Rboolean text, isopen, incomplete, canread, canwrite, canseek, blocking, 23228 ripley isGzcon; 18583 ripley Rboolean (*open)(struct Rconn *); 11656 ripley void (*close)(struct Rconn *); /* routine closing after auto open */ 11656 ripley void (*destroy)(struct Rconn *); /* when closing connection */ 11656 ripley int (*vfprintf)(struct Rconn *, const char *, va_list); 11656 ripley int (*fgetc)(struct Rconn *); 32497 ripley int (*fgetc_internal)(struct Rconn *); 31166 ripley double (*seek)(struct Rconn *, double, int, int); 13305 ripley void (*truncate)(struct Rconn *); 11656 ripley int (*fflush)(struct Rconn *); 11656 ripley size_t (*read)(void *, size_t, size_t, struct Rconn *); 11656 ripley size_t (*write)(const void *, size_t, size_t, struct Rconn *); 59167 ripley int nPushBack, posPushBack; /* number of lines, position on top line */ 11656 ripley char **PushBack; 12256 pd int save, save2; 32492 ripley char encname[101]; 32492 ripley /* will be iconv_t, which is a pointer. NULL if not in use */ 32492 ripley void *inconv, *outconv; 32497 ripley /* The idea here is that no MBCS char will ever not fit */ 32497 ripley char iconvbuff[25], oconvbuff[50], *next, init_out[25]; 32497 ripley short navail, inavail; 32497 ripley Rboolean EOF_signalled; 44101 ripley Rboolean UTF8out; 41765 ripley void *id; 41765 ripley void *ex_ptr; 11656 ripley void *private; 61527 ripley int status; /* for pipes etc */ 16472 luke }; 11656 ripley 62016 urbaneks #ifdef __cplusplus 62016 urbaneks extern "C" { 62016 urbaneks #endif 11656 ripley 62016 urbaneks SEXP R_new_custom_connection(const char *description, const char *mode, const char *class_name, Rconnection *ptr); 62016 urbaneks size_t R_ReadConnection(Rconnection con, void *buf, size_t n); 62016 urbaneks size_t R_WriteConnection(Rconnection con, void *buf, size_t n); 13366 ripley 62016 urbaneks #ifdef __cplusplus 62016 urbaneks } 25961 ripley #endif 25961 ripley 62016 urbaneks #endif