################### ## PREPROCESSING ## ################### ## read in raw information rk <- readLines("rk.raw") n <- length(rk)/2 ## process information on model dimension mdim <- strsplit(rk[(1:n) * 2], "\t") neq <- sapply(mdim, length) mdim <- sapply(strsplit(unlist(mdim), ","), rbind) ## process information on publication info <- rep(rk[(1:n) * 2 - 1], neq) info <- sapply(strsplit(info, "\t"), rbind) ## combine and turn to data.frame rk <- data.frame(id = rep(1:n, neq), neq = rep(neq, neq), nreg = as.numeric(mdim[1,]), nobs = as.numeric(mdim[2,]), author = info[1,], journal = info[2,], year = as.numeric(info[3,]), page = as.numeric(gsub("s", "", info[4,])), subject = factor(info[5,], levels = c("d", "h", "g", "u", "w", "b", "m", "c"))) levels(rk$subject) <- c("discrimination", "human capital", "general", "unionism", "women")[c(1:5, 1, 3, 3)] rk$collection <- factor(rk$journal %in% c("ir", "jpe", "jhr", "ilrr", "aer", "ej", "restat"), levels = c(TRUE, FALSE), labels = c("no", "yes")) ## write as CSV write.table(rk, file = "rk.csv", sep = ",", quote = FALSE) ################### ## DATA ANALYSIS ## ################### ## start from pre-processed CSV file rk <- read.csv("rk.csv") ## Figure 1 hist(log10(rk$nobs), xlab = "log (base 10) sample size", main = "") ## dispersion estimation: ## - in R based on Poisson Chi-squared (by default) ## - RK claims to use this (Remark b, page 142), but in fact ## used deviance-based estimate in GLIM dispersion <- function(object, type = "deviance") sum(residuals(object, type = type)^2)/df.residual(object) ## Equation 1 rk1 <- glm(nreg ~ log(nobs), weights = 1/neq, family = quasipoisson, data = rk) summary(rk1, dispersion = dispersion(rk1)) ## Equation 2 rk2 <- glm(nreg ~ log(nobs) + I(log(nobs)^2), weights = 1/neq, family = quasipoisson, data = rk) summary(rk2, dispersion = dispersion(rk2)) ## typo in log(nobs) coef (needs to be a typo due to parsity estimate!) ## parsity at nobs = 1000 coef(rk2)[2] + 2 * coef(rk2)[3] * log(1000) ## Equation 3 rk3 <- glm(nreg ~ log(nobs) + I(log(nobs)^2), weights = 1/neq, family = quasipoisson, data = rk, subset = nobs < 250000) summary(rk3, dispersion = dispersion(rk3)) ## difference in observations claimed: sum(rk$nobs >= 250000 & rk$nobs < 500000) ## parsity at nobs = 1000 coef(rk3)[2] + 2 * coef(rk3)[3] * log(1000) ## Equation 4 rk4 <- glm(nreg ~ log(log(nobs)), weights = 1/neq, family = quasipoisson, data = rk) summary(rk4, dispersion = dispersion(rk4)) ## parsity at nobs = 1000 coef(rk4)[2]/log(1000) ## Figure 2 plot(0, 0, xlim = c(1, 6) * log(10), ylim = c(-0.2, 1), type = "l", axes = FALSE, xlab = "sample size", ylab = "parsity") axis(1, at = (1:6) * log(10), labels = c("10", "100", "1000", "10000", "1000000", "1e6")) axis(2) box() abline(h = coef(rk1)[2]) abline(coef(rk2)[-1] * c(1, 2), lty = 3) abline(coef(rk3)[-1] * c(1, 2), lty = 2) ln <- seq(from = 1.5, to = 15, length = 50) p4 <- coef(rk4)[2]/ln lines(p4 ~ ln, lty = 5) legend("topright", paste("Model 2.", 4:1, sep = ""), lty = c(5, 2, 3, 1), bty = "n") ## Negative binomial model (p. 144) library("MASS") library("lmtest") library("sandwich") rk_nb <- glm.nb(nreg ~ log(log(nobs)), data = rk, weights = 1/neq) coeftest(rk_nb, vcov = sandwich) ## theta rk_nb$theta ## standard error rk_nb$SE.theta