###################
## PREPROCESSING ##
###################

## read in raw information
rk <- readLines("rk.raw")
n <- length(rk)/2

## process information on model dimension
mdim <- strsplit(rk[(1:n) * 2], "\t")
neq <- sapply(mdim, length)
mdim <- sapply(strsplit(unlist(mdim), ","), rbind)

## process information on publication
info <- rep(rk[(1:n) * 2 - 1], neq)
info <- sapply(strsplit(info, "\t"), rbind)

## combine and turn to data.frame
rk <- data.frame(id = rep(1:n, neq), neq = rep(neq, neq),
  nreg = as.numeric(mdim[1,]), nobs = as.numeric(mdim[2,]),
  author = info[1,], journal = info[2,],
  year = as.numeric(info[3,]), page = as.numeric(gsub("s", "", info[4,])),
  subject = factor(info[5,], levels = c("d", "h", "g", "u", "w", "b", "m", "c")))
levels(rk$subject) <- c("discrimination", "human capital", "general", "unionism", "women")[c(1:5, 1, 3, 3)]
rk$collection <- factor(rk$journal %in% c("ir", "jpe", "jhr", "ilrr", "aer", "ej", "restat"),
  levels = c(TRUE, FALSE), labels = c("no", "yes"))

## write as CSV
write.table(rk, file = "rk.csv", sep = ",", quote = FALSE)

###################
## DATA ANALYSIS ##
###################

## start from pre-processed CSV file
rk <- read.csv("rk.csv")

## Figure 1
hist(log10(rk$nobs), xlab = "log (base 10) sample size", main = "")

## dispersion estimation:
##   - in R based on Poisson Chi-squared (by default)
##   - RK claims to use this (Remark b, page 142), but in fact
##     used deviance-based estimate in GLIM
dispersion <- function(object, type = "deviance")
  sum(residuals(object, type = type)^2)/df.residual(object)

## Equation 1
rk1 <- glm(nreg ~ log(nobs), weights = 1/neq, family = quasipoisson, data = rk)
summary(rk1, dispersion = dispersion(rk1))

## Equation 2
rk2 <- glm(nreg ~ log(nobs) + I(log(nobs)^2), weights = 1/neq, family = quasipoisson, data = rk)
summary(rk2, dispersion = dispersion(rk2))
## typo in log(nobs) coef (needs to be a typo due to parsity estimate!)
## parsity at nobs = 1000
coef(rk2)[2] + 2 * coef(rk2)[3] * log(1000)

## Equation 3
rk3 <- glm(nreg ~ log(nobs) + I(log(nobs)^2), weights = 1/neq, family = quasipoisson,
  data = rk, subset = nobs < 250000)
summary(rk3, dispersion = dispersion(rk3))
## difference in observations claimed:
sum(rk$nobs >= 250000 & rk$nobs < 500000)
## parsity at nobs = 1000
coef(rk3)[2] + 2 * coef(rk3)[3] * log(1000)

## Equation 4
rk4 <- glm(nreg ~ log(log(nobs)), weights = 1/neq, family = quasipoisson, data = rk)
summary(rk4, dispersion = dispersion(rk4))
## parsity at nobs = 1000
coef(rk4)[2]/log(1000)

## Figure 2
plot(0, 0, xlim = c(1, 6) * log(10), ylim = c(-0.2, 1), type = "l", axes = FALSE,
  xlab = "sample size", ylab = "parsity")
axis(1, at = (1:6) * log(10), labels = c("10", "100", "1000", "10000", "1000000", "1e6"))
axis(2)
box()
abline(h = coef(rk1)[2])
abline(coef(rk2)[-1] * c(1, 2), lty = 3)
abline(coef(rk3)[-1] * c(1, 2), lty = 2)
ln <- seq(from = 1.5, to = 15, length = 50)
p4 <- coef(rk4)[2]/ln
lines(p4 ~ ln, lty = 5)
legend("topright", paste("Model 2.", 4:1, sep = ""), lty = c(5, 2, 3, 1), bty = "n")

## Negative binomial model (p. 144)
library("MASS")
library("lmtest")
library("sandwich")
rk_nb <- glm.nb(nreg ~ log(log(nobs)), data = rk, weights = 1/neq)
coeftest(rk_nb, vcov = sandwich)
## theta
rk_nb$theta
## standard error
rk_nb$SE.theta