# Data Process of arch.dta (The dataset used in Meghir and Pistaferri (2004, EMCA)
# Samples are further restricted to include individuals only if their PSID records start from age 25 onwards.
# Common practice in the literature: Prestage (allowing time varying observable mean effects) to get log earning residuals 
# Age variable record contains some error (repetition), since it is supposed to be consecutive order, it is easily fixed.



require(foreign)

fixage <- function(x){
r = length(x)
x[1]:(x[1]+r-1)
}


pardiff <- function(x,rho){
r = length(x)
c(NA,x[2:r]-rho*x[1:(r-1)])
}
psid = read.dta("arch.dta")


psid$id <- rep(NA,dim(psid)[1])
uni = unique(psid$newid)

for (i in 1:length(uni)){
psid$id[(psid$newid==uni[i])]=i
}

psid$college = 0
psid$college[psid$studio==3]=1
psid$grad = 0
psid$grad[psid$studio==2]=1
psid$dropout = 0
psid$dropout[psid$studio==1]=1

psid$region[is.na(psid$region)]=3
psid$reg = NA
psid$reg[psid$region>=1 & psid$region<=4] <- psid$region[psid$region>=1 & psid$region<=4]
psid$reg[psid$region>=5] <- 4
# 1 NE, 2 NC, 3 South, 4 West

psid$neast = 0
psid$neast[psid$reg==1]=1
psid$ncentr = 0
psid$ncentr[psid$reg==2]=1
psid$south = 0
psid$south[psid$reg==3]=1
psid$west = 0
psid$west[psid$reg==4] = 1

psid$mst = NA
psid$mst[psid$marit==1|psid$marit ==8 & psid$year == 67] = 1
psid$mst[psid$marit ==2 ] = 2
psid$mst [psid$marit ==3 ] = 3
psid$mst[psid$marit==4]=4
psid$mst[psid$marit==5]=5
psid$mst[is.na(psid$mst)]=6

psid$married = 0
psid$married[psid$mst==1]=1

psid$smsa1 = 0
psid$smsa1[psid$smsa>=1 & psid$smsa<=3] = 1
psid$smsa = psid$smsa1

psid$agem <- unlist(tapply(psid$age,psid$id,fixage))  # fixing age

psid$year = psid$year + 1900 + 1

psid$exp <- psid$agem - pmax(psid$educ,12) - 6

psid$age <- psid$agem

psid$cohort <- psid$year - psid$age

# We further cut the sample  by including only individuals that have 
# their PSID obsersations starting from age 25 onwards. 
minage = tapply(psid$age,psid$id,"min")
idset = which(minage<26)
sun = match(psid$id,idset)
psid = psid[!is.na(sun),]

# another iteration of id to have them in index order

psid$newid <- psid$id
uni = unique(psid$id)

for (i in 1:length(uni)){
psid$id[(psid$newid==uni[i])]=i
}

# Pre-stage regression to purge mean effects of covariates
# Regression specification follows Meghir and Pistaferri (2004). 
# Same specification was used in Hospido (2012)

psid$res = rep(NA,nrow(psid))
for (k in 1968:1993){
psid$res[psid$year==k] <- lm(ly ~ age + I(age^2) + white + college + grad + dropout + married + neast + ncentr + south + smsa, data = subset(psid,psid$year==k))$residuals
}

psid$ystand = psid$res  # This is the log earning residuals that we will be analyze
