######################## # Lecture 8 # Measurement Error and Endogeneity # POLS 509: The Linear Model # Dr. Justin Esarey ######################## # Measurement Error ###################### rm(list=ls()) set.seed(123456) X<-runif(100, min=-10, max=10) y<-2 + 3*X + rnorm(100, mean=0, sd=2) summary(lm(y~X)) # with error on y summary(lm((y+rnorm(100, mean=0, sd=3))~X)) # with error on X XX<-X+rnorm(100, mean=0, sd=3) summary(lm(y~XX)) # do it in a formal simulation rm(list=ls()) set.seed(123456) bias<-c() ci<-c() ciwid<-c() bias.xerr<-c() ci.xerr<-c() ciwid.xerr<-c() bias.yerr<-c() ci.yerr<-c() ciwid.yerr<-c() pb<-txtProgressBar(min=0, max=1000, style=3) for(i in 1:1000){ setTxtProgressBar(pb, i) X<-runif(100, min=-10, max=10) b<-runif(2, min=1, max=3) y<-b[1]+ b[2]*X + rnorm(100, mean=0, sd=2) model<-lm(y~X) bias[i]<-b[2]-summary(model)$coefficients[2,1] ci[i]<-ifelse(confint(model)[2,1]b[2],1,0) ciwid[i]<-abs(confint(model)[2,1] -confint(model)[2,2]) # with error on y model<-lm((y+rnorm(100, mean=0, sd=3))~X) bias.yerr[i]<-b[2]-summary(model)$coefficients[2,1] ci.yerr[i]<-ifelse(confint(model)[2,1]b[2],1,0) ciwid.yerr[i]<-abs(confint(model)[2,1] -confint(model)[2,2]) # with error on X XX<-X+rnorm(100, mean=0, sd=3) model<-lm(y~XX) bias.xerr[i]<-b[2]-summary(model)$coefficients[2,1] ci.xerr[i]<-ifelse(confint(model)[2,1]b[2],1,0) ciwid.xerr[i]<-abs(confint(model)[2,1] -confint(model)[2,2]) } close(pb) summary(bias) summary(ci) summary(ciwid) summary(bias.xerr) summary(ci.xerr) summary(ciwid.xerr) summary(bias.yerr) summary(ci.yerr) summary(ciwid.yerr) # bigger N does not solve the problem rm(list=ls()) set.seed(123456) bias<-c() ci<-c() ciwid<-c() bias.xerr<-c() ci.xerr<-c() ciwid.xerr<-c() bias.yerr<-c() ci.yerr<-c() ciwid.yerr<-c() pb<-txtProgressBar(min=0, max=1000, style=3) for(i in 1:1000){ setTxtProgressBar(pb, i) X<-runif(1000, min=-10, max=10) b<-runif(2, min=1, max=3) y<-b[1]+ b[2]*X + rnorm(1000, mean=0, sd=2) model<-lm(y~X) bias[i]<-b[2]-summary(model)$coefficients[2,1] ci[i]<-ifelse(confint(model)[2,1]b[2],1,0) ciwid[i]<-abs(confint(model)[2,1] -confint(model)[2,2]) # with error on y model<-lm((y+rnorm(1000, mean=0, sd=3))~X) bias.yerr[i]<-b[2]-summary(model)$coefficients[2,1] ci.yerr[i]<-ifelse(confint(model)[2,1]b[2],1,0) ciwid.yerr[i]<-abs(confint(model)[2,1] -confint(model)[2,2]) # with error on X XX<-X+rnorm(1000, mean=0, sd=3) model<-lm(y~XX) bias.xerr[i]<-b[2]-summary(model)$coefficients[2,1] ci.xerr[i]<-ifelse(confint(model)[2,1]b[2],1,0) ciwid.xerr[i]<-abs(confint(model)[2,1] -confint(model)[2,2]) } close(pb) summary(bias) summary(ci) summary(ciwid) summary(bias.xerr) summary(ci.xerr) summary(ciwid.xerr) summary(bias.yerr) summary(ci.yerr) summary(ciwid.yerr) # possible correction: multiple measurements of X rm(list=ls()) set.seed(123456) X<-runif(100, min=-10, max=10) X1<-X+rnorm(100, mean=0, sd=3) X2<-X+rnorm(100, mean=0, sd=3) X3<-X+rnorm(100, mean=0, sd=3) X4<-X+rnorm(100, mean=0, sd=3) X5<-X+rnorm(100, mean=0, sd=3) y<-2 + 3*X + rnorm(100, mean=0, sd=2) # use one measurement only summary(lm(y~X1)) summary(lm(y~X2)) summary(lm(y~X3)) summary(lm(y~X4)) summary(lm(y~X5)) # average X values before run Xmat<-cbind(X1, X2, X3, X4, X5) XX<-apply(Xmat, MARGIN=1, FUN=mean) summary(lm(y~XX)) # Endogeneity ###################### rm(list=ls()) set.seed(123456) require(sem) # Create an endogenous fake data set Z<-runif(500, min=-5, max=5) # this is the instrumental variable y<-c() X<-c() # beta_z=1 # beta_X=1 # beta_Y=2 # Y_intercept=3 # X_intercept=4 # y = 3 + X # X = 4+ Z + 2Y for(i in 1:500){ mat<-matrix(data=c(1, -1, -2, 1), nrow=2, ncol=2, byrow=T) YY<-qr.solve(mat, b=c(3+rnorm(1, mean=0, sd=2),4+Z[i]+rnorm(1, mean=0, sd=2))) y[i]<-YY[1] X[i]<-YY[2] } # Try a standard OLS model model<-lm(y~X) summary(model) # Try a 2SLS model summary(tsls(y~X, ~Z)) # more variables ################## rm(list=ls()) set.seed(123456) require(sem) # Create an endogenous fake data set K<-runif(500, min=-5, max=5) # this is one instrumental variable for X L<-runif(500, min=-5, max=5) # this is one instrumental variable for X M<-runif(500, min=-5, max=5) # this is one instrumental variable for y N<-runif(500, min=-5, max=5) # this is one instrumental variable for y Z<-runif(500, min=-5, max=5) # a common exogenous variable y<-c() X<-c() # y = f(X,M,N,Z) # X = f(y,K,L,Z) # beta_z=1 # beta_X=1 # beta_Y=2 # Y_intercept=3 # X_intercept=4 for(i in 1:500){ mat<-matrix(data=c(1, -1, -2, 1), nrow=2, ncol=2, byrow=T) YY<-qr.solve(mat, b=c(3+2.5*M[i]+1.25*N[i]+5*Z[i]+rnorm(1, mean=0, sd=2),4+K[i]+2*L[i]+4*Z[i]+rnorm(1, mean=0, sd=2))) y[i]<-YY[1] X[i]<-YY[2] } # Try a standard OLS model model<-lm(y~X+M+N+Z) summary(model) # Try a 2SLS model summary(tsls(y~X+M+N+Z, ~K)) summary(tsls(y~X+M+N+Z, ~K+L)) summary(tsls(y~X+M+N+Z, ~K+L+Z)) summary(tsls(y~X+M+N+Z, ~K+L+M+N+Z)) # Try a 2SLS model for X summary(tsls(~X, ~K+L)) # weak instrument ################## rm(list=ls()) set.seed(123456) require(sem) # Create an endogenous fake data set Z<-runif(500, min=-5, max=5) # this is the instrumental variable y<-c() X<-c() # beta_z=.06 # beta_X=1 # beta_Y=2 # Y_intercept=3 # X_intercept=4 for(i in 1:500){ mat<-matrix(data=c(1, -1, -2, 1), nrow=2, ncol=2, byrow=T) YY<-qr.solve(mat, b=c(3+rnorm(1, mean=0, sd=2),4+.06*Z[i]+rnorm(1, mean=0, sd=2))) y[i]<-YY[1] X[i]<-YY[2] } # Try a standard OLS model model<-lm(y~X) summary(model) # Try a 2SLS model summary(tsls(y~X, ~Z)) # insubstantial error in Y ################################## rm(list=ls()) set.seed(123456) require(sem) # Create an endogenous fake data set Z<-runif(500, min=-5, max=5) # this is the instrumental variable y<-c() X<-c() # beta_z=1 # beta_X=1 # beta_Y=2 # Y_intercept=3 # X_intercept=4 for(i in 1:500){ mat<-matrix(data=c(1, -1, -2, 1), nrow=2, ncol=2, byrow=T) YY<-qr.solve(mat, b=c(3+rnorm(1, mean=0, sd=.2),4+Z[i]+rnorm(1, mean=0, sd=2))) y[i]<-YY[1] X[i]<-YY[2] } # Try a standard OLS model model<-lm(y~X) summary(model) # Try a 2SLS model summary(tsls(y~X, ~Z)) # insubstantial error in X ################################## rm(list=ls()) set.seed(123456) require(sem) # Create an endogenous fake data set Z<-runif(500, min=-5, max=5) # this is the instrumental variable y<-c() X<-c() # beta_z=1 # beta_X=1 # beta_Y=2 # Y_intercept=3 # X_intercept=4 for(i in 1:500){ mat<-matrix(data=c(1, -1, -2, 1), nrow=2, ncol=2, byrow=T) YY<-qr.solve(mat, b=c(3+rnorm(1, mean=0, sd=2),4+Z[i]+rnorm(1, mean=0, sd=.02))) y[i]<-YY[1] X[i]<-YY[2] } # Try a standard OLS model model<-lm(y~X) summary(model) # Try a 2SLS model summary(tsls(y~X, ~Z))