Chapter 9

Example-9-2-automatic.R
data(hprice1, package='wooldridge')

# original linear regression
orig <- lm(price ~ lotsize+sqrft+bdrms, data=hprice1)

# RESET test
library(lmtest)
resettest(orig)
Example-9-2-manual.R
data(hprice1, package='wooldridge')

# original linear regression
orig <- lm(price ~ lotsize+sqrft+bdrms, data=hprice1)

# regression for RESET test
RESETreg <- lm(price ~ lotsize+sqrft+bdrms+I(fitted(orig)^2)+ 
                                        I(fitted(orig)^3), data=hprice1)
RESETreg

# RESET test. H0: all coeffs including "fitted" are=0 
library(car)
linearHypothesis(RESETreg, matchCoefs(RESETreg,"fitted"))
LAD.R
data(rdchem, package='wooldridge')

# OLS Regression
ols <- lm(rdintens ~ I(sales/1000) +profmarg, data=rdchem)
# LAD Regression
library(quantreg)
lad <- rq(rdintens ~ I(sales/1000) +profmarg, data=rdchem)

# regression table
library(stargazer)
stargazer(ols,lad,  type = "text")
Missings-Analyses.R
data(lawsch85, package='wooldridge')

# Mean of a variable with missings:
mean(lawsch85$LSAT)
mean(lawsch85$LSAT,na.rm=TRUE)

# Regression with missings
summary(lm(log(salary)~LSAT+cost+age, data=lawsch85))
Missings.R
data(lawsch85, package='wooldridge')

# extract LSAT
lsat <- lawsch85$LSAT

# Create logical indicator for missings
missLSAT <- is.na(lawsch85$LSAT)

# LSAT and indicator for Schools No. 120-129:
rbind(lsat,missLSAT)[,120:129]

# Frequencies of indicator
table(missLSAT)

# Missings for all variables in data frame (counts)
colSums(is.na(lawsch85))

# Indicator for complete cases
compl <- complete.cases(lawsch85)
table(compl)
NA-NaN-Inf.R
x <- c(-1,0,1,NA,NaN,-Inf,Inf)
logx <- log(x)
invx <- 1/x
ncdf <- pnorm(x)
isna <- is.na(x)

data.frame(x,logx,invx,ncdf,isna)
Nonnested-Test.R
data(hprice1, package='wooldridge')

# two alternative models
model1 <- lm(price ~     lotsize  +     sqrft  + bdrms, data=hprice1)
model2 <- lm(price ~ log(lotsize) + log(sqrft) + bdrms, data=hprice1)

# Test against comprehensive model
library(lmtest)
encomptest(model1,model2, data=hprice1)
Outliers.R
data(rdchem, package='wooldridge')

# Regression
reg <- lm(rdintens~sales+profmarg, data=rdchem)

# Studentized residuals for all observations:
studres <- rstudent(reg)

# Display extreme values:
min(studres)
max(studres)

# Histogram (and overlayed density plot):
hist(studres, freq=FALSE)
lines(density(studres), lwd=2)
Example-9-2-automatic.py
import wooldridge as woo
import statsmodels.formula.api as smf
import statsmodels.stats.outliers_influence as smo

hprice1 = woo.dataWoo('hprice1')

# original linear regression:
reg = smf.ols(formula='price ~ lotsize + sqrft + bdrms', data=hprice1)
results = reg.fit()

# automated RESET test:
reset_output = smo.reset_ramsey(res=results, degree=3)
fstat_auto = reset_output.statistic[0][0]
fpval_auto = reset_output.pvalue

print(f'fstat_auto: {fstat_auto}\n')
print(f'fpval_auto: {fpval_auto}\n')
Example-9-2-manual.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

hprice1 = woo.dataWoo('hprice1')

# original OLS:
reg = smf.ols(formula='price ~ lotsize + sqrft + bdrms', data=hprice1)
results = reg.fit()

# regression for RESET test:
hprice1['fitted_sq'] = results.fittedvalues ** 2
hprice1['fitted_cub'] = results.fittedvalues ** 3
reg_reset = smf.ols(formula='price ~ lotsize + sqrft + bdrms +'
                            ' fitted_sq + fitted_cub', data=hprice1)
results_reset = reg_reset.fit()

# print regression table:
table = pd.DataFrame({'b': round(results_reset.params, 4),
                      'se': round(results_reset.bse, 4),
                      't': round(results_reset.tvalues, 4),
                      'pval': round(results_reset.pvalues, 4)})
print(f'table: \n{table}\n')

# RESET test (H0: all coeffs including "fitted" are=0):
hypotheses = ['fitted_sq = 0', 'fitted_cub = 0']
ftest_man = results_reset.f_test(hypotheses)
fstat_man = ftest_man.statistic[0][0]
fpval_man = ftest_man.pvalue

print(f'fstat_man: {fstat_man}\n')
print(f'fpval_man: {fpval_man}\n')
LAD.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

rdchem = woo.dataWoo('rdchem')

# OLS regression:
reg_ols = smf.ols(formula='rdintens ~ I(sales/1000) + profmarg', data=rdchem)
results_ols = reg_ols.fit()

table_ols = pd.DataFrame({'b': round(results_ols.params, 4),
                          'se': round(results_ols.bse, 4),
                          't': round(results_ols.tvalues, 4),
                          'pval': round(results_ols.pvalues, 4)})
print(f'table_ols: \n{table_ols}\n')

# LAD regression:
reg_lad = smf.quantreg(formula='rdintens ~ I(sales/1000) + profmarg', data=rdchem)
results_lad = reg_lad.fit(q=.5)

table_lad = pd.DataFrame({'b': round(results_lad.params, 4),
                          'se': round(results_lad.bse, 4),
                          't': round(results_lad.tvalues, 4),
                          'pval': round(results_lad.pvalues, 4)})
print(f'table_lad: \n{table_lad}\n')
Missings-Analyses.py
import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf

lawsch85 = woo.dataWoo('lawsch85')

# missings in numpy:
x_np = np.array(lawsch85['LSAT'])
x_np_bar1 = np.mean(x_np)
x_np_bar2 = np.nanmean(x_np)
print(f'x_np_bar1: {x_np_bar1}\n')
print(f'x_np_bar2: {x_np_bar2}\n')

# missings in pandas:
x_pd = lawsch85['LSAT']
x_pd_bar1 = np.mean(x_pd)
x_pd_bar2 = np.nanmean(x_pd)
print(f'x_pd_bar1: {x_pd_bar1}\n')
print(f'x_pd_bar2: {x_pd_bar2}\n')

# observations and variables:
print(f'lawsch85.shape: {lawsch85.shape}\n')

# regression (missings are taken care of by default):
reg = smf.ols(formula='np.log(salary) ~ LSAT + cost + age', data=lawsch85)
results = reg.fit()
print(f'results.nobs: {results.nobs}\n')
Missings.py
import wooldridge as woo
import pandas as pd

lawsch85 = woo.dataWoo('lawsch85')
lsat_pd = lawsch85['LSAT']

# create boolean indicator for missings:
missLSAT = lsat_pd.isna()

# LSAT and indicator for Schools No. 120-129:
preview = pd.DataFrame({'lsat_pd': lsat_pd[119:129],
                        'missLSAT': missLSAT[119:129]})
print(f'preview: \n{preview}\n')

# frequencies of indicator:
freq_missLSAT = pd.crosstab(missLSAT, columns='count')
print(f'freq_missLSAT: \n{freq_missLSAT}\n')

# missings for all variables in data frame (counts):
miss_all = lawsch85.isna()
colsums = miss_all.sum(axis=0)
print(f'colsums: \n{colsums}\n')

# computing amount of complete cases:
complete_cases = (miss_all.sum(axis=1) == 0)
freq_complete_cases = pd.crosstab(complete_cases, columns='count')
print(f'freq_complete_cases: \n{freq_complete_cases}\n')
NA-NaN-Inf.py
import numpy as np
import pandas as pd
import scipy.stats as stats

# nan and inf handling in numpy:
x = np.array([-1, 0, 1, np.nan, np.inf, -np.inf])
logx = np.log(x)
invx = np.array(1 / x)
ncdf = np.array(stats.norm.cdf(x))
isnanx = np.isnan(x)

results = pd.DataFrame({'x': x, 'logx': logx, 'invx': invx,
                        'logx': logx, 'ncdf': ncdf, 'isnanx': isnanx})
print(f'results: \n{results}\n')
Nonnested-Test.py
import wooldridge as woo
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

hprice1 = woo.dataWoo('hprice1')

# two alternative models:
reg1 = smf.ols(formula='price ~ lotsize + sqrft + bdrms', data=hprice1)
results1 = reg1.fit()

reg2 = smf.ols(formula='price ~ np.log(lotsize) +'
                       'np.log(sqrft) + bdrms', data=hprice1)
results2 = reg2.fit()

# encompassing test of Davidson & MacKinnon:
# comprehensive model:
reg3 = smf.ols(formula='price ~ lotsize + sqrft + bdrms + '
                       'np.log(lotsize) + np.log(sqrft)', data=hprice1)
results3 = reg3.fit()

# model 1 vs. comprehensive model:
anovaResults1 = sm.stats.anova_lm(results1, results3)
print(f'anovaResults1: \n{anovaResults1}\n')

# model 2 vs. comprehensive model:
anovaResults2 = sm.stats.anova_lm(results2, results3)
print(f'anovaResults2: \n{anovaResults2}\n')
Outliers.py
import wooldridge as woo
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

rdchem = woo.dataWoo('rdchem')

# OLS regression:
reg = smf.ols(formula='rdintens ~ sales + profmarg', data=rdchem)
results = reg.fit()

# studentized residuals for all observations:
studres = results.get_influence().resid_studentized_external

# display extreme values:
studres_max = np.max(studres)
studres_min = np.min(studres)
print(f'studres_max: {studres_max}\n')
print(f'studres_min: {studres_min}\n')

# histogram (and overlayed density plot):
kde = sm.nonparametric.KDEUnivariate(studres)
kde.fit()

plt.hist(studres, color='grey', density=True)
plt.plot(kde.support, kde.density, color='black', linewidth=2)
plt.ylabel('density')
plt.xlabel('studres')
plt.savefig('PyGraphs/Outliers.pdf')
Sim-ME-Dep.py
import numpy as np
import scipy.stats as stats
import pandas as pd
import statsmodels.formula.api as smf

# set the random seed:
np.random.seed(1234567)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas):
beta0 = 1
beta1 = 0.5

# initialize arrays to store results later (b1 without ME, b1_me with ME):
b1 = np.empty(r)
b1_me = np.empty(r)

# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)

# repeat r times:
for i in range(r):
    # draw a sample of u:
    u = stats.norm.rvs(0, 1, size=n)

    # draw a sample of ystar:
    ystar = beta0 + beta1 * x + u

    # measurement error and mismeasured y:
    e0 = stats.norm.rvs(0, 1, size=n)
    y = ystar + e0
    df = pd.DataFrame({'ystar': ystar, 'y': y, 'x': x})

    # regress ystar on x and store slope estimate at position i:
    reg_star = smf.ols(formula='ystar ~ x', data=df)
    results_star = reg_star.fit()
    b1[i] = results_star.params['x']

    # regress y on x and store slope estimate at position i:
    reg_me = smf.ols(formula='y ~ x', data=df)
    results_me = reg_me.fit()
    b1_me[i] = results_me.params['x']

# mean with and without ME:
b1_mean = np.mean(b1)
b1_me_mean = np.mean(b1_me)
print(f'b1_mean: {b1_mean}\n')
print(f'b1_me_mean: {b1_me_mean}\n')

# variance with and without ME:
b1_var = np.var(b1, ddof=1)
b1_me_var = np.var(b1_me, ddof=1)
print(f'b1_var: {b1_var}\n')
print(f'b1_me_var: {b1_me_var}\n')
Sim-ME-Explan.py
import numpy as np
import scipy.stats as stats
import pandas as pd
import statsmodels.formula.api as smf

# set the random seed:
np.random.seed(1234567)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas):
beta0 = 1
beta1 = 0.5

# initialize b1 arrays to store results later:
b1 = np.empty(r)
b1_me = np.empty(r)

# draw a sample of x, fixed over replications:
xstar = stats.norm.rvs(4, 1, size=n)

# repeat r times:
for i in range(r):
    # draw a sample of u:
    u = stats.norm.rvs(0, 1, size=n)

    # draw a sample of y:
    y = beta0 + beta1 * xstar + u

    # measurement error and mismeasured x:
    e1 = stats.norm.rvs(0, 1, size=n)
    x = xstar + e1
    df = pd.DataFrame({'y': y, 'xstar': xstar, 'x': x})

    # regress y on xstar and store slope estimate at position i:
    reg_star = smf.ols(formula='y ~ xstar', data=df)
    results_star = reg_star.fit()
    b1[i] = results_star.params['xstar']

    # regress y on x and store slope estimate at position i:
    reg_me = smf.ols(formula='y ~ x', data=df)
    results_me = reg_me.fit()
    b1_me[i] = results_me.params['x']

# mean with and without ME:
b1_mean = np.mean(b1)
b1_me_mean = np.mean(b1_me)
print(f'b1_mean: {b1_mean}\n')
print(f'b1_me_mean: {b1_me_mean}\n')

# variance with and without ME:
b1_var = np.var(b1, ddof=1)
b1_me_var = np.var(b1_me, ddof=1)
print(f'b1_var: {b1_var}\n')
print(f'b1_me_var: {b1_me_var}\n')
Example-9-2.jl
using WooldridgeDatasets, GLM, DataFrames

hprice1 = DataFrame(wooldridge("hprice1"))

# original OLS:
reg = lm(@formula(price ~ lotsize + sqrft + bdrms), hprice1)

# regression for RESET test:
hprice1.fitted_sq = predict(reg) .^ 2 ./ 1000
hprice1.fitted_cub = predict(reg) .^ 3 ./ 1000

reg_reset = lm(@formula(price ~ lotsize + sqrft + bdrms +
                                fitted_sq + fitted_cub), hprice1)
table_reg_reset = coeftable(reg_reset)
println("table_reg_reset: \n$table_reg_reset\n")

# RESET test (H0: all coefficients including "fitted" are zero):
ftest_res = ftest(reg.model, reg_reset.model)
fstat = ftest_res.fstat[2]
fpval = ftest_res.pval[2]
println("fstat = $fstat\n")
println("fpval = $fpval")
LAD.jl
using WooldridgeDatasets, DataFrames, GLM, QuantileRegressions

rdchem = DataFrame(wooldridge("rdchem"))

# OLS regression:
reg_ols = lm(@formula(rdintens ~ sales / 1000 + profmarg), rdchem)
table_reg_ols = coeftable(reg_ols)
println("table_reg_ols: \n$table_reg_ols\n")

# LAD regression:
reg_lad = qreg(@formula(rdintens ~ sales / 1000 + profmarg), rdchem, 0.5)
table_reg_lad = coeftable(reg_lad)
println("table_reg_lad: \n$table_reg_lad")
Missings-Analyses.jl
using WooldridgeDatasets, GLM, DataFrames, Statistics

lawsch85 = DataFrame(wooldridge("lawsch85"))

# missings:
x = lawsch85.LSAT
x_bar1 = mean(x)
x_bar2 = mean(skipmissing(x))
println("x_bar1 = $x_bar1\n")
println("x_bar2 = $x_bar2\n")

# observations and variables:
nrows = nrow(lawsch85)
ncols = ncol(lawsch85)
println("nrows = $nrows\n")
println("ncols = $ncols\n")

# regression (missings are taken care of by default):
reg = lm(@formula(log(salary) ~ LSAT + cost + age), lawsch85)
n = nobs(reg)
println("n = $n")
Missings.jl
using WooldridgeDatasets, GLM, DataFrames

lawsch85 = DataFrame(wooldridge("lawsch85"))
lsat = lawsch85.LSAT

# create boolean indicator for missings:
missLSAT = ismissing.(lsat)

# LSAT and indicator for Schools No. 120-129:
preview = DataFrame(lsat=lsat[120:129],
    missLSAT=missLSAT[120:129])
println("preview: \n$preview\n")

# frequencies of indicator:
tot_missing = count(missLSAT) # same as sum(missLSAT)
tot_nonmissings = count(.!missLSAT)
println("tot_missing = $tot_missing\n")
println("tot_nonmissings = $tot_nonmissings\n")

# missings for all variables in data frame (counts):
miss_all = ismissing.(lawsch85)
freq_missLSAT = mapcols(count, miss_all)
freq_missLSAT_preview = freq_missLSAT[:, 1:9] # print only first nine columns
println("freq_missLSAT_preview: \n$freq_missLSAT_preview\n")

# computing amount of complete cases:
lsat_compl_cases1 = dropmissing(lawsch85)
complete_cases1 = nrow(lsat_compl_cases1)
println("complete_cases1 = $complete_cases1\n")

lsat_compl_cases2 = completecases(lawsch85)
complete_cases2 = count(lsat_compl_cases2)
println("complete_cases2 = $complete_cases2")
NaN-Inf-Missing.jl
using Distributions, DataFrames, Statistics

# NaN, missings and infinite values in Julia:
x1 = [0, 2, NaN, Inf, missing]
logx = log.(x1)
invx = 0 ./ x1
isnanx = isnan.(x1)
isinfx = isinf.(x1)
ismissingx = ismissing.(x1)

results = DataFrame(x1=x1, logx=logx, invx=invx, ismissingx=ismissingx,
    isnanx=isnanx, isinfx=isinfx)
println("results = $results\n")

# mathematically not defined is not always NaN (like in R or Python):
test = try
    log(-1) # results in an ERROR
catch e
    NaN
end
println("test = $test\n")

# handling missings:
x2 = [4, 2, missing, 3]
meanx2_1 = mean(x2)
println("meanx2_1 = $meanx2_1\n")

meanx2_2 = mean(skipmissing(x2))
println("meanx2_2 = $meanx2_2\n")

x3 = [4, 2, NaN, 3]
meanx3_1 = mean(x3)
println("meanx3_1 = $meanx3_1\n")

meanx3_2 = mean(x3[.!isnan.(x3)])
println("meanx3_2 = $meanx3_2")
Nonnested-Test.jl
using WooldridgeDatasets, GLM, DataFrames

hprice1 = DataFrame(wooldridge("hprice1"))

# two alternative models:
reg1 = lm(@formula(price ~ lotsize + sqrft + bdrms), hprice1)
reg2 = lm(@formula(price ~ log(lotsize) + log(sqrft) + bdrms), hprice1)

# encompassing test of Davidson & MacKinnon:
# comprehensive model:
reg3 = lm(@formula(price ~ lotsize + sqrft + bdrms +
                           log(lotsize) + log(sqrft)), hprice1)

# test model 1:
ftest_res1 = ftest(reg1.model, reg3.model)

fstat1 = ftest_res1.fstat[2]
fpval1 = ftest_res1.pval[2]
println("fstat1 = $fstat1\n")
println("fpval1 = $fpval1\n")

# test model 2:
ftest_res2 = ftest(reg2.model, reg3.model)

fstat2 = ftest_res2.fstat[2]
fpval2 = ftest_res2.pval[2]
println("fstat2 = $fstat2\n")
println("fpval2 = $fpval2")
Outliers.jl
using WooldridgeDatasets, GLM, DataFrames, LinearAlgebra, Plots

rdchem = DataFrame(wooldridge("rdchem"))

# create dummys for each observation with an identity matrix:
n = nrow(rdchem)
dummys = DataFrame(Matrix(1I, n, n), Symbol.(:d, 1:n)) # colnames d1, ..., d32

# studentized residuals for all observations:
studres = zeros(n)
for i in 1:n
    rdchem.di = dummys[:, i]
    reg_i = lm(@formula(rdintens ~ sales + profmarg + di), rdchem)
    # save t statistic (3rd column) of di (4th element):
    studres[i] = coeftable(reg_i).cols[3][4]
end

# display extreme values:
studres_max = maximum(studres)
studres_min = minimum(studres)
println("studres_max = $studres_max\n")
println("studres_min = $studres_min")

# histogram:
histogram(studres, color="grey", legend=false)
xlabel!("studres")
savefig("JlGraphs/Outliers.pdf")
Sim-ME-Dep.jl
using Random, Distributions, Statistics, GLM, DataFrames

# set the random seed:
Random.seed!(12345)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas):
beta0 = 1
beta1 = 0.5

# initialize arrays to store results later (b1 without ME, b1_me with ME):
b1 = zeros(r)
b1_me = zeros(r)

# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)

# repeat r times:
for i in 1:r
        # draw a sample of u:
        u = rand(Normal(0, 1), n)

        # draw a sample of ystar:
        ystar = beta0 .+ beta1 * x .+ u

        # measurement error and mismeasured y:
        e0 = rand(Normal(0, 1), n)
        y = ystar .+ e0
        df = DataFrame(ystar=ystar, y=y, x=x)

        # regress ystar on x and store slope estimate at position i:
        reg_star = lm(@formula(ystar ~ x), df)
        b1[i] = coef(reg_star)[2]

        # regress y on x and store slope estimate at position i:
        reg_me = lm(@formula(y ~ x), df)
        b1_me[i] = coef(reg_me)[2]
end

# mean with and without ME:
b1_mean = mean(b1)
b1_me_mean = mean(b1_me)
println("b1_mean = $b1_mean\n")
println("b1_me_mean = $b1_me_mean\n")

# variance with and without ME:
b1_var = var(b1)
b1_me_var = var(b1_me)
println("b1_var = $b1_var\n")
println("b1_me_var = $b1_me_var")
Sim-ME-Explan.jl
using Random, Distributions, Statistics, GLM, DataFrames

# set the random seed:
Random.seed!(12345)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas):
beta0 = 1
beta1 = 0.5

# initialize arrays to store results later (b1 without ME, b1_me with ME):
b1 = zeros(r)
b1_me = zeros(r)

# draw a sample of x, fixed over replications:
xstar = rand(Normal(4, 1), n)

# repeat r times:
for i in 1:r
        # draw a sample of u:
        u = rand(Normal(0, 1), n)

        # draw a sample of y:
        y = beta0 .+ beta1 * xstar .+ u

        # measurement error and mismeasured x:
        e1 = rand(Normal(0, 1), n)
        x = xstar .+ e1
        df = DataFrame(y=y, xstar=xstar, x=x)

        # regress y on xstar and store slope estimate at position i:
        reg_star = lm(@formula(y ~ xstar), df)
        b1[i] = coef(reg_star)[2]

        # regress y on x and store slope estimate at position i:
        reg_me = lm(@formula(y ~ x), df)
        b1_me[i] = coef(reg_me)[2]
end

# mean with and without ME:
b1_mean = mean(b1)
b1_me_mean = mean(b1_me)
println("b1_mean = $b1_mean\n")
println("b1_me_mean = $b1_me_mean\n")

# variance with and without ME:
b1_var = var(b1)
b1_me_var = var(b1_me)
println("b1_var = $b1_var\n")
println("b1_me_var = $b1_me_var")