Example-9-2-automatic.R
data(hprice1, package='wooldridge')
# original linear regression
<- lm(price ~ lotsize+sqrft+bdrms, data=hprice1)
orig
# RESET test
library(lmtest)
resettest(orig)
data(hprice1, package='wooldridge')
# original linear regression
orig <- lm(price ~ lotsize+sqrft+bdrms, data=hprice1)
# regression for RESET test
RESETreg <- lm(price ~ lotsize+sqrft+bdrms+I(fitted(orig)^2)+
I(fitted(orig)^3), data=hprice1)
RESETreg
# RESET test. H0: all coeffs including "fitted" are=0
library(car)
linearHypothesis(RESETreg, matchCoefs(RESETreg,"fitted"))
data(lawsch85, package='wooldridge')
# extract LSAT
lsat <- lawsch85$LSAT
# Create logical indicator for missings
missLSAT <- is.na(lawsch85$LSAT)
# LSAT and indicator for Schools No. 120-129:
rbind(lsat,missLSAT)[,120:129]
# Frequencies of indicator
table(missLSAT)
# Missings for all variables in data frame (counts)
colSums(is.na(lawsch85))
# Indicator for complete cases
compl <- complete.cases(lawsch85)
table(compl)
data(rdchem, package='wooldridge')
# Regression
reg <- lm(rdintens~sales+profmarg, data=rdchem)
# Studentized residuals for all observations:
studres <- rstudent(reg)
# Display extreme values:
min(studres)
max(studres)
# Histogram (and overlayed density plot):
hist(studres, freq=FALSE)
lines(density(studres), lwd=2)
import wooldridge as woo
import statsmodels.formula.api as smf
import statsmodels.stats.outliers_influence as smo
hprice1 = woo.dataWoo('hprice1')
# original linear regression:
reg = smf.ols(formula='price ~ lotsize + sqrft + bdrms', data=hprice1)
results = reg.fit()
# automated RESET test:
reset_output = smo.reset_ramsey(res=results, degree=3)
fstat_auto = reset_output.statistic[0][0]
fpval_auto = reset_output.pvalue
print(f'fstat_auto: {fstat_auto}\n')
print(f'fpval_auto: {fpval_auto}\n')
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf
hprice1 = woo.dataWoo('hprice1')
# original OLS:
reg = smf.ols(formula='price ~ lotsize + sqrft + bdrms', data=hprice1)
results = reg.fit()
# regression for RESET test:
hprice1['fitted_sq'] = results.fittedvalues ** 2
hprice1['fitted_cub'] = results.fittedvalues ** 3
reg_reset = smf.ols(formula='price ~ lotsize + sqrft + bdrms +'
' fitted_sq + fitted_cub', data=hprice1)
results_reset = reg_reset.fit()
# print regression table:
table = pd.DataFrame({'b': round(results_reset.params, 4),
'se': round(results_reset.bse, 4),
't': round(results_reset.tvalues, 4),
'pval': round(results_reset.pvalues, 4)})
print(f'table: \n{table}\n')
# RESET test (H0: all coeffs including "fitted" are=0):
hypotheses = ['fitted_sq = 0', 'fitted_cub = 0']
ftest_man = results_reset.f_test(hypotheses)
fstat_man = ftest_man.statistic[0][0]
fpval_man = ftest_man.pvalue
print(f'fstat_man: {fstat_man}\n')
print(f'fpval_man: {fpval_man}\n')
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf
rdchem = woo.dataWoo('rdchem')
# OLS regression:
reg_ols = smf.ols(formula='rdintens ~ I(sales/1000) + profmarg', data=rdchem)
results_ols = reg_ols.fit()
table_ols = pd.DataFrame({'b': round(results_ols.params, 4),
'se': round(results_ols.bse, 4),
't': round(results_ols.tvalues, 4),
'pval': round(results_ols.pvalues, 4)})
print(f'table_ols: \n{table_ols}\n')
# LAD regression:
reg_lad = smf.quantreg(formula='rdintens ~ I(sales/1000) + profmarg', data=rdchem)
results_lad = reg_lad.fit(q=.5)
table_lad = pd.DataFrame({'b': round(results_lad.params, 4),
'se': round(results_lad.bse, 4),
't': round(results_lad.tvalues, 4),
'pval': round(results_lad.pvalues, 4)})
print(f'table_lad: \n{table_lad}\n')
import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf
lawsch85 = woo.dataWoo('lawsch85')
# missings in numpy:
x_np = np.array(lawsch85['LSAT'])
x_np_bar1 = np.mean(x_np)
x_np_bar2 = np.nanmean(x_np)
print(f'x_np_bar1: {x_np_bar1}\n')
print(f'x_np_bar2: {x_np_bar2}\n')
# missings in pandas:
x_pd = lawsch85['LSAT']
x_pd_bar1 = np.mean(x_pd)
x_pd_bar2 = np.nanmean(x_pd)
print(f'x_pd_bar1: {x_pd_bar1}\n')
print(f'x_pd_bar2: {x_pd_bar2}\n')
# observations and variables:
print(f'lawsch85.shape: {lawsch85.shape}\n')
# regression (missings are taken care of by default):
reg = smf.ols(formula='np.log(salary) ~ LSAT + cost + age', data=lawsch85)
results = reg.fit()
print(f'results.nobs: {results.nobs}\n')
import wooldridge as woo
import pandas as pd
lawsch85 = woo.dataWoo('lawsch85')
lsat_pd = lawsch85['LSAT']
# create boolean indicator for missings:
missLSAT = lsat_pd.isna()
# LSAT and indicator for Schools No. 120-129:
preview = pd.DataFrame({'lsat_pd': lsat_pd[119:129],
'missLSAT': missLSAT[119:129]})
print(f'preview: \n{preview}\n')
# frequencies of indicator:
freq_missLSAT = pd.crosstab(missLSAT, columns='count')
print(f'freq_missLSAT: \n{freq_missLSAT}\n')
# missings for all variables in data frame (counts):
miss_all = lawsch85.isna()
colsums = miss_all.sum(axis=0)
print(f'colsums: \n{colsums}\n')
# computing amount of complete cases:
complete_cases = (miss_all.sum(axis=1) == 0)
freq_complete_cases = pd.crosstab(complete_cases, columns='count')
print(f'freq_complete_cases: \n{freq_complete_cases}\n')
import numpy as np
import pandas as pd
import scipy.stats as stats
# nan and inf handling in numpy:
x = np.array([-1, 0, 1, np.nan, np.inf, -np.inf])
logx = np.log(x)
invx = np.array(1 / x)
ncdf = np.array(stats.norm.cdf(x))
isnanx = np.isnan(x)
results = pd.DataFrame({'x': x, 'logx': logx, 'invx': invx,
'logx': logx, 'ncdf': ncdf, 'isnanx': isnanx})
print(f'results: \n{results}\n')
import wooldridge as woo
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
hprice1 = woo.dataWoo('hprice1')
# two alternative models:
reg1 = smf.ols(formula='price ~ lotsize + sqrft + bdrms', data=hprice1)
results1 = reg1.fit()
reg2 = smf.ols(formula='price ~ np.log(lotsize) +'
'np.log(sqrft) + bdrms', data=hprice1)
results2 = reg2.fit()
# encompassing test of Davidson & MacKinnon:
# comprehensive model:
reg3 = smf.ols(formula='price ~ lotsize + sqrft + bdrms + '
'np.log(lotsize) + np.log(sqrft)', data=hprice1)
results3 = reg3.fit()
# model 1 vs. comprehensive model:
anovaResults1 = sm.stats.anova_lm(results1, results3)
print(f'anovaResults1: \n{anovaResults1}\n')
# model 2 vs. comprehensive model:
anovaResults2 = sm.stats.anova_lm(results2, results3)
print(f'anovaResults2: \n{anovaResults2}\n')
import wooldridge as woo
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
rdchem = woo.dataWoo('rdchem')
# OLS regression:
reg = smf.ols(formula='rdintens ~ sales + profmarg', data=rdchem)
results = reg.fit()
# studentized residuals for all observations:
studres = results.get_influence().resid_studentized_external
# display extreme values:
studres_max = np.max(studres)
studres_min = np.min(studres)
print(f'studres_max: {studres_max}\n')
print(f'studres_min: {studres_min}\n')
# histogram (and overlayed density plot):
kde = sm.nonparametric.KDEUnivariate(studres)
kde.fit()
plt.hist(studres, color='grey', density=True)
plt.plot(kde.support, kde.density, color='black', linewidth=2)
plt.ylabel('density')
plt.xlabel('studres')
plt.savefig('PyGraphs/Outliers.pdf')
import numpy as np
import scipy.stats as stats
import pandas as pd
import statsmodels.formula.api as smf
# set the random seed:
np.random.seed(1234567)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas):
beta0 = 1
beta1 = 0.5
# initialize arrays to store results later (b1 without ME, b1_me with ME):
b1 = np.empty(r)
b1_me = np.empty(r)
# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)
# repeat r times:
for i in range(r):
# draw a sample of u:
u = stats.norm.rvs(0, 1, size=n)
# draw a sample of ystar:
ystar = beta0 + beta1 * x + u
# measurement error and mismeasured y:
e0 = stats.norm.rvs(0, 1, size=n)
y = ystar + e0
df = pd.DataFrame({'ystar': ystar, 'y': y, 'x': x})
# regress ystar on x and store slope estimate at position i:
reg_star = smf.ols(formula='ystar ~ x', data=df)
results_star = reg_star.fit()
b1[i] = results_star.params['x']
# regress y on x and store slope estimate at position i:
reg_me = smf.ols(formula='y ~ x', data=df)
results_me = reg_me.fit()
b1_me[i] = results_me.params['x']
# mean with and without ME:
b1_mean = np.mean(b1)
b1_me_mean = np.mean(b1_me)
print(f'b1_mean: {b1_mean}\n')
print(f'b1_me_mean: {b1_me_mean}\n')
# variance with and without ME:
b1_var = np.var(b1, ddof=1)
b1_me_var = np.var(b1_me, ddof=1)
print(f'b1_var: {b1_var}\n')
print(f'b1_me_var: {b1_me_var}\n')
import numpy as np
import scipy.stats as stats
import pandas as pd
import statsmodels.formula.api as smf
# set the random seed:
np.random.seed(1234567)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas):
beta0 = 1
beta1 = 0.5
# initialize b1 arrays to store results later:
b1 = np.empty(r)
b1_me = np.empty(r)
# draw a sample of x, fixed over replications:
xstar = stats.norm.rvs(4, 1, size=n)
# repeat r times:
for i in range(r):
# draw a sample of u:
u = stats.norm.rvs(0, 1, size=n)
# draw a sample of y:
y = beta0 + beta1 * xstar + u
# measurement error and mismeasured x:
e1 = stats.norm.rvs(0, 1, size=n)
x = xstar + e1
df = pd.DataFrame({'y': y, 'xstar': xstar, 'x': x})
# regress y on xstar and store slope estimate at position i:
reg_star = smf.ols(formula='y ~ xstar', data=df)
results_star = reg_star.fit()
b1[i] = results_star.params['xstar']
# regress y on x and store slope estimate at position i:
reg_me = smf.ols(formula='y ~ x', data=df)
results_me = reg_me.fit()
b1_me[i] = results_me.params['x']
# mean with and without ME:
b1_mean = np.mean(b1)
b1_me_mean = np.mean(b1_me)
print(f'b1_mean: {b1_mean}\n')
print(f'b1_me_mean: {b1_me_mean}\n')
# variance with and without ME:
b1_var = np.var(b1, ddof=1)
b1_me_var = np.var(b1_me, ddof=1)
print(f'b1_var: {b1_var}\n')
print(f'b1_me_var: {b1_me_var}\n')
using WooldridgeDatasets, GLM, DataFrames
hprice1 = DataFrame(wooldridge("hprice1"))
# original OLS:
reg = lm(@formula(price ~ lotsize + sqrft + bdrms), hprice1)
# regression for RESET test:
hprice1.fitted_sq = predict(reg) .^ 2 ./ 1000
hprice1.fitted_cub = predict(reg) .^ 3 ./ 1000
reg_reset = lm(@formula(price ~ lotsize + sqrft + bdrms +
fitted_sq + fitted_cub), hprice1)
table_reg_reset = coeftable(reg_reset)
println("table_reg_reset: \n$table_reg_reset\n")
# RESET test (H0: all coefficients including "fitted" are zero):
ftest_res = ftest(reg.model, reg_reset.model)
fstat = ftest_res.fstat[2]
fpval = ftest_res.pval[2]
println("fstat = $fstat\n")
println("fpval = $fpval")
using WooldridgeDatasets, DataFrames, GLM, QuantileRegressions
rdchem = DataFrame(wooldridge("rdchem"))
# OLS regression:
reg_ols = lm(@formula(rdintens ~ sales / 1000 + profmarg), rdchem)
table_reg_ols = coeftable(reg_ols)
println("table_reg_ols: \n$table_reg_ols\n")
# LAD regression:
reg_lad = qreg(@formula(rdintens ~ sales / 1000 + profmarg), rdchem, 0.5)
table_reg_lad = coeftable(reg_lad)
println("table_reg_lad: \n$table_reg_lad")
using WooldridgeDatasets, GLM, DataFrames, Statistics
lawsch85 = DataFrame(wooldridge("lawsch85"))
# missings:
x = lawsch85.LSAT
x_bar1 = mean(x)
x_bar2 = mean(skipmissing(x))
println("x_bar1 = $x_bar1\n")
println("x_bar2 = $x_bar2\n")
# observations and variables:
nrows = nrow(lawsch85)
ncols = ncol(lawsch85)
println("nrows = $nrows\n")
println("ncols = $ncols\n")
# regression (missings are taken care of by default):
reg = lm(@formula(log(salary) ~ LSAT + cost + age), lawsch85)
n = nobs(reg)
println("n = $n")
using WooldridgeDatasets, GLM, DataFrames
lawsch85 = DataFrame(wooldridge("lawsch85"))
lsat = lawsch85.LSAT
# create boolean indicator for missings:
missLSAT = ismissing.(lsat)
# LSAT and indicator for Schools No. 120-129:
preview = DataFrame(lsat=lsat[120:129],
missLSAT=missLSAT[120:129])
println("preview: \n$preview\n")
# frequencies of indicator:
tot_missing = count(missLSAT) # same as sum(missLSAT)
tot_nonmissings = count(.!missLSAT)
println("tot_missing = $tot_missing\n")
println("tot_nonmissings = $tot_nonmissings\n")
# missings for all variables in data frame (counts):
miss_all = ismissing.(lawsch85)
freq_missLSAT = mapcols(count, miss_all)
freq_missLSAT_preview = freq_missLSAT[:, 1:9] # print only first nine columns
println("freq_missLSAT_preview: \n$freq_missLSAT_preview\n")
# computing amount of complete cases:
lsat_compl_cases1 = dropmissing(lawsch85)
complete_cases1 = nrow(lsat_compl_cases1)
println("complete_cases1 = $complete_cases1\n")
lsat_compl_cases2 = completecases(lawsch85)
complete_cases2 = count(lsat_compl_cases2)
println("complete_cases2 = $complete_cases2")
using Distributions, DataFrames, Statistics
# NaN, missings and infinite values in Julia:
x1 = [0, 2, NaN, Inf, missing]
logx = log.(x1)
invx = 0 ./ x1
isnanx = isnan.(x1)
isinfx = isinf.(x1)
ismissingx = ismissing.(x1)
results = DataFrame(x1=x1, logx=logx, invx=invx, ismissingx=ismissingx,
isnanx=isnanx, isinfx=isinfx)
println("results = $results\n")
# mathematically not defined is not always NaN (like in R or Python):
test = try
log(-1) # results in an ERROR
catch e
NaN
end
println("test = $test\n")
# handling missings:
x2 = [4, 2, missing, 3]
meanx2_1 = mean(x2)
println("meanx2_1 = $meanx2_1\n")
meanx2_2 = mean(skipmissing(x2))
println("meanx2_2 = $meanx2_2\n")
x3 = [4, 2, NaN, 3]
meanx3_1 = mean(x3)
println("meanx3_1 = $meanx3_1\n")
meanx3_2 = mean(x3[.!isnan.(x3)])
println("meanx3_2 = $meanx3_2")
using WooldridgeDatasets, GLM, DataFrames
hprice1 = DataFrame(wooldridge("hprice1"))
# two alternative models:
reg1 = lm(@formula(price ~ lotsize + sqrft + bdrms), hprice1)
reg2 = lm(@formula(price ~ log(lotsize) + log(sqrft) + bdrms), hprice1)
# encompassing test of Davidson & MacKinnon:
# comprehensive model:
reg3 = lm(@formula(price ~ lotsize + sqrft + bdrms +
log(lotsize) + log(sqrft)), hprice1)
# test model 1:
ftest_res1 = ftest(reg1.model, reg3.model)
fstat1 = ftest_res1.fstat[2]
fpval1 = ftest_res1.pval[2]
println("fstat1 = $fstat1\n")
println("fpval1 = $fpval1\n")
# test model 2:
ftest_res2 = ftest(reg2.model, reg3.model)
fstat2 = ftest_res2.fstat[2]
fpval2 = ftest_res2.pval[2]
println("fstat2 = $fstat2\n")
println("fpval2 = $fpval2")
using WooldridgeDatasets, GLM, DataFrames, LinearAlgebra, Plots
rdchem = DataFrame(wooldridge("rdchem"))
# create dummys for each observation with an identity matrix:
n = nrow(rdchem)
dummys = DataFrame(Matrix(1I, n, n), Symbol.(:d, 1:n)) # colnames d1, ..., d32
# studentized residuals for all observations:
studres = zeros(n)
for i in 1:n
rdchem.di = dummys[:, i]
reg_i = lm(@formula(rdintens ~ sales + profmarg + di), rdchem)
# save t statistic (3rd column) of di (4th element):
studres[i] = coeftable(reg_i).cols[3][4]
end
# display extreme values:
studres_max = maximum(studres)
studres_min = minimum(studres)
println("studres_max = $studres_max\n")
println("studres_min = $studres_min")
# histogram:
histogram(studres, color="grey", legend=false)
xlabel!("studres")
savefig("JlGraphs/Outliers.pdf")
using Random, Distributions, Statistics, GLM, DataFrames
# set the random seed:
Random.seed!(12345)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas):
beta0 = 1
beta1 = 0.5
# initialize arrays to store results later (b1 without ME, b1_me with ME):
b1 = zeros(r)
b1_me = zeros(r)
# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)
# repeat r times:
for i in 1:r
# draw a sample of u:
u = rand(Normal(0, 1), n)
# draw a sample of ystar:
ystar = beta0 .+ beta1 * x .+ u
# measurement error and mismeasured y:
e0 = rand(Normal(0, 1), n)
y = ystar .+ e0
df = DataFrame(ystar=ystar, y=y, x=x)
# regress ystar on x and store slope estimate at position i:
reg_star = lm(@formula(ystar ~ x), df)
b1[i] = coef(reg_star)[2]
# regress y on x and store slope estimate at position i:
reg_me = lm(@formula(y ~ x), df)
b1_me[i] = coef(reg_me)[2]
end
# mean with and without ME:
b1_mean = mean(b1)
b1_me_mean = mean(b1_me)
println("b1_mean = $b1_mean\n")
println("b1_me_mean = $b1_me_mean\n")
# variance with and without ME:
b1_var = var(b1)
b1_me_var = var(b1_me)
println("b1_var = $b1_var\n")
println("b1_me_var = $b1_me_var")
using Random, Distributions, Statistics, GLM, DataFrames
# set the random seed:
Random.seed!(12345)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas):
beta0 = 1
beta1 = 0.5
# initialize arrays to store results later (b1 without ME, b1_me with ME):
b1 = zeros(r)
b1_me = zeros(r)
# draw a sample of x, fixed over replications:
xstar = rand(Normal(4, 1), n)
# repeat r times:
for i in 1:r
# draw a sample of u:
u = rand(Normal(0, 1), n)
# draw a sample of y:
y = beta0 .+ beta1 * xstar .+ u
# measurement error and mismeasured x:
e1 = rand(Normal(0, 1), n)
x = xstar .+ e1
df = DataFrame(y=y, xstar=xstar, x=x)
# regress y on xstar and store slope estimate at position i:
reg_star = lm(@formula(y ~ xstar), df)
b1[i] = coef(reg_star)[2]
# regress y on x and store slope estimate at position i:
reg_me = lm(@formula(y ~ x), df)
b1_me[i] = coef(reg_me)[2]
end
# mean with and without ME:
b1_mean = mean(b1)
b1_me_mean = mean(b1_me)
println("b1_mean = $b1_mean\n")
println("b1_me_mean = $b1_me_mean\n")
# variance with and without ME:
b1_var = var(b1)
b1_me_var = var(b1_me)
println("b1_var = $b1_var\n")
println("b1_me_var = $b1_me_var")