Chapter 2

Example-2-10.R

data(wage1, package='wooldridge')

# Estimate log-level model
lm( log(wage) ~ educ, data=wage1 )

Example-2-11.R

data(ceosal1, package='wooldridge')

# Estimate log-log model
lm( log(salary) ~ log(sales), data=ceosal1 )

Example-2-12.R

data(meap93, package='wooldridge')

# Estimate the model and save the results as "results"
results <- lm(math10 ~ lnchprg, data=meap93)

# Number of obs.
( n <- nobs(results) )
# SER:
(SER <- sd(resid(results)) * sqrt((n-1)/(n-2)) )
# SE of b0hat & b1hat, respectively:
SER / sd(meap93$lnchprg) / sqrt(n-1) * sqrt(mean(meap93$lnchprg^2))
SER / sd(meap93$lnchprg) / sqrt(n-1)

# Automatic calculations:
summary(results)

Example-2-3-2.R

data(ceosal1, package='wooldridge')

# OLS regression
lm( salary ~ roe, data=ceosal1 )

Example-2-3.R

data(ceosal1, package='wooldridge')
attach(ceosal1)

# ingredients to the OLS formulas 
cov(roe,salary)
var(roe)
mean(salary)
mean(roe)

# manual calculation of OLS coefficients 
( b1hat <- cov(roe,salary)/var(roe) )
( b0hat <- mean(salary) - b1hat*mean(roe) )

# "detach" the data frame
detach(ceosal1)

Example-2-4.R

data(wage1, package='wooldridge')

# OLS regression:
lm(wage ~ educ, data=wage1)

Example-2-5.R

data(vote1, package='wooldridge')

# OLS regression (parentheses for immediate output):
( VOTEres <- lm(voteA ~ shareA, data=vote1) )

# scatter plot with regression line:
with(vote1, plot(shareA, voteA))
abline(VOTEres)

Example-2-6.R

data(ceosal1, package='wooldridge')

# extract variables as vectors:
sal <- ceosal1$salary
roe <- ceosal1$roe

# regression with vectors:
CEOregres <- lm( sal ~ roe  )

# obtain predicted values and residuals
sal.hat <- fitted(CEOregres)
u.hat <- resid(CEOregres)

# Wooldridge, Table 2.2: 
cbind(roe, sal, sal.hat, u.hat)[1:15,]

Example-2-7.R

data(wage1, package='wooldridge')

WAGEregres <- lm(wage ~ educ, data=wage1)

# obtain coefficients, predicted values and residuals
b.hat <- coef(WAGEregres)
wage.hat <- fitted(WAGEregres)
u.hat <- resid(WAGEregres)

# Confirm property (1):
mean(u.hat)

# Confirm property (2):
cor(wage1$educ , u.hat)

# Confirm property (3):
mean(wage1$wage)
b.hat[1] + b.hat[2] * mean(wage1$educ)

Example-2-8.R

data(ceosal1, package='wooldridge')

CEOregres <- lm( salary ~ roe, data=ceosal1 )

# Calculate predicted values & residuals:
sal.hat <- fitted(CEOregres)
u.hat <- resid(CEOregres)

# Calculate R^2 in three different ways:
sal <- ceosal1$salary
var(sal.hat) / var(sal)
1 - var(u.hat) / var(sal)
cor(sal, sal.hat)^2

Example-2-9.R

data(vote1, package='wooldridge')

VOTEres <- lm(voteA ~ shareA, data=vote1)

# Summary of the regression results
summary(VOTEres)

# Calculate R^2 manually:
var( fitted(VOTEres) ) / var( vote1$voteA )

SLR-Origin-Const.R

data(ceosal1, package='wooldridge')

# Usual OLS regression:
(reg1 <- lm( salary ~ roe, data=ceosal1))

# Regression without intercept (through origin):
(reg2 <- lm( salary ~ 0 + roe, data=ceosal1))

# Regression without slope (on a constant):
(reg3 <- lm( salary ~ 1 , data=ceosal1))

# average y:
mean(ceosal1$salary)

# Scatter Plot with all 3 regression lines
plot(ceosal1$roe, ceosal1$salary, ylim=c(0,4000))
abline(reg1, lwd=2, lty=1)
abline(reg2, lwd=2, lty=2)
abline(reg3, lwd=2, lty=3)
legend("topleft",c("full","through origin","const only"),lwd=2,lty=1:3)

SLR-Sim-Model-Condx.R

# Set the random seed
set.seed(1234567)

# set sample size and number of simulations
n<-1000; r<-10000

# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2

# initialize b0hat and b1hat to store results later:
b0hat <- numeric(r)
b1hat <- numeric(r)

# Draw a sample of x, fixed over replications:
x <- rnorm(n,4,1)

# repeat r times:
for(j in 1:r) {
  # Draw a sample of y:
  u <- rnorm(n,0,su)
  y <- b0 + b1*x + u
  
  # estimate parameters by OLS and store them in the vectors
  bhat <- coefficients( lm(y~x) )
  b0hat[j] <- bhat["(Intercept)"]
  b1hat[j] <- bhat["x"]
}

SLR-Sim-Results-ViolSLR4.R

# MC estimate of the expected values:
mean(b0hat)
mean(b1hat)

# MC estimate of the variances:
var(b0hat)
var(b1hat)

SLR-Sim-Results-ViolSLR5.R

# MC estimate of the expected values:
mean(b0hat)
mean(b1hat)

# MC estimate of the variances:
var(b0hat)
var(b1hat)

SLR-Sim-Results.R

# MC estimate of the expected values:
mean(b0hat)
mean(b1hat)

# MC estimate of the variances:
var(b0hat)
var(b1hat)

# Initialize empty plot
plot( NULL, xlim=c(0,8), ylim=c(0,6), xlab="x", ylab="y")
# add OLS regression lines
for (j in 1:10) abline(b0hat[j],b1hat[j],col="gray")
# add population regression line
abline(b0,b1,lwd=2)
# add legend
legend("topleft",c("Population","OLS regressions"),
                           lwd=c(2,1),col=c("black","gray"))

SLR-Sim-Sample.R

# Set the random seed
set.seed(1234567)

# set sample size
n<-1000

# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2

# Draw a sample of size n:
x <- rnorm(n,4,1)
u <- rnorm(n,0,su)
y <- b0 + b1*x + u

# estimate parameters by OLS
(olsres <- lm(y~x))

# features of the sample for the variance formula:
mean(x^2)
sum((x-mean(x))^2)

# Graph
plot(x, y, col="gray", xlim=c(0,8) )
abline(b0,b1,lwd=2)
abline(olsres,col="gray",lwd=2)
legend("topleft",c("pop. regr. fct.","OLS regr. fct."),
                               lwd=2,col=c("black","gray"))

SLR-Sim-ViolSLR4.R

# Set the random seed
set.seed(1234567)

# set sample size and number of simulations
n<-1000; r<-10000

# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2

# initialize b0hat and b1hat to store results later:
b0hat <- numeric(r)
b1hat <- numeric(r)

# Draw a sample of x, fixed over replications:
x <- rnorm(n,4,1)

# repeat r times:
for(j in 1:r) {
  # Draw a sample of y:
  u <- rnorm(n, (x-4)/5, su)
  y <- b0 + b1*x + u
  
  # estimate parameters by OLS and store them in the vectors
  bhat <- coefficients( lm(y~x) )
  b0hat[j] <- bhat["(Intercept)"]
  b1hat[j] <- bhat["x"]
}

SLR-Sim-ViolSLR5.R

# Set the random seed
set.seed(1234567)

# set sample size and number of simulations
n<-1000; r<-10000

# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2

# initialize b0hat and b1hat to store results later:
b0hat <- numeric(r)
b1hat <- numeric(r)

# Draw a sample of x, fixed over replications:
x <- rnorm(n,4,1)

# repeat r times:
for(j in 1:r) {
  # Draw a sample of y:
  varu <- 4/exp(4.5) * exp(x)
  u <- rnorm(n, 0, sqrt(varu) )
  y <- b0 + b1*x + u
  
  # estimate parameters by OLS and store them in the vectors
  bhat <- coefficients( lm(y~x) )
  b0hat[j] <- bhat["(Intercept)"]
  b1hat[j] <- bhat["x"]
}

Example-2-10.py

import numpy as np
import wooldridge as woo
import statsmodels.formula.api as smf

wage1 = woo.dataWoo('wage1')

# estimate log-level model:
reg = smf.ols(formula='np.log(wage) ~ educ', data=wage1)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')

Example-2-11.py

import numpy as np
import wooldridge as woo
import statsmodels.formula.api as smf

ceosal1 = woo.dataWoo('ceosal1')

# estimate log-log model:
reg = smf.ols(formula='np.log(salary) ~ np.log(sales)', data=ceosal1)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')

Example-2-12.py

import numpy as np
import wooldridge as woo
import statsmodels.formula.api as smf

meap93 = woo.dataWoo('meap93')

# estimate the model and save the results as "results":
reg = smf.ols(formula='math10 ~ lnchprg', data=meap93)
results = reg.fit()

# number of obs.:
n = results.nobs

# SER:
u_hat_var = np.var(results.resid, ddof=1)
SER = np.sqrt(u_hat_var) * np.sqrt((n - 1) / (n - 2))
print(f'SER: {SER}\n')

# SE of b0 & b1, respectively:
lnchprg_sq_mean = np.mean(meap93['lnchprg'] ** 2)
lnchprg_var = np.var(meap93['lnchprg'], ddof=1)
b1_se = SER / (np.sqrt(lnchprg_var)
               * np.sqrt(n - 1)) * np.sqrt(lnchprg_sq_mean)
b0_se = SER / (np.sqrt(lnchprg_var) * np.sqrt(n - 1))
print(f'b1_se: {b1_se}\n')
print(f'b0_se: {b0_se}\n')

# automatic calculations:
print(f'results.summary(): \n{results.summary()}\n')

Example-2-3-2.py

import wooldridge as woo
import statsmodels.formula.api as smf

ceosal1 = woo.dataWoo('ceosal1')

reg = smf.ols(formula='salary ~ roe', data=ceosal1)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')

Example-2-3-3.py

import wooldridge as woo
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

ceosal1 = woo.dataWoo('ceosal1')

# OLS regression:
reg = smf.ols(formula='salary ~ roe', data=ceosal1)
results = reg.fit()

# scatter plot and fitted values:
plt.plot('roe', 'salary', data=ceosal1, color='grey', marker='o', linestyle='')
plt.plot(ceosal1['roe'], results.fittedvalues, color='black', linestyle='-')
plt.ylabel('salary')
plt.xlabel('roe')
plt.savefig('PyGraphs/Example-2-3-3.pdf')

Example-2-3.py

import wooldridge as woo
import numpy as np

ceosal1 = woo.dataWoo('ceosal1')
x = ceosal1['roe']
y = ceosal1['salary']

# ingredients to the OLS formulas:
cov_xy = np.cov(x, y)[1, 0]  # access 2. row and 1. column of covariance matrix
var_x = np.var(x, ddof=1)
x_bar = np.mean(x)
y_bar = np.mean(y)

# manual calculation of OLS coefficients:
b1 = cov_xy / var_x
b0 = y_bar - b1 * x_bar
print(f'b1: {b1}\n')
print(f'b0: {b0}\n')

Example-2-4.py

import wooldridge as woo
import statsmodels.formula.api as smf

wage1 = woo.dataWoo('wage1')

reg = smf.ols(formula='wage ~ educ', data=wage1)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')

Example-2-5.py

import wooldridge as woo
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

vote1 = woo.dataWoo('vote1')

# OLS regression:
reg = smf.ols(formula='voteA ~ shareA', data=vote1)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')

# scatter plot and fitted values:
plt.plot('shareA', 'voteA', data=vote1, color='grey', marker='o', linestyle='')
plt.plot(vote1['shareA'], results.fittedvalues, color='black', linestyle='-')
plt.ylabel('voteA')
plt.xlabel('shareA')
plt.savefig('PyGraphs/Example-2-5.pdf')

Example-2-6.py

import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

ceosal1 = woo.dataWoo('ceosal1')

# OLS regression:
reg = smf.ols(formula='salary ~ roe', data=ceosal1)
results = reg.fit()

# obtain predicted values and residuals:
salary_hat = results.fittedvalues
u_hat = results.resid

# Wooldridge, Table 2.2:
table = pd.DataFrame({'roe': ceosal1['roe'],
                      'salary': ceosal1['salary'],
                      'salary_hat': salary_hat,
                      'u_hat': u_hat})
print(f'table.head(15): \n{table.head(15)}\n')

Example-2-7.py

import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf

wage1 = woo.dataWoo('wage1')
reg = smf.ols(formula='wage ~ educ', data=wage1)
results = reg.fit()

# obtain coefficients, predicted values and residuals:
b = results.params
wage_hat = results.fittedvalues
u_hat = results.resid

# confirm property (1):
u_hat_mean = np.mean(u_hat)
print(f'u_hat_mean: {u_hat_mean}\n')

# confirm property (2):
educ_u_cov = np.cov(wage1['educ'], u_hat)[1, 0]
print(f'educ_u_cov: {educ_u_cov}\n')

# confirm property (3):
educ_mean = np.mean(wage1['educ'])
wage_pred = b.iloc[0] + b.iloc[1] * educ_mean
print(f'wage_pred: {wage_pred}\n')

wage_mean = np.mean(wage1['wage'])
print(f'wage_mean: {wage_mean}\n')

Example-2-8.py

import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf

ceosal1 = woo.dataWoo('ceosal1')

# OLS regression:
reg = smf.ols(formula='salary ~ roe', data=ceosal1)
results = reg.fit()

# calculate predicted values & residuals:
sal_hat = results.fittedvalues
u_hat = results.resid

# calculate R^2 in three different ways:
sal = ceosal1['salary']
R2_a = np.var(sal_hat, ddof=1) / np.var(sal, ddof=1)
R2_b = 1 - np.var(u_hat, ddof=1) / np.var(sal, ddof=1)
R2_c = np.corrcoef(sal, sal_hat)[1, 0] ** 2

print(f'R2_a: {R2_a}\n')
print(f'R2_b: {R2_b}\n')
print(f'R2_c: {R2_c}\n')

Example-2-9.py

import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

vote1 = woo.dataWoo('vote1')

# OLS regression:
reg = smf.ols(formula='voteA ~ shareA', data=vote1)
results = reg.fit()

# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')

# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
                      'se': round(results.bse, 4),
                      't': round(results.tvalues, 4),
                      'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')

SLR-Origin-Const.py

import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

ceosal1 = woo.dataWoo('ceosal1')

# usual OLS regression:
reg1 = smf.ols(formula='salary ~ roe', data=ceosal1)
results1 = reg1.fit()
b_1 = results1.params
print(f'b_1: \n{b_1}\n')

# regression without intercept (through origin):
reg2 = smf.ols(formula='salary ~ 0 + roe', data=ceosal1)
results2 = reg2.fit()
b_2 = results2.params
print(f'b_2: \n{b_2}\n')

# regression without slope (on a constant):
reg3 = smf.ols(formula='salary ~ 1', data=ceosal1)
results3 = reg3.fit()
b_3 = results3.params
print(f'b_3: \n{b_3}\n')

# average y:
sal_mean = np.mean(ceosal1['salary'])
print(f'sal_mean: {sal_mean}\n')

# scatter plot and fitted values:
plt.plot('roe', 'salary', data=ceosal1, color='grey', marker='o',
         linestyle='', label='')
plt.plot(ceosal1['roe'], results1.fittedvalues, color='black',
         linestyle='-', label='full')
plt.plot(ceosal1['roe'], results2.fittedvalues, color='black',
         linestyle=':', label='through origin')
plt.plot(ceosal1['roe'], results3.fittedvalues, color='black',
         linestyle='-.', label='const only')
plt.ylabel('salary')
plt.xlabel('roe')
plt.legend()
plt.savefig('PyGraphs/SLR-Origin-Const.pdf')

SLR-Sim-Model-Condx.py

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats
import matplotlib.pyplot as plt

# set the random seed:
np.random.seed(1234567)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2

# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)

# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)

# repeat r times:
for i in range(r):
    # draw a sample of y:
    u = stats.norm.rvs(0, su, size=n)
    y = beta0 + beta1 * x + u
    df = pd.DataFrame({'y': y, 'x': x})

    # estimate and store parameters by OLS:
    reg = smf.ols(formula='y ~ x', data=df)
    results = reg.fit()
    b0[i] = results.params['Intercept']
    b1[i] = results.params['x']

# MC estimate of the expected values:
b0_mean = np.mean(b0)
b1_mean = np.mean(b1)

print(f'b0_mean: {b0_mean}\n')
print(f'b1_mean: {b1_mean}\n')

# MC estimate of the variances:
b0_var = np.var(b0, ddof=1)
b1_var = np.var(b1, ddof=1)

print(f'b0_var: {b0_var}\n')
print(f'b1_var: {b1_var}\n')

# graph:
x_range = np.linspace(0, 8, num=100)
plt.ylim([0, 6])

# add population regression line:
plt.plot(x_range, beta0 + beta1 * x_range, color='black',
         linestyle='-', linewidth=2, label='Population')

# add first OLS regression line (to attach a label):
plt.plot(x_range, b0[0] + b1[0] * x_range, color='grey',
         linestyle='-', linewidth=0.5, label='OLS regressions')

# add OLS regression lines no. 2 to 10:
for i in range(1, 10):
    plt.plot(x_range, b0[i] + b1[i] * x_range, color='grey',
             linestyle='-', linewidth=0.5)
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/SLR-Sim-Model-Condx.pdf')

SLR-Sim-Model-tests.py

# XXX im Skript? Fehler in den FOrmeln?
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

# set the random seed:
np.random.seed(123456)

# set sample size and number of simulations:
n = 10
r = 10000

# set true parameters:
beta0 = 1
beta1 = 2
su = 0.2
sx = 1
ex = 5

# initialize b0 and b1 to store results later:
b0_uc = np.empty(r)
b1_uc = np.empty(r)
b0_c = np.empty(r)
b1_c = np.empty(r)

# draw a sample of conditional x, fixed over replications:
xc = stats.norm.rvs(ex, sx, size=n)

# repeat r times:
for i in range(r):
    # draw a sample:
    x = stats.norm.rvs(ex, sx, size=n)
    u = stats.norm.rvs(0, su, size=n)
    y = beta0 + beta1 * x + u
    yc = beta0 + beta1 * xc + u
    df = pd.DataFrame({'y': y, 'yc': yc, 'x': x, 'xc': xc})

    # estimate unconditional OLS:
    reg_uc = smf.ols(formula='y ~ x', data=df)
    results_uc = reg_uc.fit()
    b0_uc[i] = results_uc.params[0]
    b1_uc[i] = results_uc.params[1]

    # estimate conditional OLS:
    reg_c = smf.ols(formula='yc ~ xc', data=df)
    results_c = reg_c.fit()
    b0_c[i] = results_c.params[0]
    b1_c[i] = results_c.params[1]
    if (i % 100) == 0:
        print(i)

# comparing theoretical and empirical moments (I):
b0_uc_mean = np.mean(b0_uc)
b1_uc_mean = np.mean(b1_uc)
b0_c_mean = np.mean(b0_c)
b1_c_mean = np.mean(b1_c)

print(f'b0_uc_mean: {b0_uc_mean}\n')
print(f'b0_c_mean: {b0_c_mean}\n')
print(f'b0: {b0}\n')

print(f'b1_uc_mean: {b1_uc_mean}\n')
print(f'b1_c_mean: {b1_c_mean}\n')
print(f'b1: {b1}\n')

# comparing theoretical and empirical moments (II):
b0_uc_var = np.var(b0_uc, ddof=1)
b1_uc_var = np.var(b1_uc, ddof=1)
b0_c_var = np.var(b0_c, ddof=1)
b1_c_var = np.var(b1_c, ddof=1)

x_sq_mean = (sx ** 2) + (ex ** 2)
b0_var = 1 / (n - 1) * (su ** 2) / (sx ** 2) * x_sq_mean
b1_var = 1 / (n - 1) * (su ** 2) / (sx ** 2)

print(f'b0_uc_var: {b0_uc_var}\n')
print(f'b0_c_var: {b0_c_var}\n')
print(f'b0_var: {b0_var}\n')

print(f'b1_uc_var: {b1_uc_var}\n')
print(f'b1_c_var: {b1_c_var}\n')
print(f'b1_var: {b1_var}\n')

SLR-Sim-Model-ViolSLR4.py

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats

# set the random seed:
np.random.seed(1234567)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2

# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)

# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)

# repeat r times:
for i in range(r):
    # draw a sample of y:
    u_mean = np.array((x - 4) / 5)
    u = stats.norm.rvs(u_mean, su, size=n)
    y = beta0 + beta1 * x + u
    df = pd.DataFrame({'y': y, 'x': x})

    # estimate and store parameters by OLS:
    reg = smf.ols(formula='y ~ x', data=df)
    results = reg.fit()
    b0[i] = results.params['Intercept']
    b1[i] = results.params['x']

# MC estimate of the expected values:
b0_mean = np.mean(b0)
b1_mean = np.mean(b1)

print(f'b0_mean: {b0_mean}\n')
print(f'b1_mean: {b1_mean}\n')

# MC estimate of the variances:
b0_var = np.var(b0, ddof=1)
b1_var = np.var(b1, ddof=1)

print(f'b0_var: {b0_var}\n')
print(f'b1_var: {b1_var}\n')

SLR-Sim-Model-ViolSLR5.py

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats

# set the random seed:
np.random.seed(1234567)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas):
beta0 = 1
beta1 = 0.5

# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)

# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)

# repeat r times:
for i in range(r):
    # draw a sample of y:
    u_var = np.array(4 / np.exp(4.5) * np.exp(x))
    u = stats.norm.rvs(0, np.sqrt(u_var), size=n)
    y = beta0 + beta1 * x + u
    df = pd.DataFrame({'y': y, 'x': x})

    # estimate and store parameters by OLS:
    reg = smf.ols(formula='y ~ x', data=df)
    results = reg.fit()
    results = reg.fit()
    b0[i] = results.params['Intercept']
    b1[i] = results.params['x']

# MC estimate of the expected values:
b0_mean = np.mean(b0)
b1_mean = np.mean(b1)

print(f'b0_mean: {b0_mean}\n')
print(f'b1_mean: {b1_mean}\n')

# MC estimate of the variances:
b0_var = np.var(b0, ddof=1)
b1_var = np.var(b1, ddof=1)

print(f'b0_var: {b0_var}\n')
print(f'b1_var: {b1_var}\n')

SLR-Sim-Model.py

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats

# set the random seed:
np.random.seed(1234567)

# set sample size and number of simulations
n = 1000
r = 10000

# set true parameters:
beta0 = 1
beta1 = 0.5
su = 2
sx = 1
ex = 4

# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)

# repeat r times:
for i in range(r):
    # draw a sample:
    x = stats.norm.rvs(ex, sx, size=n)
    u = stats.norm.rvs(0, su, size=n)
    y = beta0 + beta1 * x + u
    df = pd.DataFrame({'y': y, 'x': x})

    # estimate OLS:
    reg = smf.ols(formula='y ~ x', data=df)
    results = reg.fit()
    b0[i] = results.params['Intercept']
    b1[i] = results.params['x']

SLR-Sim-Sample.py

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats
import matplotlib.pyplot as plt

# set the random seed:
np.random.seed(1234567)

# set sample size:
n = 1000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2

# draw a sample of size n:
x = stats.norm.rvs(4, 1, size=n)
u = stats.norm.rvs(0, su, size=n)
y = beta0 + beta1 * x + u
df = pd.DataFrame({'y': y, 'x': x})

# estimate parameters by OLS:
reg = smf.ols(formula='y ~ x', data=df)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')

# features of the sample for the variance formula:
x_sq_mean = np.mean(x ** 2)
print(f'x_sq_mean: {x_sq_mean}\n')
x_var = np.sum((x - np.mean(x)) ** 2)
print(f'x_var: {x_var}\n')

# graph:
x_range = np.linspace(0, 8, num=100)
plt.ylim([-2, 10])
plt.plot(x, y, color='lightgrey', marker='o', linestyle='')
plt.plot(x_range, beta0 + beta1 * x_range, color='black',
         linestyle='-', linewidth=2, label='pop. regr. fct.')
plt.plot(x_range, b.iloc[0] + b.iloc[1] * x_range, color='grey',
         linestyle='-', linewidth=2, label='OLS regr. fct.')
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/SLR-Sim-Sample.pdf')

Example-2-10.jl

using WooldridgeDatasets, DataFrames, GLM

wage1 = DataFrame(wooldridge("wage1"))

# estimate log-level model:
reg = lm(@formula(log(wage) ~ educ), wage1)
b = coef(reg)
println("b = $b")

Example-2-11.jl

using WooldridgeDatasets, DataFrames, GLM

ceosal1 = DataFrame(wooldridge("ceosal1"))

# estimate log-log model:
reg = lm(@formula(log(salary) ~ log(sales)), ceosal1)
b = coef(reg)
println("b = $b")

Example-2-12.jl

using WooldridgeDatasets, DataFrames, GLM, Statistics

meap93 = DataFrame(wooldridge("meap93"))

# estimate the model and save the results as reg:
reg = lm(@formula(math10 ~ lnchprg), meap93)

# number of obs.:
n = nobs(reg)

# SER:
u_hat_var = var(residuals(reg))
SER = sqrt(u_hat_var) * sqrt((n - 1) / (n - 2))
println("SER = $SER\n")

# SE of b0 and b1, respectively:
lnchprg_sq_mean = mean(meap93.lnchprg .^ 2)
lnchprg_var = var(meap93.lnchprg)
b0_se = SER / (sqrt(lnchprg_var) * sqrt(n - 1)) * sqrt(lnchprg_sq_mean)
b1_se = SER / (sqrt(lnchprg_var) * sqrt(n - 1))
println("b0_se = $b0_se\n")
println("b1_se = $b1_se\n")

# automatic calculations:
table_reg = coeftable(reg)
println("table_reg: \n$table_reg")

Example-2-3-2.jl

using WooldridgeDatasets, DataFrames, GLM

ceosal1 = DataFrame(wooldridge("ceosal1"))

reg = lm(@formula(salary ~ roe), ceosal1)
b = coef(reg)
println("b = $b")

Example-2-3-3.jl

using WooldridgeDatasets, DataFrames, GLM, Plots

ceosal1 = DataFrame(wooldridge("ceosal1"))

reg = lm(@formula(salary ~ roe), ceosal1)

# scatter plot and fitted values:
fitted_values = predict(reg)
scatter(ceosal1.roe, ceosal1.salary, color=:grey80, label="observations")
plot!(ceosal1.roe, fitted_values, color=:black, linewidth=3, label="OLS")
xlabel!("roe")
ylabel!("salary")
savefig("JlGraphs/Example-2-3-3.pdf")

# instead of scatter, you can also use:
# plot(ceosal1.roe, ceosal1.salary, label="observations", seriestype=:scatter)

Example-2-3.jl

using WooldridgeDatasets, DataFrames, Statistics

ceosal1 = DataFrame(wooldridge("ceosal1"))
x = ceosal1.roe
y = ceosal1.salary

# ingredients to the OLS formulas:
cov_xy = cov(x, y)
var_x = var(x)
x_bar = mean(x)
y_bar = mean(y)

# manual calculation of OLS coefficients:
b1 = cov_xy / var_x
b0 = y_bar - b1 * x_bar
println("b1 = $b1\n")
println("b0 = $b0")

Example-2-4.jl

using WooldridgeDatasets, DataFrames, GLM

wage1 = DataFrame(wooldridge("wage1"))

reg = lm(@formula(wage ~ educ), wage1)
b = coef(reg)
println("b = $b")

Example-2-5.jl

using WooldridgeDatasets, DataFrames, GLM, Plots

vote1 = DataFrame(wooldridge("vote1"))

# OLS regression:
reg = lm(@formula(voteA ~ shareA), vote1)
b = coef(reg)
println("b = $b")

# scatter plot and fitted values:
fitted_values = predict(reg)
scatter(vote1.shareA, vote1.voteA,
    color=:grey, label="observations", legend=:topleft)
plot!(vote1.shareA, fitted_values, color=:black, linewidth=3, label="OLS")
xlabel!("shareA")
ylabel!("voteA")
savefig("JlGraphs/Example-2-5.pdf")

Example-2-6.jl

using WooldridgeDatasets, DataFrames, GLM

ceosal1 = DataFrame(wooldridge("ceosal1"))

# OLS regression:
reg = lm(@formula(salary ~ roe), ceosal1)
table_reg = coeftable(reg)
println("table_reg: \n$table_reg\n")

# obtain predicted values and residuals:
salary_hat = predict(reg)
u_hat = residuals(reg)

# Wooldridge, Table 2.2:
table = DataFrame(roe=ceosal1.roe,
    salary=ceosal1.salary,
    salary_hat=salary_hat,
    u_hat=u_hat)
table_preview = first(table, 10)
println("table_preview: \n$table_preview")

Example-2-7.jl

using WooldridgeDatasets, DataFrames, GLM, Statistics

wage1 = DataFrame(wooldridge("wage1"))
reg = lm(@formula(wage ~ educ), wage1)

# obtain coefficients, predicted values and residuals:
b = coef(reg)
wage_hat = predict(reg)
u_hat = residuals(reg)

# confirm property (1):
u_hat_mean = mean(u_hat)
println("u_hat_mean = $u_hat_mean\n")

# confirm property (2):
educ_u_cov = cov(wage1.educ, u_hat)
println("educ_u_cov = $educ_u_cov\n")

# confirm property (3):
educ_mean = mean(wage1.educ)
wage_pred = b[1] + b[2] * educ_mean
println("wage_pred = $wage_pred\n")

wage_mean = mean(wage1.wage)
println("wage_mean = $wage_mean")

Example-2-8.jl

using WooldridgeDatasets, DataFrames, GLM, Statistics

ceosal1 = DataFrame(wooldridge("ceosal1"))

# OLS regression:
reg = lm(@formula(salary ~ roe), ceosal1)

# obtain predicted values and residuals:
sal_hat = predict(reg)
u_hat = residuals(reg)

# calculate R^2 in three different ways:
sal = ceosal1.salary
R2_a = var(sal_hat) / var(sal)
R2_b = 1 - var(u_hat) / var(sal)
R2_c = cor(sal, sal_hat)^2

println("R2_a = $R2_a\n")
println("R2_b = $R2_b\n")
println("R2_c = $R2_c")

Example-2-9.jl

using WooldridgeDatasets, DataFrames, GLM

vote1 = DataFrame(wooldridge("vote1"))

# OLS regression:
reg = lm(@formula(voteA ~ shareA), vote1)

# print results using coeftable:
table_reg = coeftable(reg)
println("table_reg: \n$table_reg\n")

# accessing R^2:
r2_automatic = r2(reg)
println("r2_automatic = $r2_automatic")

SLR-Origin-Const.jl

using WooldridgeDatasets, DataFrames, GLM, Plots, Statistics

ceosal1 = DataFrame(wooldridge("ceosal1"))

# usual OLS regression:
reg1 = lm(@formula(salary ~ roe), ceosal1)
b1 = coef(reg1)
println("b1 = $b1\n")

# regression without intercept (through origin):
reg2 = lm(@formula(salary ~ 0 + roe), ceosal1)
b2 = coef(reg2)
println("b2 = $b2\n")

# regression without slope (on a constant):
reg3 = lm(@formula(salary ~ 1), ceosal1)
b3 = coef(reg3)
println("b3 = $b3\n")

# average y:
sal_mean = mean(ceosal1.salary)
println("sal_mean = $sal_mean")

# scatter plot and fitted values: 
scatter(ceosal1.roe, ceosal1.salary, color="grey85", label="observations")
plot!(ceosal1.roe, predict(reg1), linewidth=2,
    color="black", label="full")
plot!(ceosal1.roe, predict(reg2), linewidth=2,
    color="dimgrey", label="trough origin")
plot!(ceosal1.roe, predict(reg3), linewidth=2,
    color="lightgrey", label="const only")
xlabel!("roe")
ylabel!("salary")
savefig("JlGraphs/SLR-Origin-Const.pdf")

SLR-Sim-Model-Condx.jl

using Random, GLM, DataFrames, Distributions, Statistics, Plots

# set the random seed:
Random.seed!(12345)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2

# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)
# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)

# repeat r times:
for i in 1:r
        # draw a sample of y:
        u = rand(Normal(0, su), n)
        y = beta0 .+ beta1 .* x .+ u
        df = DataFrame(y=y, x=x)
        # estimate and store parameters by OLS:
        reg = lm(@formula(y ~ x), df)

        b0[i] = coef(reg)[1]
        b1[i] = coef(reg)[2]
end

# MC estimate of the expected values:
b0_mean = mean(b0)
b1_mean = mean(b1)
println("b0_mean = $b0_mean\n")
println("b1_mean = $b1_mean\n")

# MC estimate of the variances:
b0_var = var(b0)
b1_var = var(b1)
println("b0_var = $b0_var\n")
println("b1_var = $b1_var")

# graph:
x_range = range(0, 8, length=100)

# add population regression line:
plot(x_range, beta0 .+ beta1 .* x_range, ylim=[0, 6],
        color="black", linewidth=2, label="Population")

# add first OLS regression line (to attach a label):
plot!(x_range, b0[1] .+ b1[1] .* x_range,
        color="grey", linewidth=0.5, label="OLS regressions")

# add OLS regression lines no. 2 to 10:
for i in 2:10
        plot!(x_range, b0[i] .+ b1[i] .* x_range,
                color="grey", linewidth=0.5, label=false)
end
ylabel!("y")
xlabel!("x")
savefig("JlGraphs/SLR-Sim-Model-Condx.pdf")

SLR-Sim-Model-ViolSLR4.jl

using Random, GLM, DataFrames, Distributions, Statistics

# set the random seed:
Random.seed!(12345)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2

# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)
# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)

# repeat r times:
for i in 1:r
        # draw a sample of y:
        u_mean = (x .- 4) ./ 5
        u = rand.(Normal.(u_mean, su), 1)
        u = reduce(vcat, u)
        y = beta0 .+ beta1 .* x .+ u
        df = DataFrame(y=y, x=x)
        # estimate and store parameters by OLS:
        reg = lm(@formula(y ~ x), df)

        b0[i] = coef(reg)[1]
        b1[i] = coef(reg)[2]
end

# MC estimate of the expected values:
b0_mean = mean(b0)
b1_mean = mean(b1)
println("b0_mean = $b0_mean\n")
println("b1_mean = $b1_mean\n")

# MC estimate of the variances:
b0_var = var(b0)
b1_var = var(b1)
println("b0_var = $b0_var\n")
println("b1_var = $b1_var")

SLR-Sim-Model-ViolSLR5.jl

using Random, GLM, DataFrames, Distributions, Statistics

# set the random seed:
Random.seed!(1234567)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2

# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)
# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)

# repeat r times:
for i in 1:r
        # draw a sample of y:
        u_var = 4 / exp(4.5) .* exp.(x)
        u = rand.(Normal.(0, sqrt.(u_var)), 1)
        u = reduce(vcat, u)
        y = beta0 .+ beta1 .* x .+ u
        df = DataFrame(y=y, x=x)
        # estimate and store parameters by OLS:
        reg = lm(@formula(y ~ x), df)

        b0[i] = coef(reg)[1]
        b1[i] = coef(reg)[2]
end

# MC estimate of the expected values:
b0_mean = mean(b0)
b1_mean = mean(b1)
println("b0_mean = $b0_mean\n")
println("b1_mean = $b1_mean\n")

# MC estimate of the variances:
b0_var = var(b0)
b1_var = var(b1)
println("b0_var = $b0_var\n")
println("b1_var = $b1_var")

SLR-Sim-Model.jl

using Random, GLM, DataFrames, Distributions, Statistics

# set the random seed:
Random.seed!(12345)

# set sample size and number of simulations:
n = 1000
r = 10000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
sx = 1
ex = 4

# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)

# repeat r times:
for i in 1:r
        # draw a sample:
        x = rand(Normal(ex, sx), n)
        u = rand(Normal(0, su), n)
        y = beta0 .+ beta1 .* x .+ u
        df = DataFrame(y=y, x=x)
        # estimate OLS:
        reg = lm(@formula(y ~ x), df)
        b0[i] = coef(reg)[1]
        b1[i] = coef(reg)[2]
end

SLR-Sim-Sample.jl

using Random, GLM, DataFrames, Distributions, Statistics, Plots

# set the random seed:
Random.seed!(12345)

# set sample size:
n = 1000

# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2

# draw a sample of size n:
x = rand(Normal(4, 1), n)
u = rand(Normal(0, su), n)
y = beta0 .+ beta1 .* x .+ u
df = DataFrame(y=y, x=x)

# estimate parameters by OLS:
reg = lm(@formula(y ~ x), df)
b = coef(reg)
println("b = $b\n")

# features of the sample for the variance formula:
x_sq_mean = mean(x .^ 2)
println("x_sq_mean = $x_sq_mean\n")
x_var = sum((x .- mean(x)) .^ 2)
println("x_var = $x_var")

# graph:
x_range = range(0, 8, length=100)
scatter(x, y, color="lightgrey", ylim=[-2, 10],
    label="sample", alpha=0.7, markerstrokecolor=:white)
plot!(x_range, beta0 .+ beta1 .* x_range, color="black",
    linestyle=:solid, linewidth=2, label="pop. regr. fct.")
plot!(x_range, coef(reg)[1] .+ coef(reg)[2] .* x_range, color="grey",
    linestyle=:solid, linewidth=2, label="OLS regr. fct.")
xlabel!("x")
ylabel!("y")
savefig("JlGraphs/SLR-Sim-Sample.pdf")