Example-2-10.R
data(wage1, package='wooldridge')
# Estimate log-level model
lm( log(wage) ~ educ, data=wage1 )
data(meap93, package='wooldridge')
# Estimate the model and save the results as "results"
results <- lm(math10 ~ lnchprg, data=meap93)
# Number of obs.
( n <- nobs(results) )
# SER:
(SER <- sd(resid(results)) * sqrt((n-1)/(n-2)) )
# SE of b0hat & b1hat, respectively:
SER / sd(meap93$lnchprg) / sqrt(n-1) * sqrt(mean(meap93$lnchprg^2))
SER / sd(meap93$lnchprg) / sqrt(n-1)
# Automatic calculations:
summary(results)
data(ceosal1, package='wooldridge')
# extract variables as vectors:
sal <- ceosal1$salary
roe <- ceosal1$roe
# regression with vectors:
CEOregres <- lm( sal ~ roe )
# obtain predicted values and residuals
sal.hat <- fitted(CEOregres)
u.hat <- resid(CEOregres)
# Wooldridge, Table 2.2:
cbind(roe, sal, sal.hat, u.hat)[1:15,]
data(wage1, package='wooldridge')
WAGEregres <- lm(wage ~ educ, data=wage1)
# obtain coefficients, predicted values and residuals
b.hat <- coef(WAGEregres)
wage.hat <- fitted(WAGEregres)
u.hat <- resid(WAGEregres)
# Confirm property (1):
mean(u.hat)
# Confirm property (2):
cor(wage1$educ , u.hat)
# Confirm property (3):
mean(wage1$wage)
b.hat[1] + b.hat[2] * mean(wage1$educ)
data(ceosal1, package='wooldridge')
CEOregres <- lm( salary ~ roe, data=ceosal1 )
# Calculate predicted values & residuals:
sal.hat <- fitted(CEOregres)
u.hat <- resid(CEOregres)
# Calculate R^2 in three different ways:
sal <- ceosal1$salary
var(sal.hat) / var(sal)
1 - var(u.hat) / var(sal)
cor(sal, sal.hat)^2
data(ceosal1, package='wooldridge')
# Usual OLS regression:
(reg1 <- lm( salary ~ roe, data=ceosal1))
# Regression without intercept (through origin):
(reg2 <- lm( salary ~ 0 + roe, data=ceosal1))
# Regression without slope (on a constant):
(reg3 <- lm( salary ~ 1 , data=ceosal1))
# average y:
mean(ceosal1$salary)
# Scatter Plot with all 3 regression lines
plot(ceosal1$roe, ceosal1$salary, ylim=c(0,4000))
abline(reg1, lwd=2, lty=1)
abline(reg2, lwd=2, lty=2)
abline(reg3, lwd=2, lty=3)
legend("topleft",c("full","through origin","const only"),lwd=2,lty=1:3)
# Set the random seed
set.seed(1234567)
# set sample size and number of simulations
n<-1000; r<-10000
# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2
# initialize b0hat and b1hat to store results later:
b0hat <- numeric(r)
b1hat <- numeric(r)
# Draw a sample of x, fixed over replications:
x <- rnorm(n,4,1)
# repeat r times:
for(j in 1:r) {
# Draw a sample of y:
u <- rnorm(n,0,su)
y <- b0 + b1*x + u
# estimate parameters by OLS and store them in the vectors
bhat <- coefficients( lm(y~x) )
b0hat[j] <- bhat["(Intercept)"]
b1hat[j] <- bhat["x"]
}
# MC estimate of the expected values:
mean(b0hat)
mean(b1hat)
# MC estimate of the variances:
var(b0hat)
var(b1hat)
# Initialize empty plot
plot( NULL, xlim=c(0,8), ylim=c(0,6), xlab="x", ylab="y")
# add OLS regression lines
for (j in 1:10) abline(b0hat[j],b1hat[j],col="gray")
# add population regression line
abline(b0,b1,lwd=2)
# add legend
legend("topleft",c("Population","OLS regressions"),
lwd=c(2,1),col=c("black","gray"))
# Set the random seed
set.seed(1234567)
# set sample size
n<-1000
# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2
# Draw a sample of size n:
x <- rnorm(n,4,1)
u <- rnorm(n,0,su)
y <- b0 + b1*x + u
# estimate parameters by OLS
(olsres <- lm(y~x))
# features of the sample for the variance formula:
mean(x^2)
sum((x-mean(x))^2)
# Graph
plot(x, y, col="gray", xlim=c(0,8) )
abline(b0,b1,lwd=2)
abline(olsres,col="gray",lwd=2)
legend("topleft",c("pop. regr. fct.","OLS regr. fct."),
lwd=2,col=c("black","gray"))
# Set the random seed
set.seed(1234567)
# set sample size and number of simulations
n<-1000; r<-10000
# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2
# initialize b0hat and b1hat to store results later:
b0hat <- numeric(r)
b1hat <- numeric(r)
# Draw a sample of x, fixed over replications:
x <- rnorm(n,4,1)
# repeat r times:
for(j in 1:r) {
# Draw a sample of y:
u <- rnorm(n, (x-4)/5, su)
y <- b0 + b1*x + u
# estimate parameters by OLS and store them in the vectors
bhat <- coefficients( lm(y~x) )
b0hat[j] <- bhat["(Intercept)"]
b1hat[j] <- bhat["x"]
}
# Set the random seed
set.seed(1234567)
# set sample size and number of simulations
n<-1000; r<-10000
# set true parameters: betas and sd of u
b0<-1; b1<-0.5; su<-2
# initialize b0hat and b1hat to store results later:
b0hat <- numeric(r)
b1hat <- numeric(r)
# Draw a sample of x, fixed over replications:
x <- rnorm(n,4,1)
# repeat r times:
for(j in 1:r) {
# Draw a sample of y:
varu <- 4/exp(4.5) * exp(x)
u <- rnorm(n, 0, sqrt(varu) )
y <- b0 + b1*x + u
# estimate parameters by OLS and store them in the vectors
bhat <- coefficients( lm(y~x) )
b0hat[j] <- bhat["(Intercept)"]
b1hat[j] <- bhat["x"]
}
import numpy as np
import wooldridge as woo
import statsmodels.formula.api as smf
meap93 = woo.dataWoo('meap93')
# estimate the model and save the results as "results":
reg = smf.ols(formula='math10 ~ lnchprg', data=meap93)
results = reg.fit()
# number of obs.:
n = results.nobs
# SER:
u_hat_var = np.var(results.resid, ddof=1)
SER = np.sqrt(u_hat_var) * np.sqrt((n - 1) / (n - 2))
print(f'SER: {SER}\n')
# SE of b0 & b1, respectively:
lnchprg_sq_mean = np.mean(meap93['lnchprg'] ** 2)
lnchprg_var = np.var(meap93['lnchprg'], ddof=1)
b1_se = SER / (np.sqrt(lnchprg_var)
* np.sqrt(n - 1)) * np.sqrt(lnchprg_sq_mean)
b0_se = SER / (np.sqrt(lnchprg_var) * np.sqrt(n - 1))
print(f'b1_se: {b1_se}\n')
print(f'b0_se: {b0_se}\n')
# automatic calculations:
print(f'results.summary(): \n{results.summary()}\n')
import wooldridge as woo
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
ceosal1 = woo.dataWoo('ceosal1')
# OLS regression:
reg = smf.ols(formula='salary ~ roe', data=ceosal1)
results = reg.fit()
# scatter plot and fitted values:
plt.plot('roe', 'salary', data=ceosal1, color='grey', marker='o', linestyle='')
plt.plot(ceosal1['roe'], results.fittedvalues, color='black', linestyle='-')
plt.ylabel('salary')
plt.xlabel('roe')
plt.savefig('PyGraphs/Example-2-3-3.pdf')
import wooldridge as woo
import numpy as np
ceosal1 = woo.dataWoo('ceosal1')
x = ceosal1['roe']
y = ceosal1['salary']
# ingredients to the OLS formulas:
cov_xy = np.cov(x, y)[1, 0] # access 2. row and 1. column of covariance matrix
var_x = np.var(x, ddof=1)
x_bar = np.mean(x)
y_bar = np.mean(y)
# manual calculation of OLS coefficients:
b1 = cov_xy / var_x
b0 = y_bar - b1 * x_bar
print(f'b1: {b1}\n')
print(f'b0: {b0}\n')
import wooldridge as woo
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
vote1 = woo.dataWoo('vote1')
# OLS regression:
reg = smf.ols(formula='voteA ~ shareA', data=vote1)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')
# scatter plot and fitted values:
plt.plot('shareA', 'voteA', data=vote1, color='grey', marker='o', linestyle='')
plt.plot(vote1['shareA'], results.fittedvalues, color='black', linestyle='-')
plt.ylabel('voteA')
plt.xlabel('shareA')
plt.savefig('PyGraphs/Example-2-5.pdf')
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf
ceosal1 = woo.dataWoo('ceosal1')
# OLS regression:
reg = smf.ols(formula='salary ~ roe', data=ceosal1)
results = reg.fit()
# obtain predicted values and residuals:
salary_hat = results.fittedvalues
u_hat = results.resid
# Wooldridge, Table 2.2:
table = pd.DataFrame({'roe': ceosal1['roe'],
'salary': ceosal1['salary'],
'salary_hat': salary_hat,
'u_hat': u_hat})
print(f'table.head(15): \n{table.head(15)}\n')
import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf
wage1 = woo.dataWoo('wage1')
reg = smf.ols(formula='wage ~ educ', data=wage1)
results = reg.fit()
# obtain coefficients, predicted values and residuals:
b = results.params
wage_hat = results.fittedvalues
u_hat = results.resid
# confirm property (1):
u_hat_mean = np.mean(u_hat)
print(f'u_hat_mean: {u_hat_mean}\n')
# confirm property (2):
educ_u_cov = np.cov(wage1['educ'], u_hat)[1, 0]
print(f'educ_u_cov: {educ_u_cov}\n')
# confirm property (3):
educ_mean = np.mean(wage1['educ'])
wage_pred = b[0] + b[1] * educ_mean
print(f'wage_pred: {wage_pred}\n')
wage_mean = np.mean(wage1['wage'])
print(f'wage_mean: {wage_mean}\n')
import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf
ceosal1 = woo.dataWoo('ceosal1')
# OLS regression:
reg = smf.ols(formula='salary ~ roe', data=ceosal1)
results = reg.fit()
# calculate predicted values & residuals:
sal_hat = results.fittedvalues
u_hat = results.resid
# calculate R^2 in three different ways:
sal = ceosal1['salary']
R2_a = np.var(sal_hat, ddof=1) / np.var(sal, ddof=1)
R2_b = 1 - np.var(u_hat, ddof=1) / np.var(sal, ddof=1)
R2_c = np.corrcoef(sal, sal_hat)[1, 0] ** 2
print(f'R2_a: {R2_a}\n')
print(f'R2_b: {R2_b}\n')
print(f'R2_c: {R2_c}\n')
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf
vote1 = woo.dataWoo('vote1')
# OLS regression:
reg = smf.ols(formula='voteA ~ shareA', data=vote1)
results = reg.fit()
# print results using summary:
print(f'results.summary(): \n{results.summary()}\n')
# print regression table:
table = pd.DataFrame({'b': round(results.params, 4),
'se': round(results.bse, 4),
't': round(results.tvalues, 4),
'pval': round(results.pvalues, 4)})
print(f'table: \n{table}\n')
import wooldridge as woo
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
ceosal1 = woo.dataWoo('ceosal1')
# usual OLS regression:
reg1 = smf.ols(formula='salary ~ roe', data=ceosal1)
results1 = reg1.fit()
b_1 = results1.params
print(f'b_1: \n{b_1}\n')
# regression without intercept (through origin):
reg2 = smf.ols(formula='salary ~ 0 + roe', data=ceosal1)
results2 = reg2.fit()
b_2 = results2.params
print(f'b_2: \n{b_2}\n')
# regression without slope (on a constant):
reg3 = smf.ols(formula='salary ~ 1', data=ceosal1)
results3 = reg3.fit()
b_3 = results3.params
print(f'b_3: \n{b_3}\n')
# average y:
sal_mean = np.mean(ceosal1['salary'])
print(f'sal_mean: {sal_mean}\n')
# scatter plot and fitted values:
plt.plot('roe', 'salary', data=ceosal1, color='grey', marker='o',
linestyle='', label='')
plt.plot(ceosal1['roe'], results1.fittedvalues, color='black',
linestyle='-', label='full')
plt.plot(ceosal1['roe'], results2.fittedvalues, color='black',
linestyle=':', label='through origin')
plt.plot(ceosal1['roe'], results3.fittedvalues, color='black',
linestyle='-.', label='const only')
plt.ylabel('salary')
plt.xlabel('roe')
plt.legend()
plt.savefig('PyGraphs/SLR-Origin-Const.pdf')
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats
import matplotlib.pyplot as plt
# set the random seed:
np.random.seed(1234567)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)
# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)
# repeat r times:
for i in range(r):
# draw a sample of y:
u = stats.norm.rvs(0, su, size=n)
y = beta0 + beta1 * x + u
df = pd.DataFrame({'y': y, 'x': x})
# estimate and store parameters by OLS:
reg = smf.ols(formula='y ~ x', data=df)
results = reg.fit()
b0[i] = results.params['Intercept']
b1[i] = results.params['x']
# MC estimate of the expected values:
b0_mean = np.mean(b0)
b1_mean = np.mean(b1)
print(f'b0_mean: {b0_mean}\n')
print(f'b1_mean: {b1_mean}\n')
# MC estimate of the variances:
b0_var = np.var(b0, ddof=1)
b1_var = np.var(b1, ddof=1)
print(f'b0_var: {b0_var}\n')
print(f'b1_var: {b1_var}\n')
# graph:
x_range = np.linspace(0, 8, num=100)
plt.ylim([0, 6])
# add population regression line:
plt.plot(x_range, beta0 + beta1 * x_range, color='black',
linestyle='-', linewidth=2, label='Population')
# add first OLS regression line (to attach a label):
plt.plot(x_range, b0[0] + b1[0] * x_range, color='grey',
linestyle='-', linewidth=0.5, label='OLS regressions')
# add OLS regression lines no. 2 to 10:
for i in range(1, 10):
plt.plot(x_range, b0[i] + b1[i] * x_range, color='grey',
linestyle='-', linewidth=0.5)
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/SLR-Sim-Model-Condx.pdf')
# XXX im Skript? Fehler in den FOrmeln?
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
# set the random seed:
np.random.seed(123456)
# set sample size and number of simulations:
n = 10
r = 10000
# set true parameters:
beta0 = 1
beta1 = 2
su = 0.2
sx = 1
ex = 5
# initialize b0 and b1 to store results later:
b0_uc = np.empty(r)
b1_uc = np.empty(r)
b0_c = np.empty(r)
b1_c = np.empty(r)
# draw a sample of conditional x, fixed over replications:
xc = stats.norm.rvs(ex, sx, size=n)
# repeat r times:
for i in range(r):
# draw a sample:
x = stats.norm.rvs(ex, sx, size=n)
u = stats.norm.rvs(0, su, size=n)
y = beta0 + beta1 * x + u
yc = beta0 + beta1 * xc + u
df = pd.DataFrame({'y': y, 'yc': yc, 'x': x, 'xc': xc})
# estimate unconditional OLS:
reg_uc = smf.ols(formula='y ~ x', data=df)
results_uc = reg_uc.fit()
b0_uc[i] = results_uc.params[0]
b1_uc[i] = results_uc.params[1]
# estimate conditional OLS:
reg_c = smf.ols(formula='yc ~ xc', data=df)
results_c = reg_c.fit()
b0_c[i] = results_c.params[0]
b1_c[i] = results_c.params[1]
if (i % 100) == 0:
print(i)
# comparing theoretical and empirical moments (I):
b0_uc_mean = np.mean(b0_uc)
b1_uc_mean = np.mean(b1_uc)
b0_c_mean = np.mean(b0_c)
b1_c_mean = np.mean(b1_c)
print(f'b0_uc_mean: {b0_uc_mean}\n')
print(f'b0_c_mean: {b0_c_mean}\n')
print(f'b0: {b0}\n')
print(f'b1_uc_mean: {b1_uc_mean}\n')
print(f'b1_c_mean: {b1_c_mean}\n')
print(f'b1: {b1}\n')
# comparing theoretical and empirical moments (II):
b0_uc_var = np.var(b0_uc, ddof=1)
b1_uc_var = np.var(b1_uc, ddof=1)
b0_c_var = np.var(b0_c, ddof=1)
b1_c_var = np.var(b1_c, ddof=1)
x_sq_mean = (sx ** 2) + (ex ** 2)
b0_var = 1 / (n - 1) * (su ** 2) / (sx ** 2) * x_sq_mean
b1_var = 1 / (n - 1) * (su ** 2) / (sx ** 2)
print(f'b0_uc_var: {b0_uc_var}\n')
print(f'b0_c_var: {b0_c_var}\n')
print(f'b0_var: {b0_var}\n')
print(f'b1_uc_var: {b1_uc_var}\n')
print(f'b1_c_var: {b1_c_var}\n')
print(f'b1_var: {b1_var}\n')
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats
# set the random seed:
np.random.seed(1234567)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)
# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)
# repeat r times:
for i in range(r):
# draw a sample of y:
u_mean = np.array((x - 4) / 5)
u = stats.norm.rvs(u_mean, su, size=n)
y = beta0 + beta1 * x + u
df = pd.DataFrame({'y': y, 'x': x})
# estimate and store parameters by OLS:
reg = smf.ols(formula='y ~ x', data=df)
results = reg.fit()
b0[i] = results.params['Intercept']
b1[i] = results.params['x']
# MC estimate of the expected values:
b0_mean = np.mean(b0)
b1_mean = np.mean(b1)
print(f'b0_mean: {b0_mean}\n')
print(f'b1_mean: {b1_mean}\n')
# MC estimate of the variances:
b0_var = np.var(b0, ddof=1)
b1_var = np.var(b1, ddof=1)
print(f'b0_var: {b0_var}\n')
print(f'b1_var: {b1_var}\n')
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats
# set the random seed:
np.random.seed(1234567)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas):
beta0 = 1
beta1 = 0.5
# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)
# draw a sample of x, fixed over replications:
x = stats.norm.rvs(4, 1, size=n)
# repeat r times:
for i in range(r):
# draw a sample of y:
u_var = np.array(4 / np.exp(4.5) * np.exp(x))
u = stats.norm.rvs(0, np.sqrt(u_var), size=n)
y = beta0 + beta1 * x + u
df = pd.DataFrame({'y': y, 'x': x})
# estimate and store parameters by OLS:
reg = smf.ols(formula='y ~ x', data=df)
results = reg.fit()
results = reg.fit()
b0[i] = results.params['Intercept']
b1[i] = results.params['x']
# MC estimate of the expected values:
b0_mean = np.mean(b0)
b1_mean = np.mean(b1)
print(f'b0_mean: {b0_mean}\n')
print(f'b1_mean: {b1_mean}\n')
# MC estimate of the variances:
b0_var = np.var(b0, ddof=1)
b1_var = np.var(b1, ddof=1)
print(f'b0_var: {b0_var}\n')
print(f'b1_var: {b1_var}\n')
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats
# set the random seed:
np.random.seed(1234567)
# set sample size and number of simulations
n = 1000
r = 10000
# set true parameters:
beta0 = 1
beta1 = 0.5
su = 2
sx = 1
ex = 4
# initialize b0 and b1 to store results later:
b0 = np.empty(r)
b1 = np.empty(r)
# repeat r times:
for i in range(r):
# draw a sample:
x = stats.norm.rvs(ex, sx, size=n)
u = stats.norm.rvs(0, su, size=n)
y = beta0 + beta1 * x + u
df = pd.DataFrame({'y': y, 'x': x})
# estimate OLS:
reg = smf.ols(formula='y ~ x', data=df)
results = reg.fit()
b0[i] = results.params['Intercept']
b1[i] = results.params['x']
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import scipy.stats as stats
import matplotlib.pyplot as plt
# set the random seed:
np.random.seed(1234567)
# set sample size:
n = 1000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
# draw a sample of size n:
x = stats.norm.rvs(4, 1, size=n)
u = stats.norm.rvs(0, su, size=n)
y = beta0 + beta1 * x + u
df = pd.DataFrame({'y': y, 'x': x})
# estimate parameters by OLS:
reg = smf.ols(formula='y ~ x', data=df)
results = reg.fit()
b = results.params
print(f'b: \n{b}\n')
# features of the sample for the variance formula:
x_sq_mean = np.mean(x ** 2)
print(f'x_sq_mean: {x_sq_mean}\n')
x_var = np.sum((x - np.mean(x)) ** 2)
print(f'x_var: {x_var}\n')
# graph:
x_range = np.linspace(0, 8, num=100)
plt.ylim([-2, 10])
plt.plot(x, y, color='lightgrey', marker='o', linestyle='')
plt.plot(x_range, beta0 + beta1 * x_range, color='black',
linestyle='-', linewidth=2, label='pop. regr. fct.')
plt.plot(x_range, b[0] + b[1] * x_range, color='grey',
linestyle='-', linewidth=2, label='OLS regr. fct.')
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/SLR-Sim-Sample.pdf')
using WooldridgeDatasets, DataFrames, GLM, Statistics
meap93 = DataFrame(wooldridge("meap93"))
# estimate the model and save the results as reg:
reg = lm(@formula(math10 ~ lnchprg), meap93)
# number of obs.:
n = nobs(reg)
# SER:
u_hat_var = var(residuals(reg))
SER = sqrt(u_hat_var) * sqrt((n - 1) / (n - 2))
println("SER = $SER\n")
# SE of b0 and b1, respectively:
lnchprg_sq_mean = mean(meap93.lnchprg .^ 2)
lnchprg_var = var(meap93.lnchprg)
b0_se = SER / (sqrt(lnchprg_var) * sqrt(n - 1)) * sqrt(lnchprg_sq_mean)
b1_se = SER / (sqrt(lnchprg_var) * sqrt(n - 1))
println("b0_se = $b0_se\n")
println("b1_se = $b1_se\n")
# automatic calculations:
table_reg = coeftable(reg)
println("table_reg: \n$table_reg")
using WooldridgeDatasets, DataFrames, GLM, Plots
ceosal1 = DataFrame(wooldridge("ceosal1"))
reg = lm(@formula(salary ~ roe), ceosal1)
# scatter plot and fitted values:
fitted_values = predict(reg)
scatter(ceosal1.roe, ceosal1.salary, color=:grey80, label="observations")
plot!(ceosal1.roe, fitted_values, color=:black, linewidth=3, label="OLS")
xlabel!("roe")
ylabel!("salary")
savefig("JlGraphs/Example-2-3-3.pdf")
# instead of scatter, you can also use:
# plot(ceosal1.roe, ceosal1.salary, label="observations", seriestype=:scatter)
using WooldridgeDatasets, DataFrames, Statistics
ceosal1 = DataFrame(wooldridge("ceosal1"))
x = ceosal1.roe
y = ceosal1.salary
# ingredients to the OLS formulas:
cov_xy = cov(x, y)
var_x = var(x)
x_bar = mean(x)
y_bar = mean(y)
# manual calculation of OLS coefficients:
b1 = cov_xy / var_x
b0 = y_bar - b1 * x_bar
println("b1 = $b1\n")
println("b0 = $b0")
using WooldridgeDatasets, DataFrames, GLM, Plots
vote1 = DataFrame(wooldridge("vote1"))
# OLS regression:
reg = lm(@formula(voteA ~ shareA), vote1)
b = coef(reg)
println("b = $b")
# scatter plot and fitted values:
fitted_values = predict(reg)
scatter(vote1.shareA, vote1.voteA,
color=:grey, label="observations", legend=:topleft)
plot!(vote1.shareA, fitted_values, color=:black, linewidth=3, label="OLS")
xlabel!("shareA")
ylabel!("voteA")
savefig("JlGraphs/Example-2-5.pdf")
using WooldridgeDatasets, DataFrames, GLM
ceosal1 = DataFrame(wooldridge("ceosal1"))
# OLS regression:
reg = lm(@formula(salary ~ roe), ceosal1)
table_reg = coeftable(reg)
println("table_reg: \n$table_reg\n")
# obtain predicted values and residuals:
salary_hat = predict(reg)
u_hat = residuals(reg)
# Wooldridge, Table 2.2:
table = DataFrame(roe=ceosal1.roe,
salary=ceosal1.salary,
salary_hat=salary_hat,
u_hat=u_hat)
table_preview = first(table, 10)
println("table_preview: \n$table_preview")
using WooldridgeDatasets, DataFrames, GLM, Statistics
wage1 = DataFrame(wooldridge("wage1"))
reg = lm(@formula(wage ~ educ), wage1)
# obtain coefficients, predicted values and residuals:
b = coef(reg)
wage_hat = predict(reg)
u_hat = residuals(reg)
# confirm property (1):
u_hat_mean = mean(u_hat)
println("u_hat_mean = $u_hat_mean\n")
# confirm property (2):
educ_u_cov = cov(wage1.educ, u_hat)
println("educ_u_cov = $educ_u_cov\n")
# confirm property (3):
educ_mean = mean(wage1.educ)
wage_pred = b[1] + b[2] * educ_mean
println("wage_pred = $wage_pred\n")
wage_mean = mean(wage1.wage)
println("wage_mean = $wage_mean")
using WooldridgeDatasets, DataFrames, GLM, Statistics
ceosal1 = DataFrame(wooldridge("ceosal1"))
# OLS regression:
reg = lm(@formula(salary ~ roe), ceosal1)
# obtain predicted values and residuals:
sal_hat = predict(reg)
u_hat = residuals(reg)
# calculate R^2 in three different ways:
sal = ceosal1.salary
R2_a = var(sal_hat) / var(sal)
R2_b = 1 - var(u_hat) / var(sal)
R2_c = cor(sal, sal_hat)^2
println("R2_a = $R2_a\n")
println("R2_b = $R2_b\n")
println("R2_c = $R2_c")
using WooldridgeDatasets, DataFrames, GLM
vote1 = DataFrame(wooldridge("vote1"))
# OLS regression:
reg = lm(@formula(voteA ~ shareA), vote1)
# print results using coeftable:
table_reg = coeftable(reg)
println("table_reg: \n$table_reg\n")
# accessing R^2:
r2_automatic = r2(reg)
println("r2_automatic = $r2_automatic")
using WooldridgeDatasets, DataFrames, GLM, Plots, Statistics
ceosal1 = DataFrame(wooldridge("ceosal1"))
# usual OLS regression:
reg1 = lm(@formula(salary ~ roe), ceosal1)
b1 = coef(reg1)
println("b1 = $b1\n")
# regression without intercept (through origin):
reg2 = lm(@formula(salary ~ 0 + roe), ceosal1)
b2 = coef(reg2)
println("b2 = $b2\n")
# regression without slope (on a constant):
reg3 = lm(@formula(salary ~ 1), ceosal1)
b3 = coef(reg3)
println("b3 = $b3\n")
# average y:
sal_mean = mean(ceosal1.salary)
println("sal_mean = $sal_mean")
# scatter plot and fitted values:
scatter(ceosal1.roe, ceosal1.salary, color="grey85", label="observations")
plot!(ceosal1.roe, predict(reg1), linewidth=2,
color="black", label="full")
plot!(ceosal1.roe, predict(reg2), linewidth=2,
color="dimgrey", label="trough origin")
plot!(ceosal1.roe, predict(reg3), linewidth=2,
color="lightgrey", label="const only")
xlabel!("roe")
ylabel!("salary")
savefig("JlGraphs/SLR-Origin-Const.pdf")
using Random, GLM, DataFrames, Distributions, Statistics, Plots
# set the random seed:
Random.seed!(12345)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)
# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)
# repeat r times:
for i in 1:r
# draw a sample of y:
u = rand(Normal(0, su), n)
y = beta0 .+ beta1 .* x .+ u
df = DataFrame(y=y, x=x)
# estimate and store parameters by OLS:
reg = lm(@formula(y ~ x), df)
b0[i] = coef(reg)[1]
b1[i] = coef(reg)[2]
end
# MC estimate of the expected values:
b0_mean = mean(b0)
b1_mean = mean(b1)
println("b0_mean = $b0_mean\n")
println("b1_mean = $b1_mean\n")
# MC estimate of the variances:
b0_var = var(b0)
b1_var = var(b1)
println("b0_var = $b0_var\n")
println("b1_var = $b1_var")
# graph:
x_range = range(0, 8, length=100)
# add population regression line:
plot(x_range, beta0 .+ beta1 .* x_range, ylim=[0, 6],
color="black", linewidth=2, label="Population")
# add first OLS regression line (to attach a label):
plot!(x_range, b0[1] .+ b1[1] .* x_range,
color="grey", linewidth=0.5, label="OLS regressions")
# add OLS regression lines no. 2 to 10:
for i in 2:10
plot!(x_range, b0[i] .+ b1[i] .* x_range,
color="grey", linewidth=0.5, label=false)
end
ylabel!("y")
xlabel!("x")
savefig("JlGraphs/SLR-Sim-Model-Condx.pdf")
using Random, GLM, DataFrames, Distributions, Statistics
# set the random seed:
Random.seed!(12345)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)
# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)
# repeat r times:
for i in 1:r
# draw a sample of y:
u_mean = (x .- 4) ./ 5
u = rand.(Normal.(u_mean, su), 1)
u = reduce(vcat, u)
y = beta0 .+ beta1 .* x .+ u
df = DataFrame(y=y, x=x)
# estimate and store parameters by OLS:
reg = lm(@formula(y ~ x), df)
b0[i] = coef(reg)[1]
b1[i] = coef(reg)[2]
end
# MC estimate of the expected values:
b0_mean = mean(b0)
b1_mean = mean(b1)
println("b0_mean = $b0_mean\n")
println("b1_mean = $b1_mean\n")
# MC estimate of the variances:
b0_var = var(b0)
b1_var = var(b1)
println("b0_var = $b0_var\n")
println("b1_var = $b1_var")
using Random, GLM, DataFrames, Distributions, Statistics
# set the random seed:
Random.seed!(1234567)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)
# draw a sample of x, fixed over replications:
x = rand(Normal(4, 1), n)
# repeat r times:
for i in 1:r
# draw a sample of y:
u_var = 4 / exp(4.5) .* exp.(x)
u = rand.(Normal.(0, sqrt.(u_var)), 1)
u = reduce(vcat, u)
y = beta0 .+ beta1 .* x .+ u
df = DataFrame(y=y, x=x)
# estimate and store parameters by OLS:
reg = lm(@formula(y ~ x), df)
b0[i] = coef(reg)[1]
b1[i] = coef(reg)[2]
end
# MC estimate of the expected values:
b0_mean = mean(b0)
b1_mean = mean(b1)
println("b0_mean = $b0_mean\n")
println("b1_mean = $b1_mean\n")
# MC estimate of the variances:
b0_var = var(b0)
b1_var = var(b1)
println("b0_var = $b0_var\n")
println("b1_var = $b1_var")
using Random, GLM, DataFrames, Distributions, Statistics
# set the random seed:
Random.seed!(12345)
# set sample size and number of simulations:
n = 1000
r = 10000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
sx = 1
ex = 4
# initialize b0 and b1 to store results later:
b0 = zeros(r)
b1 = zeros(r)
# repeat r times:
for i in 1:r
# draw a sample:
x = rand(Normal(ex, sx), n)
u = rand(Normal(0, su), n)
y = beta0 .+ beta1 .* x .+ u
df = DataFrame(y=y, x=x)
# estimate OLS:
reg = lm(@formula(y ~ x), df)
b0[i] = coef(reg)[1]
b1[i] = coef(reg)[2]
end
using Random, GLM, DataFrames, Distributions, Statistics, Plots
# set the random seed:
Random.seed!(12345)
# set sample size:
n = 1000
# set true parameters (betas and sd of u):
beta0 = 1
beta1 = 0.5
su = 2
# draw a sample of size n:
x = rand(Normal(4, 1), n)
u = rand(Normal(0, su), n)
y = beta0 .+ beta1 .* x .+ u
df = DataFrame(y=y, x=x)
# estimate parameters by OLS:
reg = lm(@formula(y ~ x), df)
b = coef(reg)
println("b = $b\n")
# features of the sample for the variance formula:
x_sq_mean = mean(x .^ 2)
println("x_sq_mean = $x_sq_mean\n")
x_var = sum((x .- mean(x)) .^ 2)
println("x_var = $x_var")
# graph:
x_range = range(0, 8, length=100)
scatter(x, y, color="lightgrey", ylim=[-2, 10],
label="sample", alpha=0.7, markerstrokecolor=:white)
plot!(x_range, beta0 .+ beta1 .* x_range, color="black",
linestyle=:solid, linewidth=2, label="pop. regr. fct.")
plot!(x_range, coef(reg)[1] .+ coef(reg)[2] .* x_range, color="grey",
linestyle=:solid, linewidth=2, label="OLS regr. fct.")
xlabel!("x")
ylabel!("y")
savefig("JlGraphs/SLR-Sim-Sample.pdf")