Chapter 17

Example-17-1-1.R
library(car); library(lmtest)  # for robust SE
data(mroz, package='wooldridge')

# Estimate linear probability model
linprob <- lm(inlf~nwifeinc+educ+exper+I(exper^2)+age+kidslt6+kidsge6,data=mroz)
# Regression table with heteroscedasticity-robust SE and t tests:
coeftest(linprob,vcov=hccm)
Example-17-1-2.R
# predictions for two "extreme" women (run Example-17-1-1.R first!):
xpred <- list(nwifeinc=c(100,0),educ=c(5,17),exper=c(0,30),
              age=c(20,52),kidslt6=c(2,0),kidsge6=c(0,0))
predict(linprob,xpred)
Example-17-1-3.R
data(mroz, package='wooldridge')

# Estimate logit model
logitres<-glm(inlf~nwifeinc+educ+exper+I(exper^2)+age+kidslt6+kidsge6,
                                family=binomial(link=logit),data=mroz)
# Summary of results:
summary(logitres)
# Log likelihood value:
logLik(logitres) 
# McFadden's pseudo R2:
1 - logitres$deviance/logitres$null.deviance
Example-17-1-4.R
data(mroz, package='wooldridge')

# Estimate probit model
probitres<-glm(inlf~nwifeinc+educ+exper+I(exper^2)+age+kidslt6+kidsge6,
                                family=binomial(link=probit),data=mroz)
# Summary of results:
summary(probitres)
# Log likelihood value:
logLik(probitres) 
# McFadden's pseudo R2:
1 - probitres$deviance/probitres$null.deviance
Example-17-1-5.R
################################################################
# Test of overall significance:
# Manual calculation of the LR test statistic:
probitres$null.deviance - probitres$deviance

# Automatic calculations including p-values,...:
library(lmtest)
lrtest(probitres)

################################################################
# Test of H0: experience and age are irrelevant
restr <- glm(inlf~nwifeinc+educ+ kidslt6+kidsge6, 
                          family=binomial(link=logit),data=mroz)
lrtest(restr,probitres)
Example-17-1-6.R
# Predictions from linear probability, probit and logit model:
# (run 17-1-1.R through 17-1-4.R first to define the variables!)
predict(linprob,  xpred,type = "response")
predict(logitres, xpred,type = "response")
predict(probitres,xpred,type = "response")
Example-17-1-7.R
# APEs (run 17-1-1.R through 17-1-4.R first to define the variables!)

# Calculation of linear index at individual values:
xb.log <- predict(logitres)
xb.prob<- predict(probitres)
# APE factors = average(g(xb))
factor.log <- mean( dlogis(xb.log) )
factor.prob<- mean( dnorm(xb.prob) )
cbind(factor.log,factor.prob)

# average partial effects = beta*factor:
APE.lin <- coef(linprob) * 1
APE.log <- coef(logitres) * factor.log
APE.prob<- coef(probitres) * factor.prob

# Table of APEs
cbind(APE.lin, APE.log, APE.prob)
Example-17-1-8.R
# Automatic APE calculations with package mfx
library(mfx)
logitmfx(inlf~nwifeinc+educ+exper+I(exper^2)+age+kidslt6+kidsge6, 
                                              data=mroz, atmean=FALSE)
Example-17-2-survreg.R
# Estimate Tobit model using survreg:
library(survival)
res <- survreg(Surv(hours, hours>0, type="left") ~ nwifeinc+educ+exper+
             I(exper^2)+age+kidslt6+kidsge6, data=mroz, dist="gaussian")
summary(res)
Example-17-2.R
data(mroz, package='wooldridge')

# Estimate Tobit model using censReg:
library(censReg)
TobitRes <- censReg(hours~nwifeinc+educ+exper+I(exper^2)+ 
                                    age+kidslt6+kidsge6, data=mroz )
summary(TobitRes)

# Partial Effects at the average x:
margEff(TobitRes)
Example-17-3-1.R
data(crime1, package='wooldridge')

# Estimate linear model
lm.res      <-  lm(narr86~pcnv+avgsen+tottime+ptime86+qemp86+inc86+
                    black+hispan+born60, data=crime1)
# Estimate Poisson model
Poisson.res <- glm(narr86~pcnv+avgsen+tottime+ptime86+qemp86+inc86+
                    black+hispan+born60, data=crime1, family=poisson)
# Quasi-Poisson model
QPoisson.res<- glm(narr86~pcnv+avgsen+tottime+ptime86+qemp86+inc86+
                    black+hispan+born60, data=crime1, family=quasipoisson)
Example-17-3-2.R
# Example 17.3: Regression table (run Example-17-3-1.R first!)
library(stargazer) # package for regression output
stargazer(lm.res,Poisson.res,QPoisson.res,type="text",keep.stat="n")
Example-17-4.R
library(survival)
data(recid, package='wooldridge')

# Define Dummy for UNcensored observations
recid$uncensored <- recid$cens==0
# Estimate censored regression model:
res<-survreg(Surv(log(durat),uncensored, type="right") ~ workprg+priors+
                     tserved+felon+alcohol+drugs+black+married+educ+age, 
                     data=recid, dist="gaussian")
# Output:
summary(res)
Example-17-5.R
library(sampleSelection)
data(mroz, package='wooldridge')

# Estimate Heckman selection model (2 step version)
res<-selection(inlf~educ+exper+I(exper^2)+nwifeinc+age+kidslt6+kidsge6,
           log(wage)~educ+exper+I(exper^2), data=mroz, method="2step" )
# Summary of results:
summary(res)
Binary-Margeff.py
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import scipy.stats as stats

# set the random seed:
np.random.seed(1234567)

y = stats.binom.rvs(1, 0.5, size=100)
x = stats.norm.rvs(0, 1, size=100) + 2 * y
sim_data = pd.DataFrame({'y': y, 'x': x})

# estimation:
reg_lin = smf.ols(formula='y ~ x', data=sim_data)
results_lin = reg_lin.fit()
reg_logit = smf.logit(formula='y ~ x', data=sim_data)
results_logit = reg_logit.fit(disp=0)
reg_probit = smf.probit(formula='y ~ x', data=sim_data)
results_probit = reg_probit.fit(disp=0)

# calculate partial effects:
PE_lin = np.repeat(results_lin.params['x'], 100)

xb_logit = results_logit.fittedvalues
factor_logit = stats.logistic.pdf(xb_logit)
PE_logit = results_logit.params['x'] * factor_logit

xb_probit = results_probit.fittedvalues
factor_probit = stats.norm.pdf(xb_probit)
PE_probit = results_probit.params['x'] * factor_probit


# plot APE's:
plt.plot(x, PE_lin, color='black',
         marker='o', linestyle='', label='linear')
plt.plot(x, PE_logit, color='black',
         marker='+', linestyle='', label='logit')
plt.plot(x, PE_probit, color='black',
         marker='*', linestyle='', label='probit')
plt.ylabel('partial effects')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/Binary-margeff.pdf')
Binary-Predictions.py
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# set the random seed:
np.random.seed(1234567)

y = stats.binom.rvs(1, 0.5, size=100)
x = stats.norm.rvs(0, 1, size=100) + 2 * y
sim_data = pd.DataFrame({'y': y, 'x': x})

# estimation:
reg_lin = smf.ols(formula='y ~ x', data=sim_data)
results_lin = reg_lin.fit()
reg_logit = smf.logit(formula='y ~ x', data=sim_data)
results_logit = reg_logit.fit(disp=0)
reg_probit = smf.probit(formula='y ~ x', data=sim_data)
results_probit = reg_probit.fit(disp=0)

# prediction for regular grid of x values:
X_new = pd.DataFrame({'x': np.linspace(min(x), max(x), 50)})
predictions_lin = results_lin.predict(X_new)
predictions_logit = results_logit.predict(X_new)
predictions_probit = results_probit.predict(X_new)

# scatter plot and fitted values:
plt.plot(x, y, color='grey', marker='o', linestyle='')
plt.plot(X_new['x'], predictions_lin,
         color='black', linestyle='-.', label='linear')
plt.plot(X_new['x'], predictions_logit,
         color='black', linestyle='-', linewidth=0.5, label='logit')
plt.plot(X_new['x'], predictions_probit,
         color='black', linestyle='--', label='probit')
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/Binary-Predictions.pdf')
Example-17-1-1.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# estimate linear probability model:
reg_lin = smf.ols(formula='inlf ~ nwifeinc + educ + exper +'
                          'I(exper**2) + age + kidslt6 + kidsge6',
                  data=mroz)
results_lin = reg_lin.fit(cov_type='HC3')

# print regression table:
table = pd.DataFrame({'b': round(results_lin.params, 4),
                      'se': round(results_lin.bse, 4),
                      't': round(results_lin.tvalues, 4),
                      'pval': round(results_lin.pvalues, 4)})
print(f'table: \n{table}\n')
Example-17-1-2.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# estimate linear probability model:
reg_lin = smf.ols(formula='inlf ~ nwifeinc + educ + exper +'
                          'I(exper**2) + age + kidslt6 + kidsge6',
                  data=mroz)
results_lin = reg_lin.fit(cov_type='HC3')

# predictions for two "extreme" women:
X_new = pd.DataFrame(
    {'nwifeinc': [100, 0], 'educ': [5, 17],
     'exper': [0, 30], 'age': [20, 52],
     'kidslt6': [2, 0], 'kidsge6': [0, 0]})
predictions = results_lin.predict(X_new)

print(f'predictions: \n{predictions}\n')
Example-17-1-3.py
import wooldridge as woo
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# estimate logit model:
reg_logit = smf.logit(formula='inlf ~ nwifeinc + educ + exper +'
                              'I(exper**2) + age + kidslt6 + kidsge6',
                      data=mroz)

# disp = 0 avoids printing out information during the estimation:
results_logit = reg_logit.fit(disp=0)
print(f'results_logit.summary(): \n{results_logit.summary()}\n')

# log likelihood value:
print(f'results_logit.llf: {results_logit.llf}\n')

# McFadden's pseudo R2:
print(f'results_logit.prsquared: {results_logit.prsquared}\n')
Example-17-1-4.py
import wooldridge as woo
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# estimate probit model:
reg_probit = smf.probit(formula='inlf ~ nwifeinc + educ + exper +'
                                'I(exper**2) + age + kidslt6 + kidsge6',
                        data=mroz)
results_probit = reg_probit.fit(disp=0)
print(f'results_probit.summary(): \n{results_probit.summary()}\n')

# log likelihood value:
print(f'results_probit.llf: {results_probit.llf}\n')

# McFadden's pseudo R2:
print(f'results_probit.prsquared: {results_probit.prsquared}\n')
Example-17-1-5.py
import wooldridge as woo
import statsmodels.formula.api as smf
import scipy.stats as stats

mroz = woo.dataWoo('mroz')

# estimate probit model:
reg_probit = smf.probit(formula='inlf ~ nwifeinc + educ + exper +'
                                'I(exper**2) + age + kidslt6 + kidsge6',
                        data=mroz)
results_probit = reg_probit.fit(disp=0)

# test of overall significance (test statistic and pvalue):
llr1_manual = 2 * (results_probit.llf - results_probit.llnull)
print(f'llr1_manual: {llr1_manual}\n')
print(f'results_probit.llr: {results_probit.llr}\n')
print(f'results_probit.llr_pvalue: {results_probit.llr_pvalue}\n')

# automatic Wald test of H0 (experience and age are irrelevant):
hypotheses = ['exper=0', 'I(exper ** 2)=0', 'age=0']
waldstat = results_probit.wald_test(hypotheses)
teststat2_autom = waldstat.statistic
pval2_autom = waldstat.pvalue
print(f'teststat2_autom: {teststat2_autom}\n')
print(f'pval2_autom: {pval2_autom}\n')

# manual likelihood ratio statistic test
# of H0 (experience and age are irrelevant):
reg_probit_restr = smf.probit(formula='inlf ~ nwifeinc + educ +'
                                      'kidslt6 + kidsge6',
                              data=mroz)
results_probit_restr = reg_probit_restr.fit(disp=0)

llr2_manual = 2 * (results_probit.llf - results_probit_restr.llf)
pval2_manual = 1 - stats.chi2.cdf(llr2_manual, 3)
print(f'llr2_manual2: {llr2_manual}\n')
print(f'pval2_manual2: {pval2_manual}\n')
Example-17-1-6.py
import wooldridge as woo
import pandas as pd
import statsmodels.formula.api as smf

mroz = woo.dataWoo('mroz')

# estimate models:
reg_lin = smf.ols(formula='inlf ~ nwifeinc + educ + exper +'
                          'I(exper**2) + age + kidslt6 + kidsge6',
                  data=mroz)
results_lin = reg_lin.fit(cov_type='HC3')

reg_logit = smf.logit(formula='inlf ~ nwifeinc + educ + exper +'
                              'I(exper**2) + age + kidslt6 + kidsge6',
                      data=mroz)
results_logit = reg_logit.fit(disp=0)

reg_probit = smf.probit(formula='inlf ~ nwifeinc + educ + exper +'
                                'I(exper**2) + age + kidslt6 + kidsge6',
                        data=mroz)
results_probit = reg_probit.fit(disp=0)

# predictions for two "extreme" women:
X_new = pd.DataFrame(
    {'nwifeinc': [100, 0], 'educ': [5, 17],
     'exper': [0, 30], 'age': [20, 52],
     'kidslt6': [2, 0], 'kidsge6': [0, 0]})
predictions_lin = results_lin.predict(X_new)
predictions_logit = results_logit.predict(X_new)
predictions_probit = results_probit.predict(X_new)

print(f'predictions_lin: \n{predictions_lin}\n')
print(f'predictions_logit: \n{predictions_logit}\n')
print(f'predictions_probit: \n{predictions_probit}\n')
Example-17-1-7.py
import wooldridge as woo
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy.stats as stats

mroz = woo.dataWoo('mroz')

# estimate models:
reg_lin = smf.ols(formula='inlf ~ nwifeinc + educ + exper + I(exper**2) +'
                          'age + kidslt6 + kidsge6', data=mroz)
results_lin = reg_lin.fit(cov_type='HC3')

reg_logit = smf.logit(formula='inlf ~ nwifeinc + educ + exper + I(exper**2) +'
                              'age + kidslt6 + kidsge6', data=mroz)
results_logit = reg_logit.fit(disp=0)

reg_probit = smf.probit(formula='inlf ~ nwifeinc + educ + exper + I(exper**2) +'
                                'age + kidslt6 + kidsge6', data=mroz)
results_probit = reg_probit.fit(disp=0)

# manual average partial effects:
APE_lin = np.array(results_lin.params)

xb_logit = results_logit.fittedvalues
factor_logit = np.mean(stats.logistic.pdf(xb_logit))
APE_logit_manual = results_logit.params * factor_logit

xb_probit = results_probit.fittedvalues
factor_probit = np.mean(stats.norm.pdf(xb_probit))
APE_probit_manual = results_probit.params * factor_probit

table_manual = pd.DataFrame({'APE_lin': np.round(APE_lin, 4),
                             'APE_logit_manual': np.round(APE_logit_manual, 4),
                             'APE_probit_manual': np.round(APE_probit_manual, 4)})
print(f'table_manual: \n{table_manual}\n')

# automatic average partial effects:
coef_names = np.array(results_lin.model.exog_names)
coef_names = np.delete(coef_names, 0)  # drop Intercept

APE_logit_autom = results_logit.get_margeff().margeff
APE_probit_autom = results_probit.get_margeff().margeff

table_auto = pd.DataFrame({'coef_names': coef_names,
                           'APE_logit_autom': np.round(APE_logit_autom, 4),
                           'APE_probit_autom': np.round(APE_probit_autom, 4)})
print(f'table_auto: \n{table_auto}\n')
Example-17-2.py
import wooldridge as woo
import numpy as np
import patsy as pt
import scipy.stats as stats
import statsmodels.formula.api as smf
import statsmodels.base.model as smclass

mroz = woo.dataWoo('mroz')
y, X = pt.dmatrices('hours ~ nwifeinc + educ + exper +'
                    'I(exper**2)+ age + kidslt6 + kidsge6',
                    data=mroz, return_type='dataframe')

# generate starting solution:
reg_ols = smf.ols(formula='hours ~ nwifeinc + educ + exper + I(exper**2) +'
                          'age + kidslt6 + kidsge6', data=mroz)
results_ols = reg_ols.fit()
sigma_start = np.log(sum(results_ols.resid ** 2) / len(results_ols.resid))
params_start = np.concatenate((np.array(results_ols.params), sigma_start),
                              axis=None)

# extend statsmodels class by defining nloglikeobs:
class Tobit(smclass.GenericLikelihoodModel):
    # define a function that returns the negative log likelihood per observation
    # for a set of parameters that is provided by the argument "params":
    def nloglikeobs(self, params):
        # objects in "self" are defined in the parent class:
        X = self.exog
        y = self.endog
        p = X.shape[1]
        # for details on the implementation see Wooldridge (2019), formula 17.22:
        beta = params[0:p]
        sigma = np.exp(params[p])
        y_hat = np.dot(X, beta)
        y_eq = (y == 0)
        y_g = (y > 0)
        ll = np.empty(len(y))
        ll[y_eq] = np.log(stats.norm.cdf(-y_hat[y_eq] / sigma))
        ll[y_g] = np.log(stats.norm.pdf((y - y_hat)[y_g] / sigma)) - np.log(sigma)
        # return an array of log likelihoods for each observation:
        return -ll

# results of MLE:
reg_tobit = Tobit(endog=y, exog=X)
results_tobit = reg_tobit.fit(start_params=params_start, maxiter=10000, disp=0)
print(f'results_tobit.summary(): \n{results_tobit.summary()}\n')
Example-17-3.py
import wooldridge as woo
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

crime1 = woo.dataWoo('crime1')

# estimate linear model:
reg_lin = smf.ols(formula='narr86 ~ pcnv + avgsen + tottime + ptime86 +'
                          'qemp86 + inc86 + black + hispan + born60',
                  data=crime1)
results_lin = reg_lin.fit()

# print regression table:
table_lin = pd.DataFrame({'b': round(results_lin.params, 4),
                          'se': round(results_lin.bse, 4),
                          't': round(results_lin.tvalues, 4),
                          'pval': round(results_lin.pvalues, 4)})
print(f'table_lin: \n{table_lin}\n')

# estimate Poisson model:
reg_poisson = smf.poisson(formula='narr86 ~ pcnv + avgsen + tottime +'
                                  'ptime86 + qemp86 + inc86 + black +'
                                  'hispan + born60',
                          data=crime1)
results_poisson = reg_poisson.fit(disp=0)

# print regression table:
table_poisson = pd.DataFrame({'b': round(results_poisson.params, 4),
                              'se': round(results_poisson.bse, 4),
                              't': round(results_poisson.tvalues, 4),
                              'pval': round(results_poisson.pvalues, 4)})
print(f'table_poisson: \n{table_poisson}\n')

# estimate Quasi-Poisson model:
reg_qpoisson = smf.glm(formula='narr86 ~ pcnv + avgsen + tottime + ptime86 +'
                               'qemp86 + inc86 + black + hispan + born60',
                       family=sm.families.Poisson(),
                       data=crime1)
# the argument scale controls for the dispersion in exponential dispersion models,
# see the module documentation for more details:
results_qpoisson = reg_qpoisson.fit(scale='X2', disp=0)

# print regression table:
table_qpoisson = pd.DataFrame({'b': round(results_qpoisson.params, 4),
                               'se': round(results_qpoisson.bse, 4),
                               't': round(results_qpoisson.tvalues, 4),
                               'pval': round(results_qpoisson.pvalues, 4)})
print(f'table_qpoisson: \n{table_qpoisson}\n')
Example-17-4.py
import wooldridge as woo
import numpy as np
import patsy as pt
import scipy.stats as stats
import statsmodels.formula.api as smf
import statsmodels.base.model as smclass

recid = woo.dataWoo('recid')

# define dummy for censored observations:
censored = recid['cens'] != 0
y, X = pt.dmatrices('ldurat ~ workprg + priors + tserved + felon +'
                    'alcohol + drugs + black + married + educ + age',
                    data=recid, return_type='dataframe')

# generate starting solution:
reg_ols = smf.ols(formula='ldurat ~ workprg + priors + tserved + felon +'
                          'alcohol + drugs + black + married + educ + age',
                  data=recid)
results_ols = reg_ols.fit()
sigma_start = np.log(sum(results_ols.resid ** 2) / len(results_ols.resid))
params_start = np.concatenate((np.array(results_ols.params), sigma_start),
                              axis=None)


# extend statsmodels class by defining nloglikeobs:
class CensReg(smclass.GenericLikelihoodModel):
    def __init__(self, endog, cens, exog):
        self.cens = cens
        super(smclass.GenericLikelihoodModel, self).__init__(endog, exog,
                                                             missing='none')

    def nloglikeobs(self, params):
        X = self.exog
        y = self.endog
        cens = self.cens
        p = X.shape[1]
        beta = params[0:p]
        sigma = np.exp(params[p])
        y_hat = np.dot(X, beta)
        ll = np.empty(len(y))
        # uncensored:
        ll[~cens] = np.log(stats.norm.pdf((y - y_hat)[~cens] /
                                          sigma)) - np.log(sigma)
        # censored:
        ll[cens] = np.log(stats.norm.cdf(-(y - y_hat)[cens] / sigma))
        return -ll


# results of MLE:
reg_censReg = CensReg(endog=y, exog=X, cens=censored)
results_censReg = reg_censReg.fit(start_params=params_start,
                                  maxiter=10000, method='BFGS', disp=0)
print(f'results_censReg.summary(): \n{results_censReg.summary()}\n')
Example-17-5.py
import wooldridge as woo
import statsmodels.formula.api as smf
import scipy.stats as stats

mroz = woo.dataWoo('mroz')

# step 1 (use all n observations to estimate a probit model of s_i on z_i):
reg_probit = smf.probit(formula='inlf ~ educ + exper + I(exper**2) +'
                                'nwifeinc + age + kidslt6 + kidsge6',
                        data=mroz)
results_probit = reg_probit.fit(disp=0)
pred_inlf = results_probit.fittedvalues
mroz['inv_mills'] = stats.norm.pdf(pred_inlf) / stats.norm.cdf(pred_inlf)

# step 2 (regress y_i on x_i and inv_mills in sample selection):
reg_heckit = smf.ols(formula='lwage ~ educ + exper + I(exper**2) + inv_mills',
                     subset=(mroz['inlf'] == 1), data=mroz)
results_heckit = reg_heckit.fit()

# print results:
print(f'results_heckit.summary(): \n{results_heckit.summary()}\n')
Tobit-CondMean-Simulation.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.base.model as smclass

# set the random seed:
np.random.seed(1234567)

x = np.sort(stats.norm.rvs(0, 1, size=100) + 4)
xb = -4 + 1 * x
y_star = xb + stats.norm.rvs(0, 1, size=100)
y = np.copy(y_star)
y[y_star < 0] = 0

x_wc = pd.DataFrame({'const': 1, 'x': x})


# extend statsmodel class:
class Tobit(smclass.GenericLikelihoodModel):
    def nloglikeobs(self, params):
        X = self.exog
        y = self.endog
        p = X.shape[1]
        beta = params[0:p]
        sigma = np.exp(params[p])
        y_hat = np.dot(X, beta)
        y_eq = (y == 0)
        y_g = (y > 0)
        ll = np.empty(len(y))
        ll[y_eq] = np.log(stats.norm.cdf(-y_hat[y_eq] / sigma))
        ll[y_g] = np.log(stats.norm.pdf((y - y_hat)[y_g] / sigma)) - np.log(sigma)
        return -ll


# predictions:
reg_ols = sm.OLS(endog=y, exog=x_wc)
results_ols = reg_ols.fit()
yhat_ols = results_ols.fittedvalues

sigma_start = np.log(sum(results_ols.resid ** 2) / len(results_ols.resid))
params_start = np.concatenate((np.array(results_ols.params), sigma_start), axis=None)
reg_tobit = Tobit(endog=y, exog=x_wc)
results_tobit = reg_tobit.fit(start_params=params_start, disp=0)
yhat_tobit = np.dot(x_wc, np.transpose(results_tobit.params[0:2]))

# plot data and model predictions:
plt.axhline(y=0, linewidth=0.5, linestyle='-', color='grey')
plt.plot(x, y_star, color='black', marker='x',
         linestyle='', label='real data')
plt.plot(x, y, color='black', marker='+',
         linestyle='', label='truncated data')
plt.plot(x, yhat_ols, color='black', marker='',
         linestyle='-', label='OLS fit')
plt.plot(x, yhat_tobit, color='black', marker='',
         linestyle='--', label='Tobit fit')
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/Tobit-CondMean-Simulation.pdf')
Tobit-CondMean.py
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# set the random seed:
np.random.seed(1234567)

x = np.sort(stats.norm.rvs(0, 1, size=100) + 4)
xb = -4 + 1 * x
y_star = xb + stats.norm.rvs(0, 1, size=100)
y = np.copy(y_star)
y[y_star < 0] = 0

# conditional means:
Eystar = xb
Ey = stats.norm.cdf(xb / 1) * xb + 1 * stats.norm.pdf(xb / 1)

# plot data and conditional means:
plt.axhline(y=0, linewidth=0.5,
            linestyle='-', color='grey')
plt.plot(x, y_star, color='black',
         marker='x', linestyle='', label='y*')
plt.plot(x, y, color='black', marker='+',
         linestyle='', label='y')
plt.plot(x, Eystar, color='black', marker='',
         linestyle='-', label='E(y*)')
plt.plot(x, Ey, color='black', marker='',
         linestyle='--', label='E(y)')
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/Tobit-CondMean.pdf')
TruncReg-Simulation.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import scipy.stats as stats

# set the random seed:
np.random.seed(1234567)

x = np.sort(stats.norm.rvs(0, 1, size=100) + 4)
y = -4 + 1 * x + stats.norm.rvs(0, 1, size=100)

# complete observations and observed sample:
compl = pd.DataFrame({'x': x, 'y': y})
sample = compl.loc[y > 0]

# predictions OLS:
reg_ols = smf.ols(formula='y ~ x', data=sample)
results_ols = reg_ols.fit()
yhat_ols = results_ols.fittedvalues

# predictions truncated regression:
reg_tr = smf.ols(formula='y ~ x', data=compl)
results_tr = reg_tr.fit()
yhat_tr = results_tr.fittedvalues

# plot data and conditional means:
plt.axhline(y=0, linewidth=0.5, linestyle='-', color='grey')
plt.plot(compl['x'], compl['y'], color='black',
         marker='o', fillstyle='none', linestyle='', label='all data')
plt.plot(sample['x'], sample['y'], color='black',
         marker='o', fillstyle='full', linestyle='', label='sample data')
plt.plot(sample['x'], yhat_ols, color='black',
         marker='', linestyle='--', label='OLS fit')
plt.plot(compl['x'], yhat_tr, color='black',
         marker='', linestyle='-', label='Trunc. Reg. fit')
plt.ylabel('y')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/TruncReg-Simulation.pdf')
Binary-Margeff.jl
using Distributions, GLM, Random, Plots, DataFrames

# set the random seed:
Random.seed!(12345)

y = rand(Binomial(1, 0.5), 100)
x = rand(Normal(), 100) + 2 * y
sim_data = DataFrame(y=y, x=x)

# estimation:
reg_lin = lm(@formula(y ~ x), sim_data)
reg_logit = glm(@formula(y ~ x), sim_data, Binomial(), LogitLink())
reg_probit = glm(@formula(y ~ x), sim_data, Binomial(), ProbitLink())

# partial effects:
PE_lin = range(coef(reg_lin)[2], coef(reg_lin)[2], length=100)

coefs_logit = coeftable(reg_logit).cols[1]
xb_logit = reg_logit.mm.m * coefs_logit
factor_logit = pdf.(Logistic(), xb_logit)
PE_logit = coefs_logit[2] * factor_logit

coefs_probit = coeftable(reg_probit).cols[1]
xb_probit = reg_probit.mm.m * coefs_probit
factor_probit = pdf.(Normal(), xb_probit)
PE_probit = coefs_probit[2] * factor_probit

# plot PE's:
scatter(x, PE_lin, markershape=:circle, label="linear",
    color="black", legend=:topleft)
scatter!(x, PE_logit, markershape=:cross, label="logit",
    color="black", legend=:topleft)
scatter!(x, PE_probit, markershape=:star, label="probit",
    color="black", legend=:topleft)
ylabel!("partial effects")
xlabel!("x")
savefig("JlGraphs/Binary-margeff.pdf")
Binary-Predictions.jl
using Distributions, GLM, Random, Plots, DataFrames

# set the random seed:
Random.seed!(12345)

y = rand(Binomial(1, 0.5), 100)
x = rand(Normal(), 100) + 2 * y
sim_data = DataFrame(y=y, x=x)

# estimation:
reg_lin = lm(@formula(y ~ x), sim_data)
reg_logit = glm(@formula(y ~ x), sim_data, Binomial(), LogitLink())
reg_probit = glm(@formula(y ~ x), sim_data, Binomial(), ProbitLink())

# prediction for regular grid of x values:
X_new = DataFrame(x=range(minimum(x), maximum(x), length=50))
predictions_lin = predict(reg_lin, X_new)
predictions_logit = predict(reg_logit, X_new)
predictions_probit = predict(reg_probit, X_new)

# scatter plot and fitted values:
scatter(x, y, label=false, color="black", legend=:topleft)
plot!(X_new.x, predictions_lin, linewidth=2,
    label="linear", color="black")
plot!(X_new.x, predictions_logit, linewidth=2,
    label="logit", color="black", linestyle=:dash)
plot!(X_new.x, predictions_probit, linewidth=2,
    label="probit", color="black", linestyle=:dot)
ylabel!("y")
xlabel!("x")
savefig("JlGraphs/Binary-Predictions.pdf")
Example-17-1-1.jl
using WooldridgeDatasets, GLM, DataFrames
include("../08/calc-white-se.jl")

mroz = DataFrame(wooldridge("mroz"))

# estimate linear probability model:
reg_lin = lm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) +
                             age + kidslt6 + kidsge6), mroz)
hc0 = calc_white_se(reg_lin, mroz)

table_reg_lin = DataFrame(
    coefficients=coeftable(reg_lin).rownms,
    b=round.(coef(reg_lin), digits=5),
    se_white=hc0)
println("table_reg_lin: \n$table_reg_lin")
Example-17-1-2.jl
using WooldridgeDatasets, GLM, DataFrames

mroz = DataFrame(wooldridge("mroz"))

# estimate linear probability model:
reg_lin = lm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) +
                             age + kidslt6 + kidsge6), mroz)

# predictions for two "extreme" women:
X_new = DataFrame(nwifeinc=[100, 0], educ=[5, 17],
     exper=[0, 30], age=[20, 52],
     kidslt6=[2, 0], kidsge6=[0, 0])
predictions = round.(predict(reg_lin, X_new), digits=5)

print("predictions = $predictions")
Example-17-1-3.jl
using WooldridgeDatasets, GLM, DataFrames

mroz = DataFrame(wooldridge("mroz"))

# estimate logit model:
reg_logit = glm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                                kidslt6 + kidsge6),
    mroz, Binomial(), LogitLink())
table_reg_logit = coeftable(reg_logit)
println("table_reg_logit: \n$table_reg_logit\n")

# log likelihood value:
ll = deviance(reg_logit) / -2
println("ll = $ll\n")

# McFadden's pseudo R2:
reg_logit_null = glm(@formula(inlf ~ 1), mroz, Binomial(), LogitLink())
ll_null = deviance(reg_logit_null) / -2
pr2 = 1 - ll / ll_null
println("pr2 = $pr2")
Example-17-1-4.jl
using WooldridgeDatasets, GLM, DataFrames

mroz = DataFrame(wooldridge("mroz"))

# estimate probit model:
reg_probit = glm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                                 kidslt6 + kidsge6),
    mroz, Binomial(), ProbitLink())
table_reg_probit = coeftable(reg_probit)
println("table_reg_probit: \n$table_reg_probit\n")

# log likelihood value:
ll = deviance(reg_probit) / -2
println("ll=$ll\n")

# McFadden's pseudo R2:
reg_probit_null = glm(@formula(inlf ~ 1), mroz, Binomial(), ProbitLink())
ll_null = deviance(reg_probit_null) / -2
pr2 = 1 - ll / ll_null
println("pr2 = $pr2")
Example-17-1-5.jl
using WooldridgeDatasets, GLM, DataFrames, Distributions

mroz = DataFrame(wooldridge("mroz"))

# estimate probit model:
reg_probit = glm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                                 kidslt6 + kidsge6),
    mroz, Binomial(), ProbitLink())
ll = deviance(reg_probit) / -2

# test of overall significance (test statistic and p value):
reg_probit_null = glm(@formula(inlf ~ 1), mroz, Binomial(), ProbitLink())
ll_null = deviance(reg_probit_null) / -2
lr1 = 2 * (ll - ll_null)
pval_all = 1 - cdf(Chisq(7), lr1)
println("lr1 = $lr1\n")
println("pval_all = $pval_all\n")

# likelihood ratio statistic test of H0 (experience and age are irrelevant):
reg_probit_hyp = glm(@formula(inlf ~ nwifeinc + educ + kidslt6 + kidsge6),
    mroz, Binomial(), ProbitLink())
ll_hyp = deviance(reg_probit_hyp) / -2
lr2 = 2 * (ll - ll_hyp)
pval_hyp = 1 - cdf(Chisq(3), lr2)
println("lr2 = $lr2\n")
println("pval_hyp = $pval_hyp")
Example-17-1-6.jl
using WooldridgeDatasets, GLM, DataFrames

mroz = DataFrame(wooldridge("mroz"))

# estimate models:
reg_lin = lm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                             kidslt6 + kidsge6), mroz)
reg_logit = glm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                                kidslt6 + kidsge6),
     mroz, Binomial(), LogitLink())
reg_probit = glm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                                 kidslt6 + kidsge6),
     mroz, Binomial(), ProbitLink())

# predictions for two "extreme" women:
X_new = DataFrame(nwifeinc=[100, 0], educ=[5, 17],
     exper=[0, 30], age=[20, 52],
     kidslt6=[2, 0], kidsge6=[0, 0])
predictions_lin = round.(predict(reg_lin, X_new), digits=5)
predictions_logit = round.(predict(reg_logit, X_new), digits=5)
predictions_probit = round.(predict(reg_probit, X_new), digits=5)

println("predictions_lin = $predictions_lin\n")
println("predictions_logit = $predictions_logit\n")
println("predictions_probit = $predictions_probit")
Example-17-1-7.jl
using WooldridgeDatasets, GLM, DataFrames, Statistics,
    Distributions, LinearAlgebra

mroz = DataFrame(wooldridge("mroz"))

# estimate models:
reg_lin = lm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                             kidslt6 + kidsge6), mroz)
reg_logit = glm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                                kidslt6 + kidsge6),
    mroz, Binomial(), LogitLink())
reg_probit = glm(@formula(inlf ~ nwifeinc + educ + exper + (exper^2) + age +
                                 kidslt6 + kidsge6),
    mroz, Binomial(), ProbitLink())

# average partial effects:
APE_lin = coef(reg_lin)

coefs_logit = coef(reg_logit)
xb_logit = reg_logit.mm.m * coefs_logit
factor_logit = mean(pdf.(Logistic(), xb_logit))
APE_logit = coefs_logit * factor_logit

coefs_probit = coef(reg_probit)
xb_probit = reg_probit.mm.m * coefs_probit
factor_probit = mean(pdf.(Normal(), xb_probit))
APE_probit = coefs_probit * factor_probit

# print results:
table_manual = DataFrame(
    coef_names=coeftable(reg_lin).rownms,
    APE_lin=round.(APE_lin, digits=5),
    APE_logit=round.(APE_logit, digits=5),
    APE_probit=round.(APE_probit, digits=5))
println("table_manual: \n$table_manual")
Example-17-2.jl
using WooldridgeDatasets, GLM, DataFrames, Statistics, Distributions,
    LinearAlgebra, Optim
include("../03/getMats.jl")

# load data and build data matrices:
mroz = DataFrame(wooldridge("mroz"))
f = @formula(hours ~ 1 + nwifeinc + educ + exper +
                     (exper^2) + age + kidslt6 + kidsge6)
xy = getMats(f, mroz)
y = xy[1]
X = xy[2]

# define a function that returns the negative log likelihood per observation
# (for details on the implementation see Wooldridge (2019), formula 17.22):
function ll_tobit(params, y, X)
    p = size(X, 2)
    beta = params[1:p]
    sigma = exp(params[p+1])
    y_hat = X * beta
    y_eq = (y .== 0)
    y_g = (y .> 0)
    ll = zeros(length(y))
    ll[y_eq] = log.(cdf.(Normal(), -y_hat[y_eq] / sigma))
    ll[y_g] = log.(pdf.(Normal(), (y.-y_hat)[y_g] / sigma)) .- log(sigma)
    # return the negative sum of log likelihoods for each observation:
    return -sum(ll)
end


# generate starting solution:
reg_ols = lm(@formula(hours ~ nwifeinc + educ + exper + (exper^2) +
                              age + kidslt6 + kidsge6), mroz)
resid_ols = residuals(reg_ols)
sigma_start = log(sum(resid_ols .^ 2) / length(resid_ols))
params_start = vcat(coef(reg_ols), sigma_start)

# maximize the log likelihood = minimize the negative of the log likelihood:
optimum = optimize(par -> ll_tobit(par, y, X), params_start, Newton())
mle_est = Optim.minimizer(optimum)
ll = -optimum.minimum

# print results:
table_mle = DataFrame(
    coef_names=vcat(coeftable(reg_ols).rownms, "exp_sigma"),
    mle_est=round.(mle_est, digits=5))
println("table_mle: \n$table_mle\n")
println("ll = $ll")
Example-17-3.jl
using WooldridgeDatasets, GLM, DataFrames

crime1 = DataFrame(wooldridge("crime1"))

# estimate linear model:
reg_lin = lm(@formula(narr86 ~ pcnv + avgsen + tottime + ptime86 + qemp86 +
                               inc86 + black + hispan + born60), crime1)
table_lin = coeftable(reg_lin)
println("table_lin: \n$table_lin\n")

# estimate Poisson model:
reg_poisson = glm(@formula(narr86 ~ pcnv + avgsen + tottime + ptime86 + qemp86 +
                                    inc86 + black + hispan + born60),
    crime1, Poisson())
table_poisson = coeftable(reg_poisson)
println("table_poisson: \n$table_poisson\n")

# estimate Quasi-Poisson model:
yhat = predict(reg_poisson)
resid = crime1.narr86 .- yhat
sigma_sq = 1 / (2725 - 9 - 1) * sum(resid .^ 2 ./ yhat)
table_qpoisson = coeftable(reg_poisson)
table_qpoisson.cols[2] = table_qpoisson.cols[2] * sqrt(sigma_sq)
println("table_qpoisson: \n$table_qpoisson")
Example-17-4.jl
using WooldridgeDatasets, GLM, DataFrames, Statistics, Distributions,
        LinearAlgebra, Optim

# load data and build data matrices:
recid = DataFrame(wooldridge("recid"))
f = @formula(ldurat ~ 1 + workprg + priors + tserved +
                      felon + alcohol + drugs + black +
                      married + educ + age)
xy = getMats(f, recid)
y = xy[1]
X = xy[2]

# define dummy for censored observations:
censored = recid.cens .!= 0

# generate starting solution:
reg_ols = lm(@formula(ldurat ~ workprg + priors + tserved +
                               felon + alcohol + drugs +
                               black + married +
                               educ + age), recid)
resid_ols = residuals(reg_ols)
sigma_start = log(sum(resid_ols .^ 2) / length(resid_ols))
params_start = vcat(coef(reg_ols), sigma_start)

# define a function that returns the negative log likelihood per observation:
function ll_censreg(params, y, X, cens)
        p = size(X, 2)
        beta = params[1:p]
        sigma = exp(params[p+1])
        y_hat = X * beta
        ll = zeros(length(y))
        # uncensored:
        ll[.!cens] = log.(pdf.(Normal(),
                (y.-y_hat)[.!cens] / sigma)) .- log(sigma)
        # censored:
        ll[cens] = log.(cdf.(Normal(), -(y.-y_hat)[cens] / sigma))

        # return the negative sum of log likelihoods for each observation:
        return -sum(ll)
end

# maximize the log likelihood = minimize the negative of the log likelihood:
optimum = optimize(par -> ll_censreg(par, y, X, censored), params_start, Newton())
mle_est = Optim.minimizer(optimum)
ll = -optimum.minimum

# print results of MLE:
table_mle = DataFrame(
        coef_names=vcat(coeftable(reg_ols).rownms, "exp_sigma"),
        mle_est=round.(mle_est, digits=5))
println("table_mle: \n$table_mle\n")
println("ll = $ll")
Example-17-5.jl
using WooldridgeDatasets, GLM, DataFrames, Distributions

# load data and build data matrices:
mroz = DataFrame(wooldridge("mroz"))

# step 1 (use all n observations to estimate a probit model of s_i on z_i):
reg_probit = glm(@formula(inlf ~ educ + exper +
                                 (exper^2) + nwifeinc +
                                 age + kidslt6 + kidsge6),
    mroz, Binomial(), ProbitLink())
pred_inlf_linpart = quantile.(Normal(), fitted(reg_probit))
mroz.inv_mills = pdf.(Normal(), pred_inlf_linpart) ./
                 cdf.(Normal(), pred_inlf_linpart)

# step 2 (regress y_i on x_i and inv_mills in sample selection):
mroz_subset = subset(mroz, :inlf => ByRow(==(1)))
reg_heckit = lm(@formula(lwage ~ educ + exper + (exper^2) +
                                 inv_mills), mroz_subset)

# print results:
table_reg_heckit = coeftable(reg_heckit)
println("table_reg_heckit: \n$table_reg_heckit")
Tobit-CondMean-Simulation.jl
using Random, Distributions, Statistics, Plots, GLM, DataFrames, LinearAlgebra, Optim

# set the random seed:
Random.seed!(12345)

x = sort(rand(Normal(), 100) .+ 4)
xb = -4 .+ 1 * x
y_star = xb .+ rand(Normal(), 100)
y = copy(y_star)
y[y_star.<0] .= 0
sim_data = DataFrame(y_star=y_star, y=y, x=x)


# define likelihood:
function ll_tobit(params, y, X)
    p = size(X, 2)
    beta = params[1:p]
    sigma = exp(params[p+1])
    y_hat = X * beta
    y_eq = (y .=== 0.0)
    y_g = (y .> 0)
    ll = zeros(length(y))
    ll[y_eq] = log.(cdf.(Normal(), -y_hat[y_eq] / sigma))
    ll[y_g] = log.(pdf.(Normal(), (y-y_hat)[y_g] / sigma)) .- log(sigma)
    # return the negative sum of log likelihoods for each observation:
    return -sum(ll)
end


# predictions:
reg_ols = lm(@formula(y ~ x), sim_data)
yhat_ols = fitted(reg_ols)
resid_ols = residuals(reg_ols)
sigma_start = log(sum(resid_ols .^ 2) / length(resid_ols))
params_start = vcat(coef(reg_ols), sigma_start)

# maximise the log likelihood = minimize the negative of the log likelihood:
X = hcat(ones(100), x)
optimum = optimize(par -> ll_tobit(par, y, X), params_start, Newton())
mle_est = Optim.minimizer(optimum)
yhat_tobit = X * mle_est[1:2]

# plot data and model predictions:

hline([0], color="grey", label=false, legend=:topleft)
scatter!(x, y_star, color="black", markershape=:xcross, label="real data")
scatter!(x, y, color="black", markershape=:cross, label="truncated data")
plot!(x, yhat_ols, color="black", linewidth=2, linestyle=:solid, label="OLS fit")
plot!(x, yhat_tobit, color="black", linewidth=2, linestyle=:dash, label="Tobit fit")
ylabel!("y")
xlabel!("x")
savefig("JlGraphs/Tobit-CondMean-Simulation.pdf")
Tobit-CondMean.jl
using Random, Distributions, Statistics, Plots

# set the random seed:
Random.seed!(12345)

x = sort(rand(Normal(), 100) .+ 4)
xb = -4 .+ 1 * x
y_star = xb .+ rand(Normal(), 100)
y = copy(y_star)
y[y_star.<0] .= 0

# conditional means:
Eystar = xb
Ey = cdf.(Normal(), xb) .* xb .+ pdf.(Normal(), xb)

# plot data and conditional means:
hline([0], color="grey", label=false, legend=:topleft)
scatter!(x, y_star, color="black", markershape=:xcross, label="y*")
scatter!(x, y, color="black", markershape=:cross, label="y")
plot!(x, Eystar, color="black", linewidth=2, linestyle=:solid, label="E(y*)")
plot!(x, Ey, color="black", linewidth=2, linestyle=:dash, label="E(y)")
ylabel!("y")
xlabel!("x")
savefig("JlGraphs/Tobit-CondMean.pdf")
TruncReg-Simulation.jl
using GLM, Random, Distributions, Statistics, Plots, DataFrames

# set the random seed:
Random.seed!(12345)

x = sort(rand(Normal(), 100) .+ 4)
y = -4 .+ 1 * x .+ rand(Normal(), 100)
compl = DataFrame(x=x, y=y)

sample = (y .> 0)

# complete observations and observed sample:
x_sample = x[sample]
y_sample = y[sample]

sample = DataFrame(x=x_sample, y=y_sample)

# predictions OLS:
reg_ols = lm(@formula(y ~ x), sample)
yhat_ols = fitted(reg_ols)

# predictions truncated regression:
reg_tr = lm(@formula(y ~ x), compl)
yhat_tr = fitted(reg_tr)


# plot data and conditional means:
hline([0], color="grey", label=false, legend=:topleft)
scatter!(compl.x, compl.y, color="black", markershape=:circle,
    markercolor=:white, label="all data")
scatter!(sample.x, sample.y, color="black", markershape=:circle,
    label="sample data")
plot!(sample.x, yhat_ols, color="black", linewidth=2,
    linestyle=:dash, label="OLS fit")
plot!(compl.x, yhat_tr, color="black", linewidth=2,
    linestyle=:solid, label="Trunc. Reg. fit")
ylabel!("y")
xlabel!("x")
savefig("JlGraphs/TruncReg-Simulation.pdf")