tekst2

*-------------------------------------------
* grameen 3
*-------------------------------------------

clear all
global path "C:\Users\Miha\Desktop\impact evaluation\problem_sets\data"
cd "$path"

use hh_9198, clear

bysort nh: egen check=sum(1)
ta check /*with this commands we check if the panel is balanced*/
drop check


*----------------------
* DID
*----------------------

*Simple DID comparison using the 'ttest' command*
gen exptot0=exptot if year==0
bysort nh: egen exptot91=max(exptot0)

gen exptot1=exptot if year==1
bysort nh: egen exptot98=max(exptot1)

gen lexptot91=ln(1+exptot91)
gen lexptot98=ln(1+exptot98)

su lexptot98 if year==1 & dfmfd==1
global yt1p1=r(mean)
su lexptot91 if year==1 & dfmfd==1
global yt0p1=r(mean)

su lexptot98 if year==1 & dfmfd==0
global yt1p0=r(mean)
su lexptot91 if year==1 & dfmfd==0
global yt0p0=r(mean)

display ($yt1p1-$yt0p1)-($yt1p0-$yt0p0)


gen lexptot9891=lexptot98-lexptot91
ttest lexptot9891 if year==1, by(dmmfd)
ttest lexptot9891 if year==1, by(dfmfd)

*alternatively:
gen y=ln(1+exptot) /*outcome variable in logs*/

gen dfmfd1=(dfmfd==1 & year==1) /*dummy for female participants in 1998*/
bysort nh: egen p=max(dfmfd1) /*female participants in 1998, panel variable*/
gen yearp=year*p /*interaction term between female participation and time*/

reg y year p yearp /*attention to the standard error!*/
xtreg y year p yearp

*

gen lnland=ln(1+hhland/100)
xtreg y year p yearp sexhead agehead educhead lnland vaccess pcirr rice wheat milk oil egg 

*accounting for unobserved hetorogeneity*
sort nh year
by nh: gen dy=y[2]-y[1]

reg dy p if year==0
xtreg y year p yearp, fe i(nh)

reg dy p sexhead agehead educhead lnland vaccess pcirr rice wheat milk oil egg if year==0

sort nh year
foreach var in sexhead agehead educhead lnland vaccess pcirr rice wheat milk oil egg {
by nh: gen d`var'=`var'[2]-`var'[1]
}
*
reg dy p sexhead   agehead  educhead  lnland  vaccess  pcirr  rice  wheat  milk  oil  egg ///
dsexhead dagehead deduchead dlnland dvaccess dpcirr drice dwheat dmilk doil degg if year==0


*-------------------------------------------
* grameen 2
*-------------------------------------------

*set the working directory using a STATA global*
clear all
global path "C:\Users\Miha\Desktop\impact evaluation\problem_sets\data"
cd "$path"

*--> Note: a global, is a simple way to save time and space in your STATA file.
* A global is basically a new name (in our case the name "path") that stays in Stata's memory and that
* identifies an object, which in our case is the string "C:\Users\dpacifico\Desktop\ImpactEvaluation_Data_files"
* Hence, when you call your global (and you do it with the dollar sign in front of the global's name: $path)
* STATA understands that you mean the global's object.

*--> Note: when you set the directory, all the data file that you generate are be saved in that folder*

use hh_98.dta, clear 
*--> Note: we have just loaded the 1998 data file. Notice that we saved a lot of typing by using the former global name

* To see all variables in the data set, use the 'describe' command:
describe
* To see what the data set contains you can use the "browse" command:
browse in 1/10

* Suppose you want to see mean family size and education of household head for female participants and nonparticipants
table dfmfd, contents(mean famsize sd famsize) row

* Let us assign a label to our variable:
label define dfmfd_l 1 Yes 0 No
label define dmmfd_l 1 Yes 0 No

label value dfmfd dfmfd_l
label value dmmfd dfmfd_l


table dfmfd, contents(mean famsize sd famsize) row f(%4.3f)
table dfmfd, contents(mean hhasset sd hhasset) row f(%4.3f)
table dfmfd, contents(mean hhland sd hhland) row f(%4.3f)
table dfmfd, contents(mean ageh sd ageh) row f(%4.3f)
table dfmfd, contents(mean educh sd educh) row f(%4.3f)
table dfmfd, contents(mean sexh sd sexh) row f(%4.3f)


* We can label also other variables: 
label define sexlabel 0 Female 1 Male
label values sexhead sexlabel

table sexh, contents(mean famsize sd famsize) row f(%3.2f)
table sexh, contents(mean educh sd educh) row f(%4.3f)
table sexh, contents(mean ageh sd ageh) row f(%4.3f)

su vaccess
su pcirr

****
gen no_participants=(dfmfd==0 & dmmfd==0)
ta no_participants

table no_participants, contents(mean rice sd rice) row f(%3.2f)
table no_participants, contents(mean wheat sd wheat)row f(%3.2f)
****

table sexh, contents(mean exptot mean expfd mean expnfd) row f(%4.3f)
table sexh, contents(sd exptot sd expfd sd expnfd) row f(%4.3f)

gen some_educ=(educh>0)
table some_educ, contents(mean exptot mean expfd mean expnfd) row f(%4.3f)
table some_educ, contents(sd exptot sd expfd sd expnfd) row f(%4.3f)

gen large_hh=(famsize>5)
table large_hh, contents(mean exptot mean expfd mean expnfd) row f(%4.3f)
table large_hh, contents(sd exptot sd expfd sd expnfd) row f(%4.3f)

gen large_land=(hhland>50)
table large_land, contents(mean exptot mean expfd mean expnfd) row f(%4.3f)
table large_land, contents(sd exptot sd expfd sd expnfd) row f(%4.3f)


*---------------*
* Randomisation *
*---------------*
/*
Assume that microcredit programs are randomly assigned to households (in reality, such random 
assignment is not done. The assumption is made just to demonstrate the implementation of randomised impact evaluation).
You want to ascertain the impact of program participation on household's per capita total expenditures.
*/

* use the 1998 household data:
use hh_98, clear 

* Create the log form of two Outcome variables (“exptot”) and household’s land (“hhland”), which is changed
* to acre from decimal by dividing by 100:
gen lexptot=ln(1+exptot)
gen lnland=ln(1+hhland/100)

/*
Start with the simplest method to calculate average treatment effect of
program participation for females. It is done by using the Stata “ttest” command, which
compares the outcome between treated and control villages:
*/

* Impacts of program participation, t-test
ttest lexptot, by(dmmfd)
ttest lexptot, by(dfmfd)

*we can replicate the standard error and the confidenc interval that stata shows with the following equations:
su lexptot
display sqrt(.5138679^2/[(534/1129)*(595/1129)*1129])
display  -.005102+1.965*sqrt(.5138679^2/[(534/1129)*(595/1129)*1129])
display  -.005102-1.965*sqrt(.5138679^2/[(534/1129)*(595/1129)*1129])

* Regression implementation
reg lexptot dmmfd
reg lexptot dfmfd

* Expanded regression
reg lexptot dmmfd sexhead agehead educhead lnland vaccess pcirr rice wheat milk oil egg
reg lexptot dfmfd sexhead agehead educhead lnland vaccess pcirr rice wheat milk oil egg


*-------------------------------------------
* grameen 1
*-------------------------------------------

*install the pscore command:
ssc install pscore, replace
*once you have installed it, put an asterisk in front of the previous line*

clear all
global path "/Users/danielepacifico/Dropbox/Lectures/Impact Evaluation/ImpactEvaluation_Data_files"
cd "$path"

use hh_98, clear 

gen lexptot=ln(1+exptot)
gen lnland=ln(1+hhland/100)


*----------------------
* Female participants |
*----------------------
tab dfmfd /*this is your treatment variable*/
*let's assign a label to this variable:
label define treatment 1 "treated" 0 "untreated"
label values dfmfd treatment
ta dfmfd

*pscore equation*
pscore dfmfd sexhead agehead educhead lnland vaccess pcirr rice wheat milk egg, pscore(ps98) blockid(blockf1) comsup level(0.01) logit
cap drop ps98 blockf1
pscore dfmfd sexhead agehead educhead lnland vaccess pcirr rice wheat milk egg oil, pscore(ps98) blockid(blockf1) level(0.01) comsup logit

table dfmfd, c(sum ps98 min ps98 max ps98)
ta comsup dfmfd 

/*from the output of this command you can see that the variable comsup 
generated by the command pscore include by default the whole et of treated units)*/

*However:
twoway (kdensity ps98 if dfmfd==1 , color(red) xtitle(propensity score) legend(label(1 "Treated") label(2 "Unreated"))) (kdensity ps98 if  dfmfd==0, color(blue) ) 

/*as you can see from the figure, there are people in the treatment unit with such a 
high propensity score that none simila untreated units can be found for them.. */
 

*so let us create a new comsup variable that do not include these treated units:
cap drop comsup2
su ps98 if  dfmfd==1
gen comsup2=(ps98>r(min))
su comsup* /*so the variable we have just created is the same that the pscore command generates automatically*/

su ps98 if  dfmfd==0
replace comsup2=0 if ps98>r(max)
ta comsup2 dfmfd /*so if we properly impose the CSC in our sample, we should exlude 5 treated units and 5 untreated units*/

*---------------------------
*Nearest Neighbor Matching |
*---------------------------
attnd lexptot dfmfd if comsup2==1, pscore(ps98) 

/*standard errors are typically derived analitically; another option is to derive them by bootstrap..*/

set seed 123456789
attnd lexptot dfmfd if comsup2==1, pscore(ps98) bootstrap reps(300) 

/*the standard errors are now computed by bootstrap*/

*---------------------
*Stratified Matching |
*---------------------
ta blockf1 dfmfd
atts lexptot dfmfd , pscore(ps98) blockid(blockf1)
*n.treat. n.contr. ATT Std.Err. t
*590 529 0.100 0.030 3.334

*-----------------
*Radius Matching |
*-----------------
attr lexptot dfmfd if comsup2==1, pscore(ps98) radius(0.001) comsup /*bootstrap reps(300)*/
attr lexptot dfmfd if comsup2==1, pscore(ps98) radius(0.01)  comsup /*bootstrap reps(300)*/

*-----------------
*Kernel Matching |
*-----------------
ta dfmfd comsup2
set seed 123456789
attk lexptot dfmfd if comsup2==1, pscore(ps98) bootstrap reps(300)

*n. treat. n.contr. ATT Std.Err. t
*590 529 0.112 .0305959 3.634

**do not trim the data
set seed 123456789
attk lexptot dfmfd , pscore(ps98) bootstrap reps(300)
*n. treat. n.contr. ATT Std.Err. t
*595 534 0.110 .0304021 3.634

**trim the data but use a different kernel (epanechnikov insted of the gaussian)
set seed 123456789
attk lexptot dfmfd if comsup2==1, pscore(ps98) comsup epan bwidth(0.001) bootstrap reps(300)
*n. treat. n.contr. ATT Std.Err. t
*595 530 0.111 0.042 2.654

set seed 123456789
attk lexptot dfmfd if comsup2==1, pscore(ps98) comsup epan bwidth(0.01) bootstrap reps(300)
*n. treat. n.contr. ATT Std.Err. t
*595 530 0.102 0.032 3.198



*-------------------------------------------
* outcome equation
*-------------------------------------------

clear all
global path "/Users/danielepacifico/Dropbox/Lectures/Impact Evaluation/ImpactEvaluation_Data_files"
cd "$path"

use card, clear 

*Outcome equation:
reg lwage educ, robust

*choosing the best instrument:
corr educ nearc2
corr educ nearc4

*IV estimator
corr lwage nearc4, cov
global cov_yz=r(cov_12)

corr educ nearc4, cov
global cov_tz=r(cov_12)

display $cov_yz/$cov_tz

*we can also apply the 2SLS procedure and get the same result
reg educ nearc4
predict fv
reg lwage fv

/*Since in this very simple model we have only one single (endogenous) regressor in the outcome 
equation and the instrument is a dummy variable, we can get the same result using the WALD estimator:
*/

sum lwage if nearc4==1
global y1=r(mean)
sum lwage if nearc4==0
global y0=r(mean)

sum educ if nearc4==1
global t1=r(mean)
sum educ if nearc4==0
global t0=r(mean)

display ($y1-$y0)/($t1-$t0)

*IV estimate in STATA
ivregress 2sls lwage (educ=nearc4)

/*the command: "estat endogenous" performs tests to determine whether endogenous 
regressors in the model are in fact exogenous (Wu-Hausman test for endogeneity).*/

estat endogenous
*we reject the null of exogeneity of EDUC

*including additional covariates:
ivregress 2sls lwage (educ=nearc4) exper expersq black reg661-reg668 smsa smsa66 south

*2SLS in a over-identified model:
ivregress 2sls lwage (educ=nearc4 nearc2) exper expersq black reg661-reg668 smsa smsa66 south
*or
reg educ nearc4 nearc2 exper expersq black reg661-reg668 smsa smsa66 south
predict fv1
reg lwage fv1 exper expersq black reg661-reg668 smsa smsa66 south

*the command "estat overid" after ivregress performs tests of overidentifying restrictions. (Sargan's and Basmann's (1960) tests are reported)
*the null hypothesis for these tests is that all instruments are valid. 
ivregress 2sls lwage (educ=nearc4 nearc2) exper expersq black reg661-reg668 smsa smsa66 south
estat overid
*we accept the null that all instruments are valid.

*Compare IV estimate with a just-identified and a over-identified model
ivregress 2sls lwage (educ=nearc4 nearc2) exper expersq black reg661-reg668 smsa smsa66 south
ivregress 2sls lwage (educ=nearc4) exper expersq black reg661-reg668 smsa smsa66 south




*-------------------------------------------
* unemployment benefit duration
*-------------------------------------------

clear all
global path "/Users/danielepacifico/Dropbox/Lectures/Impact Evaluation/ImpactEvaluation_Data_files"
cd "$path"

use AER_Lalive_Subsample.dta, clear 

sum age unemployment_duration
table age50, c(mean unemployment_duration) row
tab age

egen age_bins = cut(age), at(46(0.25)54)
tab age_bins
su age_bins

bysort age_bins: egen mean_unempdur = mean(unemployment_duration)

sum mean_unempdur

**Assumption of a linear relationship between unemployment duration and age
scatter mean_unempdur age_bins  || lfit mean_unempdur age_bins if age_bins < 50 /// 
|| lfit mean_unempdur age_bins if age_bins >= 50 , xlabel(46(1)54) ylabel(0(10)40) xline(50)

**Assumption of a quadratic relationship between unemployment duration and age
scatter mean_unempdur age_bins  || qfit mean_unempdur age_bins if age_bins < 50 ///
|| qfit mean_unempdur age_bins if age_bins >= 50 , xlabel(46(1)54) ylabel(0(10)40) xline(50)

scatter mean_unempdur age_bins  || lfit mean_unempdur age_bins if age_bins < 50  /// 
|| lfit mean_unempdur age_bins if age_bins >= 50 ///
|| qfit mean_unempdur age_bins if age_bins < 50  ///
|| qfit mean_unempdur age_bins if age_bins >= 50, xlabel(46(1)54) ylabel(0(10)40) xline(50)


*create the treatment dummy:
gen t=(age >=50)
tab t

/* Generate the scaled age variable (agesc) and create the variables that will 
allow to estimate a pooled polynomial regression of order 1, 2, 3, and 4: */

gen agesc =age - 50
generate t_agesc = t*agesc

forvalues i=2/4 {
cap generate agesc`i' = agesc^`i'
cap generate t_agesc`i' = t*agesc`i'
}
regress unemployment_duration t agesc t_agesc, robust
regress unemployment_duration t agesc agesc2 t_agesc t_agesc2 , robust
regress unemployment_duration t agesc agesc2 agesc3 t_agesc t_agesc2 t_agesc3, robust
regress unemployment_duration t agesc agesc2 agesc3 agesc4 t_agesc t_agesc2 t_agesc3 t_agesc4, robust

predict fitq4
**Assumption of a quadratic relationship between unemployment duration and age
scatter mean_unempdur age_bins  || qfit mean_unempdur age_bins if age_bins < 50  ///
|| qfit mean_unempdur age_bins if age_bins >= 50 ///
|| line fitq4 age if age_bins < 50 ///
|| line fitq4 age if age_bins >= 50 , xlabel(46(1)54) ylabel(0(10)40) xline(50)


* Estimate a local polinomial regressions:
cap drop output*
lpoly unemployment_duration age if age<50,  kernel(epan2) generate(output0) at(age) nograph
lpoly unemployment_duration age if age>=50,  kernel(epan2) generate(output1) at(age) nograph 

sum output0 if age>=49 & age <50
scalar outcome0 =r(mean)

sum output1 if age>=50 & age<51
scalar outcome1 =r(mean)

scalar diff_outcome= outcome1-outcome0
display diff_outcome

twoway scatter mean_unempdur age_bin || line output0 age if age<50 || line output1 age if age>=50, xlabel(46(1)54) ylabel(5(10)40) xline(50)

scatter mean_unempdur age_bins  || lfit mean_unempdur age_bins if age_bins < 50  /// 
|| lfit mean_unempdur age_bins if age_bins >= 50 ///
|| qfit mean_unempdur age_bins if age_bins < 50  ///
|| qfit mean_unempdur age_bins if age_bins >= 50 ///
|| line output0 age if age_bins < 50 ///
|| line output1 age if age_bins>=50, xlabel(46(1)54) ylabel(0(10)40) xline(50)



*compute standard error after local polinomial regression using bootstrap*
cap drop rdd_effect
gen rdd_effect=.

set seed 123456789
forvalues i=1/50 {
quietly{
preserve
bsample
cap drop output*
lpoly unemployment_duration age if age<50, degree(3) kernel(epan2) generate(output0) at(age) nograph
lpoly unemployment_duration age if age>=50, degree(3) kernel(epan2) generate(output1) at(age) nograph 

sum output0 if age>=49 & age <50
scalar outcome0 =r(mean)

sum output1 if age>=50 & age<51
scalar outcome1 =r(mean)

scalar diff_outcome= outcome1-outcome0
restore
replace rdd_effect=diff_outcome if _n==`i'
}
display `i'
}
sum rdd_effect
*   N. Effect     SE
*  rdd_effect |    50    10.33221    2.924757




*-------------------------------------------
* unemployment benefit duration
*-------------------------------------------


clear all
global path "/Users/danielepacifico/Dropbox/Lectures/Impact Evaluation/ImpactEvaluation_Data_files"
cd $path

use $path/AER_Lalive_Subsample.dta, clear 

sum age unemployment_duration
table age50, c(mean unemployment_duration) row
tab age

egen age_bins = cut(age), at(46(0.25)54)
tab age_bins
su age_bins

bysort age_bins: egen mean_unempdur = mean(unemployment_duration)

sum mean_unempdur

**Assumption of a linear relationship between unemployment duration and age

scatter mean_unempdur age_bins  || lfit mean_unempdur age_bins if age_bins < 50 /// 
|| lfit mean_unempdur age_bins if age_bins >= 50 , xlabel(46(1)54) ylabel(0(10)40) xline(50)

**Assumption of a quadratic relationship between unemployment duration and age

scatter mean_unempdur age_bins  || qfit mean_unempdur age_bins if age_bins < 50 ///
|| qfit mean_unempdur age_bins if age_bins >= 50 , xlabel(46(1)54) ylabel(0(10)40) xline(50)

*create the treatment dummy:
gen t=(age >=50)
tab t

/* Generate the scaled age variable (agesc) and create the variables that will 
allow to estimate a pooled polynomial regression of order 1, 2, 3, and 4: */

gen agesc =age - 50
generate t_agesc = t*agesc

forvalues i=2/4 {
cap generate agesc`i' = agesc^`i'
cap generate t_agesc`i' = t*agesc`i'
}
regress unemployment_duration t agesc t_agesc, robust
regress unemployment_duration t agesc agesc2 t_agesc t_agesc2 , robust
regress unemployment_duration t agesc agesc2 agesc3 t_agesc t_agesc2 t_agesc3, robust
regress unemployment_duration t agesc agesc2 agesc3 agesc4 t_agesc t_agesc2 t_agesc3 t_agesc4, robust


* Estimate a local polinomial regressions:
cap drop output*
lpoly unemployment_duration age if age<50, degree(3) kernel(epan2) generate(output0) at(age) nograph
lpoly unemployment_duration age if age>=50, degree(3) kernel(epan2) generate(output1) at(age) nograph 

sum output0 if age>=49 & age <50
scalar outcome0 =r(mean)

sum output1 if age>=50 & age<51
scalar outcome1 =r(mean)

scalar diff_outcome= outcome1-outcome0
display diff_outcome

twoway scatter mean_unempdur age_bin || line output0 age if age<50 || line output1 age if age>=50, xlabel(46(1)54) ylabel(5(10)40) xline(50)

*compute standard error after local polinomial regression using bootstrap*
cap drop rdd_effect
gen rdd_effect=.

set seed 123456789
forvalues i=1/50 {
quietly{
preserve
bsample
cap drop output*
lpoly unemployment_duration age if age<50, degree(3) kernel(epan2) generate(output0) at(age) nograph
lpoly unemployment_duration age if age>=50, degree(3) kernel(epan2) generate(output1) at(age) nograph 

sum output0 if age>=49 & age <50
scalar outcome0 =r(mean)

sum output1 if age>=50 & age<51
scalar outcome1 =r(mean)

scalar diff_outcome= outcome1-outcome0
restore
replace rdd_effect=diff_outcome if _n==`i'
}
display `i'
}
sum rdd_effect
*   N. Effect     SE
*  rdd_effect |    50    10.33221    2.924757

Comments