clear all set obs 1000 forvalues i=1/4 { generate x`i' = runiform(-3,3) // x's independent and balanced } summarize x* pwcorr x*, sig // no substantial correlation egen ylo = rowtotal(x*) //histogram ylo // normal distribution generate ypr = invlogit(ylo) //histogram ypr // beta distribution generate y = runiform() > ypr tabulate y // should be roughly "fair" quietly logit y x* estimates store four quietly logit y x1-x3 estimates store three quietly logit y x1-x2 estimates store two quietly logit y x1 estimates store one estimates table four three two one, se stats(bic r2_p) // Note se's become smaller coefplot (four)(three)(two)(one), drop(_cons) // Decreasing se's is less obvious here estimates table four three two one, se stats(bic r2_p) eform coefplot (four)(three)(two)(one), drop(_cons) eform // in contrast, in a regression we expect to see no change // where the x's are perfectly independent/balanced. // With randomly sampled x's we'll almost see this. generate yreg = ylo + rnormal() quietly regress yreg x1-x4 estimates store regfour quietly regress yreg x1-x3 estimates store regthree quietly regress yreg x1-x2 estimates store regtwo quietly regress yreg x1 estimates store regone estimates table regfour regthree regtwo regone, se stats(bic r2) coefplot (regfour) (regthree)(regtwo)(regone), drop(_cons) // We'll demonstrate this in perfectly balanced data, below. // in perfectly balanced data, they should not change at all, see below // you see the same phenomenon in other distributions of x's clear all set obs 1000 forvalues i=1/4 { generate x`i' = rnormal(0,3) } egen ylo = rowtotal(x*) generate ypr = invlogit(ylo) generate y = runiform() > ypr tabulate y quietly logit y x* estimates store logitnorm quitely logit y x1-x3 estimates table logitnorm ., se stats(bic r2_p) // less obvious in non-centered distributions clear set obs 1000 forvalues i=1/4 { generate x`i' = runiform(-5,1) } egen ylo = rowtotal(x*) generate y = runiform() > invlogit(ylo) quietly logit y x* estimates store noncentered quietly logit y x1-x3 estimates table noncentered . , se stats(bic r2_p) // an example with coarse measures, where it is easier to // examine the marginal distributions clear set obs 1000 forvalues i=1/2 { generate x`i' = runiformint(-2,2) } tab x1 x2, chi2 // approx uniform and independent, 9 combinations egen ylo = rowtotal(x*) tab ylo // 5 possible values generate ypr = invlogit(ylo) tab ypr // converted to probabilities graph twoway contourline ypr x2 x1, interp(none) levels(20) generate y = runiform() < ypr // generate observed values table x1 x2, contents(mean ypr) // expected proportions table x1, contents(mean ypr) // marginal expectation, given the x2 distribution // the key point in all of this is that the expected proportions of // x1 when x2==0 are not the same as the marginal expectation for x1 // (the conditional expectation is not the same as the marginal expectation). // Because proportions are bounded [0,1], if we collapse our model // (drop a variable), each point along the resulting marginal distribution // will have // shifted away from the boundary it was closest to - each point will have // shifted toward the center. graph bar (mean) ypr, over(x1) by(x2, row(1) title("Expected proportions")) /// ytitle("Fraction with y=1") generate yct = ypr*(1000/9) table x1 x2, contents(mean yct) // expected counts table x1 x2, contents(mean y) // observed proportions graph bar (mean) y, over(x1) by(x2, row(1) title("Observed proportions")) /// ytitle("Fraction with y=1") table x1 x2, contents(sum y) // observed counts quietly logit y x1 x2 estimates store log2x quietly logit y x1 // we see the coefficient for x1 is biased toward 0 (OR=1) estimates table log2x ., se stats(bic r2_p) // Suppose the data are exactly/perfectly balanced clear input x1 x2 -1 -1 0 0 1 1 end fillin x1 x2 generate ypr = invlogit(x1 + x2) table x1 x2, contents(mean ypr) table x1, contents(mean ypr) graph twoway contourline ypr x2 x1, levels(10) // No change in a regression with perfect balance expand 10 generate yreg = x1 + x2 + rnormal() quietly regress yreg x1 x2 estimates store regbal quietly regress yreg x1 estimates table regbal ., se stats(bic r2) coefplot (regbal) ( . ), drop(_cons)