Skip to content

Commit

Permalink
Revert "Revert "Allow missings in the modelmatrix, necessary to use…
Browse files Browse the repository at this point in the history
… `lead`/`lag` in `@formula` (#109)""

This reverts commit f68e456.
merged
  • Loading branch information
greimel committed Jun 29, 2022
1 parent 4887c70 commit 434acd4
Show file tree
Hide file tree
Showing 4 changed files with 223 additions and 34 deletions.
113 changes: 90 additions & 23 deletions src/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -139,27 +139,15 @@ function reg(
if nobs == N
esample = Colon()
end

# Compute weights
if has_weights
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
all(isfinite, weights) || throw("Weights are not finite")
else
weights = uweights(nobs)
end

# Compute feM, an AbstractFixedEffectSolver

has_intercept = hasintercept(formula)
has_fe_intercept = false
if has_fes
if any(fe.interaction isa UnitWeights for fe in fes)
has_fe_intercept = true
end
fes = FixedEffect[fe[esample] for fe in fes]
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
end
# Compute data for std errors
vcov_method = Vcov.materialize(view(df, esample, :), vcov)

##############################################################################
##
## Dataframe --> Matrix
Expand All @@ -171,14 +159,23 @@ function reg(
formula_schema = apply_schema(formula, s, FixedEffectModel, has_fe_intercept)

# Obtain y
# for a Vector{Float64}, convert(Vector{Float64}, y) aliases y
y = convert(Vector{Float64}, response(formula_schema, subdf))
all(isfinite, y) || throw("Some observations for the dependent variable are infinite")
# for a Vector{Float64}, conver(Vector{Float64}, y) aliases y
y = response(formula_schema, subdf)

# added in PR #109 to handle cases where formula terms introduce missings
# to be removed when fixed in StatsModels
esample2 = .!ismissing.(y)

# Obtain X
Xexo = convert(Matrix{Float64}, modelmatrix(formula_schema, subdf))
all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")

Xexo = modelmatrix(formula_schema, subdf)

# PR #109, to be removed when fixed in StatsModels
if size(Xexo, 2) > 0
for c in eachcol(Xexo)
esample2 .&= .!ismissing.(c)
end
end

response_name, coef_names = coefnames(formula_schema)
if !(coef_names isa Vector)
coef_names = typeof(coef_names)[coef_names]
Expand All @@ -187,19 +184,89 @@ function reg(
if has_iv
subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in endo_vars)...))
formula_endo_schema = apply_schema(formula_endo, schema(formula_endo, subdf, contrasts), StatisticalModel)
Xendo = convert(Matrix{Float64}, modelmatrix(formula_endo_schema, subdf))
all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")
Xendo = modelmatrix(formula_endo_schema, subdf)

# PR #109, to be removed when fixed in StatsModels
if size(Xendo, 2) > 0
for c in eachcol(Xendo)
esample2 .&= .!ismissing.(c)
end
end

_, coefendo_names = coefnames(formula_endo_schema)
append!(coef_names, coefendo_names)

subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in iv_vars)...))
formula_iv_schema = apply_schema(formula_iv, schema(formula_iv, subdf, contrasts), StatisticalModel)
Z = convert(Matrix{Float64}, modelmatrix(formula_iv_schema, subdf))
Z = modelmatrix(formula_iv_schema, subdf)

for c in eachcol(Z)
esample2 .&= .!ismissing.(c)
end

# PR #109, to be removed when fixed in StatsModels
if any(esample2 .== false)
Xendo = Xendo[esample2,:]
Z = Z[esample2,:]
end

Xendo = convert(Matrix{Float64}, Xendo)
all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")

Z = convert(Matrix{Float64}, Z)
all(isfinite, Z) || throw("Some observations for the instrumental variables are infinite")

if size(Z, 2) < size(Xendo, 2)
throw("Model not identified. There must be at least as many ivs as endogeneneous variables")
end

# modify formula to use in predict
formula_schema = FormulaTerm(formula_schema.lhs, (tuple(eachterm(formula_schema.rhs)..., (term for term in eachterm(formula_endo_schema.rhs) if term != ConstantTerm(0))...)))
end

esample0 = esample == Colon() ? trues(size(df,1)) : copy(esample)

# PR #109, to be removed when fixed in StatsModels
if any(esample2 .== false)
if any(esample0 .== 0)
throw(ArgumentError("You passed a dataset missing observations and used formula terms that introduce missings. This is not yet supported."))
end
Xexo = Xexo[esample2,:]
y = y[esample2]
esample = copy(esample0)
esample[esample] .= esample2
nobs = sum(esample2)
end

y = convert(Vector{Float64}, y)
all(isfinite, y) || throw("Some observations for the dependent variable are infinite")

Xexo = convert(Matrix{Float64}, Xexo)
all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")

# Compute weights
if has_weights
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
else
weights = uweights(nobs)
end
all(isfinite, weights) || throw("Weights are not finite")
sqrtw = sqrt.(weights)

# Compute feM, an AbstractFixedEffectSolver
has_intercept = hasintercept(formula)
has_fe_intercept = false
if has_fes
if any(fe.interaction isa UnitWeights for fe in fes)
has_fe_intercept = true
end
fes = FixedEffect[fe[esample] for fe in fes]
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
end
# Compute data for std errors
vcov_method = Vcov.materialize(view(df, esample, :), vcov)


# compute tss now before potentially demeaning y
tss_total = tss(y, has_intercept | has_fe_intercept, weights)
# create unitilaized
Expand Down
56 changes: 45 additions & 11 deletions src/partial_out.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,52 @@ function partial_out(
fes, ids, ids_fes, formula = parse_fixedeffect(df, formula)
has_fes = !isempty(fes)


nobs = sum(esample)
(nobs > 0) || throw("sample is empty")

if has_fes
# in case some FixedEffect does not have interaction, remove the intercept
if any(isa(fe.interaction, UnitWeights) for fe in fes)
formula = FormulaTerm(formula.lhs, tuple(ConstantTerm(0), (t for t in eachterm(formula.rhs) if t!= ConstantTerm(1))...))
has_fes_intercept = true
end
end

# Extract Y
vars = unique(StatsModels.termvars(formula))
subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
Y = modelmatrix(formula_y_schema, subdf)

# Extract X
formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
X = modelmatrix(formula_x_schema, subdf)

# added in PR #109 to handle cases where formula terms introduce missings
# to be removed when fixed in StatsModels
esample2 = trues(size(Y, 1))
for c in eachcol(Y)
esample2 .&= .!ismissing.(c)
end
if size(X, 2) > 0 # X can have zero rows if all regressors are fixed effects
for c in eachcol(X)
esample2 .&= .!ismissing.(c)
end
end

if any(!, esample2)
esample = esample2
Y = Y[esample,:]
X = X[esample,:]
nobs = sum(esample)
end

# Disallow missings
Y = convert(Matrix{Float64}, Y)
X = convert(Matrix{Float64}, X)

# Compute weights
if has_weights
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
Expand All @@ -85,14 +128,8 @@ function partial_out(
fes = FixedEffect[fe[esample] for fe in fes]
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method})
end

# Compute residualized Y
vars = unique(StatsModels.termvars(formula))
subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
Y = convert(Matrix{Float64}, modelmatrix(formula_y_schema, subdf))

ynames = coefnames(formula_y_schema)[2]
if !isa(ynames, Vector)
ynames = [ynames]
Expand All @@ -107,9 +144,6 @@ function partial_out(
end

# Compute residualized X
formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
X = convert(Matrix{Float64}, modelmatrix(formula_x_schema, subdf))
if has_fes
X, b, c = solve_residuals!(X, feM; maxiter = maxiter, tol = tol, progress_bar = false)
append!(iterations, b)
Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
FixedEffects = "c8885935-8500-56a7-9867-7708b20db0eb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
Expand Down
87 changes: 87 additions & 0 deletions test/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,93 @@ df.Price_zero = copy(df.Price)
df.Price_zero[1] = 0.0
m = @formula Sales ~ log(Price_zero)
@test_throws "Some observations for the regressor are infinite" reg(df, m)

# function that introduces missing
using StatsModels: lag
df.id1 = df.State
df.y = df.Sales
df.x1 = df.Price
df.z1 = df.Pimin
df.w = df.Pop
df.x1_lagged = lag(df.x1)
df.z1_lagged = lag(df.z1)
df.y_lagged = lag(df.y)


df.x1_m1 = Array{Union{Float64,Missing}}(copy(df.x1))
df.x1_m1[end-20:end] .= missing

df.x1_m2 = Array{Union{Float64,Missing}}(copy(df.x1))
df.x1_m2[5:10:end] .= missing

df.x1_lagged = lag(df.x1)
df.z1_lagged = lag(df.z1)


function test_lags(m0, m1, descr)
@testset "$descr" begin
x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
x1 = reg(df, m1, Vcov.cluster(:id1), weights=:w)

@test x0.residuals == x1.residuals
@test x0.coef == x1.coef
@test x0.nobs == x1.nobs
@test x0.vcov == x1.vcov
end
end

function test_lags_broken(m0, m1, descr)
@testset "$descr" begin
x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
@test_throws ArgumentError reg(df, m1, Vcov.cluster(:id1), weights=:w)

#@test_ x0.coef != x1.coef
#@test_ x0.vcov != x1.vcov
end
end

# NOTE: This is a "dumb" lag function,
# it doesn't take into account time and group indices!
@testset "missings from @formula" begin
m0 = @formula y ~ x1_lagged + fe(id1)
m1 = @formula y ~ lag(x1) + fe(id1)
test_lags(m0, m1, "ols: _ ~ lag")

m0 = @formula y_lagged ~ x1 + fe(id1)
m1 = @formula lag(y) ~ x1 + fe(id1)
test_lags(m0, m1, "ols: lag ~ _")

m0 = @formula y ~ (x1_lagged ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (lag(x1) ~ lag(z1)) + fe(id1)
test_lags(m0, m1, "iv: _ ~ (lag ~ lag)")

m0 = @formula y ~ (x1_lagged ~ z1) + fe(id1)
m1 = @formula y ~ (lag(x1) ~ z1) + fe(id1)
test_lags(m0, m1, "iv: _ ~ (lag ~ _)")

m0 = @formula y ~ (x1 ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (x1 ~ lag(z1)) + fe(id1)
test_lags(m0, m1, "iv: _ ~ (_ ~ lag)")

# NOTE: The case where the df contains missings and the formula contains missings cannot be handled yet. The case with :x1_m1 would actually work, but the case with :x1_m2 would not. This because the missings in x1_m1 and x1_m2 are removed BEFORE the the lag is applied.

m0 = @formula y_lagged ~ x1_m1 + fe(id1)
m1 = @formula lag(y) ~ x1_m1 + fe(id1)
test_lags_broken(m0, m1, "ols: lag ~ _, with missings")

m0 = @formula y_lagged ~ x1_m2 + fe(id1)
m1 = @formula lag(y) ~ x1_m2 + fe(id1)
test_lags_broken(m0, m1, "ols: lag ~ _, with missings")

m0 = @formula y ~ (x1_m1 ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (x1_m1 ~ lag(z1)) + fe(id1)
test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")

m0 = @formula y ~ (x1_m2 ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (x1_m2 ~ lag(z1)) + fe(id1)
test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")
end

##############################################################################
##
## collinearity
Expand Down

0 comments on commit 434acd4

Please sign in to comment.