From 434acd4aa9ae3690b8275d080ea916004e96f7c5 Mon Sep 17 00:00:00 2001 From: Fabian Greimel Date: Wed, 29 Jun 2022 13:59:13 +0200 Subject: [PATCH] Revert "Revert "Allow `missings` in the modelmatrix, necessary to use `lead`/`lag` in `@formula` (#109)"" This reverts commit f68e456fc33485079038a9af7af520be05654282. merged --- src/fit.jl | 113 ++++++++++++++++++++++++++++++++++++--------- src/partial_out.jl | 56 +++++++++++++++++----- test/Project.toml | 1 + test/fit.jl | 87 ++++++++++++++++++++++++++++++++++ 4 files changed, 223 insertions(+), 34 deletions(-) diff --git a/src/fit.jl b/src/fit.jl index f561ea6..0bd6157 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -139,27 +139,15 @@ function reg( if nobs == N esample = Colon() end - - # Compute weights - if has_weights - weights = Weights(convert(Vector{Float64}, view(df, esample, weights))) - all(isfinite, weights) || throw("Weights are not finite") - else - weights = uweights(nobs) - end - - # Compute feM, an AbstractFixedEffectSolver + has_intercept = hasintercept(formula) has_fe_intercept = false if has_fes if any(fe.interaction isa UnitWeights for fe in fes) has_fe_intercept = true end - fes = FixedEffect[fe[esample] for fe in fes] - feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads) end - # Compute data for std errors - vcov_method = Vcov.materialize(view(df, esample, :), vcov) + ############################################################################## ## ## Dataframe --> Matrix @@ -171,14 +159,23 @@ function reg( formula_schema = apply_schema(formula, s, FixedEffectModel, has_fe_intercept) # Obtain y - # for a Vector{Float64}, convert(Vector{Float64}, y) aliases y - y = convert(Vector{Float64}, response(formula_schema, subdf)) - all(isfinite, y) || throw("Some observations for the dependent variable are infinite") + # for a Vector{Float64}, conver(Vector{Float64}, y) aliases y + y = response(formula_schema, subdf) + # added in PR #109 to handle cases where formula terms introduce missings + # to be removed when fixed in StatsModels + esample2 = .!ismissing.(y) + # Obtain X - Xexo = convert(Matrix{Float64}, modelmatrix(formula_schema, subdf)) - all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite") - + Xexo = modelmatrix(formula_schema, subdf) + + # PR #109, to be removed when fixed in StatsModels + if size(Xexo, 2) > 0 + for c in eachcol(Xexo) + esample2 .&= .!ismissing.(c) + end + end + response_name, coef_names = coefnames(formula_schema) if !(coef_names isa Vector) coef_names = typeof(coef_names)[coef_names] @@ -187,19 +184,89 @@ function reg( if has_iv subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in endo_vars)...)) formula_endo_schema = apply_schema(formula_endo, schema(formula_endo, subdf, contrasts), StatisticalModel) - Xendo = convert(Matrix{Float64}, modelmatrix(formula_endo_schema, subdf)) - all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite") + Xendo = modelmatrix(formula_endo_schema, subdf) + + # PR #109, to be removed when fixed in StatsModels + if size(Xendo, 2) > 0 + for c in eachcol(Xendo) + esample2 .&= .!ismissing.(c) + end + end + _, coefendo_names = coefnames(formula_endo_schema) append!(coef_names, coefendo_names) subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in iv_vars)...)) formula_iv_schema = apply_schema(formula_iv, schema(formula_iv, subdf, contrasts), StatisticalModel) - Z = convert(Matrix{Float64}, modelmatrix(formula_iv_schema, subdf)) + Z = modelmatrix(formula_iv_schema, subdf) + + for c in eachcol(Z) + esample2 .&= .!ismissing.(c) + end + + # PR #109, to be removed when fixed in StatsModels + if any(esample2 .== false) + Xendo = Xendo[esample2,:] + Z = Z[esample2,:] + end + + Xendo = convert(Matrix{Float64}, Xendo) + all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite") + + Z = convert(Matrix{Float64}, Z) all(isfinite, Z) || throw("Some observations for the instrumental variables are infinite") + + if size(Z, 2) < size(Xendo, 2) + throw("Model not identified. There must be at least as many ivs as endogeneneous variables") + end # modify formula to use in predict formula_schema = FormulaTerm(formula_schema.lhs, (tuple(eachterm(formula_schema.rhs)..., (term for term in eachterm(formula_endo_schema.rhs) if term != ConstantTerm(0))...))) end + + esample0 = esample == Colon() ? trues(size(df,1)) : copy(esample) + + # PR #109, to be removed when fixed in StatsModels + if any(esample2 .== false) + if any(esample0 .== 0) + throw(ArgumentError("You passed a dataset missing observations and used formula terms that introduce missings. This is not yet supported.")) + end + Xexo = Xexo[esample2,:] + y = y[esample2] + esample = copy(esample0) + esample[esample] .= esample2 + nobs = sum(esample2) + end + + y = convert(Vector{Float64}, y) + all(isfinite, y) || throw("Some observations for the dependent variable are infinite") + + Xexo = convert(Matrix{Float64}, Xexo) + all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite") + + # Compute weights + if has_weights + weights = Weights(convert(Vector{Float64}, view(df, esample, weights))) + else + weights = uweights(nobs) + end + all(isfinite, weights) || throw("Weights are not finite") + sqrtw = sqrt.(weights) + + # Compute feM, an AbstractFixedEffectSolver + has_intercept = hasintercept(formula) + has_fe_intercept = false + if has_fes + if any(fe.interaction isa UnitWeights for fe in fes) + has_fe_intercept = true + end + fes = FixedEffect[fe[esample] for fe in fes] + feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads) + end + # Compute data for std errors + vcov_method = Vcov.materialize(view(df, esample, :), vcov) + + # compute tss now before potentially demeaning y tss_total = tss(y, has_intercept | has_fe_intercept, weights) # create unitilaized diff --git a/src/partial_out.jl b/src/partial_out.jl index 4f25834..572386a 100644 --- a/src/partial_out.jl +++ b/src/partial_out.jl @@ -65,9 +65,52 @@ function partial_out( fes, ids, ids_fes, formula = parse_fixedeffect(df, formula) has_fes = !isempty(fes) - nobs = sum(esample) (nobs > 0) || throw("sample is empty") + + if has_fes + # in case some FixedEffect does not have interaction, remove the intercept + if any(isa(fe.interaction, UnitWeights) for fe in fes) + formula = FormulaTerm(formula.lhs, tuple(ConstantTerm(0), (t for t in eachterm(formula.rhs) if t!= ConstantTerm(1))...)) + has_fes_intercept = true + end + end + + # Extract Y + vars = unique(StatsModels.termvars(formula)) + subdf = Tables.columntable(disallowmissing!(df[esample, vars])) + formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...)) + formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel) + Y = modelmatrix(formula_y_schema, subdf) + + # Extract X + formula_x = FormulaTerm(ConstantTerm(0), formula.rhs) + formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel) + X = modelmatrix(formula_x_schema, subdf) + + # added in PR #109 to handle cases where formula terms introduce missings + # to be removed when fixed in StatsModels + esample2 = trues(size(Y, 1)) + for c in eachcol(Y) + esample2 .&= .!ismissing.(c) + end + if size(X, 2) > 0 # X can have zero rows if all regressors are fixed effects + for c in eachcol(X) + esample2 .&= .!ismissing.(c) + end + end + + if any(!, esample2) + esample = esample2 + Y = Y[esample,:] + X = X[esample,:] + nobs = sum(esample) + end + + # Disallow missings + Y = convert(Matrix{Float64}, Y) + X = convert(Matrix{Float64}, X) + # Compute weights if has_weights weights = Weights(convert(Vector{Float64}, view(df, esample, weights))) @@ -85,14 +128,8 @@ function partial_out( fes = FixedEffect[fe[esample] for fe in fes] feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}) end - + # Compute residualized Y - vars = unique(StatsModels.termvars(formula)) - subdf = Tables.columntable(disallowmissing!(df[esample, vars])) - formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...)) - formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel) - Y = convert(Matrix{Float64}, modelmatrix(formula_y_schema, subdf)) - ynames = coefnames(formula_y_schema)[2] if !isa(ynames, Vector) ynames = [ynames] @@ -107,9 +144,6 @@ function partial_out( end # Compute residualized X - formula_x = FormulaTerm(ConstantTerm(0), formula.rhs) - formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel) - X = convert(Matrix{Float64}, modelmatrix(formula_x_schema, subdf)) if has_fes X, b, c = solve_residuals!(X, feM; maxiter = maxiter, tol = tol, progress_bar = false) append!(iterations, b) diff --git a/test/Project.toml b/test/Project.toml index 44882ea..7f1b8c0 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -6,6 +6,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" FixedEffects = "c8885935-8500-56a7-9867-7708b20db0eb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] diff --git a/test/fit.jl b/test/fit.jl index 3a58c90..423b1ea 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -228,6 +228,93 @@ df.Price_zero = copy(df.Price) df.Price_zero[1] = 0.0 m = @formula Sales ~ log(Price_zero) @test_throws "Some observations for the regressor are infinite" reg(df, m) + +# function that introduces missing +using StatsModels: lag +df.id1 = df.State +df.y = df.Sales +df.x1 = df.Price +df.z1 = df.Pimin +df.w = df.Pop +df.x1_lagged = lag(df.x1) +df.z1_lagged = lag(df.z1) +df.y_lagged = lag(df.y) + + +df.x1_m1 = Array{Union{Float64,Missing}}(copy(df.x1)) +df.x1_m1[end-20:end] .= missing + +df.x1_m2 = Array{Union{Float64,Missing}}(copy(df.x1)) +df.x1_m2[5:10:end] .= missing + +df.x1_lagged = lag(df.x1) +df.z1_lagged = lag(df.z1) + + +function test_lags(m0, m1, descr) + @testset "$descr" begin + x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w) + x1 = reg(df, m1, Vcov.cluster(:id1), weights=:w) + + @test x0.residuals == x1.residuals + @test x0.coef == x1.coef + @test x0.nobs == x1.nobs + @test x0.vcov == x1.vcov + end +end + +function test_lags_broken(m0, m1, descr) + @testset "$descr" begin + x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w) + @test_throws ArgumentError reg(df, m1, Vcov.cluster(:id1), weights=:w) + + #@test_ x0.coef != x1.coef + #@test_ x0.vcov != x1.vcov + end +end + +# NOTE: This is a "dumb" lag function, +# it doesn't take into account time and group indices! +@testset "missings from @formula" begin + m0 = @formula y ~ x1_lagged + fe(id1) + m1 = @formula y ~ lag(x1) + fe(id1) + test_lags(m0, m1, "ols: _ ~ lag") + + m0 = @formula y_lagged ~ x1 + fe(id1) + m1 = @formula lag(y) ~ x1 + fe(id1) + test_lags(m0, m1, "ols: lag ~ _") + + m0 = @formula y ~ (x1_lagged ~ z1_lagged) + fe(id1) + m1 = @formula y ~ (lag(x1) ~ lag(z1)) + fe(id1) + test_lags(m0, m1, "iv: _ ~ (lag ~ lag)") + + m0 = @formula y ~ (x1_lagged ~ z1) + fe(id1) + m1 = @formula y ~ (lag(x1) ~ z1) + fe(id1) + test_lags(m0, m1, "iv: _ ~ (lag ~ _)") + + m0 = @formula y ~ (x1 ~ z1_lagged) + fe(id1) + m1 = @formula y ~ (x1 ~ lag(z1)) + fe(id1) + test_lags(m0, m1, "iv: _ ~ (_ ~ lag)") + + # NOTE: The case where the df contains missings and the formula contains missings cannot be handled yet. The case with :x1_m1 would actually work, but the case with :x1_m2 would not. This because the missings in x1_m1 and x1_m2 are removed BEFORE the the lag is applied. + + m0 = @formula y_lagged ~ x1_m1 + fe(id1) + m1 = @formula lag(y) ~ x1_m1 + fe(id1) + test_lags_broken(m0, m1, "ols: lag ~ _, with missings") + + m0 = @formula y_lagged ~ x1_m2 + fe(id1) + m1 = @formula lag(y) ~ x1_m2 + fe(id1) + test_lags_broken(m0, m1, "ols: lag ~ _, with missings") + + m0 = @formula y ~ (x1_m1 ~ z1_lagged) + fe(id1) + m1 = @formula y ~ (x1_m1 ~ lag(z1)) + fe(id1) + test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings") + + m0 = @formula y ~ (x1_m2 ~ z1_lagged) + fe(id1) + m1 = @formula y ~ (x1_m2 ~ lag(z1)) + fe(id1) + test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings") +end + ############################################################################## ## ## collinearity