From 434acd4aa9ae3690b8275d080ea916004e96f7c5 Mon Sep 17 00:00:00 2001
From: Fabian Greimel <fabgrei@gmail.com>
Date: Wed, 29 Jun 2022 13:59:13 +0200
Subject: [PATCH] Revert "Revert "Allow `missings` in the modelmatrix,
 necessary to use `lead`/`lag` in `@formula` (#109)""

This reverts commit f68e456fc33485079038a9af7af520be05654282.
merged
---
 src/fit.jl         | 113 ++++++++++++++++++++++++++++++++++++---------
 src/partial_out.jl |  56 +++++++++++++++++-----
 test/Project.toml  |   1 +
 test/fit.jl        |  87 ++++++++++++++++++++++++++++++++++
 4 files changed, 223 insertions(+), 34 deletions(-)

diff --git a/src/fit.jl b/src/fit.jl
index f561ea6..0bd6157 100644
--- a/src/fit.jl
+++ b/src/fit.jl
@@ -139,27 +139,15 @@ function reg(
     if nobs == N
         esample = Colon()
     end
-
-    # Compute weights
-    if has_weights
-        weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
-        all(isfinite, weights) || throw("Weights are not finite")
-    else
-        weights = uweights(nobs)
-    end
-
-    # Compute feM, an AbstractFixedEffectSolver
+    
     has_intercept = hasintercept(formula)
     has_fe_intercept = false
     if has_fes
         if any(fe.interaction isa UnitWeights for fe in fes)
             has_fe_intercept = true
         end
-        fes = FixedEffect[fe[esample] for fe in fes]
-        feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
     end
-    # Compute data for std errors
-    vcov_method = Vcov.materialize(view(df, esample, :), vcov)
+    
     ##############################################################################
     ##
     ## Dataframe --> Matrix
@@ -171,14 +159,23 @@ function reg(
     formula_schema = apply_schema(formula, s, FixedEffectModel, has_fe_intercept)
 
     # Obtain y
-    # for a Vector{Float64}, convert(Vector{Float64}, y) aliases y
-    y = convert(Vector{Float64}, response(formula_schema, subdf))
-    all(isfinite, y) || throw("Some observations for the dependent variable are infinite")
+    # for a Vector{Float64}, conver(Vector{Float64}, y) aliases y
+    y = response(formula_schema, subdf)
 
+    # added in PR #109 to handle cases where formula terms introduce missings
+    # to be removed when fixed in StatsModels
+    esample2 = .!ismissing.(y)
+    
     # Obtain X
-    Xexo = convert(Matrix{Float64}, modelmatrix(formula_schema, subdf))
-    all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")
-
+    Xexo = modelmatrix(formula_schema, subdf)
+    
+    # PR #109, to be removed when fixed in StatsModels
+    if size(Xexo, 2) > 0
+        for c in eachcol(Xexo)
+            esample2 .&= .!ismissing.(c)
+        end
+    end
+    
     response_name, coef_names = coefnames(formula_schema)
     if !(coef_names isa Vector)
         coef_names = typeof(coef_names)[coef_names]
@@ -187,19 +184,89 @@ function reg(
     if has_iv
         subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in endo_vars)...))
         formula_endo_schema = apply_schema(formula_endo, schema(formula_endo, subdf, contrasts), StatisticalModel)
-        Xendo = convert(Matrix{Float64}, modelmatrix(formula_endo_schema, subdf))
-        all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")
+        Xendo = modelmatrix(formula_endo_schema, subdf)
+        
+        # PR #109, to be removed when fixed in StatsModels
+        if size(Xendo, 2) > 0
+            for c in eachcol(Xendo)
+                esample2 .&= .!ismissing.(c)
+            end
+        end
+            
         _, coefendo_names = coefnames(formula_endo_schema)
         append!(coef_names, coefendo_names)
 
         subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in iv_vars)...))
         formula_iv_schema = apply_schema(formula_iv, schema(formula_iv, subdf, contrasts), StatisticalModel)
-        Z = convert(Matrix{Float64}, modelmatrix(formula_iv_schema, subdf))
+        Z = modelmatrix(formula_iv_schema, subdf)
+        
+        for c in eachcol(Z)
+            esample2 .&= .!ismissing.(c)
+        end
+        
+        # PR #109, to be removed when fixed in StatsModels
+        if any(esample2 .== false)
+            Xendo = Xendo[esample2,:]
+            Z = Z[esample2,:]
+        end
+        
+        Xendo = convert(Matrix{Float64}, Xendo)
+        all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")
+
+        Z = convert(Matrix{Float64}, Z)
         all(isfinite, Z) || throw("Some observations for the instrumental variables are infinite")
+                
+        if size(Z, 2) < size(Xendo, 2)
+            throw("Model not identified. There must be at least as many ivs as endogeneneous variables")
+        end
 
         # modify formula to use in predict
         formula_schema = FormulaTerm(formula_schema.lhs, (tuple(eachterm(formula_schema.rhs)..., (term for term in eachterm(formula_endo_schema.rhs) if term != ConstantTerm(0))...)))
     end
+    
+    esample0 = esample == Colon() ? trues(size(df,1)) : copy(esample)
+    
+    # PR #109, to be removed when fixed in StatsModels
+    if any(esample2 .== false)
+        if any(esample0 .== 0)
+            throw(ArgumentError("You passed a dataset missing observations and used formula terms that introduce missings. This is not yet supported."))
+        end
+        Xexo = Xexo[esample2,:]
+        y = y[esample2]
+        esample = copy(esample0)
+        esample[esample] .= esample2
+        nobs = sum(esample2)
+    end
+
+    y = convert(Vector{Float64}, y)
+    all(isfinite, y) || throw("Some observations for the dependent variable are infinite")
+
+    Xexo = convert(Matrix{Float64}, Xexo)  
+    all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")
+    
+    # Compute weights
+    if has_weights
+        weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
+    else
+        weights = uweights(nobs)
+    end
+    all(isfinite, weights) || throw("Weights are not finite")
+    sqrtw = sqrt.(weights)
+
+    # Compute feM, an AbstractFixedEffectSolver
+    has_intercept = hasintercept(formula)
+    has_fe_intercept = false
+    if has_fes
+        if any(fe.interaction isa UnitWeights for fe in fes)
+              has_fe_intercept = true
+        end
+        fes = FixedEffect[fe[esample] for fe in fes]
+        feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
+    end
+    # Compute data for std errors
+    vcov_method = Vcov.materialize(view(df, esample, :), vcov)
+
+  
     # compute tss now before potentially demeaning y
     tss_total = tss(y, has_intercept | has_fe_intercept, weights)
     # create unitilaized
diff --git a/src/partial_out.jl b/src/partial_out.jl
index 4f25834..572386a 100644
--- a/src/partial_out.jl
+++ b/src/partial_out.jl
@@ -65,9 +65,52 @@ function partial_out(
     fes, ids, ids_fes, formula = parse_fixedeffect(df, formula)
     has_fes = !isempty(fes)
 
-
     nobs = sum(esample)
     (nobs > 0) || throw("sample is empty")
+  
+    if has_fes
+        # in case some FixedEffect does not have interaction, remove the intercept
+        if any(isa(fe.interaction, UnitWeights) for fe in fes)
+            formula = FormulaTerm(formula.lhs, tuple(ConstantTerm(0), (t for t in eachterm(formula.rhs) if t!= ConstantTerm(1))...))
+            has_fes_intercept = true
+        end
+    end
+    
+    # Extract Y
+    vars = unique(StatsModels.termvars(formula))
+    subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
+    formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
+    formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
+    Y = modelmatrix(formula_y_schema, subdf)
+    
+    # Extract X
+    formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
+    formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
+    X = modelmatrix(formula_x_schema, subdf)
+    
+    # added in PR #109 to handle cases where formula terms introduce missings
+    # to be removed when fixed in StatsModels
+    esample2 = trues(size(Y, 1))
+    for c in eachcol(Y)
+        esample2 .&= .!ismissing.(c)
+    end
+    if size(X, 2) > 0 # X can have zero rows if all regressors are fixed effects
+        for c in eachcol(X)
+            esample2 .&= .!ismissing.(c)
+        end
+    end
+    
+    if any(!, esample2)
+        esample = esample2
+        Y = Y[esample,:]
+        X = X[esample,:]
+        nobs = sum(esample)
+    end
+    
+    # Disallow missings
+    Y = convert(Matrix{Float64}, Y)
+    X = convert(Matrix{Float64}, X)
+    
     # Compute weights
     if has_weights
         weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
@@ -85,14 +128,8 @@ function partial_out(
         fes = FixedEffect[fe[esample] for fe in fes]
         feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method})
     end
-
+    
     # Compute residualized Y
-    vars = unique(StatsModels.termvars(formula))
-    subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
-    formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
-    formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
-    Y = convert(Matrix{Float64}, modelmatrix(formula_y_schema, subdf))
-
     ynames = coefnames(formula_y_schema)[2]
     if !isa(ynames, Vector)
         ynames = [ynames]
@@ -107,9 +144,6 @@ function partial_out(
     end
 
     # Compute residualized X
-    formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
-    formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
-    X = convert(Matrix{Float64}, modelmatrix(formula_x_schema, subdf))
     if has_fes
         X, b, c = solve_residuals!(X, feM; maxiter = maxiter, tol = tol, progress_bar = false)
         append!(iterations, b)
diff --git a/test/Project.toml b/test/Project.toml
index 44882ea..7f1b8c0 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -6,6 +6,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 FixedEffects = "c8885935-8500-56a7-9867-7708b20db0eb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
diff --git a/test/fit.jl b/test/fit.jl
index 3a58c90..423b1ea 100644
--- a/test/fit.jl
+++ b/test/fit.jl
@@ -228,6 +228,93 @@ df.Price_zero = copy(df.Price)
 df.Price_zero[1] = 0.0
 m = @formula Sales ~ log(Price_zero)
 @test_throws  "Some observations for the regressor are infinite" reg(df, m)
+
+# function that introduces missing
+using StatsModels: lag
+df.id1 = df.State
+df.y = df.Sales
+df.x1 = df.Price
+df.z1 = df.Pimin
+df.w = df.Pop
+df.x1_lagged = lag(df.x1)
+df.z1_lagged = lag(df.z1)
+df.y_lagged = lag(df.y)
+
+
+df.x1_m1 = Array{Union{Float64,Missing}}(copy(df.x1))
+df.x1_m1[end-20:end] .= missing
+
+df.x1_m2 = Array{Union{Float64,Missing}}(copy(df.x1))
+df.x1_m2[5:10:end] .= missing
+
+df.x1_lagged = lag(df.x1)
+df.z1_lagged = lag(df.z1)
+
+
+function test_lags(m0, m1, descr)
+    @testset "$descr" begin
+        x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
+        x1 = reg(df, m1, Vcov.cluster(:id1), weights=:w)
+        
+        @test x0.residuals == x1.residuals
+        @test x0.coef == x1.coef
+        @test x0.nobs == x1.nobs
+        @test x0.vcov == x1.vcov
+    end
+end
+
+function test_lags_broken(m0, m1, descr)
+    @testset "$descr" begin
+        x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
+        @test_throws ArgumentError reg(df, m1, Vcov.cluster(:id1), weights=:w)
+        
+        #@test_ x0.coef != x1.coef
+        #@test_ x0.vcov != x1.vcov
+    end
+end
+
+# NOTE: This is a "dumb" lag function,
+# it doesn't take into account time and group indices!
+@testset "missings from @formula" begin
+    m0 = @formula y ~ x1_lagged + fe(id1)
+    m1 = @formula y ~ lag(x1) + fe(id1)
+    test_lags(m0, m1, "ols: _ ~ lag")
+
+    m0 = @formula y_lagged ~ x1 + fe(id1)
+    m1 = @formula lag(y) ~ x1 + fe(id1)
+    test_lags(m0, m1, "ols: lag ~ _")
+    
+    m0 = @formula y ~ (x1_lagged ~ z1_lagged) + fe(id1)
+    m1 = @formula y ~ (lag(x1) ~ lag(z1)) + fe(id1)
+    test_lags(m0, m1, "iv: _ ~ (lag ~ lag)")
+    
+    m0 = @formula y ~ (x1_lagged ~ z1) + fe(id1)
+    m1 = @formula y ~ (lag(x1) ~ z1) + fe(id1)
+    test_lags(m0, m1, "iv: _ ~ (lag ~ _)")
+    
+    m0 = @formula y ~ (x1 ~ z1_lagged) + fe(id1)
+    m1 = @formula y ~ (x1 ~ lag(z1)) + fe(id1)
+    test_lags(m0, m1, "iv: _ ~ (_ ~ lag)")
+  
+    # NOTE: The case where the df contains missings and the formula contains missings cannot be handled yet. The case with :x1_m1 would actually work, but the case with :x1_m2 would not. This because the missings in x1_m1 and x1_m2 are removed BEFORE the the lag is applied.
+     
+    m0 = @formula y_lagged ~ x1_m1 + fe(id1)
+    m1 = @formula lag(y) ~ x1_m1 + fe(id1)
+    test_lags_broken(m0, m1, "ols: lag ~ _, with missings")
+
+    m0 = @formula y_lagged ~ x1_m2 + fe(id1)
+    m1 = @formula lag(y) ~ x1_m2 + fe(id1)
+    test_lags_broken(m0, m1, "ols: lag ~ _, with missings")
+      
+    m0 = @formula y ~ (x1_m1 ~ z1_lagged) + fe(id1)
+    m1 = @formula y ~ (x1_m1 ~ lag(z1)) + fe(id1)
+    test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")
+
+    m0 = @formula y ~ (x1_m2 ~ z1_lagged) + fe(id1)
+    m1 = @formula y ~ (x1_m2 ~ lag(z1)) + fe(id1)
+    test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")
+end
+
 ##############################################################################
 ##
 ## collinearity