Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rebased version of reverted #109 #205

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 84 additions & 20 deletions src/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,26 +140,15 @@ function reg(
esample = Colon()
end

# Compute weights
if has_weights
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
all(isfinite, weights) || throw("Weights are not finite")
else
weights = uweights(nobs)
end

# Compute feM, an AbstractFixedEffectSolver
has_intercept = hasintercept(formula)
has_fe_intercept = false
if has_fes
if any(fe.interaction isa UnitWeights for fe in fes)
has_fe_intercept = true
end
fes = FixedEffect[fe[esample] for fe in fes]
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
end
# Compute data for std errors
vcov_method = Vcov.materialize(view(df, esample, :), vcov)

##############################################################################
##
## Dataframe --> Matrix
Expand All @@ -171,13 +160,19 @@ function reg(
formula_schema = apply_schema(formula, s, FixedEffectModel, has_fe_intercept)

# Obtain y
# for a Vector{Float64}, convert(Vector{Float64}, y) aliases y
y = convert(Vector{Float64}, response(formula_schema, subdf))
all(isfinite, y) || throw("Some observations for the dependent variable are infinite")
_y_ = response(formula_schema, subdf)

# Obtain X
Xexo = convert(Matrix{Float64}, modelmatrix(formula_schema, subdf))
all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")
_Xexo_ = modelmatrix(formula_schema, subdf)

esample2 = .!ismissing.(_y_)

# PR #109, to be removed when fixed in StatsModels
if size(_Xexo_, 2) > 0
for c in eachcol(_Xexo_)
esample2 .&= .!ismissing.(c)
end
end

response_name, coef_names = coefnames(formula_schema)
if !(coef_names isa Vector)
Expand All @@ -187,19 +182,88 @@ function reg(
if has_iv
subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in endo_vars)...))
formula_endo_schema = apply_schema(formula_endo, schema(formula_endo, subdf, contrasts), StatisticalModel)
Xendo = convert(Matrix{Float64}, modelmatrix(formula_endo_schema, subdf))
all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")
_Xendo_ = modelmatrix(formula_endo_schema, subdf)

_, coefendo_names = coefnames(formula_endo_schema)
append!(coef_names, coefendo_names)

subdf = Tables.columntable((; (x => disallowmissing(view(df[!, x], esample)) for x in iv_vars)...))
formula_iv_schema = apply_schema(formula_iv, schema(formula_iv, subdf, contrasts), StatisticalModel)
Z = convert(Matrix{Float64}, modelmatrix(formula_iv_schema, subdf))
_Z_ = modelmatrix(formula_iv_schema, subdf)

# PR #109, to be removed when fixed in StatsModels
if size(_Xendo_, 2) > 0
for c in eachcol(_Xendo_)
esample2 .&= .!ismissing.(c)
end
end

# PR #109, to be removed when fixed in StatsModels
for c in eachcol(_Z_)
esample2 .&= .!ismissing.(c)
end

# PR #109, to be removed when fixed in StatsModels
if !all(esample2)
_Xendo_ = _Xendo_[esample2,:]
_Z_ = _Z_[esample2,:]
end

# for a Vector{Float64}, convert(Vector{Float64}, y) aliases y
Xendo = convert(Matrix{Float64}, _Xendo_)
all(isfinite, Xendo) || throw("Some observations for the endogenous variables are infinite")

Z = convert(Matrix{Float64}, _Z_)
all(isfinite, Z) || throw("Some observations for the instrumental variables are infinite")

# modify formula to use in predict
formula_schema = FormulaTerm(formula_schema.lhs, (tuple(eachterm(formula_schema.rhs)..., (term for term in eachterm(formula_endo_schema.rhs) if term != ConstantTerm(0))...)))
end

# PR #109, to be removed when fixed in StatsModels
if !all(esample2)
if esample != Colon() && !all(esample)
throw(ArgumentError("You passed a dataset with missing observations and used formula terms that introduce missings. This is not yet supported. See https://github.com/JuliaStats/StatsModels.jl/pull/153."))
end
_y_ = _y_[esample2]
_Xexo_ = _Xexo_[esample2,:]

if esample == Colon()
esample = esample2
else
esample[esample] .= esample2
end
nobs = sum(esample)
end

# for a Vector{Float64}, convert(Vector{Float64}, y) aliases y
y = convert(Vector{Float64}, _y_)
all(isfinite, y) || throw("Some observations for the dependent variable are infinite")

Xexo = convert(Matrix{Float64}, _Xexo_)
all(isfinite, Xexo) || throw("Some observations for the exogeneous variables are infinite")

# Compute weights
if has_weights
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
all(isfinite, weights) || throw("Weights are not finite")
else
weights = uweights(nobs)
end

# Compute feM, an AbstractFixedEffectSolver
has_intercept = hasintercept(formula)
has_fe_intercept = false
if has_fes
if any(fe.interaction isa UnitWeights for fe in fes)
has_fe_intercept = true
end
fes = FixedEffect[fe[esample] for fe in fes]
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method}, nthreads)
end
# Compute data for std errors
vcov_method = Vcov.materialize(view(df, esample, :), vcov)

# compute tss now before potentially demeaning y
tss_total = tss(y, has_intercept | has_fe_intercept, weights)
# create unitilaized
Expand Down
56 changes: 45 additions & 11 deletions src/partial_out.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,52 @@ function partial_out(
fes, ids, ids_fes, formula = parse_fixedeffect(df, formula)
has_fes = !isempty(fes)


nobs = sum(esample)
(nobs > 0) || throw("sample is empty")

if has_fes
# in case some FixedEffect does not have interaction, remove the intercept
if any(isa(fe.interaction, UnitWeights) for fe in fes)
formula = FormulaTerm(formula.lhs, tuple(ConstantTerm(0), (t for t in eachterm(formula.rhs) if t!= ConstantTerm(1))...))
has_fes_intercept = true
end
end

# Extract Y
vars = unique(StatsModels.termvars(formula))
subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
Y = modelmatrix(formula_y_schema, subdf)

# Extract X
formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
X = modelmatrix(formula_x_schema, subdf)

# added in PR #109 to handle cases where formula terms introduce missings
# to be removed when fixed in StatsModels
esample2 = trues(size(Y, 1))
for c in eachcol(Y)
esample2 .&= .!ismissing.(c)
end
if size(X, 2) > 0 # X can have zero rows if all regressors are fixed effects
for c in eachcol(X)
esample2 .&= .!ismissing.(c)
end
end

if any(!, esample2)
esample = esample2
Y = Y[esample,:]
X = X[esample,:]
nobs = sum(esample)
end

# Disallow missings
Y = convert(Matrix{Float64}, Y)
X = convert(Matrix{Float64}, X)

# Compute weights
if has_weights
weights = Weights(convert(Vector{Float64}, view(df, esample, weights)))
Expand All @@ -85,14 +128,8 @@ function partial_out(
fes = FixedEffect[fe[esample] for fe in fes]
feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, weights, Val{method})
end

# Compute residualized Y
vars = unique(StatsModels.termvars(formula))
subdf = Tables.columntable(disallowmissing!(df[esample, vars]))
formula_y = FormulaTerm(ConstantTerm(0), (ConstantTerm(0), eachterm(formula.lhs)...))
formula_y_schema = apply_schema(formula_y, schema(formula_y, subdf, contrasts), StatisticalModel)
Y = convert(Matrix{Float64}, modelmatrix(formula_y_schema, subdf))

ynames = coefnames(formula_y_schema)[2]
if !isa(ynames, Vector)
ynames = [ynames]
Expand All @@ -107,9 +144,6 @@ function partial_out(
end

# Compute residualized X
formula_x = FormulaTerm(ConstantTerm(0), formula.rhs)
formula_x_schema = apply_schema(formula_x, schema(formula_x, subdf, contrasts), StatisticalModel)
X = convert(Matrix{Float64}, modelmatrix(formula_x_schema, subdf))
if has_fes
X, b, c = solve_residuals!(X, feM; maxiter = maxiter, tol = tol, progress_bar = false)
append!(iterations, b)
Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
FixedEffects = "c8885935-8500-56a7-9867-7708b20db0eb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
Expand Down
90 changes: 90 additions & 0 deletions test/fit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,96 @@ df.Price_zero = copy(df.Price)
df.Price_zero[1] = 0.0
m = @formula Sales ~ log(Price_zero)
@test_throws "Some observations for the regressor are infinite" reg(df, m)

# function that introduces missing
using StatsModels: lag
df.id1 = df.State
df.y = df.Sales
df.x1 = df.Price
df.z1 = df.Pimin
df.w = df.Pop
df.x1_lagged = lag(df.x1)
df.z1_lagged = lag(df.z1)
df.y_lagged = lag(df.y)


df.x1_m1 = Array{Union{Float64,Missing}}(copy(df.x1))
df.x1_m1[end-20:end] .= missing

df.x1_m2 = Array{Union{Float64,Missing}}(copy(df.x1))
df.x1_m2[5:10:end] .= missing

df.x1_lagged = lag(df.x1)
df.z1_lagged = lag(df.z1)


function test_lags(m0, m1, descr)
@testset "$descr" begin
x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
x1 = reg(df, m1, Vcov.cluster(:id1), weights=:w)

@test x0.residuals == x1.residuals
@test x0.coef == x1.coef
@test x0.nobs == x1.nobs
@test x0.vcov == x1.vcov
end
end

function test_lags_broken(m0, m1, descr)
@testset "$descr" begin
x0 = reg(df, m0, Vcov.cluster(:id1), weights=:w)
@test_throws ArgumentError reg(df, m1, Vcov.cluster(:id1), weights=:w)

#@test_ x0.coef != x1.coef
#@test_ x0.vcov != x1.vcov
end
end

# NOTE: This is a "dumb" lag function,
# it doesn't take into account time and group indices!
@testset "missings from @formula" begin
m0 = @formula y ~ x1_lagged + fe(id1)
m1 = @formula y ~ lag(x1) + fe(id1)
test_lags(m0, m1, "ols: _ ~ lag")

m0 = @formula y_lagged ~ x1 + fe(id1)
m1 = @formula lag(y) ~ x1 + fe(id1)
test_lags(m0, m1, "ols: lag ~ _")

m0 = @formula y ~ (x1_lagged ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (lag(x1) ~ lag(z1)) + fe(id1)
test_lags(m0, m1, "iv: _ ~ (lag ~ lag)")

m0 = @formula y ~ (x1_lagged ~ z1) + fe(id1)
m1 = @formula y ~ (lag(x1) ~ z1) + fe(id1)
test_lags(m0, m1, "iv: _ ~ (lag ~ _)")

m0 = @formula y ~ (x1 ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (x1 ~ lag(z1)) + fe(id1)
test_lags(m0, m1, "iv: _ ~ (_ ~ lag)")

# NOTE: The case where the df contains missings and the formula contains missings cannot be handled yet.
# The case with :x1_m1 would actually work, but the case with :x1_m2 would not.
# This because the missings in x1_m1 and x1_m2 are removed BEFORE the lag is applied.
# See https://github.com/JuliaStats/StatsModels.jl/pull/153.

m0 = @formula y_lagged ~ x1_m1 + fe(id1)
m1 = @formula lag(y) ~ x1_m1 + fe(id1)
test_lags_broken(m0, m1, "ols: lag ~ _, with missings")

m0 = @formula y_lagged ~ x1_m2 + fe(id1)
m1 = @formula lag(y) ~ x1_m2 + fe(id1)
test_lags_broken(m0, m1, "ols: lag ~ _, with missings")

m0 = @formula y ~ (x1_m1 ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (x1_m1 ~ lag(z1)) + fe(id1)
test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")

m0 = @formula y ~ (x1_m2 ~ z1_lagged) + fe(id1)
m1 = @formula y ~ (x1_m2 ~ lag(z1)) + fe(id1)
test_lags_broken(m0, m1, "iv: _ ~ (_ ~ lag), with missings")
end

##############################################################################
##
## collinearity
Expand Down