Skip to content

Commit

Permalink
Merge pull request #258 from Evovest/missings
Browse files Browse the repository at this point in the history
Missings
  • Loading branch information
jeremiedb authored Oct 5, 2023
2 parents c9eff40 + 50c5691 commit dcc34f7
Show file tree
Hide file tree
Showing 14 changed files with 260 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
version: '1'
arch: x64
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v1
with:
version: ${{ matrix.version }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/CompatHelper.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
steps:
- uses: julia-actions/setup-julia@latest
with:
version: 1.3
version: 1.6
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/Docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@latest
with:
version: '1.6'
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "EvoTrees"
uuid = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
authors = ["jeremiedb <jeremie.db@evovest.com>"]
version = "0.16.1"
version = "0.16.2"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ preds = m(x_train)

### DataFrames input

When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used.
When using a DataFrames as input, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used to specify the variables to be used as features.

`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.
`Categorical` features are treated accordingly by the algorithm: ordered variables are treated as numerical features, using `` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels categorical variables.

```julia
dtrain = DataFrame(x_train, :auto)
Expand Down
85 changes: 54 additions & 31 deletions benchmarks/Yahoo-LTRC.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,26 @@ x_train = dtrain[:x][:, .!drop_cols]
x_eval = deval[:x][:, .!drop_cols]
x_test = dtest[:x][:, .!drop_cols]

# x_train_miss = x_train .== 0
# x_eval_miss = x_eval .== 0
# x_test_miss = x_test .== 0

# x_train[x_train.==0] .= 0.5
# x_eval[x_eval.==0] .= 0.5
# x_test[x_test.==0] .= 0.5

# x_train = hcat(x_train, x_train_miss)
# x_eval = hcat(x_eval, x_eval_miss)
# x_test = hcat(x_test, x_test_miss)

q_train = dtrain[:q]
q_eval = deval[:q]
q_test = dtest[:q]

y_train = dtrain[:y];
y_eval = deval[:y];
y_test = dtest[:y];

#####################################
# mse regression
#####################################
Expand Down Expand Up @@ -98,12 +114,12 @@ p_test = m_mse(x_test);
test_df = DataFrame(p=p_test, y=y_test, q=q_test)
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
@info "ndcg_test MSE" ndcg_test
@info "MSE - test data - MSE model" mean((p_test .- y_test) .^ 2)
@info "NDCG - test data - MSE model" ndcg_test

#####################################
# logistic regression
#####################################

max_rank = 4
y_train = dtrain[:y] ./ max_rank
y_eval = deval[:y] ./ max_rank
Expand Down Expand Up @@ -145,57 +161,67 @@ p_test = m_logloss(x_test);
test_df = DataFrame(p=p_test, y=y_test, q=q_test)
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = round(mean(test_df_agg.ndcg), sigdigits=5)
@info "ndcg_test LogLoss" ndcg_test

@info "NDCG - test data - LogLoss model" ndcg_test

#####################################
# logistic regression on DataFrame
#####################################
target_name = "y"

df_train = DataFrame(x_train, :auto)
df_train.y = dtrain[:y]
df_train.y = dtrain[:y] ./ 4
df_train.q = dtrain[:q]

df_eval = DataFrame(x_eval, :auto)
df_eval.y = deval[:y]
df_eval.y = deval[:y] ./ 4
df_eval.q = deval[:q]

df_test = DataFrame(x_test, :auto)
df_test.y = dtest[:y]
df_test.y = dtest[:y] ./ 4
df_test.q = dtest[:q]

function rank_target_norm(y::AbstractVector)
out = similar(y)
if minimum(y) == maximum(y)
# out .= 0.75
out .= 0.75
out .= 0.5
else
# out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
out .= 0.5 .* (y .- minimum(y)) ./ (maximum(y) - minimum(y)) .+ 0.5

out .= (y .- minimum(y)) ./ (maximum(y) - minimum(y))
end
return out
end

df_train = transform!(
groupby(df_train, "q"),
"y" => rank_target_norm => "y")
function percent_rank(x::AbstractVector{T}) where {T}
return tiedrank(x) / (length(x) + 1)
end

feature_names_raw = setdiff(names(df_train), ["y", "q"])
feature_names_rel = feature_names_raw .* "_rel"

df_eval = transform!(
groupby(df_eval, "q"),
"y" => rank_target_norm => "y")
transform!(df_train, feature_names_raw .=> percent_rank .=> feature_names_rel)
transform!(df_eval, feature_names_raw .=> percent_rank .=> feature_names_rel)
transform!(df_test, feature_names_raw .=> percent_rank .=> feature_names_rel)

df_test = transform!(
groupby(df_test, "q"),
"y" => rank_target_norm => "y")
feature_names = setdiff(names(df_train), ["y", "q"])

# df_train = transform!(
# groupby(df_train, "q"),
# "y" => rank_target_norm => "y")

# df_eval = transform!(
# groupby(df_eval, "q"),
# "y" => rank_target_norm => "y")

# df_test = transform!(
# groupby(df_test, "q"),
# "y" => rank_target_norm => "y")

minimum(df_eval.y)
maximum(df_eval.y)

config = EvoTreeRegressor(
nrounds=6000,
loss=:logloss,
eta=0.005,
eta=0.01,
nbins=64,
max_depth=11,
rowsample=0.9,
Expand All @@ -205,28 +231,25 @@ config = EvoTreeRegressor(
@time m_logloss_df, logger_logloss_df = fit_evotree(
config,
df_train;
target_name="y",
fnames=setdiff(names(df_train), ["y", "q"]),
target_name,
fnames=feature_names_raw,
deval=df_eval,
early_stopping_rounds=200,
print_every_n=50,
metric=:logloss,
return_logger=true
);

# use the original y since NDCG is scale sensitive
y_train = dtrain[:y]
y_eval = deval[:y]
y_test = dtest[:y]

m_logloss_df.info
p_test_df = m_logloss_df(df_test);
p_test_mat = m_logloss_df(x_test);
# p_test_mat = m_logloss_df(x_test);

EvoTrees.importance(m_logloss_df)

p_test = m_logloss_df(df_test);
test_df = DataFrame(p=p_test, y=dtest[:y], q=dtest[:q])
test_df_agg = combine(groupby(test_df, "q"), ["p", "y"] => ndcg => "ndcg")
ndcg_test = mean(test_df_agg.ndcg)
@info "ndcg_test LogLoss DF" ndcg_test
# ndcg_test = 0.8022558972243291
# ndcg_test = 0.8020754563069513
@info "NDCG - test data - LogLoss DF model" ndcg_test
56 changes: 53 additions & 3 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ m = fit_evotree(config; x_train, y_train)
preds = m(x_train)
```

### DataFrames and Tables input
### Tables and DataFrames input

When using a Tables compatible input such as DataFrames, features with elements types `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used.
When using a `Tables` compatible input such as `DataFrames`, features with element type `Real` (incl. `Bool`) and `Categorical` are automatically recognized as input features. Alternatively, `fnames` kwarg can be used.

`Categorical` features are treated accordingly by the algorithm. Ordered variables will be treated as numerical features, using `` split rule, while unordered variables are using `==`. Support is currently limited to a maximum of 255 levels. `Bool` variables are treated as unordered, 2-levels cat variables.

Expand All @@ -75,7 +75,6 @@ m = fit_evotree(config, dtrain; target_name="y", device="gpu");
p = m(dtrain; device="gpu")
```


## Reproducibility

EvoTrees models trained on cpu can be fully reproducible.
Expand Down Expand Up @@ -107,6 +106,57 @@ Note that in presence of multiple identical or very highly correlated features,

At the moment, there's no reproducibility guarantee on GPU, although this may change in the future.

## Missing values

### Features

EvoTrees does not handle features having missing values. Proper preprocessing of the data is therefore needed (and a general good practice regardless of the ML model used).

This includes situations where values may be all non-missing, but where the `eltype` is the form `Union{Missing,Float64}`. A conversion the types using `identity` is recommended:

```julia
julia> x = Vector{Union{Missing, Float64}}([1, 2])
2-element Vector{Union{Missing, Float64}}:
1.0
2.0

julia> identity.(x)
2-element Vector{Float64}:
1.0
2.0
```

For dealing with numerical or ordered categorical features containing missing values, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing:

```julia
transform!(df, :my_feat => ByRow(ismissing) => :my_feat_ismissing)
```

Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or using a more sophisticated approach such as predictions from another model):

```julia
transform!(df, :my_feat => (x -> coalesce.(x, median(skipmissing(x)))) => :my_feat);
```

For unordered categorical variables, a recode of the missing into a non missing level is sufficient:
```julia
julia> x = categorical(["a", "b", missing])
3-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
missing

julia> x = recode(x_cat_m1, missing => "missing value")
3-element CategoricalArray{String,1,UInt32}:
"a"
"b"
"missing value"
```

### Target

Target variable must have its element type `<:Real`. Only exception is for `EvoTreeClassifier` for which `CategoricalValue`, `Integer`, `String` and `Char` are supported.

## Save/Load

```julia
Expand Down
6 changes: 3 additions & 3 deletions docs/src/tutorials/logistic-regression-titanic.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ df = MLDatasets.Titanic().dataframe

A first step in data processing is to prepare the input features in a model compatible format.

EvoTrees' Tables API supports input that are either `Real`, `Bool` or `Categorical`.
EvoTrees' Tables API supports input that are either `Real` (incl. `Bool`) or `Categorical`. `Bool` variables are treated as unordered, 2-levels categorical variables.
A recommended approach for `String` features such as `Sex` is to convert them into an unordered `Categorical`.

For dealing with features withh missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
Then, the missing values can be inputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).
For dealing with features with missing values such as `Age`, a common approach is to first create an `Bool` indicator variable capturing the info on whether a value is missing.
Then, the missing values can be imputed (replaced by some default values such as `mean` or `median`, or more sophisticated approach such as predictions from another model).

```julia
# convert string feature to Categorical
Expand Down
5 changes: 4 additions & 1 deletion src/fit-utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.TaskLocalRNG()) where {T}
get_edges(df; fnames, nbins, rng=Random.TaskLocalRNG())
Get the braking points of the feature data.
Get the histogram breaking points of the feature data.
"""
function get_edges(X::AbstractMatrix{T}; fnames, nbins, rng=Random.MersenneTwister()) where {T}
@assert T <: Real
nobs = min(size(X, 1), 1000 * nbins)
idx = sample(rng, 1:size(X, 1), nobs, replace=false, ordered=true)
nfeats = size(X, 2)
Expand Down Expand Up @@ -80,6 +81,8 @@ function binarize(df; fnames, edges)
x_bin[:, j] .= levelcode.(col)
elseif eltype(col) <: Real
x_bin[:, j] .= searchsortedfirst.(Ref(edges[j]), col)
else
@error "Invalid feature eltype: $(fnames[j]) is $(eltype(col))"
end
end
return x_bin
Expand Down
9 changes: 8 additions & 1 deletion src/gpu/init.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o

target_levels = nothing
if L == Logistic
@assert eltype(y_train) <: Real && minimum(y_train) >= 0 && maximum(y_train) <= 1
K = 1
y = T.(y_train)
μ = [logit(mean(y))]
!isnothing(offset) && (offset .= logit.(offset))
elseif L in [Poisson, Gamma, Tweedie]
@assert eltype(y_train) <: Real
K = 1
y = T.(y_train)
μ = fill(log(mean(y)), 1)
Expand All @@ -21,26 +23,31 @@ function init_core(params::EvoTypes{L}, ::Type{GPU}, data, fnames, y_train, w, o
if eltype(y_train) <: CategoricalValue
target_levels = CategoricalArrays.levels(y_train)
y = UInt32.(CategoricalArrays.levelcode.(y_train))
else
elseif eltype(y_train) <: Integer || eltype(y_train) <: Bool || eltype(y_train) <: String || eltype(y_train) <: Char
target_levels = sort(unique(y_train))
yc = CategoricalVector(y_train, levels=target_levels)
y = UInt32.(CategoricalArrays.levelcode.(yc))
else
@error "Invalid target eltype: $(eltype(y_train))"
end
K = length(target_levels)
μ = T.(log.(proportions(y, UInt32(1):UInt32(K))))
μ .-= maximum(μ)
!isnothing(offset) && (offset .= log.(offset))
elseif L == GaussianMLE
@assert eltype(y_train) <: Real
K = 2
y = T.(y_train)
μ = [mean(y), log(std(y))]
!isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
elseif L == LogisticMLE
@assert eltype(y_train) <: Real
K = 2
y = T.(y_train)
μ = [mean(y), log(std(y) * sqrt(3) / π)]
!isnothing(offset) && (offset[:, 2] .= log.(offset[:, 2]))
else
@assert eltype(y_train) <: Real
K = 1
y = T.(y_train)
μ = [mean(y)]
Expand Down
Loading

0 comments on commit dcc34f7

Please sign in to comment.