diff --git a/CHANGELOG.md b/CHANGELOG.md index c343d68..7f0e3bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## v1.1.0 - 2021-09-23 +### Added +- Option to remove R seed from distance computation +- `most_variable_features` method +- `characteristic_features` method +- `most_correlated` method +- MissingFilter structures +- `freqtable` support + ## v1.0.1 - 2021-09-23 ### Changed - Correct intermittent error due to singular covariant matrices in helliger distance computation @@ -46,7 +55,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added - Experiment structures and normalization functions -[1.0.1] https://github.com/menchelab/BioProfiling.jl/compare/v0.4.1...HEAD +[1.1.0] https://github.com/menchelab/BioProfiling.jl/compare/v1.0.1...HEAD +[1.0.1] https://github.com/menchelab/BioProfiling.jl/compare/v1.0.0...v1.0.1 [1.0.0] https://github.com/menchelab/BioProfiling.jl/compare/v0.4.1...v1.0.0 [0.4] https://github.com/menchelab/BioProfiling.jl/compare/v0.3.4...v0.4.1 [0.3] https://github.com/menchelab/BioProfiling.jl/compare/v0.2.1...v0.3.4 diff --git a/Project.toml b/Project.toml index 6111cf5..c3187b9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,12 +1,13 @@ name = "BioProfiling" uuid = "cef322dc-4d82-11ea-04a7-113231db804d" authors = ["Loan Vulliard "] -version = "1.0.1" +version = "1.1.0" [deps] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" +FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1" ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1" Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -21,6 +22,7 @@ UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e" [compat] DataFrames = "0.20 - 0.22" FileIO = "1" +FreqTables = "0.3.3, 0.4" ImageMagick = "0.7, 1" Images = "0.17 - 0.24" MultipleTesting = "0.4" diff --git a/README.md b/README.md index bcf86a0..948d7a3 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ This package allows to perform robust multidimensional profiling in 'Julia' and ### Installation from Julia's package repository (easiest option) -You can simply add this package from the Julia repository like for any other package: +You can simply add this package from the Julia repository like any other package: ```julia import Pkg @@ -45,7 +45,7 @@ import Pkg Pkg.add(Pkg.PackageSpec(url = "https://github.com/menchelab/BioProfiling.jl.git")) ``` -Use can then import the package: +You can then import the package normally: ```julia using BioProfiling diff --git a/src/BioProfiling.jl b/src/BioProfiling.jl index b89ae89..1b5e05e 100644 --- a/src/BioProfiling.jl +++ b/src/BioProfiling.jl @@ -10,6 +10,7 @@ export logtransform, hellinger, Filter, CombinationFilter, + MissingFilter, Experiment, filter_entries!, filter_entries, @@ -34,17 +35,23 @@ export logtransform, shuffled_distance_mahalanobis_median, distance_mahalanobis_median, robust_morphological_perturbation_value, + most_variable_features, + characteristic_features, + most_correlated, + freqtable, umap using Statistics, StatsBase, DataFrames, Images, ImageMagick, UMAP, RCall, MultipleTesting -using Distributed, ParallelDataTransfer +using Distributed, ParallelDataTransfer, FreqTables using LinearAlgebra: det include("struct.jl") +include("internal.jl") include("transform.jl") include("distances.jl") include("diagnostic.jl") include("visu.jl") include("rmpv.jl") +include("interpret.jl") end # module diff --git a/src/diagnostic.jl b/src/diagnostic.jl index 946129c..9bff025 100644 --- a/src/diagnostic.jl +++ b/src/diagnostic.jl @@ -174,4 +174,32 @@ function getColorImage(R::String, G::String, B::String; normalize = true) imgB ./= maximum(imgB) end colorview(RGB, imgR, imgG, imgB) +end + +# Expand freqtable to support Experiment objects, +# either to know how many entries are selected +# by a filter or the values taken by a feature +# for the subset of the entries selected. +""" +Expand `freqtable` to support `Experiment` objects. +Find the frequency of the values taken by feature `s` +in Experiment `e`. +""" +function FreqTables.freqtable(e::Experiment, + s::Symbol; + args...) + return(freqtable(getdata(e)[!,s]; args...)) +end + +""" +Expand `freqtable` to support `Experiment` objects. +Find the frequency of the values taken by feature `s` +in Experiment `e`. +""" +function FreqTables.freqtable(e::Experiment, + f::AbstractFilter; + args...) + entries_kept = [x in filter_entries(e, f) ? "Kept" : "Discarded" + for x in e.selected_entries] + return(freqtable(entries_kept; args...)) end \ No newline at end of file diff --git a/src/internal.jl b/src/internal.jl new file mode 100644 index 0000000..cbcc1ce --- /dev/null +++ b/src/internal.jl @@ -0,0 +1,45 @@ +"""[intended for internal use only] +Make sure an Experiment's data are numerical +and do not include missing values, NaNs or infs. +""" +function _assert_clean_data(e::Experiment) + # Column type cannot be used as for instance a Union type + # could support missing values but the selected data subset + # might contain only numbers + + # This excludes strings and missings (and more) + hasnumbers = getdata(e) |> + x -> isa.(x, Number) |> + eachcol |> + x -> all.(x) |> + all + @assert hasnumbers "Selected data include non-numeric values." + + # Exclude NaNs + hasnonans = getdata(e) |> + x -> isnan.(x) |> + eachcol |> + x -> any.(x) |> + any |> ~ + @assert hasnonans "Selected data include NaNs." + + # Exclude Inf + hasnoinf = getdata(e) |> + x -> isinf.(x) |> + eachcol |> + x -> any.(x) |> + any |> ~ + @assert hasnoinf "Selected data include Inf values." +end + +"""[intended for internal use only] +Convert all selected data columns to floats +""" +function _data_to_float!(e::Experiment) + # Make sure all values are numbers + @assert all( [x <: Number for x in eltype.(eachcol(getdata(e)))] ) + # Convert each column to floats + for colname in names(getdata(e)) + e.data[!,colname] = float.(e.data[:,colname]) + end +end diff --git a/src/interpret.jl b/src/interpret.jl new file mode 100644 index 0000000..0fb23a2 --- /dev/null +++ b/src/interpret.jl @@ -0,0 +1,88 @@ +""" +Return the features of `e` ranked by decreasing +median absolute deviation. Trim to the +first `top` features if a value is provided. +""" +function most_variable_features(e::Experiment; top::Int64 = 0) + e_mad_ind = e |> + getdata |> + eachcol |> + x -> mad.(x, normalize = true) |> + sortperm |> + reverse + + # Get symbols from indices + e_mad_sym = names(getdata(e))[e_mad_ind] + + # Truncate if needed + if (top > 0) && (length(e_mad_sym) > top) + e_mad_sym = e_mad_sym[1:top] + end + + return(e_mad_sym) +end + +""" +Return (all or if provided the `top`) features +varying the most in `e` (largest absolute log +fold change), when comparing entries matching +filters `f1` and `f2`. Columns for which the +fold change is negative come last. +""" +function characteristic_features(e::Experiment, + f1::AbstractFilter, + f2::AbstractFilter; + top::Int64 = 0) + f1_col = filter_entries(e,f1) + f2_col = filter_entries(e,f2) + + lfc_ind = e.data[:, e.selected_features] |> + eachcol |> + y -> map(x -> mean(x[f1_col]) / mean(x[f2_col]), y) |> + y -> map(x -> x <= 0 ? 0 : abs(log2(x)), y) |> + sortperm |> + reverse + + # Get symbols from indices + sym = names(getdata(e))[lfc_ind] + + # Truncate if needed + if (top > 0) && (length(sym) > top) + sym = sym[1:top] + end + + return(sym) +end + +""" +Return (all or if provided the `top`) features +in `e` associated the most with `ref` (absolute +Pearson correlation). +""" +function most_correlated(e::Experiment, + ref::AbstractVector; + top::Int64 = 0) + @assert all( [x <: Number for x in eltype.(eachcol(getdata(e)))] ) + mostcor_ind = e |> getdata |> + x -> cor(ref, Array(x)) |> + x -> abs.(x) |> + x -> sortperm([x...]) |> + reverse + + # Get symbols from indices + mostcor = names(getdata(e))[mostcor_ind] + + # Truncate if needed + if (top > 0) && (length(mostcor) > top) + mostcor = mostcor[1:top] + end + + return(mostcor) +end + +function most_correlated(e::Experiment, + ref::Symbol; + top::Int64 = 0) + ref_vector = e.data[e.selected_entries,ref] + most_correlated(e,ref_vector,top = top) +end \ No newline at end of file diff --git a/src/rmpv.jl b/src/rmpv.jl index 5c06593..90b4a59 100644 --- a/src/rmpv.jl +++ b/src/rmpv.jl @@ -93,8 +93,11 @@ end """ Compute the median Robust Mahalanobis Distance (RMD) in a dataset 'data' for a given perturbation of indices 'indpert' compared to a reference of indices 'indref'. - See https://e-archivo.uc3m.es/bitstream/handle/10016/24613/ws201710.pdf """ -function distance_robust_mahalanobis_median(data, indpert, indref) + See https://e-archivo.uc3m.es/bitstream/handle/10016/24613/ws201710.pdf + This function calls R using RCall and a seed is set by default to ensure + the results are reproducible. If you don't want that, for instance if you + also use RCall and rely on another seed, set 'r_seed' to false.""" +function distance_robust_mahalanobis_median(data, indpert, indref; r_seed = true) setPert = data[indpert,:] setRef = data[indref,:] @@ -106,13 +109,14 @@ function distance_robust_mahalanobis_median(data, indpert, indref) # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance @rput setRef + @rput r_seed R""" if (!require("robustbase")) install.packages("robustbase", repos = "https://cloud.r-project.org") library(robustbase) - set.seed(777) + if (r_seed){set.seed(777)} mcd <- covMcd(setRef) mcdCenter <- mcd$center mcdCov <- mcd$cov @@ -126,8 +130,11 @@ end """ Permute labels and compute the median Robust Mahalanobis Distance (RMD) in a dataset 'data' for a given perturbation of indices 'indpert' - compared to a reference of indices 'indref', to create an empirical distribution.""" -function shuffled_distance_robust_mahalanobis_median(data, indpert, indref; nb_rep = 250) + compared to a reference of indices 'indref', to create an empirical distribution. + This function calls R using RCall and a seed is set by default to ensure + the results are reproducible. If you don't want that, for instance if you + also use RCall and rely on another seed, set 'r_seed' to false.""" +function shuffled_distance_robust_mahalanobis_median(data, indpert, indref; nb_rep = 250, r_seed = true) setPert = data[indpert,:] setRef = data[indref,:] set = vcat(setRef, setPert) @@ -150,13 +157,14 @@ function shuffled_distance_robust_mahalanobis_median(data, indpert, indref; nb_r # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance @rput shuffSetRef - + @rput r_seed + R""" if (!require("robustbase")) install.packages("robustbase", repos = "https://cloud.r-project.org") library(robustbase) - set.seed(3895) + if (r_seed){set.seed(3895)} mcd <- covMcd(shuffSetRef) mcdCenter <- mcd$center mcdCov <- mcd$cov @@ -173,8 +181,11 @@ end """ Compute the Robust Hellinger Distance (RHD) in a dataset `data` for a given perturbation of indices `indpert` - compared to a reference of indices `indref`.""" -function distance_robust_hellinger(data, indpert, indref) + compared to a reference of indices `indref`. + This function calls R using RCall and a seed is set by default to ensure + the results are reproducible. If you don't want that, for instance if you + also use RCall and rely on another seed, set 'r_seed' to false.""" +function distance_robust_hellinger(data, indpert, indref; r_seed = true) setPert = data[indpert,:] setRef = data[indref,:] @@ -187,13 +198,14 @@ function distance_robust_hellinger(data, indpert, indref) # Compute Minimum Covariance Determinant and corresponding Robust Hellinger Distance @rput setRef @rput setPert + @rput r_seed R""" if (!require("robustbase")) install.packages("robustbase", repos = "https://cloud.r-project.org") library(robustbase) - set.seed(777) + if (r_seed){set.seed(777)} mcd1 <- covMcd(setRef) mcdCenter1 <- mcd1$center mcdCov1 <- mcd1$cov @@ -201,7 +213,7 @@ function distance_robust_hellinger(data, indpert, indref) # We set the seed twice to always # find the same estimators given # the same sample - set.seed(777) + if (r_seed){set.seed(777)} mcd2 <- covMcd(setPert) mcdCenter2 <- mcd2$center mcdCov2 <- mcd2$cov @@ -217,8 +229,11 @@ end """ Permute labels and compute the Robust Hellinger Distance (RHD) in a dataset `data` for a given perturbation of indices `indpert` - compared to a reference of indices `indref`, to create an empirical distribution.""" -function shuffled_distance_robust_hellinger(data, indpert, indref; nb_rep = 250) + compared to a reference of indices `indref`, to create an empirical distribution. + This function calls R using RCall and a seed is set by default to ensure + the results are reproducible. If you don't want that, for instance if you + also use RCall and rely on another seed, set 'r_seed' to false.""" +function shuffled_distance_robust_hellinger(data, indpert, indref; nb_rep = 250, r_seed = true) setPert = data[indpert,:] setRef = data[indref,:] set = vcat(setRef, setPert) @@ -242,13 +257,14 @@ function shuffled_distance_robust_hellinger(data, indpert, indref; nb_rep = 250) # Compute Minimum Covariance Determinant and corresponding Robust Mahalanobis Distance @rput shuffSetRef @rput shuffSetPert + @rput r_seed R""" if (!require("robustbase")) install.packages("robustbase", repos = "https://cloud.r-project.org") library(robustbase) - set.seed(777) + if (r_seed){set.seed(777)} mcd <- covMcd(shuffSetRef) mcdCenter1 <- mcd$center mcdCov1 <- mcd$cov @@ -256,7 +272,7 @@ function shuffled_distance_robust_hellinger(data, indpert, indref; nb_rep = 250) # We set the seed twice to always # find the same estimators given # the same sample - set.seed(777) + if (r_seed){set.seed(777)} mcd <- covMcd(shuffSetPert) mcdCenter2 <- mcd$center mcdCov2 <- mcd$cov @@ -289,6 +305,10 @@ end `nb_rep` times. If `process_pool` is a pool of worker processes, they will be used for parallel computation in the permutation test. + This function might call R using RCall and a seed is set by default + to ensure the results are reproducible. If you don't want that, for + instance if you also use RCall and rely on another seed, set 'r_seed' + to false. This returns a DataFrame with three columns: * `Condition`: the levels in `s` * `Distance`: the distance between a condition and the @@ -304,13 +324,18 @@ function robust_morphological_perturbation_value(e::AbstractExperiment, f::AbstractFilter; nb_rep::Int64 = 250, dist::Symbol = :RobHellinger, - process_pool = nothing) + process_pool = nothing, + r_seed = true) if dist == :RobHellinger - selected_distance = distance_robust_hellinger - shuffled_distance = shuffled_distance_robust_hellinger + selected_distance = (x...; kw...)->distance_robust_hellinger(x...; + r_seed = r_seed, kw...) + shuffled_distance = (x...; kw...)->shuffled_distance_robust_hellinger(x...; + r_seed = r_seed, kw...) elseif dist == :RobMedMahalanobis - selected_distance = distance_robust_mahalanobis_median - shuffled_distance = shuffled_distance_robust_mahalanobis_median + selected_distance = (x...; kw...)->distance_robust_mahalanobis_median(x...; + r_seed = r_seed, kw...) + shuffled_distance = (x...; kw...)->shuffled_distance_robust_mahalanobis_median(x...; + r_seed = r_seed, kw...) elseif dist == :MedMahalanobis selected_distance = distance_mahalanobis_median shuffled_distance = shuffled_distance_mahalanobis_median @@ -374,13 +399,15 @@ function robust_morphological_perturbation_value(e::AbstractExperiment, ref; nb_rep::Int64 = 250, dist::Symbol = :RobHellinger, - process_pool = nothing) + process_pool = nothing, + r_seed = true) ref_filter = Filter(ref, s) return(robust_morphological_perturbation_value(e, s, ref_filter; nb_rep=nb_rep, dist=dist, - process_pool=process_pool)) + process_pool=process_pool, + r_seed=r_seed)) end diff --git a/src/struct.jl b/src/struct.jl index 5b10e34..1d5ff3f 100644 --- a/src/struct.jl +++ b/src/struct.jl @@ -32,6 +32,7 @@ abstract type AbstractReduce end abstract type AbstractFilter <: AbstractReduce end abstract type AbstractSimpleFilter <: AbstractFilter end abstract type AbstractCombinationFilter <: AbstractFilter end +abstract type AbstractMissingFilter <: AbstractFilter end mutable struct Filter <: AbstractSimpleFilter value::Any @@ -52,6 +53,14 @@ mutable struct CombinationFilter <: AbstractCombinationFilter operator::Function end +mutable struct MissingFilter <: AbstractMissingFilter end + +# Constructor +function MissingFilter(feature; + description = "No description provided") + return Filter(1, feature, (x,y) -> !ismissing(x), description) +end + # Methods """Return filtered entries in an Experiment `e` based on filter `f` @@ -67,6 +76,10 @@ function filter_entries(e::AbstractExperiment, f::AbstractCombinationFilter) return(sort(f.operator(e1, e2))) end +function filter_entries(e::AbstractExperiment, f::AbstractMissingFilter) + return(sort(∩([filter_entries(e, MissingFilter(f)) + for f in Symbol.(names(e.data)[e.selected_features])]...))) +end """Filter entries in an Experiment `e` based on filter(s) `f`, updating `e.selected_entries` in place accordingly. diff --git a/src/transform.jl b/src/transform.jl index 332daa6..3588e0e 100644 --- a/src/transform.jl +++ b/src/transform.jl @@ -1,15 +1,3 @@ -"""[intended for internal use only] -Convert all selected data columns to floats -""" -function _data_to_float!(e::Experiment) - # Make sure all values are numbers - @assert all( [x <: Number for x in eltype.(eachcol(getdata(e)))] ) - # Convert each column to floats - for colname in names(getdata(e)) - e.data[!,colname] = float.(e.data[:,colname]) - end -end - """Approximate normal distribution""" logtransform(x) = log.(x .+ 1 .- minimum(x)) @@ -82,6 +70,7 @@ decorrelate(data::AbstractMatrix; ordercol = nothing, threshold = 0.8) = a given order 'ordercol' (defaults to left to right). """ function decorrelate!(e::Experiment; ordercol = nothing, threshold = 0.8) + _assert_clean_data(e) e.selected_features = e.selected_features[decorrelate(getdata(e), ordercol=ordercol, threshold=threshold)] @@ -92,6 +81,7 @@ end with largest median absolute deviation. """ function decorrelate_by_mad!(e::Experiment; threshold = 0.8) + _assert_clean_data(e) # Order features from biggest mad to smallest mad # When features have mad(reference) = 1, it means that we rank features # by how more variable they are overall compared to the reference diff --git a/test/runtests.jl b/test/runtests.jl index 4949a38..7b229b1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,6 +9,30 @@ using RCall using Distributed using ParallelDataTransfer +@testset "freqtable" begin + d = DataFrame(Any[0.05131 0.32830 "Exp1"; 0.83296 0.97647 "Exp1"; 0.66463 0.66939 "Exp2"; + 0.30651 0.58938 "Exp2"; 0.71313 0.18477 "Exp2"; 0.81810 0.16309 "Exp2"; + 0.05657 0.06012 "Exp1"; 0.02205 0.17055 "Exp2"; 0.49819 0.91871 "Exp1"; + 0.90857 0.18794 "Exp2"; 0.12327 0.00619 "Exp2"; 0.34146 0.62640 "Exp1"]) + rename!(d, [:Ft1, :Ft2, :Experiment]) + + e1 = Experiment(d, description = "Test Experiment") + + d1 = "Reject cells with high Ft2 values" + f1 = Filter(0.9, :Ft2, compare = isless, + description = d1) + + @test freqtable(e1, f1) == [2,10] + filter_entries!(e1, f1) + + d2 = "Select only a single experiment" + f2 = Filter("Exp1", :Experiment, description = d2) + + @test freqtable(e1, f2) == [7,3] + + @test freqtable(e1, :Experiment) == [3,7] +end + @testset "decorrelate" begin X = DataFrame([[1,2,3],[3,2,1],[0,1,2],[1,0,1]]) @test decorrelate(X) == [1,4] @@ -127,6 +151,20 @@ end # Additional checks that could be performed: # Filter.compare::Function -> Make sure it takes 2 arguments and return 1? # CombinationFilter.operator::Function -> Make sure it takes 2 lists and return 1? + + filter_entries!(e3, f2) + + # Add missing values + e3.data.Ft1 = Array{Union{Missing, Float64},1}(e3.data.Ft1) + e3.data.Experiment = Array{Union{Missing, String},1}(e3.data.Experiment) + e3.data.Ft1[4:6] .= missing + e3.data.Experiment[[5,10,12]] .= missing + + mf1 = MissingFilter(:Ft1) + mf2 = MissingFilter() + + @test filter_entries(e3, mf1) == [7,8,10,11] + @test filter_entries(e3, mf2) == [7,8,11] end @testset "Selector" begin @@ -246,6 +284,54 @@ end # output of diagnostic_images end +@testset "interpret" begin + # Define example dataset + d = DataFrame([[0,2,4,6,8,21],[0,1,2,3,4,0],[0,3,6,9,12,3],'A':'F']) + rename!(d, [Symbol.("Ft".*string.(1:3))..., :Class]) + + f1 = Filter('F', :Class, compare = !=, description = "Exclude row") + s1 = NameSelector(x -> occursin("Ft", String(x)), "Keep numerical features") + + e = Experiment(d) + select!(e, [f1,s1]) + + @test most_variable_features(e) == ["Ft3", "Ft1", "Ft2"] + + # Test equivalence to sorting by mad + top1ft = most_variable_features(e, top = 1) + decorrelate_by_mad!(e) + @test names(e.data)[e.selected_features] == top1ft + + d = DataFrame(rand(120,3)) + rename!(d, Symbol.("Ft".*string.(1:3))) + d.Class = repeat(["Ref", "A", "B"], 40) + + # Add specific differences for each class + d[d.Class .== "A",:Ft2] .+= 2; + d[d.Class .== "B",:Ft3] .+= 2; + + f1 = Filter(0.9, :Ft1, compare = <=, description = "Exclude 10% of entries") + s1 = NameSelector(x -> occursin("Ft", String(x)), "Keep numerical features") + + e = Experiment(d) + select!(e, [f1,s1]) + + filt_Ref = Filter("Ref", :Class) + filt_A = Filter("A", :Class) + filt_B = Filter("B", :Class) + + @test characteristic_features(e, filt_Ref, filt_A, top = 1) == ["Ft2"] + @test characteristic_features(e, filt_Ref, filt_B, top = 1) == ["Ft3"] + @test characteristic_features(e, filt_A, filt_B)[3] == "Ft1" + @test characteristic_features(e, filt_A, filt_B) == characteristic_features(e, filt_B, filt_A) + + correlated_ref = 3 .* getdata(e).Ft2; + @test most_correlated(e, correlated_ref, top = 1)[1] == "Ft2" + @test most_correlated(e, :Ft1)[1] == "Ft1" + e.data.Ft2 .-= 2 .* e.data.Ft3 + @test most_correlated(e, :Ft3) == ["Ft3", "Ft2", "Ft1"] +end + @testset "negation" begin # Define example dataset d = DataFrame(Any[0.0513198 0.328301 "Exp1"; 0.832986 0.976474 "Exp1"; 0.664634 0.669392 "Exp2"; @@ -395,6 +481,39 @@ end decorrelate_by_mad!(e5) @test e5.selected_features == [3] + # Test type issues + d.Ft4 = [collect(1:9)..., missing, Inf, NaN] + e5 = Experiment(d) + + # Includes String column + @test_throws AssertionError decorrelate!(e5) + + filter!(e5,[f1,s1]) + decorrelate!(e5) # This should now work + append!(e5.selected_features, 5) + + # Includes Inf values + @test_throws AssertionError decorrelate!(e5) + + e5.data[11,5] = 3 + e6 = deepcopy(e5) + decorrelate!(e6) # This should now work + append!(e5.selected_entries, 10) + + # Includes Missing values + @test_throws AssertionError decorrelate!(e5) + + e5.data[10,5] = 3 + e6 = deepcopy(e5) + decorrelate!(e6) # This should now work + append!(e5.selected_entries, 12) + + # Includes NaN values + @test_throws AssertionError decorrelate!(e5) + + e5.data[12,5] = 3 + decorrelate!(e5) # This should now work + # Test handling of non-float variables d.Ft2 = rand(1:4, size(d,1)) e5 = Experiment(d) @@ -595,6 +714,53 @@ end # A, B and C have the same distribution but D has a different one @test rmpv4.RMPV[rmpv4.Condition .== 'C'][1] > 0.1 @test rmpv4.RMPV[rmpv4.Condition .== 'D'][1] < 0.1 + + # Test reproducibility + Random.seed!(777) + rmpv_run1 = robust_morphological_perturbation_value(e, + :Condition, + 'C', + nb_rep = 4, + dist = :RobMedMahalanobis) + Random.seed!(777) + rmpv_run2 = robust_morphological_perturbation_value(e, + :Condition, + 'C', + nb_rep = 4, + dist = :RobMedMahalanobis) + Random.seed!(777) + rmpv_run3 = robust_morphological_perturbation_value(e, + :Condition, + 'C', + nb_rep = 4, + dist = :RobMedMahalanobis, + r_seed = false) + @test rmpv_run1 == rmpv_run2 + # NB: without changing the seeds, it is possible that + # the MCD converges so the following is a bad test: + # @test rmpv_run1 != rmpv_run3 + + Random.seed!(777) + rmpv_run1 = robust_morphological_perturbation_value(e, + :Condition, + 'C', + nb_rep = 4, + dist = :RobHellinger) + Random.seed!(777) + rmpv_run2 = robust_morphological_perturbation_value(e, + :Condition, + 'C', + nb_rep = 4, + dist = :RobHellinger) + Random.seed!(777) + rmpv_run3 = robust_morphological_perturbation_value(e, + :Condition, + 'C', + nb_rep = 4, + dist = :RobHellinger, + r_seed = false) + @test rmpv_run1 == rmpv_run2 + # @test rmpv_run1 != rmpv_run3 end @testset "parallel_rmpv" begin