Merge pull request #37 from menchelab/v1.1.0

V1.1.0
menchelab · Sep 23, 2021 · 15c0e2d · 15c0e2d · koalive · Sep 23, 2021
2 parents 13b72f6 + d3765f3
commit 15c0e2d
Show file tree

Hide file tree

Showing 11 changed files with 415 additions and 39 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## v1.1.0 - 2021-09-23
+### Added
+- Option to remove R seed from distance computation
+- `most_variable_features` method
+- `characteristic_features` method
+- `most_correlated` method
+- MissingFilter structures 
+- `freqtable` support
+
 ## v1.0.1 - 2021-09-23
 ### Changed
 - Correct intermittent error due to singular covariant matrices in helliger distance computation
@@ -46,7 +55,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### Added
 - Experiment structures and normalization functions
 
-[1.0.1] https://github.com/menchelab/BioProfiling.jl/compare/v0.4.1...HEAD
+[1.1.0] https://github.com/menchelab/BioProfiling.jl/compare/v1.0.1...HEAD
+[1.0.1] https://github.com/menchelab/BioProfiling.jl/compare/v1.0.0...v1.0.1
 [1.0.0] https://github.com/menchelab/BioProfiling.jl/compare/v0.4.1...v1.0.0
 [0.4] https://github.com/menchelab/BioProfiling.jl/compare/v0.3.4...v0.4.1
 [0.3] https://github.com/menchelab/BioProfiling.jl/compare/v0.2.1...v0.3.4

diff --git a/Project.toml b/Project.toml
@@ -1,12 +1,13 @@
 name = "BioProfiling"
 uuid = "cef322dc-4d82-11ea-04a7-113231db804d"
 authors = ["Loan Vulliard <lvulliard@cemm.oeaw.ac.at>"]
-version = "1.0.1"
+version = "1.1.0"
 
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1"
 ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -21,6 +22,7 @@ UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e"
 [compat]
 DataFrames = "0.20 - 0.22"
 FileIO = "1"
+FreqTables = "0.3.3, 0.4"
 ImageMagick = "0.7, 1"
 Images = "0.17 - 0.24"
 MultipleTesting = "0.4"

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ This package allows to perform robust multidimensional profiling in 'Julia' and
 
 ### Installation from Julia's package repository (easiest option)
 
-You can simply add this package from the Julia repository like for any other package:
+You can simply add this package from the Julia repository like any other package:
 
 ```julia
 import Pkg
@@ -45,7 +45,7 @@ import Pkg
 Pkg.add(Pkg.PackageSpec(url = "https://github.com/menchelab/BioProfiling.jl.git"))
 ```
 
-Use can then import the package:
+You can then import the package normally:
 
 ```julia
 using BioProfiling

diff --git a/src/BioProfiling.jl b/src/BioProfiling.jl
@@ -10,6 +10,7 @@ export  logtransform,
         hellinger, 
         Filter, 
         CombinationFilter, 
+        MissingFilter, 
         Experiment, 
         filter_entries!, 
         filter_entries,
@@ -34,17 +35,23 @@ export  logtransform,
         shuffled_distance_mahalanobis_median,
         distance_mahalanobis_median,
         robust_morphological_perturbation_value,
+        most_variable_features,
+        characteristic_features,
+        most_correlated,
+        freqtable,
         umap
 using Statistics, StatsBase, DataFrames, Images, ImageMagick, UMAP, RCall, MultipleTesting
-using Distributed, ParallelDataTransfer
+using Distributed, ParallelDataTransfer, FreqTables
 
 using LinearAlgebra: det
 
 include("struct.jl")
+include("internal.jl")
 include("transform.jl")
 include("distances.jl")
 include("diagnostic.jl")
 include("visu.jl")
 include("rmpv.jl")
+include("interpret.jl")
 
 end # module
diff --git a/src/diagnostic.jl b/src/diagnostic.jl
@@ -174,4 +174,32 @@ function getColorImage(R::String, G::String, B::String; normalize = true)
         imgB ./= maximum(imgB)
     end
     colorview(RGB, imgR, imgG, imgB)
+end
+
+# Expand freqtable to support Experiment objects,
+# either to know how many entries are selected
+# by a filter or the values taken by a feature
+# for the subset of the entries selected.
+"""
+Expand `freqtable` to support `Experiment` objects.
+Find the frequency of the values taken by feature `s`
+in Experiment `e`.
+"""
+function FreqTables.freqtable(e::Experiment,
+                              s::Symbol;
+                              args...)
+    return(freqtable(getdata(e)[!,s]; args...))
+end
+
+"""
+Expand `freqtable` to support `Experiment` objects.
+Find the frequency of the values taken by feature `s`
+in Experiment `e`.
+"""
+function FreqTables.freqtable(e::Experiment,
+                              f::AbstractFilter;
+                              args...)
+    entries_kept = [x in filter_entries(e, f) ? "Kept" : "Discarded" 
+                    for x in e.selected_entries]
+    return(freqtable(entries_kept; args...))
 end
diff --git a/src/internal.jl b/src/internal.jl
@@ -0,0 +1,45 @@
+"""[intended for internal use only]
+Make sure an Experiment's data are numerical 
+and do not include missing values, NaNs or infs.
+"""
+function _assert_clean_data(e::Experiment)
+	# Column type cannot be used as for instance a Union type 
+	# could support missing values but the selected data subset
+	# might contain only numbers
+
+	# This excludes strings and missings (and more)
+	hasnumbers = getdata(e) |> 
+					x -> isa.(x, Number) |> 
+					eachcol |> 
+					x -> all.(x) |> 
+					all
+	@assert hasnumbers "Selected data include non-numeric values."
+
+	# Exclude NaNs
+	hasnonans = getdata(e) |> 
+					x -> isnan.(x) |> 
+					eachcol |> 
+					x -> any.(x) |> 
+					any |> ~
+	@assert hasnonans "Selected data include NaNs."
+
+	# Exclude Inf
+	hasnoinf = getdata(e) |> 
+					x -> isinf.(x) |> 
+					eachcol |> 
+					x -> any.(x) |> 
+					any |> ~
+	@assert hasnoinf "Selected data include Inf values."
+end
+
+"""[intended for internal use only]
+Convert all selected data columns to floats
+"""
+function _data_to_float!(e::Experiment)
+    # Make sure all values are numbers
+    @assert all( [x <: Number for x in eltype.(eachcol(getdata(e)))] )
+    # Convert each column to floats
+    for colname in names(getdata(e))
+        e.data[!,colname] = float.(e.data[:,colname])
+    end
+end
diff --git a/src/interpret.jl b/src/interpret.jl
@@ -0,0 +1,88 @@
+"""
+Return the features of `e` ranked by decreasing
+median absolute deviation. Trim to the 
+first `top` features if a value is provided.
+"""
+function most_variable_features(e::Experiment; top::Int64 = 0)
+    e_mad_ind = e |>
+                 getdata |>    
+                 eachcol |>
+                 x -> mad.(x, normalize = true) |>
+                 sortperm |>
+                 reverse
+
+    # Get symbols from indices
+    e_mad_sym = names(getdata(e))[e_mad_ind]
+
+    # Truncate if needed
+    if (top > 0) && (length(e_mad_sym) > top)
+        e_mad_sym = e_mad_sym[1:top]
+    end
+
+    return(e_mad_sym)
+end
+
+"""
+Return (all or if provided the `top`) features
+varying the most in `e` (largest absolute log 
+fold change), when comparing entries matching
+filters `f1` and `f2`. Columns for which the 
+fold change is negative come last.
+"""
+function characteristic_features(e::Experiment,
+                                 f1::AbstractFilter, 
+                                 f2::AbstractFilter;
+                                 top::Int64 = 0)
+    f1_col = filter_entries(e,f1)
+    f2_col = filter_entries(e,f2)
+
+    lfc_ind = e.data[:, e.selected_features] |>
+         eachcol |>
+         y -> map(x -> mean(x[f1_col]) / mean(x[f2_col]), y) |>
+         y -> map(x -> x <= 0 ? 0 : abs(log2(x)), y) |>
+         sortperm |>
+         reverse
+
+    # Get symbols from indices
+    sym = names(getdata(e))[lfc_ind]
+
+    # Truncate if needed
+    if (top > 0) && (length(sym) > top)
+        sym = sym[1:top]
+    end
+
+    return(sym)
+end
+
+"""
+Return (all or if provided the `top`) features
+in `e` associated the most with `ref` (absolute
+Pearson correlation).
+"""
+function most_correlated(e::Experiment,
+                         ref::AbstractVector;
+                         top::Int64 = 0)
+    @assert all( [x <: Number for x in eltype.(eachcol(getdata(e)))] )
+    mostcor_ind = e |> getdata |>
+                   x -> cor(ref, Array(x)) |> 
+                   x -> abs.(x) |>
+                   x -> sortperm([x...]) |>
+                   reverse
+
+    # Get symbols from indices
+    mostcor = names(getdata(e))[mostcor_ind]
+
+    # Truncate if needed
+    if (top > 0) && (length(mostcor) > top)
+        mostcor = mostcor[1:top]
+    end
+
+    return(mostcor)
+end
+
+function most_correlated(e::Experiment,
+                         ref::Symbol;
+                         top::Int64 = 0)
+    ref_vector = e.data[e.selected_entries,ref]
+    most_correlated(e,ref_vector,top = top)
+end