From da2e575b59cf317650a8f44f83b4513474172fbd Mon Sep 17 00:00:00 2001 From: dicook Date: Tue, 12 Mar 2024 16:04:32 +1100 Subject: [PATCH] anomaly index working --- NAMESPACE | 2 ++ R/anomaly-pursuit.r | 4 +-- R/display-xy.r | 48 ++++++++++++++++++++++++++------- R/tour-guided-anomaly.r | 59 +++++++++++++++++++++++++++++------------ R/util.r | 1 + man/display_xy.Rd | 8 ++++-- man/mapShapes.Rd | 2 ++ 7 files changed, 93 insertions(+), 31 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e9235996..dfd15912 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -136,7 +136,9 @@ importFrom(graphics,rect) importFrom(graphics,segments) importFrom(graphics,stars) importFrom(graphics,text) +importFrom(stats,mahalanobis) importFrom(stats,na.omit) +importFrom(stats,qchisq) importFrom(stats,quantile) importFrom(stats,residuals) importFrom(stats,rnorm) diff --git a/R/anomaly-pursuit.r b/R/anomaly-pursuit.r index c3f1ff41..48d86aeb 100644 --- a/R/anomaly-pursuit.r +++ b/R/anomaly-pursuit.r @@ -6,10 +6,10 @@ #' @export anomaly_index <- function() { - function(mat, ell2d) { + function(mat, ell2d, ellmu2d) { mat_tab <- #mean(mahal_dist(mat, ell2d)) - mean(mahalanobis(mat, center=c(0,0), cov=ell2d)) + mean(mahalanobis(mat, center=ellmu2d, cov=ell2d)) } } diff --git a/R/display-xy.r b/R/display-xy.r index 30bad6ee..3cd3963b 100644 --- a/R/display-xy.r +++ b/R/display-xy.r @@ -17,14 +17,17 @@ #' @param obs_labels vector of text labels to display #' @param ellipse pxp variance-covariance matrix defining ellipse, default NULL. Useful for #' comparing data with some null hypothesis -#' @param ellsize This can be considered the equivalent of a critical value, used to +#' @param ellc This can be considered the equivalent of a critical value, used to #' scale the ellipse larger or smaller to capture more or fewer anomalies. Default 3. +#' @param ellmu This is the centre of the ellipse corresponding to the mean of the +#' normal population. Default vector of 0's #' @param palette name of color palette for point colour, used by \code{\link{hcl.colors}}, default "Zissou 1" #' @param shapeset numbers corresponding to shapes in base R points, to use for mapping #' categorical variable to shapes, default=c(15:17, 23:25) #' @param ... other arguments passed on to \code{\link{animate}} and #' \code{\link{display_xy}} #' @importFrom graphics legend +#' @importFrom stats mahalanobis qchisq #' @export #' @examples #' animate_xy(flea[, 1:6]) @@ -68,7 +71,7 @@ display_xy <- function(center = TRUE, axes = "center", half_range = NULL, col = "black", pch = 20, cex = 1, edges = NULL, edges.col = "black", edges.width=1, obs_labels = NULL, - ellipse = NULL, ellsize = 3, + ellipse = NULL, ellc = NULL, ellmu = NULL, palette="Zissou 1", shapeset=c(15:17, 23:25), ...) { # Needed for CRAN checks labels <- NULL @@ -154,19 +157,37 @@ display_xy <- function(center = TRUE, axes = "center", half_range = NULL, if (!is.null(ellipse)) { if (nrow(ellipse) == nrow(proj)) { + if (is.null(ellc)) + ellc <- qchisq(0.95, nrow(proj)) + else + stopifnot(ellc > 0) # Needs to be positive + if (is.null(ellmu)) + ellmu <- rep(0, nrow(proj)) + else + stopifnot(length(ellmu) == nrow(proj)) # Right dimension + message("Using ellc = ", format(ellc, digits = 2)) + # Project ellipse into 2D # Notation in paper: ellipse=A, ellinv=A^(-1), # e2=P^TA^(-1)P, ell2d=B - evc <- eigen(ellipse*ellsize) + # ellipse=var-cov of normal pop + # ellinv for defining pD ellipse, for dist calc + # e2 is projected var-cov + # ell2d is B, used to project ellipse points + evc <- eigen(ellipse) # ellinv <- (evc$vectors) %*% diag(evc$values) %*% t(evc$vectors) - e2 <- t(proj) %*% ellinv %*% proj + e2 <- t(proj) %*% ellipse %*% proj evc2 <- eigen(e2) - ell2d <- (evc2$vectors) %*% sqrt(diag(evc2$values)) %*% t(evc2$vectors) - e3 <- eigen(ell2d) - ell2dinv <- (e3$vectors) %*% diag(e3$values) %*% t(e3$vectors) + ell2d <- (evc2$vectors) %*% sqrt(diag(evc2$values*ellc)) %*% t(evc2$vectors) + #e3 <- eigen(ell2d) + #ell2dinv <- (e3$vectors) %*% diag(e3$values) %*% t(e3$vectors) + # Compute the points on an ellipse + # Generate points on a circle sph <- geozoo::sphere.hollow(2, 200)$points + # Organise so lines connecting consecutive + # points creates the circle sph <- sph[order(sph[,2]),] sph1 <- sph[sph[,2]>=0,] sph2 <- sph[sph[,2]<0,] @@ -174,16 +195,23 @@ display_xy <- function(center = TRUE, axes = "center", half_range = NULL, sph2 <- sph2[order(sph2[,1], decreasing=T),] sph <- rbind(sph1, sph2) sph <- rbind(sph, sph[1,]) - sph2d <- sph%*%ell2dinv/half_range + + # Transform circle points into an ellipse + sph2d <- sph%*%ell2d + # Centre on the given mean + ellmu2d <- t(as.matrix(ellmu)) %*% proj + sph2d <- sweep(sph2d, 2, ellmu2d, `+`) + # Scale ellipse into plot space + sph2d <- sph2d/half_range lines(sph2d) # Colour points outside the pD ellipse mdst <- mahalanobis(data, - center=rep(0, ncol(data)), + center=ellmu, cov=ellipse) #mdst <- mahal_dist(data, ellipse) - anomalies <- which(mdst > ellsize) + anomalies <- which(mdst > ellc) points(x[anomalies,], col = "red", pch = 4, diff --git a/R/tour-guided-anomaly.r b/R/tour-guided-anomaly.r index 5e44a3ad..11625b26 100644 --- a/R/tour-guided-anomaly.r +++ b/R/tour-guided-anomaly.r @@ -19,19 +19,22 @@ #' @param max.i the maximum index value, stop search if a larger value is found #' @param ellipse pxp variance-covariance matrix defining ellipse, default NULL. #' Useful for comparing data with some hypothesized null. -#' @param ellsize This can be considered the equivalent of a critical value, used to +#' @param ellc This can be considered the equivalent of a critical value, used to #' scale the ellipse larger or smaller to capture more or fewer anomalies. Default 3. +#' @param ellmu This is the centre of the ellipse corresponding to the mean of the +#' normal population. Default vector of 0's #' @param ... arguments sent to the search_f #' @seealso \code{\link{slice_index}} for an example of an index functions. #' \code{\link{search_geodesic}}, \code{\link{search_better}}, #' \code{\link{search_better_random}} for different search strategies +#' @importFrom stats mahalanobis qchisq #' @export #' @examples #' animate_xy(flea[, 1:6], guided_anomaly_tour(anomaly_index(), #' ellipse=cov(flea[,1:6])), ellipse=cov(flea[,1:6]), axes="off") guided_anomaly_tour <- function(index_f, d = 2, alpha = 0.5, cooling = 0.99, max.tries = 25, max.i = Inf, - ellipse, ellsize=3, + ellipse, ellc=NULL, ellmu=NULL, search_f = search_geodesic, ...) { h <- NULL @@ -45,22 +48,44 @@ guided_anomaly_tour <- function(index_f, d = 2, alpha = 0.5, cooling = 0.99, } index <- function(proj) { - # Check which observations are outside pD ellipse - mdst <- mahalanobis(data, - center=rep(0, ncol(data)), - cov=ellipse) + if (nrow(ellipse) == nrow(proj)) { + + if (is.null(ellc)) + ellc <- qchisq(0.95, nrow(proj)) + else + stopifnot(ellc > 0) # Needs to be positive + if (is.null(ellmu)) + ellmu <- rep(0, nrow(proj)) + else + stopifnot(length(ellmu) == nrow(proj)) # Right dimension + message("Using ellc = ", format(ellc, digits = 2)) + + # Check which observations are outside pD ellipse + mdst <- mahalanobis(data, + center=ellmu, + cov=ellipse) #mdst <- mahal_dist(data, ellipse) - anomalies <- which(mdst > ellsize) - stopifnot(length(anomalies) > 0) - # Project ellipse into 2D - evc <- eigen(ellipse) - ellinv <- (evc$vectors) %*% diag(evc$values) %*% t(evc$vectors) - e2 <- t(proj) %*% ellinv %*% proj - evc2 <- eigen(e2) - ell2d <- (evc2$vectors) %*% diag(sqrt(evc2$values)) %*% t(evc2$vectors) - e3 <- eigen(ell2d) - ell2dinv <- (e3$vectors) %*% diag(e3$values) %*% t(e3$vectors) - index_f(as.matrix(data[anomalies,]) %*% proj, ell2dinv) + anomalies <- which(mdst > ellc) + stopifnot(length(anomalies) > 0) + + # Project ellipse into 2D + evc <- eigen(ellipse) # + ellinv <- (evc$vectors) %*% diag(evc$values) %*% t(evc$vectors) + e2 <- t(proj) %*% ellipse %*% proj + evc2 <- eigen(e2) + ell2d <- (evc2$vectors) %*% sqrt(diag(evc2$values*ellc)) %*% t(evc2$vectors) + + ell2dinv <- (evc2$vectors) %*% diag(evc2$values*ellc) %*% t(evc2$vectors) + ellmu2d <- t(as.matrix(ellmu)) %*% proj + #evc <- eigen(ellipse) + #ellinv <- (evc$vectors) %*% diag(evc$values) %*% t(evc$vectors) + #e2 <- t(proj) %*% ellinv %*% proj + #evc2 <- eigen(e2) + #ell2d <- (evc2$vectors) %*% diag(sqrt(evc2$values)) %*% t(evc2$vectors) + #e3 <- eigen(ell2d) + #ell2dinv <- (e3$vectors) %*% diag(e3$values) %*% t(e3$vectors) + index_f(as.matrix(data[anomalies,]) %*% proj, e2, ellmu2d) + } } cur_index <- index(current) diff --git a/R/util.r b/R/util.r index 002bd315..c67e039e 100644 --- a/R/util.r +++ b/R/util.r @@ -136,6 +136,7 @@ mapColors <- function(x, palette) { #' Map vector of factors to pch #' #' @param x vector +#' @param shapeset vector of integers indicating point shapes #' @export mapShapes <- function(x, shapeset) { n <- length(unique(x)) diff --git a/man/display_xy.Rd b/man/display_xy.Rd index 7ad22e21..9e4f331d 100644 --- a/man/display_xy.Rd +++ b/man/display_xy.Rd @@ -17,7 +17,8 @@ display_xy( edges.width = 1, obs_labels = NULL, ellipse = NULL, - ellsize = 3, + ellc = NULL, + ellmu = NULL, palette = "Zissou 1", shapeset = c(15:17, 23:25), ... @@ -52,9 +53,12 @@ If not set, defaults to maximum distance from origin to each row of data.} \item{ellipse}{pxp variance-covariance matrix defining ellipse, default NULL. Useful for comparing data with some null hypothesis} -\item{ellsize}{This can be considered the equivalent of a critical value, used to +\item{ellc}{This can be considered the equivalent of a critical value, used to scale the ellipse larger or smaller to capture more or fewer anomalies. Default 3.} +\item{ellmu}{This is the centre of the ellipse corresponding to the mean of the +normal population. Default vector of 0's} + \item{palette}{name of color palette for point colour, used by \code{\link{hcl.colors}}, default "Zissou 1"} \item{shapeset}{numbers corresponding to shapes in base R points, to use for mapping diff --git a/man/mapShapes.Rd b/man/mapShapes.Rd index 7b3ab960..f8edd1f6 100644 --- a/man/mapShapes.Rd +++ b/man/mapShapes.Rd @@ -8,6 +8,8 @@ mapShapes(x, shapeset) } \arguments{ \item{x}{vector} + +\item{shapeset}{vector of integers indicating point shapes} } \description{ Map vector of factors to pch