diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..8af6a08 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^CONDUCT\.md$ diff --git a/.gitignore b/.gitignore index 5b5ff13..3f20077 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,18 @@ inst/extdata/ hum2vec.Rproj src/*.o src/*.so -cookbooks cookbooks.txt cookbooks.vectors cookbooks.zip +cookbooks +cookbooks.txt +cookbooks.vectors +cookbooks.zip cookbooks* etc +cookbook_vectors.bin +tests/testthat/binary.bin +tests/testthat/input.txt +tests/testthat/tmp.txt +tests/testthat/binary.bin +tests/testthat/tmp.bin +vignettes/*.R +vignettes/*.html +vignettes/*_files diff --git a/.travis.yml b/.travis.yml index eacc524..ec2b6b9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,8 @@ language: r cache: packages -warnings_are_errors: false - +warnings_are_errors: true +r_build_args: --no-build-vignettes --no-manual --no-resave-data +r_check_args: --no-build-vignettes --no-manual r: - release - devel diff --git a/CONDUCT.md b/CONDUCT.md new file mode 100644 index 0000000..52a673e --- /dev/null +++ b/CONDUCT.md @@ -0,0 +1,25 @@ +# Contributor Code of Conduct + +As contributors and maintainers of this project, we pledge to respect all people who +contribute through reporting issues, posting feature requests, updating documentation, +submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free experience for +everyone, regardless of level of experience, gender, gender identity and expression, +sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. + +Examples of unacceptable behavior by participants include the use of sexual language or +imagery, derogatory comments or personal attacks, trolling, public or private harassment, +insults, or other unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, +commits, code, wiki edits, issues, and other contributions that are not aligned to this +Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed +from the project team. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by +opening an issue or contacting one or more of the project maintainers. + +This Code of Conduct is adapted from the Contributor Covenant +(http:contributor-covenant.org), version 1.0.0, available at +http://contributor-covenant.org/version/1/0/0/ diff --git a/DESCRIPTION b/DESCRIPTION index 7539317..bcd6775 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,23 +1,38 @@ Package: wordVectors Type: Package Title: Tools for creating and analyzing vector-space models of texts -Version: 1.3 -Date: 2015-09-10 +Version: 2.0 Author: Ben Schmidt, Jian Li Maintainer: Ben Schmidt -Description: wordVectors wraps Google's word2vec code for creating vector-space - models of texts, and defines a new class "VectorSpaceModel" (extending the native matrix class) - with a number of functions that make it easier to perform useful operations in a - word-vector space. +Description: + wordVectors wraps Google's implementation in C for training word2vec models, + and provides several R functions for exploratory data analysis of word2vec + and other related models. These include import-export from the binary format, + some useful linear algebra operations missing from R, and a streamlined + syntax for working with models and performing vector arithmetic that make it + easier to perform useful operations in a word-vector space. License: Apache License (== 2.0) +URL: http://github.com/bmschmidt/wordVectors +BugReports: https://github.com/bmschmidt/wordVectors/issues Depends: R (>= 2.14.0) LazyData: TRUE Imports: + magrittr, graphics, methods, - utils + utils, + stats, + readr, + stringr, + stringi Suggests: - stringi, - tsne -RoxygenNote: 5.0.1 + tsne, + testthat, + ggplot2, + knitr, + dplyr, + rmarkdown, + devtools +RoxygenNote: 6.0.1 +VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index e119550..4b47db1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,13 @@ # Generated by roxygen2: do not edit by hand +export("%>%") export(as.VectorSpaceModel) +export(closest_to) export(cosineDist) export(cosineSimilarity) +export(distend) export(filter_to_rownames) +export(improve_vectorspace) export(magnitudes) export(nearest_to) export(normalize_lengths) @@ -17,4 +21,5 @@ export(word2phrase) export(write.binary.word2vec) exportClasses(VectorSpaceModel) exportMethods(plot) +importFrom(magrittr,"%>%") useDynLib(wordVectors) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..ac03212 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,137 @@ +# VERSION 2.0 + +Upgrade focusing on ease of use and CRAN-ability. Bumping major version because of a breaking change in the behavior of `closest_to`, which now returns a data.frame. + +# Changes + +## New default function: closest_to. + +`nearest_to` was previously the easiest way to interact with cosine similarity functions. That's been deprecated +in favor of a new function, `closest_to`. (I would have changed the name but for back-compatibility reasons.) +The data.frame columns have elaborate names so they can easily be manipulated with dplyr, and/or plotted with ggplot. +`nearest_to` is now just a wrapped version of the new function. + +## New syntax for vector addition. + +This package now allows formula scoping for the most common operations, and string inputs to access in the context of a particular matrix. This makes this much nicer for handling the bread and butter word2vec operations. + +For instance, instead of writing +```R +vectors %>% closest_to(vectors[rownames(vectors)=="king",] - vectors[rownames(vectors)=="man",] + vectors[rownames(vectors)=="woman",]) +``` + +(whew!), you can write + +```R +vectors %>% closest_to(~"king" - "man" + "woman") +``` + + +## Reading tweaks. + +In keeping with the goal of allowing manipulation of models in low-memory environments, it's now possible to read only rows with words matching certain criteria by passing an argument to read.binary.vectors(); either `rowname_list` for a fixed list, or `rowname_regexp` for a regular expression. (You could, say, read only the gerunds from a file by entering `rowname_regexp = "*.ing"`). + +## Test Suite + +The package now includes a test suite. + +## Other materials for rOpenScience and JOSS. + +This package has enough users it might be nice to get it on CRAN. I'm trying doing so through rOpenSci. That requires a lot of small files scattered throughout. + + +# VERSION 1.3 + +Two significant performance improvements. +1. Row magnitudes for a `VectorSpaceModel` object are now **cached** in an environment that allows some pass-by-reference editing. This means that the most time-consuming part of any comparison query is only done once for any given vector set; subsequent queries are at least an order of magnitude (10-20x)? faster. + +Although this is a big performance improvement, certain edge cases might not wipe the cache clear. **In particular, assignment inside a VSM object might cause incorrect calculations.** I can't see why anyone would be in the habit of manually tweaking a row or block (rather than a whole matrix). +1. Access to rows in a `VectorSpaceModel` object is now handled through callNextMethod() rather than accessing the element's .Data slot. For reasons opaque to me, hitting the .Data slot seems to internally require copying the whole huge matrix internally. Now that no longer happens. + + +# VERSION 1.2 + +This release implements a number of incremental improvements and clears up some errors. +- The package is now able to read and write in the binary word2vec format; since this is faster and takes much less hard drive space (down by about 66%) than writing out floats as text, it does so internally. +- Several improvements to the C codebase to avoid warnings by @mukul13, described [here](https://github.com/bmschmidt/wordVectors/pull/9). (Does this address the `long long` issue?) +- Subsetting with `[[` now takes an argument `average`; if false, rather than collapse a matrix down to a single row, it just extracts the elements that correspond to the words. +- Added sample data in the object `demo_vectors`: the 999 words from the most common vectors. +- Began adding examples to the codebase. +- Tracking build success using Travis. +- Dodging most warnings from R CMD check. + +Bug fixes +- If the `dir.exists` function is undefined, it creates one for you. This should allow installation on R 3.1 and some lower versions. +- `reject` and `project` are better about returning VSM objects, rather than dropping back into a matrix. + +# VERSION 1.1 + +A few changes, primarily to the functions for _training_ vector spaces to produce higher quality models. A number of these changes are merged back in from the fork of this repo by github user @sunecasp . Thanks! + +## Some bug fixes + +Filenames can now be up to 1024 characters. Some parameters on alpha decay may be fixed; I'm not entirely sure what sunecasp's changes do. + +## Changes to default number of iterations. + +Models now default to 5 iterations through the text rather than 1. That means training may take 5 times as long; but particularly for small corpora, the vectors should be of higher quality. See below for an example. + +## More training arguments + +You can now specify more flags to the word2vec code. `?train_word2vec` gives a full list, but particularly useful are: +1. `window` now accurately sets the window size. +2. `iter` sets the number of iterations. For very large corpora, `iter=1` will train most quickly; for very small corpora, `iter=15` will give substantially better vectors. (See below). You should set this as high as you can stand within reason (Setting `iter` to a number higher than `window` is probably not that useful). But more text is better than more iterations. +3. `min_count` gives a cutoff for vocabulary size. Tokens occurring fewer than `min_count` times will be dropped from the model. Setting this high can be useful. (But note that a trained model is sorted in order of frequency, so if you have the RAM to train a big model you can reduce it in size for analysis by just subsetting to the first 10,000 or whatever rows). + +## Example of vectors + +Here's an example of training on a small set (c. 1000 speeches on the floor of the house of commons from the early 19th century). + +> proc.time({one = train_word2vec("~/tmp2.txt","~/1_iter.vectors",iter = 1)}) +> Error in train_word2vec("~/tmp2.txt", "~/1_iter.vectors", iter = 1) : +> The output file '~/1_iter.vectors' already exists: delete or give a new destination. +> proc.time({one = train_word2vec("~/tmp2.txt","~/1_iter.vectors",iter = 1)}) +> Starting training using file /Users/bschmidt/tmp2.txt +> Vocab size: 4469 +> Words in train file: 407583 +> Alpha: 0.000711 Progress: 99.86% Words/thread/sec: 67.51k +> Error in proc.time({ : 1 argument passed to 'proc.time' which requires 0 +> ?proc.time +> system.time({one = train_word2vec("~/tmp2.txt","~/1_iter.vectors",iter = 1)}) +> Starting training using file /Users/bschmidt/tmp2.txt +> Vocab size: 4469 +> Words in train file: 407583 +> Alpha: 0.000711 Progress: 99.86% Words/thread/sec: 66.93k user system elapsed +> 6.753 0.055 6.796 +> system.time({two = train_word2vec("~/tmp2.txt","~/2_iter.vectors",iter = 3)}) +> Starting training using file /Users/bschmidt/tmp2.txt +> Vocab size: 4469 +> Words in train file: 407583 +> Alpha: 0.000237 Progress: 99.95% Words/thread/sec: 67.15k user system elapsed +> 18.846 0.085 18.896 +> +> two %>% nearest_to(two["debt"]) %>% round(3) +> debt remainder Jan including drawback manufactures prisoners mercantile subsisting +> 0.000 0.234 0.256 0.281 0.291 0.293 0.297 0.314 0.314 +> Dec +> 0.318 +> one %>% nearest_to(one[["debt"]]) %>% round(3) +> debt Christmas exception preventing Indies import remainder eye eighteen labouring +> 0.000 0.150 0.210 0.214 0.215 0.220 0.221 0.223 0.225 0.227 +> +> system.time({ten = train_word2vec("~/tmp2.txt","~/10_iter.vectors",iter = 10)}) +> Starting training using file /Users/bschmidt/tmp2.txt +> Vocab size: 4469 +> Words in train file: 407583 +> Alpha: 0.000071 Progress: 99.98% Words/thread/sec: 66.13k user system elapsed +> 62.070 0.296 62.333 +> +> ten %>% nearest_to(ten[["debt"]]) %>% round(3) +> debt surplus Dec remainder manufacturing grants Jan drawback prisoners +> 0.000 0.497 0.504 0.510 0.519 0.520 0.533 0.536 0.546 +> compelling +> 0.553 + +``` +``` + diff --git a/R/data.R b/R/data.R index 73826c0..67d8350 100644 --- a/R/data.R +++ b/R/data.R @@ -3,7 +3,7 @@ #' A sample VectorSpaceModel object trained on about 15 million #' teaching evaluations, limited to the 999 most common words. #' Included for demonstration purposes only: there's only so much you can -#' do with a 999 dimension vocabulary. +#' do with a 999 length vocabulary. #' #' You're best off downloading a real model to work with, #' such as the precompiled vectors distributed by Google diff --git a/R/functions.R b/R/functions.R deleted file mode 100644 index 0c2fb8c..0000000 --- a/R/functions.R +++ /dev/null @@ -1,5 +0,0 @@ -laziestLoad <- function(path=".") { - files <- list.files(path,recursive=TRUE) - cache_files <- sub(".rdb$", "", files[grepl(".rdb$", files)]) - for (i in cache_files) try(lazyLoad(i, envir = .GlobalEnv)) -} \ No newline at end of file diff --git a/R/matrixFunctions.R b/R/matrixFunctions.R index a898446..faa6836 100644 --- a/R/matrixFunctions.R +++ b/R/matrixFunctions.R @@ -1,3 +1,94 @@ +#' Improve a vectorspace by removing common elements. +#' +#' +#' @param vectorspace A VectorSpacemodel to be improved. +#' @param D The number of principal components to eliminate. +#' +#' @description See reference for a full description. Supposedly, these operations will improve performance on analogy tasks. +#' +#' @references Jiaqi Mu, Suma Bhat, Pramod Viswanath. All-but-the-Top: Simple and Effective Postprocessing for Word Representations. https://arxiv.org/abs/1702.01417. +#' @return A VectorSpaceModel object, transformed from the original. +#' @export +#' +#' @examples +#' +#' closest_to(demo_vectors,"great") +#' # stopwords like "and" and "very" are no longer top ten. +#' # I don't know if this is really better, though. +#' +#' closest_to(improve_vectorspace(demo_vectors),"great") +#' +improve_vectorspace = function(vectorspace,D=round(ncol(vectorspace)/100)) { + mean = methods::new("VectorSpaceModel", + matrix(apply(vectorspace,2,mean), + ncol=ncol(vectorspace)) + ) + vectorspace = vectorspace-mean + pca = stats::prcomp(vectorspace) + + # I don't totally understand the recommended operation in the source paper, but this seems to do much + # the same thing using the internal functions of the package to reject the top i dimensions one at a time. + drop_top_i = function(vspace,i) { + if (i<=0) {vspace} else if (i==1) { + reject(vspace,pca$rotation[,i]) + } else { + drop_top_i(reject(vspace,pca$rotation[,i]), i-1) + } + } + better = drop_top_i(vectorspace,D) +} + + +#' Internal function to subsitute strings for a tree. Allows arithmetic on words. +#' +#' @noRd +#' +#' @param tree an expression from a formula +#' @param context the VSM context in which to parse it. +#' +#' @return a tree +sub_out_tree = function(tree, context) { + # This is a whitelist of operators that I think are basic for vector math. + # It's possible it could be expanded. + + # This might fail if you try to pass a reference to a basic + # arithmetic operator, or something crazy like that. + + if (deparse(tree[[1]]) %in% c("+","*","-","/","^","log","sqrt","(")) { + for (i in 2:length(tree)) { + tree[[i]] <- sub_out_tree(tree[[i]],context) + } + } + if (is.character(tree)) { + return(context[[tree]]) + } + return(tree) +} + +#' Internal function to wrap for sub_out_tree. Allows arithmetic on words. +#' +#' @noRd +#' +#' @param formula A formula; string arithmetic on the LHS, no RHS. +#' @param context the VSM context in which to parse it. +#' +#' @return an evaluated formula. + +sub_out_formula = function(formula,context) { + # Despite the name, this will work on something that + # isn't a formula. That's by design: we want to allow + # basic reference passing, and also to allow simple access + # to words. + + if (class(context) != "VectorSpaceModel") {return(formula)} + if (class(formula)=="formula") { + formula[[2]] <- sub_out_tree(formula[[2]],context) + return(eval(formula[[2]])) + } + if (is.character(formula)) {return(context[[formula]])} + return(formula) +} + #' Vector Space Model class #' #' @description A class for describing and accessing Vector Space Models like Word2Vec. @@ -15,7 +106,7 @@ setClass("VectorSpaceModel",slots = c(".cache"="environment"),contains="matrix") # http://r.789695.n4.nabble.com/Change-value-of-a-slot-of-an-S4-object-within-a-method-td2338484.html setMethod("initialize", "VectorSpaceModel", function(.Object, ..., .cache=new.env()) { - callNextMethod(.Object, .cache=.cache, ...) + methods::callNextMethod(.Object, .cache=.cache, ...) }) #' Square Magnitudes with caching @@ -27,7 +118,7 @@ setMethod("initialize", "VectorSpaceModel", #' @keywords internal square_magnitudes = function(object) { if (class(object)=="VectorSpaceModel") { - if (.hasSlot(object, ".cache")) { + if (methods::.hasSlot(object, ".cache")) { if (is.null(object@.cache$magnitudes)) { object@.cache$magnitudes = rowSums(object^2) } @@ -50,12 +141,13 @@ square_magnitudes = function(object) { #' @param x The vectorspace model to subset #' @param i The row numbers to extract #' @param j The column numbers to extract -#' @param j Other arguments to extract (unlikely to be useful). +#' @param ... Other arguments passed to extract (unlikely to be useful). +#' #' @param drop Whether to drop columns. This parameter is ignored. #' @return A VectorSpaceModel #' setMethod("[","VectorSpaceModel",function(x,i,j,...,drop) { - nextup = callNextMethod() + nextup = methods::callNextMethod() if (!is.matrix(nextup)) { # A verbose way of effectively changing drop from TRUE to FALSE; # I don't want one-dimensional matrices turned to vectors. @@ -66,7 +158,7 @@ setMethod("[","VectorSpaceModel",function(x,i,j,...,drop) { nextup = matrix(nextup,ncol=j) } } - new("VectorSpaceModel",nextup) + methods::new("VectorSpaceModel",nextup) }) #' VectorSpaceModel subtraction @@ -86,11 +178,11 @@ setMethod("[","VectorSpaceModel",function(x,i,j,...,drop) { #' setMethod("-",signature(e1="VectorSpaceModel",e2="VectorSpaceModel"),function(e1,e2) { if (nrow(e1)==nrow(e2) && ncol(e1)==ncol(e2)) { - return (methods::new("VectorSpaceModel",callNextMethod())) + return (methods::new("VectorSpaceModel",e1@.Data-e2@.Data)) } if (nrow(e2)==1) { return( - new("VectorSpaceModel",e1 - matrix(rep(e2,each=nrow(e1)),nrow=nrow(e1))) + methods::new("VectorSpaceModel",e1 - matrix(rep(e2,each=nrow(e1)),nrow=nrow(e1))) ) } stop("Vector space model subtraction must use models of equal dimensions") @@ -138,7 +230,7 @@ setMethod("[[","VectorSpaceModel",function(x,i,average=TRUE) { setMethod("show","VectorSpaceModel",function(object) { dims = dim(object) cat("A VectorSpaceModel object of ",dims[1]," words and ", dims[2], " vectors\n") - methods::show(unclass(object[1:min(nrow(object),10),1:min(ncol(object),6)])) + methods::show(unclass(object[1:min(nrow(object),10),1:min(ncol(object),6),drop=F])) }) #' Plot a Vector Space Model. @@ -147,26 +239,35 @@ setMethod("show","VectorSpaceModel",function(object) { #' sanest thing to do is reduce the full model down to two dimensions #' using T-SNE, which preserves some of the local clusters. #' +#' For individual subsections, it can make sense to do a principal components +#' plot of the space of just those letters. This is what happens if method +#' is pca. On the full vocab, it's kind of a mess. +#' #' This plots only the first 300 words in the model. #' #' @param x The model to plot -#' @param y (ignored) -#' @param ... Further arguments passed to tsne::tsne. -#' (Note: not to plot.) +#' @param method The method to use for plotting. "pca" is principal components, "tsne" is t-sne +#' @param ... Further arguments passed to the plotting method. #' #' @return The TSNE model (silently.) #' @export -setMethod("plot","VectorSpaceModel",function(x,y,...) { - message("Attempting to use T-SNE to plot the vector representation") - message("Cancel if this is taking too long") - message("Or run 'install.packages' tsne if you don't have it.") - x = as.matrix(x) - short = x[1:min(300,nrow(x)),] - m = tsne::tsne(short,...) - plot(m,type='n',main="A two dimensional reduction of the vector space model using t-SNE") - graphics::text(m,rownames(short),cex = ((400:1)/200)^(1/3)) - rownames(m)=rownames(short) - silent = m +setMethod("plot","VectorSpaceModel",function(x,method="tsne",...) { + if (method=="tsne") { + message("Attempting to use T-SNE to plot the vector representation") + message("Cancel if this is taking too long") + message("Or run 'install.packages' tsne if you don't have it.") + x = as.matrix(x) + short = x[1:min(300,nrow(x)),] + m = tsne::tsne(short,...) + graphics::plot(m,type='n',main="A two dimensional reduction of the vector space model using t-SNE") + graphics::text(m,rownames(short),cex = ((400:1)/200)^(1/3)) + rownames(m)=rownames(short) + silent = m + } else if (method=="pca") { + vectors = stats::predict(stats::prcomp(x))[,1:2] + graphics::plot(vectors,type='n') + graphics::text(vectors,labels=rownames(vectors)) + } }) #' Convert to a Vector Space Model @@ -185,18 +286,18 @@ as.VectorSpaceModel = function(matrix) { #' #' @param filename The file to read in. #' @param vectors The number of dimensions word2vec calculated. Imputed automatically if not specified. -#' @param binary Read in the binary word2vec form. (Wraps `read.binary.vectors`) +#' @param binary Read in the binary word2vec form. (Wraps `read.binary.vectors`) By default, function +#' guesses based on file suffix. #' @param ... Further arguments passed to read.table or read.binary.vectors. -#' Note that both accept 'nrow' as an argument. Word2vec produces -#' by default frequency sorted output. Therefore 'read.vectors(...,nrows=500)', for example, +#' Note that both accept 'nrows' as an argument. Word2vec produces +#' by default frequency sorted output. Therefore 'read.vectors("file.bin", nrows=500)', for example, #' will return the vectors for the top 500 words. This can be useful on machines with limited #' memory. #' @export #' @return An matrixlike object of class `VectorSpaceModel` #' -read.vectors <- function(filename,vectors=guess_n_cols(),binary=FALSE,...) { - - if(rev(strsplit(filename,"\\.")[[1]])[1] =="bin") { +read.vectors <- function(filename,vectors=guess_n_cols(),binary=NULL,...) { + if(rev(strsplit(filename,"\\.")[[1]])[1] =="bin" && is.null(binary)) { message("Filename ends with .bin, so reading in binary format") binary=TRUE } @@ -234,14 +335,12 @@ read.vectors <- function(filename,vectors=guess_n_cols(),binary=FALSE,...) { #' @param cols The column numbers to read. Default is "All"; #' if you are in a memory-limited environment, #' you can limit the number of columns you read in by giving a vector of column integers -#' @param name_list A whitelist of words. If you wish to read in only a few dozen words, +#' @param rowname_list A whitelist of words. If you wish to read in only a few dozen words, #' all other rows will be skipped and only these read in. -#' @param name_regexp A regular expression specifying a pattern for rows to read in. Row +#' @param rowname_regexp A regular expression specifying a pattern for rows to read in. Row #' names matching that pattern will be included in the read; all others will be skipped. #' @return A VectorSpaceModel object #' @export -#' -#' read.binary.vectors = function(filename,nrows=Inf,cols="All", rowname_list = NULL, rowname_regexp = NULL) { if (!is.null(rowname_list) && !is.null(rowname_regexp)) {stop("Specify a whitelist of names or a regular expression to be applied to all input, not both.")} @@ -277,7 +376,7 @@ read.binary.vectors = function(filename,nrows=Inf,cols="All", rowname_list = NUL returned_columns = col_number - if (is.integer(cols)) { + if (is.numeric(cols)) { returned_columns = length(cols) } @@ -295,7 +394,7 @@ read.binary.vectors = function(filename,nrows=Inf,cols="All", rowname_list = NUL } rownames[i] <<- rowname row = readBin(a,numeric(),size=4,n=col_number,endian="little") - if (is.integer(cols)) { + if (is.numeric(cols)) { return(row[cols]) } return(row) @@ -388,7 +487,12 @@ magnitudes <- function(matrix) { #' @return An object of the same class as matrix #' @export normalize_lengths = function(matrix) { - t(t(matrix)/magnitudes(matrix)) + + val = matrix/magnitudes(matrix) + if (inherits(val,"VectorSpaceModel")) { + val@.cache = new.env() + } + val } #' Reduce by rownames @@ -422,19 +526,28 @@ filter_to_rownames <- function(matrix,words) { #' @return A matrix. Rows correspond to entries in x; columns to entries in y. #' #' @examples +#' +#' # Inspect the similarity of several academic disciplines by hand. #' subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=FALSE]] #' similarities = cosineSimilarity(subjects,subjects) #' +#' # Use 'closest_to' to build up a large list of similar words to a seed set. #' subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=TRUE]] -#' new_subject_list = nearest_to(demo_vectors,subjects,20) -#' new_subjects = demo_vectors[[names(new_subject_list),average=FALSE]] +#' new_subject_list = closest_to(demo_vectors,subjects,20) +#' new_subjects = demo_vectors[[new_subject_list$word,average=FALSE]] +#' +#' # Plot the cosineDistance of these as a dendrogram. #' plot(hclust(as.dist(cosineDist(new_subjects,new_subjects)))) #' #' @export -cosineSimilarity <- function(x,y){ + +cosineSimilarity <- function(x,y) { # The most straightforward definition would be just: # x %*% t(y) / (sqrt(rowSums(x^2) %*% t(rowSums(y^2)))) - # However, we have to do a little type-checking and a few speedups. + # However, we do a little type-checking and a few speedups. + + # Allow non-referenced characters to refer to the original matrix. + y = sub_out_formula(y,x) if (!(is.matrix(x) || is.matrix(y))) { if (length(x)==length(y)) { @@ -458,10 +571,10 @@ cosineSimilarity <- function(x,y){ # triangles of a symmetrical matrix, I think. tcrossprod(x,y)/ (sqrt(tcrossprod(square_magnitudes(x),square_magnitudes(y)))) - # } + #' Cosine Distance #' @description Calculate the cosine distance between two vectors. #' @@ -472,7 +585,8 @@ cosineSimilarity <- function(x,y){ #' @param x A matrix, VectorSpaceModel, or vector. #' @param y A matrix, VectorSpaceModel, or vector. #' -#' @return A matrix whose dimnames are rownames(x), rownames(y) +#' @return A matrix whose dimnames are rownames(x), rownames(y) and whose entires are +#' the associated distance. #' #' @export cosineDist <- function(x,y) { @@ -482,9 +596,12 @@ cosineDist <- function(x,y) { #' Project each row of an input matrix along a vector. #' #' @param matrix A matrix or VectorSpaceModel -#' @param vector A vector (or an object coercable to a vector, see project) +#' @param vector A vector (or object coercable to a vector) #' of the same length as the VectorSpaceModel. #' +#' +#' @description As with 'cosineSimilarity +#' #' @return A new matrix or VectorSpaceModel of the same dimensions as `matrix`, #' each row of which is parallel to vector. #' @@ -496,12 +613,13 @@ cosineDist <- function(x,y) { project = function(matrix,vector) { # The matrix is a matrix: # b is a vector to reproject the matrix to be orthogonal to. + vector = sub_out_formula(vector,matrix) b = as.vector(vector) if (length(b)!=ncol(matrix)) { stop("The vector must be the same length as the matrix it is being compared to") } newmat = crossprod(t(matrix %*% b)/as.vector((b %*% b)) , b) - return(new("VectorSpaceModel",newmat)) + return(methods::new("VectorSpaceModel",newmat)) } #' Return a vector rejection for each element in a VectorSpaceModel @@ -519,9 +637,10 @@ project = function(matrix,vector) { #' See `project` for more details. #' #' @examples -#' nearest_to(demo_vectors,demo_vectors[["man"]]) +#' closest_to(demo_vectors,demo_vectors[["man"]]) +#' #' genderless = reject(demo_vectors,demo_vectors[["he"]] - demo_vectors[["she"]]) -#' nearest_to(genderless,genderless[["man"]]) +#' closest_to(genderless,genderless[["man"]]) #' #' @export reject = function(matrix,vector) { @@ -530,33 +649,136 @@ reject = function(matrix,vector) { return(val) } -#' Return the n closest words in a VectorSpaceModel to a given vector. + +#' Compress or expand a vector space model along a vector. #' #' @param matrix A matrix or VectorSpaceModel -#' @param vector Avector (or an object coercable to a vector, see project) +#' @param vector A vector (or an object coercable to a vector, see project) #' of the same length as the VectorSpaceModel. -#' @param n The number of closest words to include. +#' @param multiplier A scaling factor. See below. #' -#' @return A vector of distances, with names corresponding to the words -#' in the parent VectorSpaceModel, of length n. +#' @description This is an experimental function that might be useful sometimes. +#' 'Reject' flatly eliminates a particular dimension from a vectorspace, essentially +#' squashing out a single dimension; 'distend' gives finer grained control, making it +#' possible to stretch out or compress in the same space. High values of 'multiplier' +#' make a given vector more prominent; 1 keeps the original matrix untransformed; values +#' less than one compress distances along the vector; and 0 is the same as "reject," +#' eliminating a vector entirely. Values less than zero will do some type of mirror-image +#' universe thing, but probably aren't useful? +#' +#' +#' @return A new matrix or VectorSpaceModel of the same dimensions as `matrix`, +#' distended along the vector 'vector' by factor 'multiplier'. +#' +#' See `project` for more details and usage. #' #' @examples +#' closest_to(demo_vectors,"sweet") +#' +#' # Stretch out the vectorspace 4x longer along the gender direction. +#' more_sexist = distend(demo_vectors, ~ "man" + "he" - "she" -"woman", 4) #' -#' #Synonyms and similar words -#' nearest_to(demo_vectors,demo_vectors[["good"]]) +#' closest_to(more_sexist,"sweet") +#' +#' @export +distend = function(matrix,vector, multiplier) { + parallel_track = project(matrix,vector) + return(methods::new("VectorSpaceModel",matrix - parallel_track*(multiplier-1))) +} + +#' Return the n closest words in a VectorSpaceModel to a given vector. +#' +#' @param matrix A matrix or VectorSpaceModel +#' @param vector A vector (or a string or a formula coercable to a vector) +#' of the same length as the VectorSpaceModel. See below. +#' @param n The number of closest words to include. +#' @param fancy_names If true (the default) the data frame will have descriptive names like +#' 'similarity to "king+queen-man"'; otherwise, just 'similarity.' The default can speed up +#' interactive exploration. +#' +#' @return A sorted data.frame with columns for the words and their similarity +#' to the target vector. (Or, if as_df==FALSE, a named vector of similarities.) +#' +#' @description This is a convenience wrapper around the most common use of +#' 'cosineSimilarity'; the listing of several words similar to a given vector. +#' Unlike cosineSimilarity, it returns a data.frame object instead of a matrix. +#' cosineSimilarity is more powerful, because it can compare two matrices to +#' each other; closest_to can only take a vector or vectorlike object as its second argument. +#' But with (or without) the argument n=Inf, closest_to is often better for +#' plugging directly into a plot. +#' +#' As with cosineSimilarity, the second argument can take several forms. If it's a vector or +#' matrix slice, it will be taken literally. If it's a character string, it will +#' be interpreted as a word and the associated vector from `matrix` will be used. If +#' a formula, any strings in the formula will be converted to rows in the associated `matrix` +#' before any math happens. +#' +#' @examples +#' +#' # Synonyms and similar words +#' closest_to(demo_vectors,demo_vectors[["good"]]) +#' +#' # If 'matrix' is a VectorSpaceModel object, +#' # you can also just enter a string directly, and +#' # it will be evaluated in the context of the passed matrix. +#' +#' closest_to(demo_vectors,"good") +#' +#' # You can also express more complicated formulas. +#' +#' closest_to(demo_vectors,"good") #' #' # Something close to the classic king:man::queen:woman; #' # What's the equivalent word for a female teacher that "guy" is for #' # a male one? -#' nearest_to(demo_vectors,demo_vectors[["guy"]] - demo_vectors[["man"]] + demo_vectors[["woman"]]) +#' +#' closest_to(demo_vectors,~ "guy" - "man" + "woman") #' #' @export +closest_to = function(matrix, vector, n=10, fancy_names = TRUE) { + label = deparse(substitute(vector),width.cutoff=500) + if (substr(label,1,1)=="~") {label = substr(label,2,500)} + + # The actually wrapping. + sims = cosineSimilarity(matrix,vector) + + # Top n shouldn't be greater than the vocab length. + n = min(n,length(sims)) -nearest_to = function(matrix,vector,n=10) { - sims = cosineSimilarity(matrix,matrix(as.vector(vector),ncol=ncol(matrix))) + # For sorting. ords = order(-sims[,1]) - structure( - 1-sims[ords[1:n]], # Convert from similarity to distance. - names=rownames(sims)[ords[1:n]]) + + return_val = data.frame(rownames(sims)[ords[1:n]], sims[ords[1:n]],stringsAsFactors=FALSE) + if (fancy_names) { + names(return_val) = c("word", paste("similarity to", label)) + } else { + names(return_val) = c("word","similarity") + } + rownames(return_val) = NULL + return_val } + +#' Nearest vectors to a word +#' +#' @description This a wrapper around closest_to, included for back-compatibility. Use +#' closest_to for new applications. +#' @param ... See `closest_to` +#' +#' @return a names vector of cosine similarities. See 'nearest_to' for more details. +#' @export +#' +#' @examples +#' +#' # Recommended usage in 1.0: +#' nearest_to(demo_vectors, demo_vectors[["good"]]) +#' +#' # Recommended usage in 2.0: +#' demo_vectors %>% closest_to("good") +#' +nearest_to = function(...) { + vals = closest_to(...,fancy_names = F) + returnable = 1 - vals$similarity + names(returnable) = vals$word + returnable +} diff --git a/R/utils.R b/R/utils.R index 5ed31e7..ce0b79e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,12 +1,3 @@ -.is.word2vec <- function(obj) { - if (!inherits(obj, "word2vec")) return(FALSE) - if (!identical(class(obj$model_file), "character")) return(FALSE) - if (!file.exists(obj$model_file)) return(FALSE) - if (length(obj$model_file) > 1) return(FALSE) - if (file.info(obj$model_file)$isdir) return(FALSE) - return(TRUE) -} - - - - +#' @importFrom magrittr %>% +#' @export +magrittr::`%>%` diff --git a/R/word2vec.R b/R/word2vec.R index df12025..ac6a7f6 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -11,15 +11,22 @@ ##' @title Train a model by word2vec. ##' @param train_file Path of a single .txt file for training. Tokens are split on spaces. ##' @param output_file Path of the output file. -##' @param vectors The number of vectors to output. Defaults to 100. More vectors may be useful with large files. -##' @param threads Number of threads to run training process on. Defaults to 1; up to the number of cores on your machine may be useful. +##' @param vectors The number of vectors to output. Defaults to 100. +##' More vectors usually means more precision, but also more random error, higher memory usage, and slower operations. +##' Sensible choices are probably in the range 100-500. +##' @param threads Number of threads to run training process on. +##' Defaults to 1; up to the number of (virtual) cores on your machine may speed things up. ##' @param window The size of the window (in words) to use in training. ##' @param classes Number of classes for k-means clustering. Not documented/tested. -##' @param cbow If 1, use a continuous-bag-of-words model instead of skip-grams. Defaults to false (recommended for newcomers). -##' @param min_count Minimum times a word must appear to be included in the samples. High values help reduce model size. +##' @param cbow If 1, use a continuous-bag-of-words model instead of skip-grams. +##' Defaults to false (recommended for newcomers). +##' @param min_count Minimum times a word must appear to be included in the samples. +##' High values help reduce model size. ##' @param iter Number of passes to make over the corpus in training. -##' @param force Whether to overwrite existing files. -##' @return A word2vec object. +##' @param force Whether to overwrite existing model files. +##' @param negative_samples Number of negative samples to take in skip-gram training. 0 means full sampling, while lower numbers +##' give faster training. For large corpora 2-5 may work; for smaller corpora, 5-15 is reasonable. +##' @return A VectorSpaceModel object. ##' @author Jian Li <\email{rweibo@@sina.com}>, Ben Schmidt <\email{bmchmidt@@gmail.com}> ##' @references \url{https://code.google.com/p/word2vec/} ##' @export @@ -27,10 +34,10 @@ ##' @useDynLib wordVectors ##' ##' @examples \dontrun{ -##' model = word2vec(system.file("examples", "rfaq.txt", package = "tmcn.word2vec")) +##' model = train_word2vec(system.file("examples", "rfaq.txt", package = "wordVectors")) ##' } train_word2vec <- function(train_file, output_file = "vectors.bin",vectors=100,threads=1,window=12, - classes=0,cbow=0,min_count=5,iter=5,force=F) + classes=0,cbow=0,min_count=5,iter=5,force=F, negative_samples=5) { if (!file.exists(train_file)) stop("Can't find the training file!") if (file.exists(output_file) && !force) stop("The output file '", @@ -64,7 +71,8 @@ train_word2vec <- function(train_file, output_file = "vectors.bin",vectors=100,t classes=as.character(classes), cbow=as.character(cbow), min_count=as.character(min_count), - iter=as.character(iter) + iter=as.character(iter), + neg_samples=as.character(negative_samples) ) read.vectors(output_file) @@ -83,10 +91,6 @@ train_word2vec <- function(train_file, output_file = "vectors.bin",vectors=100,t #' @param origin A text file or a directory of text files #' to be used in training the model #' @param destination The location for output text. -#' @param split_characters If the 'stringi' package is not installed, -#' A list of characters that mark word breaks. By default, -#' any nonword characters according to the perl regex engine. If stringi is installed, -#' this parameter is ignored. #' @param lowercase Logical. Should uppercase characters be converted to lower? #' @param bundle_ngrams Integer. Statistically significant phrases of up to this many words #' will be joined with underscores: e.g., "United States" will usually be changed to "United_States" @@ -98,27 +102,13 @@ train_word2vec <- function(train_file, output_file = "vectors.bin",vectors=100,t #' @export #' #' @return The file name (silently). -prep_word2vec <- function(origin,destination, - split_characters="\\W",lowercase=F, - bundle_ngrams=1,...) +prep_word2vec <- function(origin,destination,lowercase=F, + bundle_ngrams=1, ...) { # strsplit chokes on large lines. I would not have gone down this path if I knew this # to begin with. - non_choking_strsplit <- function(lines,...) { - splitLineIfNecessary = function(line,limit=10000) { - # recursive function. - chars = nchar(line) - if (chars < limit) { - return(line) - } else { - first_half = substr(line,1,nchar(line) %/% 2) - second_half = substr(line,1,nchar(line) %/% 2) - return(c(splitLineIfNecessary(first_half),splitLineIfNecessary(second_half))) - } - } - lines = unlist(lapply(lines,splitLineIfNecessary)) - unlist(strsplit(lines,...)) - } + + message("Beginning tokenization to text file at ", destination) if (!exists("dir.exists")) { @@ -129,36 +119,37 @@ prep_word2vec <- function(origin,destination, stats::setNames(res, x) } } + if (dir.exists(origin)) { origin = list.files(origin,recursive=T,full.names = T) } - cat("",file=destination,append=F) + if (file.exists(destination)) file.remove(destination) - if (require(stringi)) { - using_stringi = TRUE - } else { - warning("Install the stringi package ('install.packages(\"stringi\")') for much more efficient word tokenization") + tokenize_words = function (x, lowercase = TRUE) { + # This is an abbreviated version of the "tokenizers" package version to remove the dependency. + # Sorry, Lincoln, it was failing some tests. + if (lowercase) x <- stringi::stri_trans_tolower(x) + out <- stringi::stri_split_boundaries(x, type = "word", skip_word_none = TRUE) + unlist(out) } - for (filename in origin) { - message("\n",filename,appendLF=F) - con = file(filename,open="r") - while(length(lines <- readLines(con, n = 1000, warn = FALSE))>0) { - message(".",appendLF=F) - if(using_stringi) { - words = unlist(stri_extract_all_words(lines)) - } else { - words = non_choking_strsplit(lines,split_characters,perl=T) - } - if (lowercase) {words=tolower(words)} - cat(c(words," "),file=destination,append=T) - } - close(con) - cat(c("\n"),file=destination,append=T) + prep_single_file <- function(file_in, file_out, lowercase) { + message("Prepping ", file_in) + text <- file_in %>% + readr::read_file() %>% + tokenize_words(lowercase) %>% + stringr::str_c(collapse = " ") + + stopifnot(length(text) == 1) + readr::write_lines(text, file_out, append = TRUE) + return(TRUE) } + + Map(prep_single_file, origin, lowercase=lowercase, file_out=destination) + # Save the ultimate output real_destination_name = destination @@ -195,7 +186,10 @@ prep_word2vec <- function(origin,destination, #' @param min_count Minimum times a word must appear to be included in the samples. #' High values help reduce model size. #' @param threshold Threshold value for determining if pairs of words are phrases. +#' @param force Whether to overwrite existing files at the output location. Default FALSE +#' #' @return The name of output_file, the trained file where common phrases are now joined. +#' #' @export #' @examples #' \dontrun{ diff --git a/README.md b/README.md index 3d432ef..d146275 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,16 @@ An R package for building and exploring word embedding models. # Description -This package does three major things: +This package does three major things to make it easier to work with word2vec and other vectorspace models of language. -1. [Trains word2vec models](#creating-text-vectors) using an extended Jian Li's word2vec code; reads and writes the binary word2vec format so that you can import pre-trained models such as Google's; and provides tools for reading only *part* of a model so you can explore a model in memory-limited situations. -2. [Creates a new `VectorSpaceModel` class in R that gives a better syntax for exploring a word2vec or GloVe model than native matrix methods.](#vectorspacemodel-object) For example, instead of writing `model[rownames(model)=="king",]`, you can write `model[["king"]]`. +1. [Trains word2vec models](#creating-text-vectors) using an extended Jian Li's word2vec code; reads and writes the binary word2vec format so that you can import pre-trained models such as Google's; and provides tools for reading only *part* of a model (rows or columns) so you can explore a model in memory-limited situations. +2. [Creates a new `VectorSpaceModel` class in R that gives a better syntax for exploring a word2vec or GloVe model than native matrix methods.](#vectorspacemodel-object) For example, instead of writing `model[rownames(model)=="king",]`, you can write `model[["king"]]`, and instead of writing `vectors %>% closest_to(vectors[rownames(vectors)=="king",] - vectors[rownames(vectors)=="man",] + vectors[rownames(vectors)=="woman",])` (whew!), you can write +`vectors %>% closest_to(~"king" - "man" + "woman")`. 3. [Implements several basic matrix operations that are useful in exploring word embedding models including cosine similarity, nearest neighbor, and vector projection](#useful-matrix-operations) with some caching that makes them much faster than the simplest implementations. ### Quick start -For a step-by-step interactive demo that includes installation and training a model on 77 historical cookbooks from Michigan State University, [jump to the quick-start guide](#quick-start-1). +For a step-by-step interactive demo that includes installation and training a model on 77 historical cookbooks from Michigan State University, [see the introductory vignette.](https://github.com/bmschmidt/wordVectors/blob/master/vignettes/introduction.Rmd). ### Credit @@ -24,6 +25,10 @@ Right now, it [does not (I don't think) install under Windows 8](https://github. It's not extremely fast, but once the data is loaded in most operations happen in suitable time for exploratory data analysis (under a second on my laptop.) +For high-performance analysis of models, C or python's numpy/gensim will likely be better than this package, in part because R doesn't have support for single-precision floats. The goal of this package is to facilitate clear code and exploratory data analysis of models. + +Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. + ## Creating text vectors. One portion of this is an expanded version of the code from Jian Li's `word2vec` package with a few additional parameters enabled as the function `train_word2vec`. @@ -52,14 +57,15 @@ In this package, you can simply access it by using the double brace operators: vector_set[["king"]] - vector_set[["man"]] + vector_set[["woman"]] ``` +(And in the context of the custom functions, as a formula like `~"king" - "man" + "woman"`: see below). + Since frequently an average of two vectors provides a better indication, multiple words can be collapsed into a single vector by specifying multiple labels. For example, this may provide a slightly better gender vector: ```{r} vector_set[["king"]] - vector_set[[c("man","men")]] + vector_set[[c("woman","women")]] ``` -Sometimes you want to subset *without* averaging. You can do this with the argument `average==FALSE` to -the subset. +Sometimes you want to subset *without* averaging. You can do this with the argument `average==FALSE` to the subset. This is particularly useful for comparing slices of the matrix to itself in similarity operations. ```{r} cosineSimilarity(vector_set[[c("man","men","king"),average=F]], vector_set[[c("woman","women","queen"),average=F]] @@ -67,7 +73,8 @@ cosineSimilarity(vector_set[[c("man","men","king"),average=F]], vector_set[[c("w ## A few native functions defined on the VectorSpaceModel object. -The native `show` method just prints the dimensions; the native `print` method does some crazy reductions with the T-SNE package (installation required for functionality) because T-SNE is a nice way to reduce down the size of vectors. +The native `show` method just prints the dimensions; the native `plot` method does some crazy reductions with the T-SNE package (installation required for functionality) because T-SNE is a nice way to reduce down the size of vectors, **or** lets you pass `method='pca'` to array a full set or subset by the first two principal components. + ## Useful matrix operations @@ -78,10 +85,10 @@ Each takes a `VectorSpaceModel` as its first argument. Sometimes, it's appropria * `cosineSimilarity(VSM_1,VSM_2)` calculates the cosine similarity of every vector in on vector space model to every vector in another. This is `n^2` complexity. With a vocabulary size of 20,000 or so, it can be reasonable to compare an entire set to itself; or you can compare a larger set to a smaller one to search for particular terms of interest. * `cosineDistance(VSM_1,VSM_2)` is the inverse of cosineSimilarity. It's not really a distance metric, but can be used as one for clustering and the like. - * `nearest_to(VSM,vector,n)` wraps a particularly common use case for `cosineSimilarity`, of finding the top `n` terms in a `VectorSpaceModel` closest to term m + * `closest_to(VSM,vector,n)` wraps a particularly common use case for `cosineSimilarity`, of finding the top `n` terms in a `VectorSpaceModel` closest to term m * `project(VSM,vector)` takes a `VectorSpaceModel` and returns the portion parallel to the vector `vector`. * `reject(VSM,vector)` is the inverse of `project`; it takes a `VectorSpaceModel` and returns the portion orthogonal to the vector `vector`. This makes it possible, for example, to collapse a vector space by removing certain distinctions of meaning. - * `magnitudes` calculated the magnitude of each element in a VSM. This is useful in. + * `magnitudes` calculated the magnitude of each element in a VSM. This is useful in many operations. All of these functions place the VSM object as the first argument. This makes it easy to chain together operations using the `magrittr` package. For example, beginning with a single vector set one could find the nearest words in a set to a version of the vector for "bank" that has been decomposed to remove any semantic similarity to the banking sector. @@ -91,9 +98,18 @@ not_that_kind_of_bank = chronam_vectors[["bank"]] %>% reject(chronam_vectors[["cashier"]]) %>% reject(chronam_vectors[["depositors"]]) %>% reject(chronam_vectors[["check"]]) -chronam_vectors %>% nearest_to(not_that_kind_of_bank) +chronam_vectors %>% closest_to(not_that_kind_of_bank) ``` +These functions also allow an additional layer of syntactic sugar when working with word vectors. + +Or even just as a formula, if you're working entirely with a single model, so you don't have to keep referring to words; instead, you can use a formula interface to reduce typing and increase clarity. + +```{r} +vectors %>% closest_to(~ "king" - "man" + "woman") +``` + + # Quick start ## Install the wordVectors package. @@ -109,72 +125,14 @@ One of the major hurdles to running word2vec for ordinary people is that it requ 4. Install the latest version of this package from Github by pasting in the following. ```R - library(devtools) - install_github("bmschmidt/wordVectors") + devtools::install_github("bmschmidt/wordVectors") ``` Windows users may need to install "Rtools" as well: if so, a message to this effect should appear in red on the screen. This may cycle through a very large number of warnings: so long as it says "warning" and not "error", you're probably OK. -## Testing the setup - -We'll test the setup by running a complete VSM. First, download and extract a zip file of cookbooks from the MSU library by pasting the following lines. - -```{r} -if (!file.exists("cookbooks.zip")) { - download.file("http://archive.lib.msu.edu/dinfo/feedingamerica/cookbook_text.zip","cookbooks.zip") -} -unzip("cookbooks.zip",exdir="cookbooks") -``` - -Then load the wordVectors package you have already installed. -```{r} -library(wordVectors) -``` - -Next, we build a single text file consisting of all the cookbooks converted to lowercase with punctuation removed. - -**Note**: this `prep_word2vec` function is *extremely* inefficient compared to text parsing functions written in python or sed or pretty much any language you can think of. I'm only including it for Windows compatibility of examples and non-programmers. If you know how to create a file with punctuation already stripped or separated any other way, I **strongly** recommend doing it that way. But if you're working with a few hundred documents, this will get the job done, slowly. On the cookbooks, it should take a couple minutes. (For reference: in a console, `perl -pe 's/[^A-Za-z_0-9 \n]/ /g;' cookbooks/* > cookbooks.txt` will do the same thing in a couple *seconds*. Seriously, I have no idea how to write fast R text-parsing code.) - -```{r} -prep_word2vec("cookbooks","cookbooks.txt",lowercase=T) -``` - -Now we *train* the model. This can take quite a while. In RStudio I've noticed that this appears to hang, but if you check processors it actually still runs. Try it on smaller portions first, and then let it take time: the training function can take hours for tens of thousands of books. - -The 'threads' parameter is the number of processors to use on your computer. - -```{r} -model = train_word2vec("cookbooks.txt",output="cookbook_vectors.bin",threads = 3,vectors = 100,window=12) -``` - -* NOTE: If at any point you want to *read in* a previously trained model, you can do so by typing `model = read.vectors("cookbook_vectors.bin")` - -Now we have a model in memory, trained on about 10 million words from 77 cookbooks. What can it tell us about food? - -Well, you can run some basic operations to find the nearest elements: - -```{r} -nearest_to(model,model[["fish"]]) -``` - -With that list, you can expand out further to search for multiple words: - -```{r} -nearest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50) -``` - -Now we have a pretty expansive list of potential fish-related words from old cookbooks. This may be useful for something in real life. +## Train a model. -Or we can just arrange them somehow. If you have the tsne package installed, (type `install.packages("tsne")` to download it), you can plot these words in a reduced dimensional space. In this case, it doesn't look like much of anything. +For instructions on training, see the [introductory vignette](https://github.com/bmschmidt/wordVectors/blob/master/vignettes/introduction.Rmd) -```{r} -some_fish = nearest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50) -plot(filter_to_rownames(model,names(some_fish))) -``` - -But this set actually gives a fairly nicely clustered set of results if you plot the top words in the whole thing. - -```{r} -plot(model) -``` +## Explore an existing model. -There's a lot of other stuff you can do besides just measuring nearness: you can do analogies, projection, and more complicated plots. But for that you should read my blog posts on this. +For instructions on exploration, see the end of the [introductory vignette](https://github.com/bmschmidt/wordVectors/blob/master/vignettes/introduction.Rmd), or the slower-paced [vignette on exploration](https://github.com/bmschmidt/wordVectors/blob/master/vignettes/exploration.Rmd) diff --git a/data/demo_vectors.rda b/data/demo_vectors.rda index e489a0e..02ed604 100644 Binary files a/data/demo_vectors.rda and b/data/demo_vectors.rda differ diff --git a/inst/doc/exploration.R b/inst/doc/exploration.R new file mode 100644 index 0000000..07b42b9 --- /dev/null +++ b/inst/doc/exploration.R @@ -0,0 +1,64 @@ +## ------------------------------------------------------------------------ +library(wordVectors) +library(magrittr) + +## ------------------------------------------------------------------------ +demo_vectors[["good"]] + +## ------------------------------------------------------------------------ +demo_vectors %>% closest_to(demo_vectors[["good"]]) + +## ------------------------------------------------------------------------ +demo_vectors %>% closest_to("bad") + +## ------------------------------------------------------------------------ + +demo_vectors %>% closest_to(~"good"+"bad") + +# The same thing could be written as: +# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) + +## ------------------------------------------------------------------------ +demo_vectors %>% closest_to(~"good" - "bad") + +## ------------------------------------------------------------------------ +demo_vectors %>% closest_to(~ "bad" - "good") + +## ------------------------------------------------------------------------ +demo_vectors %>% closest_to(~ "he" - "she") +demo_vectors %>% closest_to(~ "she" - "he") + +## ------------------------------------------------------------------------ +demo_vectors %>% closest_to(~ "guy" - "he" + "she") + +## ------------------------------------------------------------------------ +demo_vectors %>% closest_to(~ "guy" + ("she" - "he")) + +## ------------------------------------------------------------------------ + +demo_vectors[[c("lady","woman","man","he","she","guy","man"), average=F]] %>% + plot(method="pca") + + +## ------------------------------------------------------------------------ +top_evaluative_words = demo_vectors %>% + closest_to(~ "good"+"bad",n=75) + +goodness = demo_vectors %>% + closest_to(~ "good"-"bad",n=Inf) + +femininity = demo_vectors %>% + closest_to(~ "she" - "he", n=Inf) + +## ------------------------------------------------------------------------ +library(ggplot2) +library(dplyr) + +top_evaluative_words %>% + inner_join(goodness) %>% + inner_join(femininity) %>% + ggplot() + + geom_text(aes(x=`similarity to "she" - "he"`, + y=`similarity to "good" - "bad"`, + label=word)) + diff --git a/inst/doc/exploration.Rmd b/inst/doc/exploration.Rmd new file mode 100644 index 0000000..fd056b8 --- /dev/null +++ b/inst/doc/exploration.Rmd @@ -0,0 +1,168 @@ +--- +title: "Word2Vec Workshop" +author: "Ben Schmidt" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# Exploring Word2Vec models + +R is a great language for *exploratory data analysis* in particular. If you're going to use a word2vec model in a larger pipeline, it may be important (intellectually or ethically) to spend a little while understanding what kind of model of language you've learned. + +This package makes it easy to do so, both by allowing you to read word2vec models to and from R, and by giving some syntactic sugar that lets you describe vector-space models concisely and clearly. + +Note that these functions may still be useful if you're a data analyst training word2vec models elsewhere (say, in gensim.) I'm also hopeful this can be a good way of interacting with varied vector models in a workshop session. + +If you want to train your own model or need help setting up the package, read the introductory vignette. Aside from the installation, it assumes more knowledge of R than this walkthrough. + +## Why explore? + +In this vignette we're going to look at (a small portion of) a model trained on teaching evaluations. It's an interesting set, but it's also one that shows the importance of exploring vector space models before you use them. Exploration is important because: + +1. If you're a humanist or social scientist, it can tell you something about the *sources* by letting you see how they use language. These co-occurrence patterns can then be better investigated through close reading or more traditional collocation scores, which potentially more reliable but also much slower and less flexible. + +2. If you're an engineer, it can help you understand some of biases built into a model that you're using in a larger pipeline. This can be both technically and ethically important: you don't want, for instance, to build a job-recommendation system which is disinclined to offer programming jobs to women because it has learned that women are unrepresented in CS jobs already. +(On this point in word2vec in particular, see [here](https://freedom-to-tinker.com/blog/randomwalker/language-necessarily-contains-human-biases-and-so-will-machines-trained-on-language-corpora/) and [here](https://arxiv.org/abs/1607.06520).) + +## Getting started. + +First we'll load this package, and the recommended package `magrittr`, which lets us pass these arguments around. + +```{r} +library(wordVectors) +library(magrittr) +``` + +The basic element of any vector space model is a *vectors.* for each word. In the demo data included with this package, an object called 'demo_vectors,' there are 500 numbers: you can start to examine them, if you with, by hand. So let's consider just one of these--the vector for 'good'. + +In R's ordinary matrix syntax, you could write that out laboriously as `demo_vectors[rownames(demo_vectors)=="good",]`. `WordVectors` provides a shorthand using double braces: + +```{r} +demo_vectors[["good"]] +``` + +These numbers are meaningless on their own. But in the vector space, we can find similar words. + +```{r} +demo_vectors %>% closest_to(demo_vectors[["good"]]) +``` + +The `%>%` is the pipe operator from magrittr; it helps to keep things organized, and is particularly useful with some of the things we'll see later. The 'similarity' scores here are cosine similarity in a vector space; 1.0 represents perfect similarity, 0 is no correlation, and -1.0 is complete opposition. In practice, vector "opposition" is different from the colloquial use of "opposite," and very rare. You'll only occasionally see vector scores below 0--as you can see above, "bad" is actually one of the most similar words to "good." + +When interactively exploring a single model (rather than comparing *two* models), it can be a pain to keep retyping words over and over. Rather than operate on the vectors, this package also lets you access the word directly by using R's formula notation: putting a tilde in front of it. For a single word, you can even access it directly, as so. + +```{r} +demo_vectors %>% closest_to("bad") +``` + +## Vector math + +The tildes are necessary syntax where things get interesting--you can do **math** on these vectors. So if we want to find the words that are closest to the *combination* of "good" and "bad" (which is to say, words that get used in evaluation) we can write (see where the tilde is?): + +```{r} + +demo_vectors %>% closest_to(~"good"+"bad") + +# The same thing could be written as: +# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) +``` + +Those are words that are common to both "good" and "bad". We could also find words that are shaded towards just good but *not* bad by using subtraction. + +```{r} +demo_vectors %>% closest_to(~"good" - "bad") +``` + +> What does this "subtraction" vector mean? +> In practice, the easiest way to think of it is probably simply as 'similar to +> good and dissimilar to 'bad'. Omer and Levy's papers suggest this interpretation. +> But taking the vectors more seriously means you can think of it geometrically: "good"-"bad" is +> a vector that describes the difference between positive and negative. +> Similarity to this vector means, technically, the portion of a words vectors whose +> whose multidimensional path lies largely along the direction between the two words. + +Again, you can easily switch the order to the opposite: here are a bunch of bad words: + +```{r} +demo_vectors %>% closest_to(~ "bad" - "good") +``` + +All sorts of binaries are captured in word2vec models. One of the most famous, since Mikolov's original word2vec paper, is *gender*. If you ask for similarity to "he"-"she", for example, you get words that appear mostly in a *male* context. Since these examples are from teaching evaluations, after just a few straightforwardly gendered words, we start to get things that only men are ("arrogant") or where there are very few women in the university ("physics") + +```{r} +demo_vectors %>% closest_to(~ "he" - "she") +demo_vectors %>% closest_to(~ "she" - "he") +``` + +## Analogies + +We can expand out the match to perform analogies. Men tend to be called 'guys'. +What's the female equivalent? +In an SAT-style analogy, you might write `he:guy::she:???`. +In vector math, we think of this as moving between points. + +If you're using the mental framework of positive of 'similarity' and +negative as 'dissimilarity,' you can think of this as starting at "guy", +removing its similarity to "he", and additing a similarity to "she". + +This yields the answer: the most similar term to "guy" for a woman is "lady." + +```{r} +demo_vectors %>% closest_to(~ "guy" - "he" + "she") +``` + +If you're using the other mental framework, of thinking of these as real vectors, +you might phrase this in a slightly different way. +You have a gender vector `("female" - "male")` that represents the *direction* of masculinity +to femininity. You can then add this vector to "guy", and that will take you to a new neighborhood. You might phrase that this way: note that the math is exactly equivalent, and +only the grouping is different. + +```{r} +demo_vectors %>% closest_to(~ "guy" + ("she" - "he")) +``` + +Principal components can let you plot a subset of these vectors to see how they relate. You can imagine an arrow from "he" to "she", from "guy" to "lady", and from "man" to "woman"; all run in roughly the same direction. + +```{r} + +demo_vectors[[c("lady","woman","man","he","she","guy","man"), average=F]] %>% + plot(method="pca") + +``` + +These lists of ten words at a time are useful for interactive exploration, but sometimes we might want to say 'n=Inf' to return the full list. For instance, we can combine these two methods to look at positive and negative words used to evaluate teachers. + +First we build up three data_frames: first, a list of the 50 top evaluative words, and then complete lists of similarity to `"good" -"bad"` and `"woman" - "man"`. + +```{r} +top_evaluative_words = demo_vectors %>% + closest_to(~ "good"+"bad",n=75) + +goodness = demo_vectors %>% + closest_to(~ "good"-"bad",n=Inf) + +femininity = demo_vectors %>% + closest_to(~ "she" - "he", n=Inf) +``` + +Then we can use tidyverse packages to join and plot these. +An `inner_join` restricts us down to just those top 50 words, and ggplot +can array the words on axes. + +```{r} +library(ggplot2) +library(dplyr) + +top_evaluative_words %>% + inner_join(goodness) %>% + inner_join(femininity) %>% + ggplot() + + geom_text(aes(x=`similarity to "she" - "he"`, + y=`similarity to "good" - "bad"`, + label=word)) +``` + diff --git a/inst/doc/exploration.html b/inst/doc/exploration.html new file mode 100644 index 0000000..eda1e1d --- /dev/null +++ b/inst/doc/exploration.html @@ -0,0 +1,278 @@ + + + + + + + + + + + + + + + + +Word2Vec Workshop + + + + + + + + + + + + + + + + + +

Word2Vec Workshop

+

Ben Schmidt

+

2017-02-21

+ + + +
+

Exploring Word2Vec models

+

R is a great language for exploratory data analysis in particular. If you’re going to use a word2vec model in a larger pipeline, it may be important (intellectually or ethically) to spend a little while understanding what kind of model of language you’ve learned.

+

This package makes it easy to do so, both by allowing you to read word2vec models to and from R, and by giving some syntactic sugar that lets you describe vector-space models concisely and clearly.

+

Note that these functions may still be useful if you’re a data analyst training word2vec models elsewhere (say, in gensim.) I’m also hopeful this can be a good way of interacting with varied vector models in a workshop session.

+

If you want to train your own model or need help setting up the package, read the introductory vignette. Aside from the installation, it assumes more knowledge of R than this walkthrough.

+
+

Why explore?

+

In this vignette we’re going to look at (a small portion of) a model trained on teaching evaluations. It’s an interesting set, but it’s also one that shows the importance of exploring vector space models before you use them. Exploration is important because:

+
    +
  1. If you’re a humanist or social scientist, it can tell you something about the sources by letting you see how they use language. These co-occurrence patterns can then be better investigated through close reading or more traditional collocation scores, which potentially more reliable but also much slower and less flexible.

  2. +
  3. If you’re an engineer, it can help you understand some of biases built into a model that you’re using in a larger pipeline. This can be both technically and ethically important: you don’t want, for instance, to build a job-recommendation system which is disinclined to offer programming jobs to women because it has learned that women are unrepresented in CS jobs already. (On this point in word2vec in particular, see here and here.)

  4. +
+
+
+

Getting started.

+

First we’ll load this package, and the recommended package magrittr, which lets us pass these arguments around.

+
library(wordVectors)
+library(magrittr)
+

The basic element of any vector space model is a vectors. for each word. In the demo data included with this package, an object called ‘demo_vectors,’ there are 500 numbers: you can start to examine them, if you with, by hand. So let’s consider just one of these–the vector for ‘good’.

+

In R’s ordinary matrix syntax, you could write that out laboriously as demo_vectors[rownames(demo_vectors)=="good",]. WordVectors provides a shorthand using double braces:

+
demo_vectors[["good"]]
+
## A VectorSpaceModel object of  1  words and  500  vectors
+##         V1    V2     V3     V4    V5    V6
+## [1,] 0.381 0.026 -0.006 -0.004 -0.09 0.075
+## attr(,".cache")
+## <environment: 0xae47980>
+

These numbers are meaningless on their own. But in the vector space, we can find similar words.

+
demo_vectors %>% closest_to(demo_vectors[["good"]])
+
##         word similarity to demo_vectors[["good"]]
+## 1       good                            1.0000000
+## 2      great                            0.7089031
+## 3     decent                            0.6604300
+## 4        bad                            0.5263100
+## 5     really                            0.5206047
+## 6  excellent                            0.5196016
+## 7       nice                            0.5058916
+## 8        but                            0.5056037
+## 9        and                            0.4916693
+## 10         a                            0.4893531
+

The %>% is the pipe operator from magrittr; it helps to keep things organized, and is particularly useful with some of the things we’ll see later. The ‘similarity’ scores here are cosine similarity in a vector space; 1.0 represents perfect similarity, 0 is no correlation, and -1.0 is complete opposition. In practice, vector “opposition” is different from the colloquial use of “opposite,” and very rare. You’ll only occasionally see vector scores below 0–as you can see above, “bad” is actually one of the most similar words to “good.”

+

When interactively exploring a single model (rather than comparing two models), it can be a pain to keep retyping words over and over. Rather than operate on the vectors, this package also lets you access the word directly by using R’s formula notation: putting a tilde in front of it. For a single word, you can even access it directly, as so.

+
demo_vectors %>% closest_to("bad")
+
##         word similarity to "bad"
+## 1        bad           1.0000000
+## 2       good           0.5263100
+## 3       hard           0.4661783
+## 4   terrible           0.4640076
+## 5     either           0.4545856
+## 6   horrible           0.4431282
+## 7         ok           0.4081203
+## 8  difficult           0.4038597
+## 9        but           0.3964500
+## 10  honestly           0.3904238
+
+
+

Vector math

+

The tildes are necessary syntax where things get interesting–you can do math on these vectors. So if we want to find the words that are closest to the combination of “good” and “bad” (which is to say, words that get used in evaluation) we can write (see where the tilde is?):

+
demo_vectors %>% closest_to(~"good"+"bad")
+
##      word similarity to "good" + "bad"
+## 1     bad                    0.8845830
+## 2    good                    0.8621269
+## 3   great                    0.5917829
+## 4  decent                    0.5893969
+## 5    hard                    0.5362420
+## 6     but                    0.5135680
+## 7  really                    0.5025217
+## 8    nice                    0.5004618
+## 9      ok                    0.4751181
+## 10   that                    0.4692515
+
# The same thing could be written as:
+# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]])
+

Those are words that are common to both “good” and “bad”. We could also find words that are shaded towards just good but not bad by using subtraction.

+
demo_vectors %>% closest_to(~"good" - "bad")
+
##         word similarity to "good" - "bad"
+## 1       good                    0.4205466
+## 2      great                    0.3328308
+## 3  excellent                    0.3093233
+## 4     decent                    0.2418898
+## 5  fantastic                    0.2168332
+## 6   thorough                    0.2148802
+## 7  wonderful                    0.2093082
+## 8        and                    0.1995122
+## 9       very                    0.1979586
+## 10   awesome                    0.1975041
+
+

What does this “subtraction” vector mean? In practice, the easiest way to think of it is probably simply as ‘similar to good and dissimilar to ’bad’. Omer and Levy’s papers suggest this interpretation. But taking the vectors more seriously means you can think of it geometrically: “good”-“bad” is a vector that describes the difference between positive and negative. Similarity to this vector means, technically, the portion of a words vectors whose whose multidimensional path lies largely along the direction between the two words.

+
+

Again, you can easily switch the order to the opposite: here are a bunch of bad words:

+
demo_vectors %>% closest_to(~ "bad" - "good")
+
##        word similarity to "bad" - "good"
+## 1       bad                    0.5501080
+## 2    either                    0.2372618
+## 3     awful                    0.1834758
+## 4     worse                    0.1832953
+## 5    stupid                    0.1745834
+## 6  terrible                    0.1626507
+## 7  horrible                    0.1536159
+## 8  honestly                    0.1466926
+## 9      dumb                    0.1455157
+## 10   unfair                    0.1449083
+

All sorts of binaries are captured in word2vec models. One of the most famous, since Mikolov’s original word2vec paper, is gender. If you ask for similarity to “he”-“she”, for example, you get words that appear mostly in a male context. Since these examples are from teaching evaluations, after just a few straightforwardly gendered words, we start to get things that only men are (“arrogant”) or where there are very few women in the university (“physics”)

+
demo_vectors %>% closest_to(~ "he" - "she")
+
##        word similarity to "he" - "she"
+## 1        he                  0.5014923
+## 2       his                  0.4467857
+## 3       guy                  0.4179970
+## 4       hes                  0.4049624
+## 5       him                  0.3907059
+## 6        mr                  0.3827611
+## 7       man                  0.3713098
+## 8   himself                  0.3436856
+## 9  arrogant                  0.1662236
+## 10  physics                  0.1560129
+
demo_vectors %>% closest_to(~ "she" - "he")
+
##       word similarity to "she" - "he"
+## 1      she                  0.5749598
+## 2      her                  0.5707957
+## 3     lady                  0.5067850
+## 4     shes                  0.5050173
+## 5    woman                  0.4741360
+## 6  herself                  0.4294012
+## 7       ms                  0.3842313
+## 8      mrs                  0.3745640
+## 9    sweet                  0.2067263
+## 10  person                  0.1084187
+
+
+

Analogies

+

We can expand out the match to perform analogies. Men tend to be called ‘guys’. What’s the female equivalent? In an SAT-style analogy, you might write he:guy::she:???. In vector math, we think of this as moving between points.

+

If you’re using the mental framework of positive of ‘similarity’ and negative as ‘dissimilarity,’ you can think of this as starting at “guy”, removing its similarity to “he”, and additing a similarity to “she”.

+

This yields the answer: the most similar term to “guy” for a woman is “lady.”

+
demo_vectors %>% closest_to(~ "guy" - "he" + "she")
+
##       word similarity to "guy" - "he" + "she"
+## 1     lady                          0.8851965
+## 2    woman                          0.7777516
+## 3      she                          0.7025325
+## 4     shes                          0.6502704
+## 5      her                          0.6421576
+## 6      guy                          0.5533376
+## 7   person                          0.5437728
+## 8       ms                          0.4703695
+## 9  herself                          0.4589193
+## 10     mrs                          0.4508955
+

If you’re using the other mental framework, of thinking of these as real vectors, you might phrase this in a slightly different way. You have a gender vector ("female" - "male") that represents the direction of masculinity to femininity. You can then add this vector to “guy”, and that will take you to a new neighborhood. You might phrase that this way: note that the math is exactly equivalent, and only the grouping is different.

+
demo_vectors %>% closest_to(~ "guy" + ("she" - "he"))
+
##       word similarity to "guy" + ("she" - "he")
+## 1     lady                            0.8851965
+## 2    woman                            0.7777516
+## 3      she                            0.7025325
+## 4     shes                            0.6502704
+## 5      her                            0.6421576
+## 6      guy                            0.5533376
+## 7   person                            0.5437728
+## 8       ms                            0.4703695
+## 9  herself                            0.4589193
+## 10     mrs                            0.4508955
+

Principal components can let you plot a subset of these vectors to see how they relate. You can imagine an arrow from “he” to “she”, from “guy” to “lady”, and from “man” to “woman”; all run in roughly the same direction.

+
demo_vectors[[c("lady","woman","man","he","she","guy","man"), average=F]] %>% 
+  plot(method="pca")
+

+

These lists of ten words at a time are useful for interactive exploration, but sometimes we might want to say ‘n=Inf’ to return the full list. For instance, we can combine these two methods to look at positive and negative words used to evaluate teachers.

+

First we build up three data_frames: first, a list of the 50 top evaluative words, and then complete lists of similarity to "good" -"bad" and "woman" - "man".

+
top_evaluative_words = demo_vectors %>% 
+   closest_to(~ "good"+"bad",n=75)
+
+goodness = demo_vectors %>% 
+  closest_to(~ "good"-"bad",n=Inf) 
+
+femininity = demo_vectors %>% 
+  closest_to(~ "she" - "he", n=Inf)
+

Then we can use tidyverse packages to join and plot these. An inner_join restricts us down to just those top 50 words, and ggplot can array the words on axes.

+
library(ggplot2)
+library(dplyr)
+
+top_evaluative_words %>%
+  inner_join(goodness) %>%
+  inner_join(femininity) %>%
+  ggplot() + 
+  geom_text(aes(x=`similarity to "she" - "he"`,
+                y=`similarity to "good" - "bad"`,
+                label=word))
+
## Joining, by = "word"
+## Joining, by = "word"
+

+
+
+ + + + + + + + diff --git a/inst/doc/introduction.R b/inst/doc/introduction.R new file mode 100644 index 0000000..8c030f0 --- /dev/null +++ b/inst/doc/introduction.R @@ -0,0 +1,101 @@ +## ------------------------------------------------------------------------ +if (!require(wordVectors)) { + if (!(require(devtools))) { + install.packages("devtools") + } + devtools::install_github("bmschmidt/wordVectors") +} + + + +## ------------------------------------------------------------------------ +library(wordVectors) +library(magrittr) + +## ------------------------------------------------------------------------ +if (!file.exists("cookbooks.zip")) { + download.file("http://archive.lib.msu.edu/dinfo/feedingamerica/cookbook_text.zip","cookbooks.zip") +} +unzip("cookbooks.zip",exdir="cookbooks") + +## ------------------------------------------------------------------------ +if (!file.exists("cookbooks.txt")) prep_word2vec(origin="cookbooks",destination="cookbooks.txt",lowercase=T,bundle_ngrams=2) + +## ------------------------------------------------------------------------ +if (!file.exists("cookbook_vectors.bin")) {model = train_word2vec("cookbooks.txt","cookbook_vectors.bin",vectors=200,threads=4,window=12,iter=5,negative_samples=0)} else model = read.vectors("cookbook_vectors.bin") + + +## ------------------------------------------------------------------------ +model %>% closest_to("fish") + +## ------------------------------------------------------------------------ +model %>% + closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50) + +## ------------------------------------------------------------------------ +some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150) +fishy = model[[some_fish$word,average=F]] +plot(fishy,method="pca") + +## ------------------------------------------------------------------------ +set.seed(10) +centers = 150 +clustering = kmeans(model,centers=centers,iter.max = 40) + +## ------------------------------------------------------------------------ +sapply(sample(1:centers,10),function(n) { + names(clustering$cluster[clustering$cluster==n][1:10]) +}) + +## ------------------------------------------------------------------------ +ingredients = c("madeira","beef","saucepan","carrots") +term_set = lapply(ingredients, + function(ingredient) { + nearest_words = model %>% closest_to(model[[ingredient]],20) + nearest_words$word + }) %>% unlist + +subset = model[[term_set,average=F]] + +subset %>% + cosineDist(subset) %>% + as.dist %>% + hclust %>% + plot + + +## ------------------------------------------------------------------------ +tastes = model[[c("sweet","salty"),average=F]] + +# model[1:3000,] here restricts to the 3000 most common words in the set. +sweet_and_saltiness = model[1:3000,] %>% cosineSimilarity(tastes) + +# Filter to the top 20 sweet or salty. +sweet_and_saltiness = sweet_and_saltiness[ + rank(-sweet_and_saltiness[,1])<20 | + rank(-sweet_and_saltiness[,2])<20, + ] + +plot(sweet_and_saltiness,type='n') +text(sweet_and_saltiness,labels=rownames(sweet_and_saltiness)) + + +## ------------------------------------------------------------------------ + +tastes = model[[c("sweet","salty","savory","bitter","sour"),average=F]] + +# model[1:3000,] here restricts to the 3000 most common words in the set. +common_similarities_tastes = model[1:3000,] %>% cosineSimilarity(tastes) + +common_similarities_tastes[20:30,] + +## ------------------------------------------------------------------------ +high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 75,] + +high_similarities_to_tastes %>% + prcomp %>% + biplot(main="Fifty words in a\nprojection of flavor space") + +## ------------------------------------------------------------------------ +plot(model,perplexity=50) + diff --git a/vignettes/training.Rmd b/inst/doc/introduction.Rmd similarity index 68% rename from vignettes/training.Rmd rename to inst/doc/introduction.Rmd index df92a9b..6f1185b 100644 --- a/vignettes/training.Rmd +++ b/inst/doc/introduction.Rmd @@ -11,29 +11,29 @@ vignette: > # Intro -This vignette walks you through training a word2vec model, and using that model to search for similarities, to build clusters, and to visualize vocabulary relationships of that model in two dimensions. +This vignette walks you through training a word2vec model, and using that model to search for similarities, to build clusters, and to visualize vocabulary relationships of that model in two dimensions. If you are working with pre-trained vectors, you might want to jump straight to the "exploration" vignette; it is a little slower-paced, but doesn't show off quite so many features of the package. # Package installation -If you have not installed this package, paste the below. +If you have not installed this package, paste the below. More detailed installation instructions are at the end of the [package README](https://github.com/bmschmidt/wordVectors). ```{r} -if (!require(wordVectors) { +if (!require(wordVectors)) { if (!(require(devtools))) { install.packages("devtools") } devtools::install_github("bmschmidt/wordVectors") -}) +} ``` # Building test data -We begin by importing the word2vec library and the `maggritr` package, because its pipe operator makes things much clearer. +We begin by importing the `wordVectors` package and the `magrittr` package, because its pipe operator makes it easier to work with data. ```{r} -library(word2vec) +library(wordVectors) library(magrittr) ``` @@ -59,23 +59,25 @@ You can also do this in another language: particularly for large files, that wil ```{r} -prep_word2vec(origin="cookbooks",destination="cookbooks.txt",lowercase=T,bundle_ngrams=2) +if (!file.exists("cookbooks.txt")) prep_word2vec(origin="cookbooks",destination="cookbooks.txt",lowercase=T,bundle_ngrams=2) ``` To train a word2vec model, use the function `train_word2vec`. This actually builds up the model. It uses an on-disk file as an intermediary and then reads that file into memory. ```{r} -model = train_word2vec("cookbooks.txt","cookbook_vectors.bin",vectors=200,threads=4,window=12,iter=5) +if (!file.exists("cookbook_vectors.bin")) {model = train_word2vec("cookbooks.txt","cookbook_vectors.bin",vectors=200,threads=4,window=12,iter=5,negative_samples=0)} else model = read.vectors("cookbook_vectors.bin") + ``` A few notes: -1. The 'threads' parameter is the number of processors to use on your computer. On a modern laptop, up to 8 threads can be useful. -2. `iter` is how many times to read through the corpus. With fewer than 100 books, it can greatly help to increase the number of passes. -3. Training can take a while. On my laptop, it takes a few minutes to train these cookbooks; larger models (on tens of thousands of books) can take longer. -4. One of the best things about the word2vec algorithm is that it *does* work on extremely large corpora in linear time. -5. In RStudio I've noticed that this sometimes appears to hang after a while; the percentage bar stops updating. If you check system activity it actually is still running, and will complete. -6. If at any point you want to *read in* a previously trained model, you can do so by typing `model = read.vectors("cookbook_vectors.bin")`. +1. The `vectors` parameter is the dimensionality of the representation. More vectors usually means more precision, but also more random error and slower operations. Likely choices are probably in the range 100-500. +2. The `threads` parameter is the number of processors to use on your computer. On a modern laptop, the fastest results will probably be between 2 and 8 threads, depending on the number of cores. +3. `iter` is how many times to read through the corpus. With fewer than 100 books, it can greatly help to increase the number of passes; if you're working with billions of words, it probably matters less. One danger of too low a number of iterations is that words that aren't closely related will seem to be closer than they are. +4. Training can take a while. On my laptop, it takes a few minutes to train these cookbooks; larger models take proportionally more time. Because of the importance of more iterations to reducing noise, don't be afraid to set things up to require a lot of training time (as much as a day!) +5. One of the best things about the word2vec algorithm is that it *does* work on extremely large corpora in linear time. +6. In RStudio I've noticed that this sometimes appears to hang after a while; the percentage bar stops updating. If you check system activity it actually is still running, and will complete. +7. If at any point you want to *read in* a previously trained model, you can do so by typing `model = read.vectors("cookbook_vectors.bin")`. Now we have a model in memory, trained on about 10 million words from 77 cookbooks. What can it tell us about food? @@ -84,14 +86,14 @@ Now we have a model in memory, trained on about 10 million words from 77 cookboo Well, you can run some basic operations to find the nearest elements: ```{r} -model %>% nearest_to(model[["fish"]]) +model %>% closest_to("fish") ``` With that list, you can expand out further to search for multiple words: ```{r} model %>% - nearest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50) + closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50) ``` Now we have a pretty expansive list of potential fish-related words from old cookbooks. This can be useful for a few different things: @@ -100,17 +102,17 @@ Now we have a pretty expansive list of potential fish-related words from old coo 2. As a batch of words to use as seed to some other text mining operation; for example, you could pull all paragraphs surrounding these to find ways that fish are cooked. 3. As a source for visualization. -Or we can just arrange them somehow. If you have the tsne package installed, (type `install.packages("tsne")` to download it), you can plot these words in a reduced dimensional space. In this case, it doesn't look like much of anything. +Or we can just arrange them somehow. In this case, it doesn't look like much of anything. ```{r} -some_fish = nearest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150) -fishy = model[[names(some_fish),average=F]] -plot(fishy,perplexity=15) +some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150) +fishy = model[[some_fish$word,average=F]] +plot(fishy,method="pca") ``` ## Clustering -We can use standard clustering algorithms, like kmeans, to find groups of terms that fit together. You can think of this as a sort of topic model, although unlike more sophisticated topic modeling algorithms like Latent Direchlet Allocation, each word must be tied to a particular topic. +We can use standard clustering algorithms, like kmeans, to find groups of terms that fit together. You can think of this as a sort of topic model, although unlike more sophisticated topic modeling algorithms like Latent Direchlet Allocation, each word must be tied to single particular topic. ```{r} set.seed(10) @@ -118,7 +120,7 @@ centers = 150 clustering = kmeans(model,centers=centers,iter.max = 40) ``` -Here are a ten random topics produced through this method. Each of the columns are the ten most frequent words in one random cluster. +Here are a ten random "topics" produced through this method. Each of the columns are the ten most frequent words in one random cluster. ```{r} sapply(sample(1:centers,10),function(n) { @@ -135,7 +137,8 @@ the 20 words closest to each of four different kinds of words. ingredients = c("madeira","beef","saucepan","carrots") term_set = lapply(ingredients, function(ingredient) { - nearest_words = model %>% nearest_to(model[[ingredient]],20) %>% names + nearest_words = model %>% closest_to(model[[ingredient]],20) + nearest_words$word }) %>% unlist subset = model[[term_set,average=F]] @@ -163,7 +166,7 @@ tastes = model[[c("sweet","salty"),average=F]] # model[1:3000,] here restricts to the 3000 most common words in the set. sweet_and_saltiness = model[1:3000,] %>% cosineSimilarity(tastes) -# +# Filter to the top 20 sweet or salty. sweet_and_saltiness = sweet_and_saltiness[ rank(-sweet_and_saltiness[,1])<20 | rank(-sweet_and_saltiness[,2])<20, @@ -193,12 +196,14 @@ Now we can filter down to the 50 words that are closest to *any* of these (that' use a PCA biplot to look at just 50 words in a flavor plane. ```{r} -high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 50,] +high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 75,] -high_similarities_to_tastes %>% prcomp %>% biplot(main="Fifty words in a\nprojection of flavor space") +high_similarities_to_tastes %>% + prcomp %>% + biplot(main="Fifty words in a\nprojection of flavor space") ``` -This tells us a few things. First is that while each of the tastes is distinct, 'sweet' and 'sour' are much more closely linked in this cooking style. Is this a unique feature of American cooking? A relationship that changes over time? These would require more investigation. +This tells us a few things. One is that (in some runnings of the model, at least--there is some random chance built in here.) "sweet" and "sour" are closely aligned. Is this a unique feature of American cooking? A relationship that changes over time? These would require more investigation. Second is that "savory" really is an acting category in these cookbooks, even without the precision of 'umami' as a word to express it. Anchovy, the flavor most closely associated with savoriness, shows up as fairly characteristic of the flavor, along with a variety of herbs. @@ -220,3 +225,4 @@ plot(model,perplexity=50) A few notes on this method: 1. If you don't get local clusters, it is not working. You might need to reduce the perplexity so that clusters are smaller; or you might not have good local similarities. +2. If you're plotting only a small set of words, you're better off trying to plot a `VectorSpaceModel` with `method="pca"`, which locates the points using principal components analysis. diff --git a/inst/doc/introduction.html b/inst/doc/introduction.html new file mode 100644 index 0000000..15510cf --- /dev/null +++ b/inst/doc/introduction.html @@ -0,0 +1,744 @@ + + + + + + + + + + + + + + + + +Word2Vec introduction + + + + + + + + + + + + + + + + + +

Word2Vec introduction

+

Ben Schmidt

+

2017-02-21

+ + + +
+

Intro

+

This vignette walks you through training a word2vec model, and using that model to search for similarities, to build clusters, and to visualize vocabulary relationships of that model in two dimensions. If you are working with pre-trained vectors, you might want to jump straight to the “exploration” vignette; it is a little slower-paced, but doesn’t show off quite so many features of the package.

+
+
+

Package installation

+

If you have not installed this package, paste the below. More detailed installation instructions are at the end of the package README.

+
if (!require(wordVectors)) {
+  if (!(require(devtools))) {
+    install.packages("devtools")
+  }
+  devtools::install_github("bmschmidt/wordVectors")
+}
+
+
+

Building test data

+

We begin by importing the wordVectors package and the magrittr package, because its pipe operator makes it easier to work with data.

+
library(wordVectors)
+library(magrittr)
+

First we build up a test file to train on. As an example, we’ll use a collection of cookbooks from Michigan State University. This has to download from the Internet if it doesn’t already exist.

+
if (!file.exists("cookbooks.zip")) {
+  download.file("http://archive.lib.msu.edu/dinfo/feedingamerica/cookbook_text.zip","cookbooks.zip")
+}
+unzip("cookbooks.zip",exdir="cookbooks")
+

Then we prepare a single file for word2vec to read in. This does a couple things:

+
    +
  1. Creates a single text file with the contents of every file in the original document;
  2. +
  3. Uses the tokenizers package to clean and lowercase the original text,
  4. +
  5. If bundle_ngrams is greater than 1, joins together common bigrams into a single word. For example, “olive oil” may be joined together into “olive_oil” wherever it occurs.
  6. +
+

You can also do this in another language: particularly for large files, that will be much faster. (For reference: in a console, perl -ne 's/[^A-Za-z_0-9 \n]/ /g; print lc $_;' cookbooks/*.txt > cookbooks.txt will do much the same thing on ASCII text in a couple seconds.) If you do this and want to bundle ngrams, you’ll then need to call word2phrase("cookbooks.txt","cookbook_bigrams.txt",...) to build up the bigrams; call it twice if you want 3-grams, and so forth.

+
if (!file.exists("cookbooks.txt")) prep_word2vec(origin="cookbooks",destination="cookbooks.txt",lowercase=T,bundle_ngrams=2)
+

To train a word2vec model, use the function train_word2vec. This actually builds up the model. It uses an on-disk file as an intermediary and then reads that file into memory.

+
if (!file.exists("cookbook_vectors.bin")) {model = train_word2vec("cookbooks.txt","cookbook_vectors.bin",vectors=200,threads=4,window=12,iter=5,negative_samples=0)} else model = read.vectors("cookbook_vectors.bin")
+
## Filename ends with .bin, so reading in binary format
+
## Reading a word2vec binary file of 45820 rows and 200 columns
+
## 
+  |                                                                       
+  |                                                                 |   0%
+  |                                                                       
+  |                                                                 |   1%
+  |                                                                       
+  |=                                                                |   1%
+  |                                                                       
+  |=                                                                |   2%
+  |                                                                       
+  |==                                                               |   2%
+  |                                                                       
+  |==                                                               |   3%
+  |                                                                       
+  |==                                                               |   4%
+  |                                                                       
+  |===                                                              |   4%
+  |                                                                       
+  |===                                                              |   5%
+  |                                                                       
+  |====                                                             |   5%
+  |                                                                       
+  |====                                                             |   6%
+  |                                                                       
+  |====                                                             |   7%
+  |                                                                       
+  |=====                                                            |   7%
+  |                                                                       
+  |=====                                                            |   8%
+  |                                                                       
+  |======                                                           |   8%
+  |                                                                       
+  |======                                                           |   9%
+  |                                                                       
+  |======                                                           |  10%
+  |                                                                       
+  |=======                                                          |  10%
+  |                                                                       
+  |=======                                                          |  11%
+  |                                                                       
+  |=======                                                          |  12%
+  |                                                                       
+  |========                                                         |  12%
+  |                                                                       
+  |========                                                         |  13%
+  |                                                                       
+  |=========                                                        |  13%
+  |                                                                       
+  |=========                                                        |  14%
+  |                                                                       
+  |=========                                                        |  15%
+  |                                                                       
+  |==========                                                       |  15%
+  |                                                                       
+  |==========                                                       |  16%
+  |                                                                       
+  |===========                                                      |  16%
+  |                                                                       
+  |===========                                                      |  17%
+  |                                                                       
+  |===========                                                      |  18%
+  |                                                                       
+  |============                                                     |  18%
+  |                                                                       
+  |============                                                     |  19%
+  |                                                                       
+  |=============                                                    |  19%
+  |                                                                       
+  |=============                                                    |  20%
+  |                                                                       
+  |=============                                                    |  21%
+  |                                                                       
+  |==============                                                   |  21%
+  |                                                                       
+  |==============                                                   |  22%
+  |                                                                       
+  |===============                                                  |  22%
+  |                                                                       
+  |===============                                                  |  23%
+  |                                                                       
+  |===============                                                  |  24%
+  |                                                                       
+  |================                                                 |  24%
+  |                                                                       
+  |================                                                 |  25%
+  |                                                                       
+  |=================                                                |  25%
+  |                                                                       
+  |=================                                                |  26%
+  |                                                                       
+  |=================                                                |  27%
+  |                                                                       
+  |==================                                               |  27%
+  |                                                                       
+  |==================                                               |  28%
+  |                                                                       
+  |===================                                              |  28%
+  |                                                                       
+  |===================                                              |  29%
+  |                                                                       
+  |===================                                              |  30%
+  |                                                                       
+  |====================                                             |  30%
+  |                                                                       
+  |====================                                             |  31%
+  |                                                                       
+  |====================                                             |  32%
+  |                                                                       
+  |=====================                                            |  32%
+  |                                                                       
+  |=====================                                            |  33%
+  |                                                                       
+  |======================                                           |  33%
+  |                                                                       
+  |======================                                           |  34%
+  |                                                                       
+  |======================                                           |  35%
+  |                                                                       
+  |=======================                                          |  35%
+  |                                                                       
+  |=======================                                          |  36%
+  |                                                                       
+  |========================                                         |  36%
+  |                                                                       
+  |========================                                         |  37%
+  |                                                                       
+  |========================                                         |  38%
+  |                                                                       
+  |=========================                                        |  38%
+  |                                                                       
+  |=========================                                        |  39%
+  |                                                                       
+  |==========================                                       |  39%
+  |                                                                       
+  |==========================                                       |  40%
+  |                                                                       
+  |==========================                                       |  41%
+  |                                                                       
+  |===========================                                      |  41%
+  |                                                                       
+  |===========================                                      |  42%
+  |                                                                       
+  |============================                                     |  42%
+  |                                                                       
+  |============================                                     |  43%
+  |                                                                       
+  |============================                                     |  44%
+  |                                                                       
+  |=============================                                    |  44%
+  |                                                                       
+  |=============================                                    |  45%
+  |                                                                       
+  |==============================                                   |  45%
+  |                                                                       
+  |==============================                                   |  46%
+  |                                                                       
+  |==============================                                   |  47%
+  |                                                                       
+  |===============================                                  |  47%
+  |                                                                       
+  |===============================                                  |  48%
+  |                                                                       
+  |================================                                 |  48%
+  |                                                                       
+  |================================                                 |  49%
+  |                                                                       
+  |================================                                 |  50%
+  |                                                                       
+  |=================================                                |  50%
+  |                                                                       
+  |=================================                                |  51%
+  |                                                                       
+  |=================================                                |  52%
+  |                                                                       
+  |==================================                               |  52%
+  |                                                                       
+  |==================================                               |  53%
+  |                                                                       
+  |===================================                              |  53%
+  |                                                                       
+  |===================================                              |  54%
+  |                                                                       
+  |===================================                              |  55%
+  |                                                                       
+  |====================================                             |  55%
+  |                                                                       
+  |====================================                             |  56%
+  |                                                                       
+  |=====================================                            |  56%
+  |                                                                       
+  |=====================================                            |  57%
+  |                                                                       
+  |=====================================                            |  58%
+  |                                                                       
+  |======================================                           |  58%
+  |                                                                       
+  |======================================                           |  59%
+  |                                                                       
+  |=======================================                          |  59%
+  |                                                                       
+  |=======================================                          |  60%
+  |                                                                       
+  |=======================================                          |  61%
+  |                                                                       
+  |========================================                         |  61%
+  |                                                                       
+  |========================================                         |  62%
+  |                                                                       
+  |=========================================                        |  62%
+  |                                                                       
+  |=========================================                        |  63%
+  |                                                                       
+  |=========================================                        |  64%
+  |                                                                       
+  |==========================================                       |  64%
+  |                                                                       
+  |==========================================                       |  65%
+  |                                                                       
+  |===========================================                      |  65%
+  |                                                                       
+  |===========================================                      |  66%
+  |                                                                       
+  |===========================================                      |  67%
+  |                                                                       
+  |============================================                     |  67%
+  |                                                                       
+  |============================================                     |  68%
+  |                                                                       
+  |=============================================                    |  68%
+  |                                                                       
+  |=============================================                    |  69%
+  |                                                                       
+  |=============================================                    |  70%
+  |                                                                       
+  |==============================================                   |  70%
+  |                                                                       
+  |==============================================                   |  71%
+  |                                                                       
+  |==============================================                   |  72%
+  |                                                                       
+  |===============================================                  |  72%
+  |                                                                       
+  |===============================================                  |  73%
+  |                                                                       
+  |================================================                 |  73%
+  |                                                                       
+  |================================================                 |  74%
+  |                                                                       
+  |================================================                 |  75%
+  |                                                                       
+  |=================================================                |  75%
+  |                                                                       
+  |=================================================                |  76%
+  |                                                                       
+  |==================================================               |  76%
+  |                                                                       
+  |==================================================               |  77%
+  |                                                                       
+  |==================================================               |  78%
+  |                                                                       
+  |===================================================              |  78%
+  |                                                                       
+  |===================================================              |  79%
+  |                                                                       
+  |====================================================             |  79%
+  |                                                                       
+  |====================================================             |  80%
+  |                                                                       
+  |====================================================             |  81%
+  |                                                                       
+  |=====================================================            |  81%
+  |                                                                       
+  |=====================================================            |  82%
+  |                                                                       
+  |======================================================           |  82%
+  |                                                                       
+  |======================================================           |  83%
+  |                                                                       
+  |======================================================           |  84%
+  |                                                                       
+  |=======================================================          |  84%
+  |                                                                       
+  |=======================================================          |  85%
+  |                                                                       
+  |========================================================         |  85%
+  |                                                                       
+  |========================================================         |  86%
+  |                                                                       
+  |========================================================         |  87%
+  |                                                                       
+  |=========================================================        |  87%
+  |                                                                       
+  |=========================================================        |  88%
+  |                                                                       
+  |==========================================================       |  88%
+  |                                                                       
+  |==========================================================       |  89%
+  |                                                                       
+  |==========================================================       |  90%
+  |                                                                       
+  |===========================================================      |  90%
+  |                                                                       
+  |===========================================================      |  91%
+  |                                                                       
+  |===========================================================      |  92%
+  |                                                                       
+  |============================================================     |  92%
+  |                                                                       
+  |============================================================     |  93%
+  |                                                                       
+  |=============================================================    |  93%
+  |                                                                       
+  |=============================================================    |  94%
+  |                                                                       
+  |=============================================================    |  95%
+  |                                                                       
+  |==============================================================   |  95%
+  |                                                                       
+  |==============================================================   |  96%
+  |                                                                       
+  |===============================================================  |  96%
+  |                                                                       
+  |===============================================================  |  97%
+  |                                                                       
+  |===============================================================  |  98%
+  |                                                                       
+  |================================================================ |  98%
+  |                                                                       
+  |================================================================ |  99%
+  |                                                                       
+  |=================================================================|  99%
+  |                                                                       
+  |=================================================================| 100%
+

A few notes:

+
    +
  1. The vectors parameter is the dimensionality of the representation. More vectors usually means more precision, but also more random error and slower operations. Likely choices are probably in the range 100-500.
  2. +
  3. The threads parameter is the number of processors to use on your computer. On a modern laptop, the fastest results will probably be between 2 and 8 threads, depending on the number of cores.
  4. +
  5. iter is how many times to read through the corpus. With fewer than 100 books, it can greatly help to increase the number of passes; if you’re working with billions of words, it probably matters less. One danger of too low a number of iterations is that words that aren’t closely related will seem to be closer than they are.
  6. +
  7. Training can take a while. On my laptop, it takes a few minutes to train these cookbooks; larger models take proportionally more time. Because of the importance of more iterations to reducing noise, don’t be afraid to set things up to require a lot of training time (as much as a day!)
  8. +
  9. One of the best things about the word2vec algorithm is that it does work on extremely large corpora in linear time.
  10. +
  11. In RStudio I’ve noticed that this sometimes appears to hang after a while; the percentage bar stops updating. If you check system activity it actually is still running, and will complete.
  12. +
  13. If at any point you want to read in a previously trained model, you can do so by typing model = read.vectors("cookbook_vectors.bin").
  14. +
+

Now we have a model in memory, trained on about 10 million words from 77 cookbooks. What can it tell us about food?

+
+

Similarity searches

+

Well, you can run some basic operations to find the nearest elements:

+
model %>% closest_to("fish")
+
##              word similarity to "fish"
+## 1            fish            1.0000000
+## 2          thames            0.6550232
+## 3           fluke            0.6549175
+## 4  perch_pickerel            0.6455389
+## 5       flounders            0.6411825
+## 6          logger            0.6404159
+## 7  columbia_river            0.6403468
+## 8           lakes            0.6382800
+## 9     great_lakes            0.6320935
+## 10         turbot            0.6302442
+

With that list, you can expand out further to search for multiple words:

+
model %>% 
+  closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50)
+
##                word
+## 1             trout
+## 2              carp
+## 3              shad
+## 4              eels
+## 5         flounders
+## 6             perch
+## 7    perch_pickerel
+## 8             skate
+## 9           haddock
+## 10    mackerel_shad
+## 11           salmon
+## 12           turbot
+## 13         mackerel
+## 14             pike
+## 15              cod
+## 16         gudgeons
+## 17           smelts
+## 18           mullet
+## 19      cod_haddock
+## 20       butterfish
+## 21     striped_bass
+## 22         pickerel
+## 23         flounder
+## 24           plaice
+## 25       carp_tench
+## 26       black_bass
+## 27          52fried
+## 28         grayling
+## 29           barbel
+## 30         sea_bass
+## 31       sword_fish
+## 32         whitings
+## 33            roach
+## 34         52boiled
+## 35       redsnapper
+## 36  haddock_halibut
+## 37         52potted
+## 38         weakfish
+## 39            porgy
+## 40    pike_pickerel
+## 41             fish
+## 42     salmon_trout
+## 43         sturgeon
+## 44          halibut
+## 45         haddocks
+## 46             cusk
+## 47        whitefish
+## 48         51boiled
+## 49 spanish_mackerel
+## 50        blackfish
+##    similarity to model[[c("fish", "salmon", "trout", "shad", "flounder", "carp", "roe", "eels")]]
+## 1                                                                                       0.8825309
+## 2                                                                                       0.8493702
+## 3                                                                                       0.8336669
+## 4                                                                                       0.8303654
+## 5                                                                                       0.8185495
+## 6                                                                                       0.8174846
+## 7                                                                                       0.8127453
+## 8                                                                                       0.8059550
+## 9                                                                                       0.8036258
+## 10                                                                                      0.8017232
+## 11                                                                                      0.8008546
+## 12                                                                                      0.7971772
+## 13                                                                                      0.7954075
+## 14                                                                                      0.7948964
+## 15                                                                                      0.7942049
+## 16                                                                                      0.7854212
+## 17                                                                                      0.7801296
+## 18                                                                                      0.7784851
+## 19                                                                                      0.7781567
+## 20                                                                                      0.7773511
+## 21                                                                                      0.7731814
+## 22                                                                                      0.7703981
+## 23                                                                                      0.7694271
+## 24                                                                                      0.7692327
+## 25                                                                                      0.7640110
+## 26                                                                                      0.7637830
+## 27                                                                                      0.7624884
+## 28                                                                                      0.7621161
+## 29                                                                                      0.7613481
+## 30                                                                                      0.7607049
+## 31                                                                                      0.7587526
+## 32                                                                                      0.7582790
+## 33                                                                                      0.7553524
+## 34                                                                                      0.7549496
+## 35                                                                                      0.7522104
+## 36                                                                                      0.7483817
+## 37                                                                                      0.7480971
+## 38                                                                                      0.7477817
+## 39                                                                                      0.7472975
+## 40                                                                                      0.7450452
+## 41                                                                                      0.7438109
+## 42                                                                                      0.7434369
+## 43                                                                                      0.7413590
+## 44                                                                                      0.7402515
+## 45                                                                                      0.7391468
+## 46                                                                                      0.7384402
+## 47                                                                                      0.7379825
+## 48                                                                                      0.7366679
+## 49                                                                                      0.7330924
+## 50                                                                                      0.7310721
+

Now we have a pretty expansive list of potential fish-related words from old cookbooks. This can be useful for a few different things:

+
    +
  1. As a list of potential query terms for keyword search.
  2. +
  3. As a batch of words to use as seed to some other text mining operation; for example, you could pull all paragraphs surrounding these to find ways that fish are cooked.
  4. +
  5. As a source for visualization.
  6. +
+

Or we can just arrange them somehow. In this case, it doesn’t look like much of anything.

+
some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150)
+fishy = model[[some_fish$word,average=F]]
+plot(fishy,method="pca")
+

+
+
+

Clustering

+

We can use standard clustering algorithms, like kmeans, to find groups of terms that fit together. You can think of this as a sort of topic model, although unlike more sophisticated topic modeling algorithms like Latent Direchlet Allocation, each word must be tied to single particular topic.

+
set.seed(10)
+centers = 150
+clustering = kmeans(model,centers=centers,iter.max = 40)
+

Here are a ten random “topics” produced through this method. Each of the columns are the ten most frequent words in one random cluster.

+
sapply(sample(1:centers,10),function(n) {
+  names(clustering$cluster[clustering$cluster==n][1:10])
+})
+
##       [,1]             [,2]       [,3]                      [,4]         
+##  [1,] "ha_wi"          "through"  "new_york"                "dish"       
+##  [2,] "k'uh"           "strain"   "united_states"           "ice"        
+##  [3,] "former_village" "rub"      "philadelphia"            "salad"      
+##  [4,] "na_wa"          "sieve"    "cook_book"               "cheese"     
+##  [5,] "ki"             "press"    "published"               "toast"      
+##  [6,] "wan"            "pulp"     "amp"                     "garnish"    
+##  [7,] "aguico"         "mash"     "printed"                 "tomato"     
+##  [8,] "pas_sa"         "strained" "co"                      "truffles"   
+##  [9,] "ha_lo"          "squeeze"  "chicago"                 "omelet"     
+## [10,] "mat_sa"         "colander" "handwritten_inscription" "hard_boiled"
+##       [,5]          [,6]      [,7]      [,8]           [,9]      
+##  [1,] "tea"         "baking"  "chicken" "mary_frances" "lake"    
+##  [2,] "drink"       "dough"   "rice"    "friend"       "buffalo" 
+##  [3,] "currant"     "meal"    "gravy"   "frances"      "spotted" 
+##  [4,] "raspberry"   "ground"  "baked"   "got"          "frog"    
+##  [5,] "cider"       "stone"   "pie"     "dear"         "striped" 
+##  [6,] "sick"        "coarse"  "fried"   "i'll"         "crane"   
+##  [7,] "beer"        "formed"  "roast"   "it's"         "headed"  
+##  [8,] "gruel"       "toasted" "stew"    "said_mary"    "whale"   
+##  [9,] "blanc_mange" "ashes"   "ham"     "an_animated"  "meadow"  
+## [10,] "sago"        "kneaded" "stewed"  "delight"      "speckled"
+##       [,10]         
+##  [1,] "history"     
+##  [2,] "scientific"  
+##  [3,] "farmer"      
+##  [4,] "ages"        
+##  [5,] "vast"        
+##  [6,] "newspaper"   
+##  [7,] "latest"      
+##  [8,] "post"        
+##  [9,] "agricultural"
+## [10,] "ideas"
+

These can be useful for figuring out, at a glance, what some of the overall common clusters in your corpus are.

+

Clusters need not be derived at the level of the full model. We can take, for instance, the 20 words closest to each of four different kinds of words.

+
ingredients = c("madeira","beef","saucepan","carrots")
+term_set = lapply(ingredients, 
+       function(ingredient) {
+          nearest_words = model %>% closest_to(model[[ingredient]],20)
+          nearest_words$word
+        }) %>% unlist
+
+subset = model[[term_set,average=F]]
+
+subset %>%
+  cosineDist(subset) %>% 
+  as.dist %>%
+  hclust %>%
+  plot
+

+
+
+
+

Visualization

+
+

Relationship planes.

+

One of the basic strategies you can take is to try to project the high-dimensional space here into a plane you can look at.

+

For instance, we can take the words “sweet” and “sour,” find the twenty words most similar to either of them, and plot those in a sweet-salty plane.

+
tastes = model[[c("sweet","salty"),average=F]]
+
+# model[1:3000,] here restricts to the 3000 most common words in the set.
+sweet_and_saltiness = model[1:3000,] %>% cosineSimilarity(tastes)
+
+# Filter to the top 20 sweet or salty.
+sweet_and_saltiness = sweet_and_saltiness[
+  rank(-sweet_and_saltiness[,1])<20 |
+  rank(-sweet_and_saltiness[,2])<20,
+  ]
+
+plot(sweet_and_saltiness,type='n')
+text(sweet_and_saltiness,labels=rownames(sweet_and_saltiness))
+

+

There’s no limit to how complicated this can get. For instance, there are really five tastes: sweet, salty, bitter, sour, and savory. (Savory is usually called ‘umami’ nowadays, but that word will not appear in historic cookbooks.)

+

Rather than use a base matrix of the whole set, we can shrink down to just five dimensions: how similar every word in our set is to each of these five. (I’m using cosine similarity here, so the closer a number is to one, the more similar it is.)

+
tastes = model[[c("sweet","salty","savory","bitter","sour"),average=F]]
+
+# model[1:3000,] here restricts to the 3000 most common words in the set.
+common_similarities_tastes = model[1:3000,] %>% cosineSimilarity(tastes)
+
+common_similarities_tastes[20:30,]
+
##            sweet        sour     bitter     savory     salty
+## then   0.2339259  0.06514445 0.12066119 0.09819144 0.3032770
+## put    0.2700498  0.02393292 0.10902933 0.13097811 0.2040845
+## are    0.1368766 -0.04255840 0.06944761 0.14227197 0.1100167
+## two    0.3219879  0.01819152 0.10420576 0.16770579 0.1608001
+## when   0.2588062  0.08498863 0.15445827 0.17202905 0.2336235
+## half   0.3916728  0.04622267 0.13031899 0.12109115 0.1454105
+## add    0.2962998  0.13567802 0.18950686 0.14410955 0.2902871
+## this   0.2591290  0.06750514 0.11697138 0.20434913 0.2655267
+## butter 0.4379218  0.13391168 0.18708843 0.18367837 0.1874771
+## salt   0.3847879  0.17693223 0.21420540 0.21829522 0.3294425
+## sugar  0.4314809  0.24587647 0.29656475 0.04229497 0.1594757
+

Now we can filter down to the 50 words that are closest to any of these (that’s what the apply-max function below does), and use a PCA biplot to look at just 50 words in a flavor plane.

+
high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 75,]
+
+high_similarities_to_tastes %>% 
+  prcomp %>% 
+  biplot(main="Fifty words in a\nprojection of flavor space")
+

+

This tells us a few things. One is that (in some runnings of the model, at least–there is some random chance built in here.) “sweet” and “sour” are closely aligned. Is this a unique feature of American cooking? A relationship that changes over time? These would require more investigation.

+

Second is that “savory” really is an acting category in these cookbooks, even without the precision of ‘umami’ as a word to express it. Anchovy, the flavor most closely associated with savoriness, shows up as fairly characteristic of the flavor, along with a variety of herbs.

+

Finally, words characteristic of meals seem to show up in the upper realms of the file.

+
+
+
+

Catchall reduction: TSNE

+

Last but not least, there is a catchall method built into the library to visualize a single overall decent plane for viewing the library; TSNE dimensionality reduction.

+

Just calling “plot” will display the equivalent of a word cloud with individual tokens grouped relatively close to each other based on their proximity in the higher dimensional space.

+

“Perplexity” is the optimal number of neighbors for each word. By default it’s 50; smaller numbers may cause clusters to appear more dramatically at the cost of overall coherence.

+
plot(model,perplexity=50)
+
## Attempting to use T-SNE to plot the vector representation
+
## Cancel if this is taking too long
+
## Or run 'install.packages' tsne if you don't have it.
+
## sigma summary: Min. : 0.5386 |1st Qu. : 0.6961 |Median : 0.7434 |Mean : 0.7702 |3rd Qu. : 0.8083 |Max. : 1.206 |
+
## Epoch: Iteration #100 error is: 16.879583703121
+
## Epoch: Iteration #200 error is: 0.856309962004217
+
## Epoch: Iteration #300 error is: 0.836821330832503
+
## Epoch: Iteration #400 error is: 0.831341310092711
+
## Epoch: Iteration #500 error is: 0.823818771867925
+
## Epoch: Iteration #600 error is: 0.823213062308806
+
## Epoch: Iteration #700 error is: 0.822970694425717
+
## Epoch: Iteration #800 error is: 0.822939945020377
+
## Epoch: Iteration #900 error is: 0.822938395371538
+
## Epoch: Iteration #1000 error is: 0.822938284670947
+

+

A few notes on this method:

+
    +
  1. If you don’t get local clusters, it is not working. You might need to reduce the perplexity so that clusters are smaller; or you might not have good local similarities.
  2. +
  3. If you’re plotting only a small set of words, you’re better off trying to plot a VectorSpaceModel with method="pca", which locates the points using principal components analysis.
  4. +
+
+ + + + + + + + diff --git a/inst/paper.md b/inst/paper.md new file mode 100644 index 0000000..4ba30f3 --- /dev/null +++ b/inst/paper.md @@ -0,0 +1,23 @@ +--- + title: 'WordVectors: an R environment for training and exploring word2vec modes' + tags: + - Natural Language Processing + - Vector Space Models + - word2vec + authors: + - name: Benjamin M Schmidt + orcid: 0000-0002-1142-5720 + affiliation: 1 + affiliations: + - name: Northeastern University + index: 1 + date: 24 January 2017 + bibliography: paper.bib + --- + + # Summary + + This is an R package for training and exploring word2vec models. It provides wrappers for the reference word2vec implementation released by Google to enable training of vectors from R.[@mikolov_efficient_2013] It also provides a variety of functions enabling exploratory data analysis of word2vec models in an R environment, including 1) functions for reading and writing word2vec's binary form, 2) standard linear algebra functions not bundled in base R (such as cosine similarity) with speed optimizations, and 3) a streamlined syntax for performing vector arithmetic in a vocabulary space. + + # References + diff --git a/man/VectorSpaceModel-VectorSpaceModel-method.Rd b/man/VectorSpaceModel-VectorSpaceModel-method.Rd index 053896b..fdf3730 100644 --- a/man/VectorSpaceModel-VectorSpaceModel-method.Rd +++ b/man/VectorSpaceModel-VectorSpaceModel-method.Rd @@ -24,4 +24,3 @@ I believe this is necessary, but honestly am not sure. Keep the VSM class when doing subtraction operations; make it possible to subtract a single row from an entire model. } - diff --git a/man/as.VectorSpaceModel.Rd b/man/as.VectorSpaceModel.Rd index daa73ae..37b8682 100644 --- a/man/as.VectorSpaceModel.Rd +++ b/man/as.VectorSpaceModel.Rd @@ -15,4 +15,3 @@ An object of class "VectorSpaceModel" \description{ Convert to a Vector Space Model } - diff --git a/man/closest_to.Rd b/man/closest_to.Rd new file mode 100644 index 0000000..3064393 --- /dev/null +++ b/man/closest_to.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrixFunctions.R +\name{closest_to} +\alias{closest_to} +\title{Return the n closest words in a VectorSpaceModel to a given vector.} +\usage{ +closest_to(matrix, vector, n = 10, fancy_names = TRUE) +} +\arguments{ +\item{matrix}{A matrix or VectorSpaceModel} + +\item{vector}{A vector (or a string or a formula coercable to a vector) +of the same length as the VectorSpaceModel. See below.} + +\item{n}{The number of closest words to include.} + +\item{fancy_names}{If true (the default) the data frame will have descriptive names like +'similarity to "king+queen-man"'; otherwise, just 'similarity.' The default can speed up + interactive exploration.} +} +\value{ +A sorted data.frame with columns for the words and their similarity +to the target vector. (Or, if as_df==FALSE, a named vector of similarities.) +} +\description{ +This is a convenience wrapper around the most common use of +'cosineSimilarity'; the listing of several words similar to a given vector. +Unlike cosineSimilarity, it returns a data.frame object instead of a matrix. +cosineSimilarity is more powerful, because it can compare two matrices to +each other; closest_to can only take a vector or vectorlike object as its second argument. +But with (or without) the argument n=Inf, closest_to is often better for +plugging directly into a plot. + +As with cosineSimilarity, the second argument can take several forms. If it's a vector or +matrix slice, it will be taken literally. If it's a character string, it will +be interpreted as a word and the associated vector from `matrix` will be used. If +a formula, any strings in the formula will be converted to rows in the associated `matrix` +before any math happens. +} +\examples{ + +# Synonyms and similar words +closest_to(demo_vectors,demo_vectors[["good"]]) + +# If 'matrix' is a VectorSpaceModel object, +# you can also just enter a string directly, and +# it will be evaluated in the context of the passed matrix. + +closest_to(demo_vectors,"good") + +# You can also express more complicated formulas. + +closest_to(demo_vectors,"good") + +# Something close to the classic king:man::queen:woman; +# What's the equivalent word for a female teacher that "guy" is for +# a male one? + +closest_to(demo_vectors,~ "guy" - "man" + "woman") + +} diff --git a/man/cosineDist.Rd b/man/cosineDist.Rd index f857027..c886e64 100644 --- a/man/cosineDist.Rd +++ b/man/cosineDist.Rd @@ -12,7 +12,8 @@ cosineDist(x, y) \item{y}{A matrix, VectorSpaceModel, or vector.} } \value{ -A matrix whose dimnames are rownames(x), rownames(y) +A matrix whose dimnames are rownames(x), rownames(y) and whose entires are +the associated distance. } \description{ Calculate the cosine distance between two vectors. @@ -21,4 +22,3 @@ Not an actual distance metric, but can be used in similar contexts. It is calculated as simply the inverse of cosine similarity, and falls in a fixed range of 0 (identical) to 2 (completely opposite in direction.) } - diff --git a/man/cosineSimilarity.Rd b/man/cosineSimilarity.Rd index 512ed59..e74263b 100644 --- a/man/cosineSimilarity.Rd +++ b/man/cosineSimilarity.Rd @@ -21,13 +21,17 @@ A matrix. Rows correspond to entries in x; columns to entries in y. Calculate the cosine similarity of two matrices or a matrix and a vector. } \examples{ + +# Inspect the similarity of several academic disciplines by hand. subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=FALSE]] similarities = cosineSimilarity(subjects,subjects) +# Use 'closest_to' to build up a large list of similar words to a seed set. subjects = demo_vectors[[c("history","literature","biology","math","stats"),average=TRUE]] -new_subject_list = nearest_to(demo_vectors,subjects,20) -new_subjects = demo_vectors[[names(new_subject_list),average=FALSE]] +new_subject_list = closest_to(demo_vectors,subjects,20) +new_subjects = demo_vectors[[new_subject_list$word,average=FALSE]] + +# Plot the cosineDistance of these as a dendrogram. plot(hclust(as.dist(cosineDist(new_subjects,new_subjects)))) } - diff --git a/man/demo_vectors.Rd b/man/demo_vectors.Rd index 99efc68..42c2628 100644 --- a/man/demo_vectors.Rd +++ b/man/demo_vectors.Rd @@ -15,7 +15,7 @@ demo_vectors A sample VectorSpaceModel object trained on about 15 million teaching evaluations, limited to the 999 most common words. Included for demonstration purposes only: there's only so much you can -do with a 999 dimension vocabulary. +do with a 999 length vocabulary. } \details{ You're best off downloading a real model to work with, @@ -23,4 +23,3 @@ such as the precompiled vectors distributed by Google at https://code.google.com/archive/p/word2vec/ } \keyword{datasets} - diff --git a/man/distend.Rd b/man/distend.Rd new file mode 100644 index 0000000..ef5356c --- /dev/null +++ b/man/distend.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrixFunctions.R +\name{distend} +\alias{distend} +\title{Compress or expand a vector space model along a vector.} +\usage{ +distend(matrix, vector, multiplier) +} +\arguments{ +\item{matrix}{A matrix or VectorSpaceModel} + +\item{vector}{A vector (or an object coercable to a vector, see project) +of the same length as the VectorSpaceModel.} + +\item{multiplier}{A scaling factor. See below.} +} +\value{ +A new matrix or VectorSpaceModel of the same dimensions as `matrix`, +distended along the vector 'vector' by factor 'multiplier'. + +See `project` for more details and usage. +} +\description{ +This is an experimental function that might be useful sometimes. +'Reject' flatly eliminates a particular dimension from a vectorspace, essentially +squashing out a single dimension; 'distend' gives finer grained control, making it +possible to stretch out or compress in the same space. High values of 'multiplier' +make a given vector more prominent; 1 keeps the original matrix untransformed; values +less than one compress distances along the vector; and 0 is the same as "reject," +eliminating a vector entirely. Values less than zero will do some type of mirror-image +universe thing, but probably aren't useful? +} +\examples{ +closest_to(demo_vectors,"sweet") + +# Stretch out the vectorspace 4x longer along the gender direction. +more_sexist = distend(demo_vectors, ~ "man" + "he" - "she" -"woman", 4) + +closest_to(more_sexist,"sweet") + +} diff --git a/man/filter_to_rownames.Rd b/man/filter_to_rownames.Rd index 3b47cb1..cd6332e 100644 --- a/man/filter_to_rownames.Rd +++ b/man/filter_to_rownames.Rd @@ -20,4 +20,3 @@ Deprecated: use instead VSM[[c("word1","word2",...),average=FALSE]] \description{ Reduce by rownames } - diff --git a/man/improve_vectorspace.Rd b/man/improve_vectorspace.Rd new file mode 100644 index 0000000..02e626d --- /dev/null +++ b/man/improve_vectorspace.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrixFunctions.R +\name{improve_vectorspace} +\alias{improve_vectorspace} +\title{Improve a vectorspace by removing common elements.} +\usage{ +improve_vectorspace(vectorspace, D = round(ncol(vectorspace)/100)) +} +\arguments{ +\item{vectorspace}{A VectorSpacemodel to be improved.} + +\item{D}{The number of principal components to eliminate.} +} +\value{ +A VectorSpaceModel object, transformed from the original. +} +\description{ +See reference for a full description. Supposedly, these operations will improve performance on analogy tasks. +} +\examples{ + +closest_to(demo_vectors,"great") +# stopwords like "and" and "very" are no longer top ten. +# I don't know if this is really better, though. + +closest_to(improve_vectorspace(demo_vectors),"great") + +} +\references{ +Jiaqi Mu, Suma Bhat, Pramod Viswanath. All-but-the-Top: Simple and Effective Postprocessing for Word Representations. https://arxiv.org/abs/1702.01417. +} diff --git a/man/magnitudes.Rd b/man/magnitudes.Rd index c8dafbc..688c8b1 100644 --- a/man/magnitudes.Rd +++ b/man/magnitudes.Rd @@ -17,4 +17,3 @@ This is an extraordinarily simple function. \description{ Vector Magnitudes } - diff --git a/man/nearest_to.Rd b/man/nearest_to.Rd index 8eede2e..6964066 100644 --- a/man/nearest_to.Rd +++ b/man/nearest_to.Rd @@ -2,34 +2,26 @@ % Please edit documentation in R/matrixFunctions.R \name{nearest_to} \alias{nearest_to} -\title{Return the n closest words in a VectorSpaceModel to a given vector.} +\title{Nearest vectors to a word} \usage{ -nearest_to(matrix, vector, n = 10) +nearest_to(...) } \arguments{ -\item{matrix}{A matrix or VectorSpaceModel} - -\item{vector}{Avector (or an object coercable to a vector, see project) -of the same length as the VectorSpaceModel.} - -\item{n}{The number of closest words to include.} +\item{...}{See `closest_to`} } \value{ -A vector of distances, with names corresponding to the words -in the parent VectorSpaceModel, of length n. +a names vector of cosine similarities. See 'nearest_to' for more details. } \description{ -Return the n closest words in a VectorSpaceModel to a given vector. +This a wrapper around closest_to, included for back-compatibility. Use +closest_to for new applications. } \examples{ -#Synonyms and similar words -nearest_to(demo_vectors,demo_vectors[["good"]]) +# Recommended usage in 1.0: +nearest_to(demo_vectors, demo_vectors[["good"]]) -# Something close to the classic king:man::queen:woman; -# What's the equivalent word for a female teacher that "guy" is for -# a male one? -nearest_to(demo_vectors,demo_vectors[["guy"]] - demo_vectors[["man"]] + demo_vectors[["woman"]]) +# Recommended usage in 2.0: +demo_vectors \%>\% closest_to("good") } - diff --git a/man/normalize_lengths.Rd b/man/normalize_lengths.Rd index 3a3482c..a761667 100644 --- a/man/normalize_lengths.Rd +++ b/man/normalize_lengths.Rd @@ -15,4 +15,3 @@ An object of the same class as matrix \description{ Normalize a matrix so that all rows are of unit length. } - diff --git a/man/plot-VectorSpaceModel-method.Rd b/man/plot-VectorSpaceModel-method.Rd index 7016682..426bf6a 100644 --- a/man/plot-VectorSpaceModel-method.Rd +++ b/man/plot-VectorSpaceModel-method.Rd @@ -5,15 +5,14 @@ \alias{plot,VectorSpaceModel-method} \title{Plot a Vector Space Model.} \usage{ -\S4method{plot}{VectorSpaceModel}(x, y, ...) +\S4method{plot}{VectorSpaceModel}(x, method = "tsne", ...) } \arguments{ \item{x}{The model to plot} -\item{y}{(ignored)} +\item{method}{The method to use for plotting. "pca" is principal components, "tsne" is t-sne} -\item{...}{Further arguments passed to tsne::tsne. -(Note: not to plot.)} +\item{...}{Further arguments passed to the plotting method.} } \value{ The TSNE model (silently.) @@ -24,6 +23,9 @@ sanest thing to do is reduce the full model down to two dimensions using T-SNE, which preserves some of the local clusters. } \details{ +For individual subsections, it can make sense to do a principal components +plot of the space of just those letters. This is what happens if method +is pca. On the full vocab, it's kind of a mess. + This plots only the first 300 words in the model. } - diff --git a/man/prep_word2vec.Rd b/man/prep_word2vec.Rd index 150cb41..30d0301 100644 --- a/man/prep_word2vec.Rd +++ b/man/prep_word2vec.Rd @@ -4,8 +4,7 @@ \alias{prep_word2vec} \title{Prepare documents for word2Vec} \usage{ -prep_word2vec(origin, destination, split_characters = "\\\\W", - lowercase = F, bundle_ngrams = 1, ...) +prep_word2vec(origin, destination, lowercase = F, bundle_ngrams = 1, ...) } \arguments{ \item{origin}{A text file or a directory of text files @@ -13,11 +12,6 @@ to be used in training the model} \item{destination}{The location for output text.} -\item{split_characters}{If the 'stringi' package is not installed, -A list of characters that mark word breaks. By default, -any nonword characters according to the perl regex engine. If stringi is installed, -this parameter is ignored.} - \item{lowercase}{Logical. Should uppercase characters be converted to lower?} \item{bundle_ngrams}{Integer. Statistically significant phrases of up to this many words @@ -40,4 +34,3 @@ This function is extraordinarily inefficient: in most real-world cases, you'll b much better off preparing the documents using python, perl, awk, or any other scripting language that can reasonable read things in line-by-line. } - diff --git a/man/project.Rd b/man/project.Rd index 6e002bc..b7acd33 100644 --- a/man/project.Rd +++ b/man/project.Rd @@ -9,7 +9,7 @@ project(matrix, vector) \arguments{ \item{matrix}{A matrix or VectorSpaceModel} -\item{vector}{A vector (or an object coercable to a vector, see project) +\item{vector}{A vector (or object coercable to a vector) of the same length as the VectorSpaceModel.} } \value{ @@ -20,6 +20,5 @@ If the input is a matrix, the output will be a matrix: if a VectorSpaceModel, it will be a VectorSpaceModel. } \description{ -Project each row of an input matrix along a vector. +As with 'cosineSimilarity } - diff --git a/man/read.binary.vectors.Rd b/man/read.binary.vectors.Rd index 8fd639c..7b5202f 100644 --- a/man/read.binary.vectors.Rd +++ b/man/read.binary.vectors.Rd @@ -20,10 +20,10 @@ the whole matrix into memory. This limit is applied BEFORE `name_list` and if you are in a memory-limited environment, you can limit the number of columns you read in by giving a vector of column integers} -\item{name_list}{A whitelist of words. If you wish to read in only a few dozen words, +\item{rowname_list}{A whitelist of words. If you wish to read in only a few dozen words, all other rows will be skipped and only these read in.} -\item{name_regexp}{A regular expression specifying a pattern for rows to read in. Row +\item{rowname_regexp}{A regular expression specifying a pattern for rows to read in. Row names matching that pattern will be included in the read; all others will be skipped.} } \value{ @@ -32,4 +32,3 @@ A VectorSpaceModel object \description{ Read binary word2vec format files } - diff --git a/man/read.vectors.Rd b/man/read.vectors.Rd index 5260cdd..4a866e3 100644 --- a/man/read.vectors.Rd +++ b/man/read.vectors.Rd @@ -4,18 +4,19 @@ \alias{read.vectors} \title{Read VectorSpaceModel} \usage{ -read.vectors(filename, vectors = guess_n_cols(), binary = FALSE, ...) +read.vectors(filename, vectors = guess_n_cols(), binary = NULL, ...) } \arguments{ \item{filename}{The file to read in.} \item{vectors}{The number of dimensions word2vec calculated. Imputed automatically if not specified.} -\item{binary}{Read in the binary word2vec form. (Wraps `read.binary.vectors`)} +\item{binary}{Read in the binary word2vec form. (Wraps `read.binary.vectors`) By default, function +guesses based on file suffix.} \item{...}{Further arguments passed to read.table or read.binary.vectors. -Note that both accept 'nrow' as an argument. Word2vec produces -by default frequency sorted output. Therefore 'read.vectors(...,nrows=500)', for example, +Note that both accept 'nrows' as an argument. Word2vec produces +by default frequency sorted output. Therefore 'read.vectors("file.bin", nrows=500)', for example, will return the vectors for the top 500 words. This can be useful on machines with limited memory.} } @@ -25,4 +26,3 @@ An matrixlike object of class `VectorSpaceModel` \description{ Read a VectorSpaceModel from a file exported from word2vec or a similar output format. } - diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 0000000..6c3785c --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{\%>\%} +\title{Objects exported from other packages} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{magrittr}{\code{\link[magrittr]{\%>\%}}} +}} + diff --git a/man/reject.Rd b/man/reject.Rd index bff9438..64f9d60 100644 --- a/man/reject.Rd +++ b/man/reject.Rd @@ -25,9 +25,9 @@ See `project` for more details. Return a vector rejection for each element in a VectorSpaceModel } \examples{ -nearest_to(demo_vectors,demo_vectors[["man"]]) +closest_to(demo_vectors,demo_vectors[["man"]]) + genderless = reject(demo_vectors,demo_vectors[["he"]] - demo_vectors[["she"]]) -nearest_to(genderless,genderless[["man"]]) +closest_to(genderless,genderless[["man"]]) } - diff --git a/man/square_magnitudes.Rd b/man/square_magnitudes.Rd index 36e08ed..c4abb72 100644 --- a/man/square_magnitudes.Rd +++ b/man/square_magnitudes.Rd @@ -17,4 +17,3 @@ square_magnitudes Returns the square magnitudes and caches them if necessary } \keyword{internal} - diff --git a/man/sub-VectorSpaceModel-method.Rd b/man/sub-VectorSpaceModel-method.Rd index 257589d..25e3940 100644 --- a/man/sub-VectorSpaceModel-method.Rd +++ b/man/sub-VectorSpaceModel-method.Rd @@ -14,9 +14,9 @@ \item{j}{The column numbers to extract} -\item{drop}{Whether to drop columns. This parameter is ignored.} +\item{...}{Other arguments passed to extract (unlikely to be useful).} -\item{j}{Other arguments to extract (unlikely to be useful).} +\item{drop}{Whether to drop columns. This parameter is ignored.} } \value{ A VectorSpaceModel @@ -24,4 +24,3 @@ A VectorSpaceModel \description{ Reduce a VectorSpaceModel to a smaller one } - diff --git a/man/sub-sub-VectorSpaceModel-method.Rd b/man/sub-sub-VectorSpaceModel-method.Rd index 79cbce7..d3ef098 100644 --- a/man/sub-sub-VectorSpaceModel-method.Rd +++ b/man/sub-sub-VectorSpaceModel-method.Rd @@ -21,4 +21,3 @@ A VectorSpaceModel of a single row. \description{ VectorSpaceModel subsetting } - diff --git a/man/train_word2vec.Rd b/man/train_word2vec.Rd index ecc111a..d1dccf1 100644 --- a/man/train_word2vec.Rd +++ b/man/train_word2vec.Rd @@ -6,31 +6,39 @@ \usage{ train_word2vec(train_file, output_file = "vectors.bin", vectors = 100, threads = 1, window = 12, classes = 0, cbow = 0, min_count = 5, - iter = 5, force = F) + iter = 5, force = F, negative_samples = 5) } \arguments{ \item{train_file}{Path of a single .txt file for training. Tokens are split on spaces.} \item{output_file}{Path of the output file.} -\item{vectors}{The number of vectors to output. Defaults to 100. More vectors may be useful with large files.} +\item{vectors}{The number of vectors to output. Defaults to 100. +More vectors usually means more precision, but also more random error, higher memory usage, and slower operations. +Sensible choices are probably in the range 100-500.} -\item{threads}{Number of threads to run training process on. Defaults to 1; up to the number of cores on your machine may be useful.} +\item{threads}{Number of threads to run training process on. +Defaults to 1; up to the number of (virtual) cores on your machine may speed things up.} \item{window}{The size of the window (in words) to use in training.} \item{classes}{Number of classes for k-means clustering. Not documented/tested.} -\item{cbow}{If 1, use a continuous-bag-of-words model instead of skip-grams. Defaults to false (recommended for newcomers).} +\item{cbow}{If 1, use a continuous-bag-of-words model instead of skip-grams. +Defaults to false (recommended for newcomers).} -\item{min_count}{Minimum times a word must appear to be included in the samples. High values help reduce model size.} +\item{min_count}{Minimum times a word must appear to be included in the samples. +High values help reduce model size.} \item{iter}{Number of passes to make over the corpus in training.} -\item{force}{Whether to overwrite existing files.} +\item{force}{Whether to overwrite existing model files.} + +\item{negative_samples}{Number of negative samples to take in skip-gram training. 0 means full sampling, while lower numbers +give faster training. For large corpora 2-5 may work; for smaller corpora, 5-15 is reasonable.} } \value{ -A word2vec object. +A VectorSpaceModel object. } \description{ Train a model by word2vec. @@ -44,13 +52,12 @@ natural language processing and machine learning applications. } \examples{ \dontrun{ -model = word2vec(system.file("examples", "rfaq.txt", package = "tmcn.word2vec")) -} +model = train_word2vec(system.file("examples", "rfaq.txt", package = "wordVectors")) } -\author{ -Jian Li <\email{rweibo@sina.com}>, Ben Schmidt <\email{bmchmidt@gmail.com}> } \references{ \url{https://code.google.com/p/word2vec/} } - +\author{ +Jian Li <\email{rweibo@sina.com}>, Ben Schmidt <\email{bmchmidt@gmail.com}> +} diff --git a/man/word2phrase.Rd b/man/word2phrase.Rd index ce37ef6..3445a1c 100644 --- a/man/word2phrase.Rd +++ b/man/word2phrase.Rd @@ -20,6 +20,8 @@ prints progress regularly.} High values help reduce model size.} \item{threshold}{Threshold value for determining if pairs of words are phrases.} + +\item{force}{Whether to overwrite existing files at the output location. Default FALSE} } \value{ The name of output_file, the trained file where common phrases are now joined. @@ -41,4 +43,3 @@ model=word2phrase("text8","vec.txt") \author{ Tomas Mikolov } - diff --git a/man/write.binary.word2vec.Rd b/man/write.binary.word2vec.Rd index a942065..6c9ba60 100644 --- a/man/write.binary.word2vec.Rd +++ b/man/write.binary.word2vec.Rd @@ -18,4 +18,3 @@ Nothing \description{ Write in word2vec binary format } - diff --git a/src/distance.h b/src/distance.h deleted file mode 100644 index 0bbf9fd..0000000 --- a/src/distance.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -const long long max_size = 2000; // max length of strings -const long long N = 20; // number of closest words that will be shown -const long long max_w = 50; // max length of vocabulary entries - -void distance(char *file_name0, char *word0, char *returnw, double *returnd) { - FILE *f; - char st1[max_size]; - char bestw[N][max_size]; - char file_name[max_size], st[100][max_size]; - float dist, len, bestd[N], vec[max_size]; - long long words, size, a, b, c, d, cn, bi[100]; - char ch; - float *M; - char *vocab; - - strcpy(file_name, file_name0); - f = fopen(file_name, "rb"); - if (f == NULL) { - printf("Input file not found\n"); - - } - if(fscanf(f, "%lld", &words)==1); - if(fscanf(f, "%lld", &size)==1); - vocab = (char *)malloc((long long)words * max_w * sizeof(char)); - M = (float *)malloc((long long)words * (long long)size * sizeof(float)); - if (M == NULL) { - printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); - - } - for (b = 0; b < words; b++) { - if(fscanf(f, "%s%c", &vocab[b * max_w], &ch)==1); - for (a = 0; a < size; a++) if(fread(&M[a + b * size], sizeof(float), 1, f)==1); - len = 0; - for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; - len = sqrt(len); - for (a = 0; a < size; a++) M[a + b * size] /= len; - } - fclose(f); - for (a = 0; a < N; a++) bestd[a] = 0; - for (a = 0; a < N; a++) bestw[a][0] = 0; - a = 0; - - strcpy(st1, word0); - - - cn = 0; - b = 0; - c = 0; - while (1) { - st[cn][b] = st1[c]; - b++; - c++; - st[cn][b] = 0; - if (st1[c] == 0) break; - if (st1[c] == ' ') { - cn++; - b = 0; - c++; - } - } - cn++; - for (a = 0; a < cn; a++) { - for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; - if (b == words) b = -1; - bi[a] = b; - printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); - if (b == -1) { - printf("Out of dictionary word!\n"); - break; - } - } - - for (a = 0; a < size; a++) vec[a] = 0; - for (b = 0; b < cn; b++) { - if (bi[b] == -1) continue; - for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; - } - len = 0; - for (a = 0; a < size; a++) len += vec[a] * vec[a]; - len = sqrt(len); - for (a = 0; a < size; a++) vec[a] /= len; - for (a = 0; a < N; a++) bestd[a] = 0; - for (a = 0; a < N; a++) bestw[a][0] = 0; - for (c = 0; c < words; c++) { - a = 0; - for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; - if (a == 1) continue; - dist = 0; - for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; - for (a = 0; a < N; a++) { - if (dist > bestd[a]) { - for (d = N - 1; d > a; d--) { - bestd[d] = bestd[d - 1]; - strcpy(bestw[d], bestw[d - 1]); - } - bestd[a] = dist; - strcpy(bestw[a], &vocab[c * max_w]); - break; - } - } - } - for (a = 0; a < N; a++) { -// printf("%50s\t\t%f\n", bestw[a], bestd[a]); - returnw = strcat(returnw, " "); - returnw = strcat(returnw, bestw[a]); - returnd[a] = bestd[a]; - } - -} diff --git a/src/tmcn_distance.c b/src/tmcn_distance.c deleted file mode 100644 index db71fd7..0000000 --- a/src/tmcn_distance.c +++ /dev/null @@ -1,8 +0,0 @@ - -#include "distance.h" - -void CWrapper_distance(char **file_name, char **word, char **returnw, double *returnd) -{ - distance(*file_name, *word, *returnw, returnd); -} - diff --git a/src/tmcn_word2vec.c b/src/tmcn_word2vec.c index 34b37ae..62a206d 100644 --- a/src/tmcn_word2vec.c +++ b/src/tmcn_word2vec.c @@ -5,7 +5,7 @@ void tmcn_word2vec(char *train_file0, char *output_file0, char *binary0, char *dims0, char *threads, char *window0, char *classes0, char *cbow0, - char *min_count0, char *iter0) + char *min_count0, char *iter0, char *neg_samples0) { int i; layer1_size = atoll(dims0); @@ -16,9 +16,11 @@ void tmcn_word2vec(char *train_file0, char *output_file0, cbow = atoi(cbow0); min_count = atoi(min_count0); iter = atoll(iter0); + negative = atoi(neg_samples0); strcpy(train_file, train_file0); strcpy(output_file, output_file0); + alpha = 0.025; starting_alpha = alpha; word_count_actual = 0; @@ -36,8 +38,8 @@ void tmcn_word2vec(char *train_file0, char *output_file0, void CWrapper_word2vec(char **train_file, char **output_file, char **binary, char **dims, char **threads, - char **window, char **classes, char **cbow, char **min_count, char **iter) + char **window, char **classes, char **cbow, char **min_count, char **iter, char **neg_samples) { - tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count,*iter); + tmcn_word2vec(*train_file, *output_file, *binary, *dims, *threads,*window,*classes,*cbow,*min_count,*iter, *neg_samples); } diff --git a/src/word2vec.h b/src/word2vec.h index b5e46ee..320fbb6 100644 --- a/src/word2vec.h +++ b/src/word2vec.h @@ -18,6 +18,9 @@ #include #include #include +#include "R.h" +#include "Rmath.h" + #define MAX_STRING 100 #define EXP_TABLE_SIZE 1000 @@ -50,6 +53,7 @@ int hs = 1, negative = 0; const int table_size = 1e8; int *table; + void InitUnigramTable() { int a, i; long long train_words_pow = 0; @@ -189,7 +193,7 @@ void ReduceVocab() { while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = a; } - fflush(stdout); + fflush(NULL); min_reduce++; } @@ -267,8 +271,8 @@ void LearnVocabFromTrainFile() { for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { - printf("ERROR: training data file not found!\n"); - exit(1); + Rprintf("ERROR: training data file not found!\n"); + Rf_error("Error!"); } vocab_size = 0; AddWordToVocab((char *)""); @@ -276,9 +280,9 @@ void LearnVocabFromTrainFile() { ReadWord(word, fin); if (feof(fin)) break; train_words++; - if ((debug_mode > 1) && (train_words % 100000 == 0)) { - printf("%lldK%c", train_words / 1000, 13); - fflush(stdout); + if ((debug_mode > 1) && (train_words % 100000 == 0)) { + Rprintf("%lldK%c", train_words / 1000, 13); + fflush(NULL); } i = SearchVocab(word); if (i == -1) { @@ -289,8 +293,8 @@ void LearnVocabFromTrainFile() { } SortVocab(); if (debug_mode > 0) { - printf("Vocab size: %lld\n", vocab_size); - printf("Words in train file: %lld\n", train_words); + Rprintf("Vocab size: %lld\n", vocab_size); + Rprintf("Words in train file: %lld\n", train_words); } file_size = ftell(fin); fclose(fin); @@ -309,8 +313,8 @@ void ReadVocab() { char word[MAX_STRING]; FILE *fin = fopen(read_vocab_file, "rb"); if (fin == NULL) { - printf("Vocabulary file not found\n"); - exit(1); + Rprintf("Vocabulary file not found\n"); + Rf_error("Error!"); } for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; vocab_size = 0; @@ -318,18 +322,19 @@ void ReadVocab() { ReadWord(word, fin); if (feof(fin)) break; a = AddWordToVocab(word); - if(fscanf(fin, "%lld%c", &vocab[a].cn, &c)==1); + if(fscanf(fin, "%lld%c", &vocab[a].cn, &c)==1) + ; i++; } SortVocab(); if (debug_mode > 0) { - printf("Vocab size: %lld\n", vocab_size); - printf("Words in train file: %lld\n", train_words); + Rprintf("Vocab size: %lld\n", vocab_size); + Rprintf("Words in train file: %lld\n", train_words); } fin = fopen(train_file, "rb"); if (fin == NULL) { - printf("ERROR: training data file not found!\n"); - exit(1); + Rprintf("ERROR: training data file not found!\n"); + Rf_error("Error!"); } fseek(fin, 0, SEEK_END); file_size = ftell(fin); @@ -346,16 +351,16 @@ void InitNet() { a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); #endif - if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} + if (syn0 == NULL) {Rprintf("Memory allocation failed\n"); Rf_error("Error!");} if (hs) { // a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); #ifdef _WIN32 syn1 = (real *)_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128); #else a = posix_memalign((void **)&(syn1), 128, (long long)vocab_size * layer1_size * sizeof(real)); -#endif +#endif - if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} + if (syn1 == NULL) {Rprintf("Memory allocation failed\n"); Rf_error("Error!");} for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) syn1[a * layer1_size + b] = 0; } @@ -366,8 +371,8 @@ void InitNet() { #else a = posix_memalign((void **)&(syn1neg), 128, (long long)vocab_size * layer1_size * sizeof(real)); #endif - -if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} + +if (syn1neg == NULL) {Rprintf("Memory allocation failed\n"); Rf_error("Error!");} for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) syn1neg[a * layer1_size + b] = 0; } @@ -383,6 +388,8 @@ void *TrainModelThread(void *id) { long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l1, l2, c, target, label, local_iter = iter; unsigned long long next_random = (long long)id; + // real doneness_f, speed_f; + // For writing to R. real f, g; clock_t now; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); @@ -392,14 +399,17 @@ void *TrainModelThread(void *id) { while (1) { if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; - last_word_count = word_count; - if ((debug_mode > 1)) { - now=clock(); - printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, - word_count_actual / (real)(iter * train_words + 1) * 100, - word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); - fflush(stdout); - } + last_word_count = word_count; + /* if ((debug_mode > 1)) { */ + /* now=clock(); */ + /* doneness_f = word_count_actual / (real)(iter * train_words + 1) * 100; */ + /* speed_f = word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000); */ + + /* Rprintf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, */ + /* doneness_f, speed_f); */ + + /* fflush(NULL); */ + /* } */ alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; } @@ -559,7 +569,7 @@ void TrainModel() { long a, b, c, d; FILE *fo; pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); - printf("Starting training using file %s\n", train_file); + Rprintf("Starting training using file %s\n", train_file); starting_alpha = alpha; if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); if (save_vocab_file[0] != 0) SaveVocab(); @@ -632,8 +642,8 @@ int ArgPos(char *str, int argc, char **argv) { int a; for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { if (a == argc - 1) { - printf("Argument missing for %s\n", str); - exit(1); + Rprintf("Argument missing for %s\n", str); + Rf_error("Error!"); } return a; } diff --git a/tests/run-all.R b/tests/run-all.R new file mode 100644 index 0000000..a20ba7a --- /dev/null +++ b/tests/run-all.R @@ -0,0 +1,2 @@ +library(testthat) +test_check("wordVectors") diff --git a/tests/testthat/test-linear-algebra-functions.R b/tests/testthat/test-linear-algebra-functions.R new file mode 100644 index 0000000..c862eab --- /dev/null +++ b/tests/testthat/test-linear-algebra-functions.R @@ -0,0 +1,28 @@ +context("VectorSpaceModel Linear Algebra is sensible") + +test_that("Vectors are near to themselves", + expect_lt( + cosineDist(demo_vectors[1,],demo_vectors[1,]), + 1e-07 + ) +) + +test_that("Distance is between 0 and 2 (pt 1)", + expect_gt( + min(cosineDist(demo_vectors,demo_vectors)), + -1e-07 + ) +) + +test_that("Distance is between 0 and 2 (pt 1)", + expect_lt( + max(cosineDist(demo_vectors,demo_vectors)), + 2 + 1e-07) + ) + + +test_that("Distance is between 0 and 2 (pt 1)", + expect_lt( + max(abs(1-square_magnitudes(normalize_lengths(demo_vectors)))), + 1e-07) +) diff --git a/tests/testthat/test-name-collapsing.r b/tests/testthat/test-name-collapsing.r new file mode 100644 index 0000000..45de6aa --- /dev/null +++ b/tests/testthat/test-name-collapsing.r @@ -0,0 +1,60 @@ +context("Name collapsing") + +test_that("name substitution works", + expect_equivalent( + demo_vectors %>% closest_to(~"good") + , + demo_vectors %>% closest_to(demo_vectors[["good"]]) + ) +) + +test_that("character substitution works", + expect_equivalent( + demo_vectors %>% closest_to("good") + , + demo_vectors %>% closest_to(demo_vectors[["good"]]) + ) +) + +test_that("addition works in substitutions", + expect_equivalent( + demo_vectors %>% closest_to(~ "good" + "bad") + , + demo_vectors %>% closest_to(demo_vectors[["good"]] + demo_vectors[["bad"]]) + ) +) + +test_that("addition provides correct results", + expect_gt( + demo_vectors[["good"]] %>% cosineSimilarity(demo_vectors[["good"]] + demo_vectors[["bad"]]) + , + .8)) + +test_that("single-argument negation works", + expect_equivalent( + demo_vectors %>% closest_to(~ -("good"-"bad")) + , + demo_vectors %>% closest_to(~ "bad"-"good") + + )) + +test_that("closest_to can wrap in function", + expect_equal( + {function(x) {closest_to(x,~ "class" + "school")}}(demo_vectors), + closest_to(demo_vectors,~ "class" + "school") + ) +) + +test_that("Name substitution is occurring", + expect_equivalent( + cosineSimilarity(demo_vectors,"good"), + cosineSimilarity(demo_vectors,demo_vectors[["good"]]) + )) + +test_that("reference in functional scope is passed along", + expect_equivalent( + lapply(c("good"),function(referenced_word) + {demo_vectors %>% closest_to(demo_vectors[[referenced_word]])})[[1]], + demo_vectors %>% closest_to("good") + ) +) diff --git a/tests/testthat/test-read-write.R b/tests/testthat/test-read-write.R new file mode 100644 index 0000000..071eeba --- /dev/null +++ b/tests/testthat/test-read-write.R @@ -0,0 +1,18 @@ +context("Read and Write works") + +## TODO: Add tests for non-binary format; check actual value of results; test reading of slices. + +test_that("Writing works", + expect_null( + write.binary.word2vec(demo_vectors[1:100,],"binary.bin"), + 1e-07 + ) +) + +test_that("Reading Works", + expect_s4_class( + read.binary.vectors("binary.bin"), + "VectorSpaceModel" + ) +) + diff --git a/tests/testthat/test-rejection.R b/tests/testthat/test-rejection.R new file mode 100644 index 0000000..9d95376 --- /dev/null +++ b/tests/testthat/test-rejection.R @@ -0,0 +1,12 @@ +context("Rejection Works") + +test_that("Rejection works along gender binary", + expect_gt( + { + rejected_frame <- demo_vectors %>% reject(~ "man" - "woman") + cosineDist(demo_vectors[["he"]],demo_vectors[["she"]] ) - + cosineDist(rejected_frame[["he"]],rejected_frame[["she"]] ) + }, + .4 + ) +) diff --git a/tests/testthat/test-train.R b/tests/testthat/test-train.R new file mode 100644 index 0000000..d234542 --- /dev/null +++ b/tests/testthat/test-train.R @@ -0,0 +1,49 @@ +context("Training Functions Work") + +# This fails on Travis. I'll worry about this later. +demo = "Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. + +Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. + +But, in a larger sense, we can not dedicate -- we can not consecrate -- we can not hallow -- this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us -- that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion -- that we here highly resolve that these dead shall not have died in vain -- that this nation, under God, shall have a new birth of freedom -- and that government of the people, by the people, for the people, shall not perish from the earth. +" + +message("In directory", getwd()) +cat(demo,file = "input.txt") +if (file.exists("tmp.txt")) file.remove("tmp.txt") + +test_that("Preparation produces file", + expect_equal( + prep_word2vec("input.txt","tmp.txt"), + "tmp.txt" + ) +) + +test_that("Preparation produces file", + expect_equal( + prep_word2vec("input.txt","tmp.txt"), + "tmp.txt" + ) +) + +test_that("Tokenization is the right length", + expect_lt( + 2, + 272 - length(stringr::str_split(readr::read_file("tmp.txt"), " ")) + ) +) +if (FALSE) { +test_that("Bundling works on multiple levels", + expect_equal( + prep_word2vec("input.txt","tmp.txt",bundle_ngrams = 3), + "tmp.txt" + ) +) +} +test_that("Training Works", + expect_s4_class( + train_word2vec("tmp.txt"), + "VectorSpaceModel" + ) +) + diff --git a/tests/testthat/test-types.R b/tests/testthat/test-types.R new file mode 100644 index 0000000..7f7d7c0 --- /dev/null +++ b/tests/testthat/test-types.R @@ -0,0 +1,29 @@ +context("VectorSpaceModel Class Works") + +test_that("Class Exists", + expect_s4_class( + demo_vectors, + "VectorSpaceModel" + ) +) + +test_that("Class inherits addition", + expect_s4_class( + demo_vectors+1, + "VectorSpaceModel" + ) +) + +test_that("Class inherits slices", + expect_s4_class( + demo_vectors[1,], + "VectorSpaceModel" + ) +) + +test_that("Slices aren't dropped in dimensionality", + expect_s4_class( + demo_vectors[1,], + "matrix" + ) +) diff --git a/vignettes/exploration.Rmd b/vignettes/exploration.Rmd new file mode 100644 index 0000000..fd056b8 --- /dev/null +++ b/vignettes/exploration.Rmd @@ -0,0 +1,168 @@ +--- +title: "Word2Vec Workshop" +author: "Ben Schmidt" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# Exploring Word2Vec models + +R is a great language for *exploratory data analysis* in particular. If you're going to use a word2vec model in a larger pipeline, it may be important (intellectually or ethically) to spend a little while understanding what kind of model of language you've learned. + +This package makes it easy to do so, both by allowing you to read word2vec models to and from R, and by giving some syntactic sugar that lets you describe vector-space models concisely and clearly. + +Note that these functions may still be useful if you're a data analyst training word2vec models elsewhere (say, in gensim.) I'm also hopeful this can be a good way of interacting with varied vector models in a workshop session. + +If you want to train your own model or need help setting up the package, read the introductory vignette. Aside from the installation, it assumes more knowledge of R than this walkthrough. + +## Why explore? + +In this vignette we're going to look at (a small portion of) a model trained on teaching evaluations. It's an interesting set, but it's also one that shows the importance of exploring vector space models before you use them. Exploration is important because: + +1. If you're a humanist or social scientist, it can tell you something about the *sources* by letting you see how they use language. These co-occurrence patterns can then be better investigated through close reading or more traditional collocation scores, which potentially more reliable but also much slower and less flexible. + +2. If you're an engineer, it can help you understand some of biases built into a model that you're using in a larger pipeline. This can be both technically and ethically important: you don't want, for instance, to build a job-recommendation system which is disinclined to offer programming jobs to women because it has learned that women are unrepresented in CS jobs already. +(On this point in word2vec in particular, see [here](https://freedom-to-tinker.com/blog/randomwalker/language-necessarily-contains-human-biases-and-so-will-machines-trained-on-language-corpora/) and [here](https://arxiv.org/abs/1607.06520).) + +## Getting started. + +First we'll load this package, and the recommended package `magrittr`, which lets us pass these arguments around. + +```{r} +library(wordVectors) +library(magrittr) +``` + +The basic element of any vector space model is a *vectors.* for each word. In the demo data included with this package, an object called 'demo_vectors,' there are 500 numbers: you can start to examine them, if you with, by hand. So let's consider just one of these--the vector for 'good'. + +In R's ordinary matrix syntax, you could write that out laboriously as `demo_vectors[rownames(demo_vectors)=="good",]`. `WordVectors` provides a shorthand using double braces: + +```{r} +demo_vectors[["good"]] +``` + +These numbers are meaningless on their own. But in the vector space, we can find similar words. + +```{r} +demo_vectors %>% closest_to(demo_vectors[["good"]]) +``` + +The `%>%` is the pipe operator from magrittr; it helps to keep things organized, and is particularly useful with some of the things we'll see later. The 'similarity' scores here are cosine similarity in a vector space; 1.0 represents perfect similarity, 0 is no correlation, and -1.0 is complete opposition. In practice, vector "opposition" is different from the colloquial use of "opposite," and very rare. You'll only occasionally see vector scores below 0--as you can see above, "bad" is actually one of the most similar words to "good." + +When interactively exploring a single model (rather than comparing *two* models), it can be a pain to keep retyping words over and over. Rather than operate on the vectors, this package also lets you access the word directly by using R's formula notation: putting a tilde in front of it. For a single word, you can even access it directly, as so. + +```{r} +demo_vectors %>% closest_to("bad") +``` + +## Vector math + +The tildes are necessary syntax where things get interesting--you can do **math** on these vectors. So if we want to find the words that are closest to the *combination* of "good" and "bad" (which is to say, words that get used in evaluation) we can write (see where the tilde is?): + +```{r} + +demo_vectors %>% closest_to(~"good"+"bad") + +# The same thing could be written as: +# demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) +``` + +Those are words that are common to both "good" and "bad". We could also find words that are shaded towards just good but *not* bad by using subtraction. + +```{r} +demo_vectors %>% closest_to(~"good" - "bad") +``` + +> What does this "subtraction" vector mean? +> In practice, the easiest way to think of it is probably simply as 'similar to +> good and dissimilar to 'bad'. Omer and Levy's papers suggest this interpretation. +> But taking the vectors more seriously means you can think of it geometrically: "good"-"bad" is +> a vector that describes the difference between positive and negative. +> Similarity to this vector means, technically, the portion of a words vectors whose +> whose multidimensional path lies largely along the direction between the two words. + +Again, you can easily switch the order to the opposite: here are a bunch of bad words: + +```{r} +demo_vectors %>% closest_to(~ "bad" - "good") +``` + +All sorts of binaries are captured in word2vec models. One of the most famous, since Mikolov's original word2vec paper, is *gender*. If you ask for similarity to "he"-"she", for example, you get words that appear mostly in a *male* context. Since these examples are from teaching evaluations, after just a few straightforwardly gendered words, we start to get things that only men are ("arrogant") or where there are very few women in the university ("physics") + +```{r} +demo_vectors %>% closest_to(~ "he" - "she") +demo_vectors %>% closest_to(~ "she" - "he") +``` + +## Analogies + +We can expand out the match to perform analogies. Men tend to be called 'guys'. +What's the female equivalent? +In an SAT-style analogy, you might write `he:guy::she:???`. +In vector math, we think of this as moving between points. + +If you're using the mental framework of positive of 'similarity' and +negative as 'dissimilarity,' you can think of this as starting at "guy", +removing its similarity to "he", and additing a similarity to "she". + +This yields the answer: the most similar term to "guy" for a woman is "lady." + +```{r} +demo_vectors %>% closest_to(~ "guy" - "he" + "she") +``` + +If you're using the other mental framework, of thinking of these as real vectors, +you might phrase this in a slightly different way. +You have a gender vector `("female" - "male")` that represents the *direction* of masculinity +to femininity. You can then add this vector to "guy", and that will take you to a new neighborhood. You might phrase that this way: note that the math is exactly equivalent, and +only the grouping is different. + +```{r} +demo_vectors %>% closest_to(~ "guy" + ("she" - "he")) +``` + +Principal components can let you plot a subset of these vectors to see how they relate. You can imagine an arrow from "he" to "she", from "guy" to "lady", and from "man" to "woman"; all run in roughly the same direction. + +```{r} + +demo_vectors[[c("lady","woman","man","he","she","guy","man"), average=F]] %>% + plot(method="pca") + +``` + +These lists of ten words at a time are useful for interactive exploration, but sometimes we might want to say 'n=Inf' to return the full list. For instance, we can combine these two methods to look at positive and negative words used to evaluate teachers. + +First we build up three data_frames: first, a list of the 50 top evaluative words, and then complete lists of similarity to `"good" -"bad"` and `"woman" - "man"`. + +```{r} +top_evaluative_words = demo_vectors %>% + closest_to(~ "good"+"bad",n=75) + +goodness = demo_vectors %>% + closest_to(~ "good"-"bad",n=Inf) + +femininity = demo_vectors %>% + closest_to(~ "she" - "he", n=Inf) +``` + +Then we can use tidyverse packages to join and plot these. +An `inner_join` restricts us down to just those top 50 words, and ggplot +can array the words on axes. + +```{r} +library(ggplot2) +library(dplyr) + +top_evaluative_words %>% + inner_join(goodness) %>% + inner_join(femininity) %>% + ggplot() + + geom_text(aes(x=`similarity to "she" - "he"`, + y=`similarity to "good" - "bad"`, + label=word)) +``` + diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd new file mode 100644 index 0000000..6f1185b --- /dev/null +++ b/vignettes/introduction.Rmd @@ -0,0 +1,228 @@ +--- +title: "Word2Vec introduction" +author: "Ben Schmidt" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# Intro + +This vignette walks you through training a word2vec model, and using that model to search for similarities, to build clusters, and to visualize vocabulary relationships of that model in two dimensions. If you are working with pre-trained vectors, you might want to jump straight to the "exploration" vignette; it is a little slower-paced, but doesn't show off quite so many features of the package. + +# Package installation + +If you have not installed this package, paste the below. More detailed installation instructions are at the end of the [package README](https://github.com/bmschmidt/wordVectors). + +```{r} +if (!require(wordVectors)) { + if (!(require(devtools))) { + install.packages("devtools") + } + devtools::install_github("bmschmidt/wordVectors") +} + + +``` + +# Building test data + +We begin by importing the `wordVectors` package and the `magrittr` package, because its pipe operator makes it easier to work with data. + +```{r} +library(wordVectors) +library(magrittr) +``` + +First we build up a test file to train on. +As an example, we'll use a collection of cookbooks from Michigan State University. +This has to download from the Internet if it doesn't already exist. + +```{r} +if (!file.exists("cookbooks.zip")) { + download.file("http://archive.lib.msu.edu/dinfo/feedingamerica/cookbook_text.zip","cookbooks.zip") +} +unzip("cookbooks.zip",exdir="cookbooks") +``` + + +Then we *prepare* a single file for word2vec to read in. This does a couple things: + +1. Creates a single text file with the contents of every file in the original document; +2. Uses the `tokenizers` package to clean and lowercase the original text, +3. If `bundle_ngrams` is greater than 1, joins together common bigrams into a single word. For example, "olive oil" may be joined together into "olive_oil" wherever it occurs. + +You can also do this in another language: particularly for large files, that will be **much** faster. (For reference: in a console, `perl -ne 's/[^A-Za-z_0-9 \n]/ /g; print lc $_;' cookbooks/*.txt > cookbooks.txt` will do much the same thing on ASCII text in a couple seconds.) If you do this and want to bundle ngrams, you'll then need to call `word2phrase("cookbooks.txt","cookbook_bigrams.txt",...)` to build up the bigrams; call it twice if you want 3-grams, and so forth. + + +```{r} +if (!file.exists("cookbooks.txt")) prep_word2vec(origin="cookbooks",destination="cookbooks.txt",lowercase=T,bundle_ngrams=2) +``` + +To train a word2vec model, use the function `train_word2vec`. This actually builds up the model. It uses an on-disk file as an intermediary and then reads that file into memory. + +```{r} +if (!file.exists("cookbook_vectors.bin")) {model = train_word2vec("cookbooks.txt","cookbook_vectors.bin",vectors=200,threads=4,window=12,iter=5,negative_samples=0)} else model = read.vectors("cookbook_vectors.bin") + +``` + +A few notes: + +1. The `vectors` parameter is the dimensionality of the representation. More vectors usually means more precision, but also more random error and slower operations. Likely choices are probably in the range 100-500. +2. The `threads` parameter is the number of processors to use on your computer. On a modern laptop, the fastest results will probably be between 2 and 8 threads, depending on the number of cores. +3. `iter` is how many times to read through the corpus. With fewer than 100 books, it can greatly help to increase the number of passes; if you're working with billions of words, it probably matters less. One danger of too low a number of iterations is that words that aren't closely related will seem to be closer than they are. +4. Training can take a while. On my laptop, it takes a few minutes to train these cookbooks; larger models take proportionally more time. Because of the importance of more iterations to reducing noise, don't be afraid to set things up to require a lot of training time (as much as a day!) +5. One of the best things about the word2vec algorithm is that it *does* work on extremely large corpora in linear time. +6. In RStudio I've noticed that this sometimes appears to hang after a while; the percentage bar stops updating. If you check system activity it actually is still running, and will complete. +7. If at any point you want to *read in* a previously trained model, you can do so by typing `model = read.vectors("cookbook_vectors.bin")`. + +Now we have a model in memory, trained on about 10 million words from 77 cookbooks. What can it tell us about food? + +## Similarity searches + +Well, you can run some basic operations to find the nearest elements: + +```{r} +model %>% closest_to("fish") +``` + +With that list, you can expand out further to search for multiple words: + +```{r} +model %>% + closest_to(model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],50) +``` + +Now we have a pretty expansive list of potential fish-related words from old cookbooks. This can be useful for a few different things: + +1. As a list of potential query terms for keyword search. +2. As a batch of words to use as seed to some other text mining operation; for example, you could pull all paragraphs surrounding these to find ways that fish are cooked. +3. As a source for visualization. + +Or we can just arrange them somehow. In this case, it doesn't look like much of anything. + +```{r} +some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150) +fishy = model[[some_fish$word,average=F]] +plot(fishy,method="pca") +``` + +## Clustering + +We can use standard clustering algorithms, like kmeans, to find groups of terms that fit together. You can think of this as a sort of topic model, although unlike more sophisticated topic modeling algorithms like Latent Direchlet Allocation, each word must be tied to single particular topic. + +```{r} +set.seed(10) +centers = 150 +clustering = kmeans(model,centers=centers,iter.max = 40) +``` + +Here are a ten random "topics" produced through this method. Each of the columns are the ten most frequent words in one random cluster. + +```{r} +sapply(sample(1:centers,10),function(n) { + names(clustering$cluster[clustering$cluster==n][1:10]) +}) +``` + +These can be useful for figuring out, at a glance, what some of the overall common clusters in your corpus are. + +Clusters need not be derived at the level of the full model. We can take, for instance, +the 20 words closest to each of four different kinds of words. + +```{r} +ingredients = c("madeira","beef","saucepan","carrots") +term_set = lapply(ingredients, + function(ingredient) { + nearest_words = model %>% closest_to(model[[ingredient]],20) + nearest_words$word + }) %>% unlist + +subset = model[[term_set,average=F]] + +subset %>% + cosineDist(subset) %>% + as.dist %>% + hclust %>% + plot + +``` + + +# Visualization + +## Relationship planes. + +One of the basic strategies you can take is to try to project the high-dimensional space here into a plane you can look at. + +For instance, we can take the words "sweet" and "sour," find the twenty words most similar to either of them, and plot those in a sweet-salty plane. + +```{r} +tastes = model[[c("sweet","salty"),average=F]] + +# model[1:3000,] here restricts to the 3000 most common words in the set. +sweet_and_saltiness = model[1:3000,] %>% cosineSimilarity(tastes) + +# Filter to the top 20 sweet or salty. +sweet_and_saltiness = sweet_and_saltiness[ + rank(-sweet_and_saltiness[,1])<20 | + rank(-sweet_and_saltiness[,2])<20, + ] + +plot(sweet_and_saltiness,type='n') +text(sweet_and_saltiness,labels=rownames(sweet_and_saltiness)) + +``` + + +There's no limit to how complicated this can get. For instance, there are really *five* tastes: sweet, salty, bitter, sour, and savory. (Savory is usually called 'umami' nowadays, but that word will not appear in historic cookbooks.) + +Rather than use a base matrix of the whole set, we can shrink down to just five dimensions: how similar every word in our set is to each of these five. (I'm using cosine similarity here, so the closer a number is to one, the more similar it is.) + +```{r} + +tastes = model[[c("sweet","salty","savory","bitter","sour"),average=F]] + +# model[1:3000,] here restricts to the 3000 most common words in the set. +common_similarities_tastes = model[1:3000,] %>% cosineSimilarity(tastes) + +common_similarities_tastes[20:30,] +``` + +Now we can filter down to the 50 words that are closest to *any* of these (that's what the apply-max function below does), and +use a PCA biplot to look at just 50 words in a flavor plane. + +```{r} +high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 75,] + +high_similarities_to_tastes %>% + prcomp %>% + biplot(main="Fifty words in a\nprojection of flavor space") +``` + +This tells us a few things. One is that (in some runnings of the model, at least--there is some random chance built in here.) "sweet" and "sour" are closely aligned. Is this a unique feature of American cooking? A relationship that changes over time? These would require more investigation. + +Second is that "savory" really is an acting category in these cookbooks, even without the precision of 'umami' as a word to express it. Anchovy, the flavor most closely associated with savoriness, shows up as fairly characteristic of the flavor, along with a variety of herbs. + +Finally, words characteristic of meals seem to show up in the upper realms of the file. + +# Catchall reduction: TSNE + +Last but not least, there is a catchall method built into the library +to visualize a single overall decent plane for viewing the library; TSNE dimensionality reduction. + +Just calling "plot" will display the equivalent of a word cloud with individual tokens grouped relatively close to each other based on their proximity in the higher dimensional space. + +"Perplexity" is the optimal number of neighbors for each word. By default it's 50; smaller numbers may cause clusters to appear more dramatically at the cost of overall coherence. + +```{r} +plot(model,perplexity=50) +``` + +A few notes on this method: + +1. If you don't get local clusters, it is not working. You might need to reduce the perplexity so that clusters are smaller; or you might not have good local similarities. +2. If you're plotting only a small set of words, you're better off trying to plot a `VectorSpaceModel` with `method="pca"`, which locates the points using principal components analysis.