From 4fbd94cd81246f9325f44029ef1e9b24f4aad791 Mon Sep 17 00:00:00 2001 From: Aron Culotta Date: Tue, 12 Apr 2016 16:39:45 -0500 Subject: [PATCH] l22 after class --- lectures/lec22/README.md | 5 + lectures/lec22/WordRep.ipynb | 1546 ++++++++++++++++++++++++++++++++++ 2 files changed, 1551 insertions(+) create mode 100644 lectures/lec22/README.md create mode 100644 lectures/lec22/WordRep.ipynb diff --git a/lectures/lec22/README.md b/lectures/lec22/README.md new file mode 100644 index 0000000..b29882a --- /dev/null +++ b/lectures/lec22/README.md @@ -0,0 +1,5 @@ +See Stanford slides as well: + +http://en.wikipedia.org/wiki/Cosine_similarity#Properties + + diff --git a/lectures/lec22/WordRep.ipynb b/lectures/lec22/WordRep.ipynb new file mode 100644 index 0000000..ed02369 --- /dev/null +++ b/lectures/lec22/WordRep.ipynb @@ -0,0 +1,1546 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# Representing Words" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Get some text data (message board posts).\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "raw_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42,\n", + " remove=('headers', 'footers', 'quotes'))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "read 11314 documents\n", + ">>>for example:\n", + "I was wondering if anyone out there could enlighten me on this car I saw\n", + "the other day. It was a 2-door sports car, looked to be from the late 60s/\n", + "early 70s. It was called a Bricklin. The doors were really small. In addition,\n", + "the front bumper was separate from the rest of the body. This is \n", + "all I know. If anyone can tellme a model name, engine specs, years\n", + "of production, where this car is made, history, or whatever info you\n", + "have on this funky looking car, please e-mail.\n", + "\n", + ">>>for example:\n", + "Anybody seen mouse cursor distortion running the Diamond 1024x768x256 driver?\n", + "Sorry, don't know the version of the driver (no indication in the menus) but it's a recently\n", + "delivered Gateway system. Am going to try the latest drivers from Diamond BBS but wondered\n", + "if anyone else had seen this.\n", + "\n", + "post or email\n" + ] + } + ], + "source": [ + "print('read %d documents' % len(raw_data.data))\n", + "print('>>>for example:\\n%s\\n' % raw_data.data[0])\n", + "print('>>>for example:\\n%s' % raw_data.data[1000])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tokenized documents; for example:\n", + "['i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', 'it', 'was', 'a', '2', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', '60s', 'early', '70s', 'it', 'was', 'called', 'a', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'i', 'know', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'e', 'mail']\n" + ] + } + ], + "source": [ + "import re\n", + "tokens = [re.findall('\\w+', s.lower()) for s in raw_data.data]\n", + "# sample 5k docs\n", + "# tokens = tokens[:5000]\n", + "print('tokenized documents; for example:\\n%s' % tokens[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cluster words by their context" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import Counter, defaultdict\n", + "import numpy as np\n", + "\n", + "# dict from term to context vector.\n", + "contexts = defaultdict(lambda: Counter())\n", + "window = 2\n", + "for toks in tokens:\n", + " for i, token in enumerate(toks):\n", + " features = []\n", + " for j in range(np.amax([0, i-window]), i):\n", + " features.append(toks[j] + \"@\" + str(j-i))\n", + " for j in range(i+1, min(i + window, len(toks))):\n", + " features.append(toks[j] + \"@\" + str(j-i))\n", + " contexts[token].update(features)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "contexts = dict((k,v) for k, v in contexts.items() if sum(v.values()) > 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('have@1', 1133), ('m@1', 1078), ('am@1', 809), ('the@-2', 730), ('think@1', 713), ('don@1', 700), ('and@-1', 686), ('but@-1', 647), ('ve@1', 591), ('can@1', 583)]\n" + ] + } + ], + "source": [ + "print(contexts['i'].most_common(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('the@-1', 87), ('a@-1', 43), ('my@-1', 31), ('a@-2', 28), ('of@-2', 28), ('and@1', 25), ('i@1', 22), ('in@-2', 17), ('is@1', 14), ('this@-1', 14)]\n" + ] + } + ], + "source": [ + "print(contexts['car'].most_common(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('control@1', 87), ('a@-1', 49), ('the@-1', 39), ('the@-2', 29), ('anti@-1', 23), ('of@-1', 22), ('of@-2', 21), ('a@-2', 20), ('on@-1', 12), ('pro@-1', 11)]\n" + ] + } + ], + "source": [ + "print(contexts['gun'].most_common(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('the@-1', 6502.0),\n", + " ('and@1', 5893.0),\n", + " ('the@-2', 5837.0),\n", + " ('of@-2', 4819.0),\n", + " ('the@1', 4634.0)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compute the number of different contexts each term appears in.\n", + "doc_freq = Counter()\n", + "for context in contexts.values():\n", + " for term in context:\n", + " doc_freq[term] += 1.\n", + "doc_freq.most_common(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "16920" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(contexts)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "to_remove = set([t for t, v in doc_freq.items() if v < 10])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for w, context in contexts.items():\n", + " d = dict()\n", + " for k, v in context.items():\n", + " if k not in to_remove:\n", + " d[k] = v\n", + " contexts[w] = Counter(d)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('have@1', 0.33676022127839716),\n", + " ('ve@1', 0.33081880741285874),\n", + " ('am@1', 0.3245826170925442),\n", + " ('m@1', 0.31135022907131166),\n", + " ('think@1', 0.2871391254258928)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Transform each context vector to be term freq / tweet frequency. \n", + "# Also then normalize by length.\n", + "import math\n", + "for term, context in contexts.items():\n", + " for term2, frequency in context.items():\n", + " context[term2] = frequency / (1. + math.log(doc_freq[term2]))\n", + " length = math.sqrt(sum([v*v for v in context.values()]))\n", + " for term2, frequency in context.items():\n", + " context[term2] = 1. * frequency / length\n", + "\n", + "contexts['i'].most_common(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('control@1', 0.7649046244521146),\n", + " ('a@-1', 0.2748044135561928),\n", + " ('anti@-1', 0.24620735646234973),\n", + " ('the@-1', 0.2070426831444815),\n", + " ('the@-2', 0.15567221355940442),\n", + " ('owners@1', 0.13182927982147039),\n", + " ('of@-1', 0.12396703634205102),\n", + " ('of@-2', 0.11500702355700627),\n", + " ('pro@-1', 0.11447448925177982),\n", + " ('ownership@1', 0.11421855663134342)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contexts['gun'].most_common(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Make a sparse matrix.\n", + "from sklearn.feature_extraction import DictVectorizer\n", + "vec = DictVectorizer()\n", + "X = vec.fit_transform(contexts.values())\n", + "features = np.array(vec.get_feature_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# X is now a sparse matrix where each row is a term and each column is a context feature" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(16920, 19786)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "<1x19786 sparse matrix of type ''\n", + "\twith 427 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "terms = contexts.keys()\n", + "term2id = dict((t, i) for i, t in enumerate(terms))\n", + "X[term2id['gun']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import scale\n", + "# SVD of X\n", + "# Too big!\n", + "# U, s, Vh = np.linalg.svd(X.toarray(), full_matrices=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "help(np.linalg.svd)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## word2vec" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on module gensim.models.word2vec in gensim.models:\n", + "\n", + "NAME\n", + " gensim.models.word2vec\n", + "\n", + "DESCRIPTION\n", + " Deep learning via word2vec's \"skip-gram and CBOW models\", using either\n", + " hierarchical softmax or negative sampling [1]_ [2]_.\n", + " \n", + " The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/\n", + " and extended with additional functionality.\n", + " \n", + " For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, visit http://radimrehurek.com/2014/02/word2vec-tutorial/\n", + " \n", + " **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training**\n", + " (70x speedup compared to plain NumPy implementation [3]_).\n", + " \n", + " Initialize a model with e.g.::\n", + " \n", + " >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)\n", + " \n", + " Persist a model to disk with::\n", + " \n", + " >>> model.save(fname)\n", + " >>> model = Word2Vec.load(fname) # you can continue training with the loaded model!\n", + " \n", + " The model can also be instantiated from an existing file on disk in the word2vec C format::\n", + " \n", + " >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format\n", + " >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format\n", + " \n", + " You can perform various syntactic/semantic NLP word tasks with the model. Some of them\n", + " are already built-in::\n", + " \n", + " >>> model.most_similar(positive=['woman', 'king'], negative=['man'])\n", + " [('queen', 0.50882536), ...]\n", + " \n", + " >>> model.doesnt_match(\"breakfast cereal dinner lunch\".split())\n", + " 'cereal'\n", + " \n", + " >>> model.similarity('woman', 'man')\n", + " 0.73723527\n", + " \n", + " >>> model['computer'] # raw numpy vector of a word\n", + " array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32)\n", + " \n", + " and so on.\n", + " \n", + " If you're finished training a model (=no more updates, only querying), you can do\n", + " \n", + " >>> model.init_sims(replace=True)\n", + " \n", + " to trim unneeded model memory = use (much) less RAM.\n", + " \n", + " Note that there is a :mod:`gensim.models.phrases` module which lets you automatically\n", + " detect phrases longer than one word. Using phrases, you can learn a word2vec model\n", + " where \"words\" are actually multiword expressions, such as `new_york_times` or `financial_crisis`:\n", + " \n", + " >>> bigram_transformer = gensim.models.Phrases(sentences)\n", + " >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...)\n", + " \n", + " .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.\n", + " .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality.\n", + " In Proceedings of NIPS, 2013.\n", + " .. [3] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/\n", + "\n", + "CLASSES\n", + " builtins.object\n", + " BrownCorpus\n", + " LineSentence\n", + " Text8Corpus\n", + " Vocab\n", + " gensim.utils.SaveLoad(builtins.object)\n", + " Word2Vec\n", + " \n", + " class BrownCorpus(builtins.object)\n", + " | Iterate over sentences from the Brown corpus (part of NLTK data).\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(self, dirname)\n", + " | Initialize self. See help(type(self)) for accurate signature.\n", + " | \n", + " | __iter__(self)\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + " \n", + " class LineSentence(builtins.object)\n", + " | Simple format: one sentence = one line; words already preprocessed and separated by whitespace.\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(self, source, max_sentence_length=10000, limit=None)\n", + " | `source` can be either a string or a file object. Clip the file to the first\n", + " | `limit` lines (or no clipped if limit is None, the default).\n", + " | \n", + " | Example::\n", + " | \n", + " | sentences = LineSentence('myfile.txt')\n", + " | \n", + " | Or for compressed files::\n", + " | \n", + " | sentences = LineSentence('compressed_text.txt.bz2')\n", + " | sentences = LineSentence('compressed_text.txt.gz')\n", + " | \n", + " | __iter__(self)\n", + " | Iterate through the lines in the source.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + " \n", + " class Text8Corpus(builtins.object)\n", + " | Iterate over sentences from the \"text8\" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(self, fname, max_sentence_length=10000)\n", + " | Initialize self. See help(type(self)) for accurate signature.\n", + " | \n", + " | __iter__(self)\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + " \n", + " class Vocab(builtins.object)\n", + " | A single vocabulary item, used internally for collecting per-word frequency/sampling info,\n", + " | and for constructing binary trees (incl. both word leaves and inner nodes).\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __init__(self, **kwargs)\n", + " | Initialize self. See help(type(self)) for accurate signature.\n", + " | \n", + " | __lt__(self, other)\n", + " | Return self>> trained_model['office']\n", + " | array([ -1.40128313e-02, ...])\n", + " | \n", + " | >>> trained_model[['office', 'products']]\n", + " | array([ -1.40128313e-02, ...]\n", + " | [ -1.70425311e-03, ...]\n", + " | ...)\n", + " | \n", + " | __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000)\n", + " | Initialize the model from an iterable of `sentences`. Each sentence is a\n", + " | list of words (unicode strings) that will be used for training.\n", + " | \n", + " | The `sentences` iterable can be simply a list, but for larger corpora,\n", + " | consider an iterable that streams the sentences directly from disk/network.\n", + " | See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in\n", + " | this module for such examples.\n", + " | \n", + " | If you don't supply `sentences`, the model is left uninitialized -- use if\n", + " | you plan to initialize it in some other way.\n", + " | \n", + " | `sg` defines the training algorithm. By default (`sg=0`), CBOW is used.\n", + " | Otherwise (`sg=1`), skip-gram is employed.\n", + " | \n", + " | `size` is the dimensionality of the feature vectors.\n", + " | \n", + " | `window` is the maximum distance between the current and predicted word within a sentence.\n", + " | \n", + " | `alpha` is the initial learning rate (will linearly drop to zero as training progresses).\n", + " | \n", + " | `seed` = for the random number generator. Initial vectors for each\n", + " | word are seeded with a hash of the concatenation of word + str(seed).\n", + " | \n", + " | `min_count` = ignore all words with total frequency lower than this.\n", + " | \n", + " | `max_vocab_size` = limit RAM during vocabulary building; if there are more unique\n", + " | words than this, then prune the infrequent ones. Every 10 million word types\n", + " | need about 1GB of RAM. Set to `None` for no limit (default).\n", + " | \n", + " | `sample` = threshold for configuring which higher-frequency words are randomly downsampled;\n", + " | default is 1e-3, useful range is (0, 1e-5).\n", + " | \n", + " | `workers` = use this many worker threads to train the model (=faster training with multicore machines).\n", + " | \n", + " | `hs` = if 1, hierarchical softmax will be used for model training.\n", + " | If set to 0 (default), and `negative` is non-zero, negative sampling will be used.\n", + " | \n", + " | `negative` = if > 0, negative sampling will be used, the int for negative\n", + " | specifies how many \"noise words\" should be drawn (usually between 5-20).\n", + " | Default is 5. If set to 0, no negative samping is used.\n", + " | \n", + " | `cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean.\n", + " | Only applies when cbow is used.\n", + " | \n", + " | `hashfxn` = hash function to use to randomly initialize weights, for increased\n", + " | training reproducibility. Default is Python's rudimentary built in hash function.\n", + " | \n", + " | `iter` = number of iterations (epochs) over the corpus.\n", + " | \n", + " | `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain\n", + " | in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count).\n", + " | Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and\n", + " | returns either util.RULE_DISCARD, util.RULE_KEEP or util.RULE_DEFAULT.\n", + " | Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part\n", + " | of the model.\n", + " | \n", + " | `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before\n", + " | assigning word indexes.\n", + " | \n", + " | `batch_words` = target size (in words) for batches of examples passed to worker threads (and\n", + " | thus cython routines). Default is 10000. (Larger batches can be passed if individual\n", + " | texts are longer, but the cython code may truncate.)\n", + " | \n", + " | __str__(self)\n", + " | Return str(self).\n", + " | \n", + " | accuracy(self, questions, restrict_vocab=30000, most_similar=)\n", + " | Compute accuracy of the model. `questions` is a filename where lines are\n", + " | 4-tuples of words, split into sections by \": SECTION NAME\" lines.\n", + " | See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.\n", + " | \n", + " | The accuracy is reported (=printed to log and returned as a list) for each\n", + " | section separately, plus there's one aggregate summary at the end.\n", + " | \n", + " | Use `restrict_vocab` to ignore all questions containing a word whose frequency\n", + " | is not in the top-N most frequent words (default top 30,000).\n", + " | \n", + " | This method corresponds to the `compute-accuracy` script of the original C word2vec.\n", + " | \n", + " | build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None)\n", + " | Build vocabulary from a sequence of sentences (can be a once-only generator stream).\n", + " | Each sentence must be a list of unicode strings.\n", + " | \n", + " | clear_sims(self)\n", + " | \n", + " | create_binary_tree(self)\n", + " | Create a binary Huffman tree using stored vocabulary word counts. Frequent words\n", + " | will have shorter binary codes. Called internally from `build_vocab()`.\n", + " | \n", + " | doesnt_match(self, words)\n", + " | Which word from the given list doesn't go with the others?\n", + " | \n", + " | Example::\n", + " | \n", + " | >>> trained_model.doesnt_match(\"breakfast cereal dinner lunch\".split())\n", + " | 'cereal'\n", + " | \n", + " | estimate_memory(self, vocab_size=None, report=None)\n", + " | Estimate required memory for a model using current settings and provided vocabulary size.\n", + " | \n", + " | finalize_vocab(self)\n", + " | Build tables and model weights based on final vocabulary settings.\n", + " | \n", + " | init_sims(self, replace=False)\n", + " | Precompute L2-normalized vectors.\n", + " | \n", + " | If `replace` is set, forget the original vectors and only keep the normalized\n", + " | ones = saves lots of memory!\n", + " | \n", + " | Note that you **cannot continue training** after doing a replace. The model becomes\n", + " | effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.\n", + " | \n", + " | intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicode_errors='strict')\n", + " | Merge the input-hidden weight matrix from the original C word2vec-tool format\n", + " | given, where it intersects with the current vocabulary. (No words are added to the\n", + " | existing vocabulary, but intersecting words adopt the file's weights, and\n", + " | non-intersecting words are left alone.)\n", + " | \n", + " | `binary` is a boolean indicating whether the data is in binary word2vec format.\n", + " | \n", + " | make_cum_table(self, power=0.75, domain=2147483647)\n", + " | Create a cumulative-distribution table using stored vocabulary word counts for\n", + " | drawing random words in the negative-sampling training routines.\n", + " | \n", + " | To draw a word index, choose a random integer up to the maximum value in the\n", + " | table (cum_table[-1]), then finding that integer's sorted insertion point\n", + " | (as if by bisect_left or ndarray.searchsorted()). That insertion point is the\n", + " | drawn index, coming up in proportion equal to the increment at that slot.\n", + " | \n", + " | Called internally from 'build_vocab()'.\n", + " | \n", + " | most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None)\n", + " | Find the top-N most similar words. Positive words contribute positively towards the\n", + " | similarity, negative words negatively.\n", + " | \n", + " | This method computes cosine similarity between a simple mean of the projection\n", + " | weight vectors of the given words and the vectors for each word in the model.\n", + " | The method corresponds to the `word-analogy` and `distance` scripts in the original\n", + " | word2vec implementation.\n", + " | \n", + " | If topn is False, most_similar returns the vector of similarity scores.\n", + " | \n", + " | `restrict_vocab` is an optional integer which limits the range of vectors which\n", + " | are searched for most-similar values. For example, restrict_vocab=10000 would\n", + " | only check the first 10000 word vectors in the vocabulary order. (This may be\n", + " | meaningful if you've sorted the vocabulary by descending frequency.)\n", + " | \n", + " | Example::\n", + " | \n", + " | >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])\n", + " | [('queen', 0.50882536), ...]\n", + " | \n", + " | most_similar_cosmul(self, positive=[], negative=[], topn=10)\n", + " | Find the top-N most similar words, using the multiplicative combination objective\n", + " | proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute\n", + " | positively towards the similarity, negative words negatively, but with less\n", + " | susceptibility to one large distance dominating the calculation.\n", + " | \n", + " | In the common analogy-solving case, of two positive and one negative examples,\n", + " | this method is equivalent to the \"3CosMul\" objective (equation (4)) of Levy and Goldberg.\n", + " | \n", + " | Additional positive or negative examples contribute to the numerator or denominator,\n", + " | respectively – a potentially sensible but untested extension of the method. (With\n", + " | a single positive example, rankings will be the same as in the default most_similar.)\n", + " | \n", + " | Example::\n", + " | \n", + " | >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])\n", + " | [(u'iraq', 0.8488819003105164), ...]\n", + " | \n", + " | .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.\n", + " | \n", + " | n_similarity(self, ws1, ws2)\n", + " | Compute cosine similarity between two sets of words.\n", + " | \n", + " | Example::\n", + " | \n", + " | >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])\n", + " | 0.61540466561049689\n", + " | \n", + " | >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])\n", + " | 1.0000000000000004\n", + " | \n", + " | >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')\n", + " | True\n", + " | \n", + " | reset_from(self, other_model)\n", + " | Borrow shareable pre-built structures (like vocab) from the other_model. Useful\n", + " | if testing multiple models in parallel on the same corpus.\n", + " | \n", + " | reset_weights(self)\n", + " | Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.\n", + " | \n", + " | save(self, *args, **kwargs)\n", + " | Save the object to file (also see `load`).\n", + " | \n", + " | `fname_or_handle` is either a string specifying the file name to\n", + " | save to, or an open file-like object which can be written to. If\n", + " | the object is a file handle, no special array handling will be\n", + " | performed; all attributes will be saved to the same file.\n", + " | \n", + " | If `separately` is None, automatically detect large\n", + " | numpy/scipy.sparse arrays in the object being stored, and store\n", + " | them into separate files. This avoids pickle memory errors and\n", + " | allows mmap'ing large arrays back on load efficiently.\n", + " | \n", + " | You can also set `separately` manually, in which case it must be\n", + " | a list of attribute names to be stored in separate files. The\n", + " | automatic check is not performed in this case.\n", + " | \n", + " | `ignore` is a set of attribute names to *not* serialize (file\n", + " | handles, caches etc). On subsequent load() these attributes will\n", + " | be set to None.\n", + " | \n", + " | `pickle_protocol` defaults to 2 so the pickled object can be imported\n", + " | in both Python 2 and 3.\n", + " | \n", + " | save_word2vec_format(self, fname, fvocab=None, binary=False)\n", + " | Store the input-hidden weight matrix in the same format used by the original\n", + " | C word2vec-tool, for compatibility.\n", + " | \n", + " | scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None)\n", + " | Apply vocabulary settings for `min_count` (discarding less-frequent words)\n", + " | and `sample` (controlling the downsampling of more-frequent words).\n", + " | \n", + " | Calling with `dry_run=True` will only simulate the provided settings and\n", + " | report the size of the retained vocabulary, effective corpus length, and\n", + " | estimated memory requirements. Results are both printed via logging and\n", + " | returned as a dict.\n", + " | \n", + " | Delete the raw vocabulary after the scaling is done to free up RAM,\n", + " | unless `keep_raw_vocab` is set.\n", + " | \n", + " | scan_vocab(self, sentences, progress_per=10000, trim_rule=None)\n", + " | Do an initial scan of all words appearing in sentences.\n", + " | \n", + " | score(self, sentences, total_sentences=1000000, chunksize=100, queue_factor=2, report_delay=1)\n", + " | Score the log probability for a sequence of sentences (can be a once-only generator stream).\n", + " | Each sentence must be a list of unicode strings.\n", + " | This does not change the fitted model in any way (see Word2Vec.train() for that)\n", + " | \n", + " | Note that you should specify total_sentences; we'll run into problems if you ask to\n", + " | score more than this number of sentences but it is inefficient to set the value too high.\n", + " | \n", + " | See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification.\n", + " | \n", + " | .. [taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.\n", + " | .. [deepir] https://github.com/TaddyLab/gensim/blob/deepir/docs/notebooks/deepir.ipynb\n", + " | \n", + " | seeded_vector(self, seed_string)\n", + " | Create one 'random' vector (but deterministic by seed_string)\n", + " | \n", + " | similarity(self, w1, w2)\n", + " | Compute cosine similarity between two words.\n", + " | \n", + " | Example::\n", + " | \n", + " | >>> trained_model.similarity('woman', 'man')\n", + " | 0.73723527\n", + " | \n", + " | >>> trained_model.similarity('woman', 'woman')\n", + " | 1.0\n", + " | \n", + " | sort_vocab(self)\n", + " | Sort the vocabulary so the most frequent words have the lowest indexes.\n", + " | \n", + " | train(self, sentences, total_words=None, word_count=0, total_examples=None, queue_factor=2, report_delay=1.0)\n", + " | Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).\n", + " | For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)\n", + " | \n", + " | To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples\n", + " | (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the\n", + " | sentences are the same as those that were used to initially build the vocabulary.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Class methods defined here:\n", + " | \n", + " | load(*args, **kwargs) from builtins.type\n", + " | Load a previously saved object from file (also see `save`).\n", + " | \n", + " | If the object was saved with large arrays stored separately, you can load\n", + " | these arrays via mmap (shared memory) using `mmap='r'`. Default: don't use\n", + " | mmap, load large arrays as normal objects.\n", + " | \n", + " | If the file being loaded is compressed (either '.gz' or '.bz2'), then\n", + " | `mmap=None` must be set. Load will raise an `IOError` if this condition\n", + " | is encountered.\n", + " | \n", + " | load_word2vec_format(fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict') from builtins.type\n", + " | Load the input-hidden weight matrix from the original C word2vec-tool format.\n", + " | \n", + " | Note that the information stored in the file is incomplete (the binary tree is missing),\n", + " | so while you can query for word similarity etc., you cannot continue training\n", + " | with a model loaded this way.\n", + " | \n", + " | `binary` is a boolean indicating whether the data is in binary word2vec format.\n", + " | `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.\n", + " | Word counts are read from `fvocab` filename, if set (this is the file generated\n", + " | by `-save-vocab` flag of the original C tool).\n", + " | \n", + " | If you trained the C model using non-utf8 encoding for words, specify that\n", + " | encoding in `encoding`.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Static methods defined here:\n", + " | \n", + " | log_accuracy(section)\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors inherited from gensim.utils.SaveLoad:\n", + " | \n", + " | __dict__\n", + " | dictionary for instance variables (if defined)\n", + " | \n", + " | __weakref__\n", + " | list of weak references to the object (if defined)\n", + "\n", + "FUNCTIONS\n", + " array(...)\n", + " array(object, dtype=None, copy=True, order=None, subok=False, ndmin=0)\n", + " \n", + " Create an array.\n", + " \n", + " Parameters\n", + " ----------\n", + " object : array_like\n", + " An array, any object exposing the array interface, an\n", + " object whose __array__ method returns an array, or any\n", + " (nested) sequence.\n", + " dtype : data-type, optional\n", + " The desired data-type for the array. If not given, then\n", + " the type will be determined as the minimum type required\n", + " to hold the objects in the sequence. This argument can only\n", + " be used to 'upcast' the array. For downcasting, use the\n", + " .astype(t) method.\n", + " copy : bool, optional\n", + " If true (default), then the object is copied. Otherwise, a copy\n", + " will only be made if __array__ returns a copy, if obj is a\n", + " nested sequence, or if a copy is needed to satisfy any of the other\n", + " requirements (`dtype`, `order`, etc.).\n", + " order : {'C', 'F', 'A'}, optional\n", + " Specify the order of the array. If order is 'C', then the array\n", + " will be in C-contiguous order (last-index varies the fastest).\n", + " If order is 'F', then the returned array will be in\n", + " Fortran-contiguous order (first-index varies the fastest).\n", + " If order is 'A' (default), then the returned array may be\n", + " in any order (either C-, Fortran-contiguous, or even discontiguous),\n", + " unless a copy is required, in which case it will be C-contiguous.\n", + " subok : bool, optional\n", + " If True, then sub-classes will be passed-through, otherwise\n", + " the returned array will be forced to be a base-class array (default).\n", + " ndmin : int, optional\n", + " Specifies the minimum number of dimensions that the resulting\n", + " array should have. Ones will be pre-pended to the shape as\n", + " needed to meet this requirement.\n", + " \n", + " Returns\n", + " -------\n", + " out : ndarray\n", + " An array object satisfying the specified requirements.\n", + " \n", + " See Also\n", + " --------\n", + " empty, empty_like, zeros, zeros_like, ones, ones_like, fill\n", + " \n", + " Examples\n", + " --------\n", + " >>> np.array([1, 2, 3])\n", + " array([1, 2, 3])\n", + " \n", + " Upcasting:\n", + " \n", + " >>> np.array([1, 2, 3.0])\n", + " array([ 1., 2., 3.])\n", + " \n", + " More than one dimension:\n", + " \n", + " >>> np.array([[1, 2], [3, 4]])\n", + " array([[1, 2],\n", + " [3, 4]])\n", + " \n", + " Minimum dimensions 2:\n", + " \n", + " >>> np.array([1, 2, 3], ndmin=2)\n", + " array([[1, 2, 3]])\n", + " \n", + " Type provided:\n", + " \n", + " >>> np.array([1, 2, 3], dtype=complex)\n", + " array([ 1.+0.j, 2.+0.j, 3.+0.j])\n", + " \n", + " Data-type consisting of more than one element:\n", + " \n", + " >>> x = np.array([(1,2),(3,4)],dtype=[('a','>> x['a']\n", + " array([1, 3])\n", + " \n", + " Creating an array from sub-classes:\n", + " \n", + " >>> np.array(np.mat('1 2; 3 4'))\n", + " array([[1, 2],\n", + " [3, 4]])\n", + " \n", + " >>> np.array(np.mat('1 2; 3 4'), subok=True)\n", + " matrix([[1, 2],\n", + " [3, 4]])\n", + " \n", + " default_timer = perf_counter(...)\n", + " perf_counter() -> float\n", + " \n", + " Performance counter for benchmarking.\n", + " \n", + " dot(...)\n", + " dot(a, b, out=None)\n", + " \n", + " Dot product of two arrays.\n", + " \n", + " For 2-D arrays it is equivalent to matrix multiplication, and for 1-D\n", + " arrays to inner product of vectors (without complex conjugation). For\n", + " N dimensions it is a sum product over the last axis of `a` and\n", + " the second-to-last of `b`::\n", + " \n", + " dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])\n", + " \n", + " Parameters\n", + " ----------\n", + " a : array_like\n", + " First argument.\n", + " b : array_like\n", + " Second argument.\n", + " out : ndarray, optional\n", + " Output argument. This must have the exact kind that would be returned\n", + " if it was not used. In particular, it must have the right type, must be\n", + " C-contiguous, and its dtype must be the dtype that would be returned\n", + " for `dot(a,b)`. This is a performance feature. Therefore, if these\n", + " conditions are not met, an exception is raised, instead of attempting\n", + " to be flexible.\n", + " \n", + " Returns\n", + " -------\n", + " output : ndarray\n", + " Returns the dot product of `a` and `b`. If `a` and `b` are both\n", + " scalars or both 1-D arrays then a scalar is returned; otherwise\n", + " an array is returned.\n", + " If `out` is given, then it is returned.\n", + " \n", + " Raises\n", + " ------\n", + " ValueError\n", + " If the last dimension of `a` is not the same size as\n", + " the second-to-last dimension of `b`.\n", + " \n", + " See Also\n", + " --------\n", + " vdot : Complex-conjugating dot product.\n", + " tensordot : Sum products over arbitrary axes.\n", + " einsum : Einstein summation convention.\n", + " matmul : '@' operator as method with out parameter.\n", + " \n", + " Examples\n", + " --------\n", + " >>> np.dot(3, 4)\n", + " 12\n", + " \n", + " Neither argument is complex-conjugated:\n", + " \n", + " >>> np.dot([2j, 3j], [2j, 3j])\n", + " (-13+0j)\n", + " \n", + " For 2-D arrays it is the matrix product:\n", + " \n", + " >>> a = [[1, 0], [0, 1]]\n", + " >>> b = [[4, 1], [2, 2]]\n", + " >>> np.dot(a, b)\n", + " array([[4, 1],\n", + " [2, 2]])\n", + " \n", + " >>> a = np.arange(3*4*5*6).reshape((3,4,5,6))\n", + " >>> b = np.arange(3*4*5*6)[::-1].reshape((5,4,6,3))\n", + " >>> np.dot(a, b)[2,3,2,1,2,2]\n", + " 499128\n", + " >>> sum(a[2,3,2,:] * b[1,2,:,2])\n", + " 499128\n", + " \n", + " empty(...)\n", + " empty(shape, dtype=float, order='C')\n", + " \n", + " Return a new array of given shape and type, without initializing entries.\n", + " \n", + " Parameters\n", + " ----------\n", + " shape : int or tuple of int\n", + " Shape of the empty array\n", + " dtype : data-type, optional\n", + " Desired output data-type.\n", + " order : {'C', 'F'}, optional\n", + " Whether to store multi-dimensional data in row-major\n", + " (C-style) or column-major (Fortran-style) order in\n", + " memory.\n", + " \n", + " Returns\n", + " -------\n", + " out : ndarray\n", + " Array of uninitialized (arbitrary) data with the given\n", + " shape, dtype, and order.\n", + " \n", + " See Also\n", + " --------\n", + " empty_like, zeros, ones\n", + " \n", + " Notes\n", + " -----\n", + " `empty`, unlike `zeros`, does not set the array values to zero,\n", + " and may therefore be marginally faster. On the other hand, it requires\n", + " the user to manually set all the values in the array, and should be\n", + " used with caution.\n", + " \n", + " Examples\n", + " --------\n", + " >>> np.empty([2, 2])\n", + " array([[ -9.74499359e+001, 6.69583040e-309],\n", + " [ 2.13182611e-314, 3.06959433e-309]]) #random\n", + " \n", + " >>> np.empty([2, 2], dtype=int)\n", + " array([[-1073741821, -1067949133],\n", + " [ 496041986, 19249760]]) #random\n", + " \n", + " fromstring(...)\n", + " fromstring(string, dtype=float, count=-1, sep='')\n", + " \n", + " A new 1-D array initialized from raw binary or text data in a string.\n", + " \n", + " Parameters\n", + " ----------\n", + " string : str\n", + " A string containing the data.\n", + " dtype : data-type, optional\n", + " The data type of the array; default: float. For binary input data,\n", + " the data must be in exactly this format.\n", + " count : int, optional\n", + " Read this number of `dtype` elements from the data. If this is\n", + " negative (the default), the count will be determined from the\n", + " length of the data.\n", + " sep : str, optional\n", + " If not provided or, equivalently, the empty string, the data will\n", + " be interpreted as binary data; otherwise, as ASCII text with\n", + " decimal numbers. Also in this latter case, this argument is\n", + " interpreted as the string separating numbers in the data; extra\n", + " whitespace between elements is also ignored.\n", + " \n", + " Returns\n", + " -------\n", + " arr : ndarray\n", + " The constructed array.\n", + " \n", + " Raises\n", + " ------\n", + " ValueError\n", + " If the string is not the correct size to satisfy the requested\n", + " `dtype` and `count`.\n", + " \n", + " See Also\n", + " --------\n", + " frombuffer, fromfile, fromiter\n", + " \n", + " Examples\n", + " --------\n", + " >>> np.fromstring('\\x01\\x02', dtype=np.uint8)\n", + " array([1, 2], dtype=uint8)\n", + " >>> np.fromstring('1 2', dtype=int, sep=' ')\n", + " array([1, 2])\n", + " >>> np.fromstring('1, 2', dtype=int, sep=',')\n", + " array([1, 2])\n", + " >>> np.fromstring('\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)\n", + " array([1, 2, 3], dtype=uint8)\n", + " \n", + " score_cbow_pair(model, word, word2_indices, l1)\n", + " \n", + " score_sentence_cbow(...)\n", + " \n", + " score_sentence_sg(...)\n", + " \n", + " score_sg_pair(model, word, word2)\n", + " \n", + " train_batch_cbow(...)\n", + " \n", + " train_batch_sg(...)\n", + " \n", + " train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True)\n", + " \n", + " train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, context_vectors=None, context_locks=None)\n", + " \n", + " zeros(...)\n", + " zeros(shape, dtype=float, order='C')\n", + " \n", + " Return a new array of given shape and type, filled with zeros.\n", + " \n", + " Parameters\n", + " ----------\n", + " shape : int or sequence of ints\n", + " Shape of the new array, e.g., ``(2, 3)`` or ``2``.\n", + " dtype : data-type, optional\n", + " The desired data-type for the array, e.g., `numpy.int8`. Default is\n", + " `numpy.float64`.\n", + " order : {'C', 'F'}, optional\n", + " Whether to store multidimensional data in C- or Fortran-contiguous\n", + " (row- or column-wise) order in memory.\n", + " \n", + " Returns\n", + " -------\n", + " out : ndarray\n", + " Array of zeros with the given shape, dtype, and order.\n", + " \n", + " See Also\n", + " --------\n", + " zeros_like : Return an array of zeros with shape and type of input.\n", + " ones_like : Return an array of ones with shape and type of input.\n", + " empty_like : Return an empty array with shape and type of input.\n", + " ones : Return a new array setting values to one.\n", + " empty : Return a new uninitialized array.\n", + " \n", + " Examples\n", + " --------\n", + " >>> np.zeros(5)\n", + " array([ 0., 0., 0., 0., 0.])\n", + " \n", + " >>> np.zeros((5,), dtype=np.int)\n", + " array([0, 0, 0, 0, 0])\n", + " \n", + " >>> np.zeros((2, 1))\n", + " array([[ 0.],\n", + " [ 0.]])\n", + " \n", + " >>> s = (2,2)\n", + " >>> np.zeros(s)\n", + " array([[ 0., 0.],\n", + " [ 0., 0.]])\n", + " \n", + " >>> np.zeros((2,), dtype=[('x', 'i4'), ('y', 'i4')]) # custom dtype\n", + " array([(0, 0), (0, 0)],\n", + " dtype=[('x', '\n", + " log = \n", + " logger = \n", + " newaxis = None\n", + " sqrt = \n", + " string_types = (,)\n", + "\n", + "FILE\n", + " /Users/awculott/.local/lib/python3.5/site-packages/gensim/models/word2vec.py\n", + "\n", + "\n" + ] + } + ], + "source": [ + "from gensim.models import word2vec\n", + "# Tutorial: http://rare-technologies.com/deep-learning-with-word2vec-and-gensim/\n", + "help(word2vec)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2407167" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(len(t) for t in tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", + "model = word2vec.Word2Vec(tokens, size=100, window=5, min_count=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "model.init_sims(replace=True) # free unneeded variables and precompute normalized vectors." + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('bikes', 0.8544794321060181),\n", + " ('spots', 0.8441545367240906),\n", + " ('decades', 0.8351261615753174),\n", + " ('mice', 0.8316686749458313),\n", + " ('plants', 0.8287297487258911),\n", + " ('seats', 0.822428286075592),\n", + " ('masters', 0.8212233781814575),\n", + " ('antibiotics', 0.8196142315864563),\n", + " ('adults', 0.8149713277816772),\n", + " ('buses', 0.812675952911377)]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar(positive=['cars', 'trucks'])" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('crime', 0.882451593875885),\n", + " ('military', 0.8497803211212158),\n", + " ('criminal', 0.8445150852203369),\n", + " ('safety', 0.7943019866943359),\n", + " ('defense', 0.7925150394439697),\n", + " ('community', 0.7909201979637146),\n", + " ('armed', 0.7859578132629395),\n", + " ('drug', 0.780666708946228),\n", + " ('violent', 0.777873158454895),\n", + " ('self', 0.7714388370513916)]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar(positive=['gun'])" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'engine'" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.doesnt_match(['mouse', 'engine', 'cpu'])" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.79739832314678638" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.n_similarity(['chip', 'cpu'], ['software', 'algorithm'])" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.11561933793959725" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.n_similarity(['chip', 'cpu'], ['religion', 'belief'])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('items', 0.6851003170013428),\n", + " ('vendors', 0.667102038860321),\n", + " ('inanimate', 0.6526215672492981),\n", + " ('prophesied', 0.6507330536842346),\n", + " ('models', 0.6335091590881348),\n", + " ('solicited', 0.625587522983551),\n", + " ('places', 0.6187167167663574),\n", + " ('sizes', 0.6169173717498779),\n", + " ('assemblers', 0.6047966480255127),\n", + " ('tapes', 0.6001349091529846)]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar(positive=['cars', 'guns', 'prices'], negative=['car', 'gun'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}