From 4fbd94cd81246f9325f44029ef1e9b24f4aad791 Mon Sep 17 00:00:00 2001
From: Aron Culotta <aronwc@gmail.com>
Date: Tue, 12 Apr 2016 16:39:45 -0500
Subject: [PATCH] l22 after class

---
 lectures/lec22/README.md     |    5 +
 lectures/lec22/WordRep.ipynb | 1546 ++++++++++++++++++++++++++++++++++
 2 files changed, 1551 insertions(+)
 create mode 100644 lectures/lec22/README.md
 create mode 100644 lectures/lec22/WordRep.ipynb

diff --git a/lectures/lec22/README.md b/lectures/lec22/README.md
new file mode 100644
index 0000000..b29882a
--- /dev/null
+++ b/lectures/lec22/README.md
@@ -0,0 +1,5 @@
+See Stanford slides as well:
+
+http://en.wikipedia.org/wiki/Cosine_similarity#Properties
+
+
diff --git a/lectures/lec22/WordRep.ipynb b/lectures/lec22/WordRep.ipynb
new file mode 100644
index 0000000..ed02369
--- /dev/null
+++ b/lectures/lec22/WordRep.ipynb
@@ -0,0 +1,1546 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "# Representing Words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Get some text data (message board posts).\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "raw_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42,\n",
+    "                              remove=('headers', 'footers', 'quotes'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "read 11314 documents\n",
+      ">>>for example:\n",
+      "I was wondering if anyone out there could enlighten me on this car I saw\n",
+      "the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
+      "early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
+      "the front bumper was separate from the rest of the body. This is \n",
+      "all I know. If anyone can tellme a model name, engine specs, years\n",
+      "of production, where this car is made, history, or whatever info you\n",
+      "have on this funky looking car, please e-mail.\n",
+      "\n",
+      ">>>for example:\n",
+      "Anybody seen mouse cursor distortion running the Diamond 1024x768x256 driver?\n",
+      "Sorry, don't know the version of the driver (no indication in the menus) but it's a recently\n",
+      "delivered Gateway system.  Am going to try the latest drivers from Diamond BBS but wondered\n",
+      "if anyone else had seen this.\n",
+      "\n",
+      "post or email\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('read %d documents' % len(raw_data.data))\n",
+    "print('>>>for example:\\n%s\\n' % raw_data.data[0])\n",
+    "print('>>>for example:\\n%s' % raw_data.data[1000])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tokenized documents; for example:\n",
+      "['i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', 'it', 'was', 'a', '2', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', '60s', 'early', '70s', 'it', 'was', 'called', 'a', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'i', 'know', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'e', 'mail']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "tokens = [re.findall('\\w+', s.lower()) for s in raw_data.data]\n",
+    "# sample 5k docs\n",
+    "# tokens = tokens[:5000]\n",
+    "print('tokenized documents; for example:\\n%s' % tokens[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cluster words by their context"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from collections import Counter, defaultdict\n",
+    "import numpy as np\n",
+    "\n",
+    "# dict from term to context vector.\n",
+    "contexts = defaultdict(lambda: Counter())\n",
+    "window = 2\n",
+    "for toks in tokens:\n",
+    "    for i, token in enumerate(toks):\n",
+    "        features = []\n",
+    "        for j in range(np.amax([0, i-window]), i):\n",
+    "            features.append(toks[j] + \"@\" + str(j-i))\n",
+    "        for j in range(i+1, min(i + window, len(toks))):\n",
+    "            features.append(toks[j] + \"@\" + str(j-i))\n",
+    "        contexts[token].update(features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "contexts = dict((k,v) for k, v in contexts.items() if sum(v.values()) > 10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('have@1', 1133), ('m@1', 1078), ('am@1', 809), ('the@-2', 730), ('think@1', 713), ('don@1', 700), ('and@-1', 686), ('but@-1', 647), ('ve@1', 591), ('can@1', 583)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(contexts['i'].most_common(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('the@-1', 87), ('a@-1', 43), ('my@-1', 31), ('a@-2', 28), ('of@-2', 28), ('and@1', 25), ('i@1', 22), ('in@-2', 17), ('is@1', 14), ('this@-1', 14)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(contexts['car'].most_common(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('control@1', 87), ('a@-1', 49), ('the@-1', 39), ('the@-2', 29), ('anti@-1', 23), ('of@-1', 22), ('of@-2', 21), ('a@-2', 20), ('on@-1', 12), ('pro@-1', 11)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(contexts['gun'].most_common(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('the@-1', 6502.0),\n",
+       " ('and@1', 5893.0),\n",
+       " ('the@-2', 5837.0),\n",
+       " ('of@-2', 4819.0),\n",
+       " ('the@1', 4634.0)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Compute the number of different contexts each term appears in.\n",
+    "doc_freq = Counter()\n",
+    "for context in contexts.values():\n",
+    "    for term in context:\n",
+    "        doc_freq[term] += 1.\n",
+    "doc_freq.most_common(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "16920"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(contexts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "to_remove = set([t for t, v in doc_freq.items() if v < 10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "for w, context in contexts.items():\n",
+    "    d = dict()\n",
+    "    for k, v in context.items():\n",
+    "        if k not in to_remove:\n",
+    "            d[k] = v\n",
+    "    contexts[w] = Counter(d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('have@1', 0.33676022127839716),\n",
+       " ('ve@1', 0.33081880741285874),\n",
+       " ('am@1', 0.3245826170925442),\n",
+       " ('m@1', 0.31135022907131166),\n",
+       " ('think@1', 0.2871391254258928)]"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Transform each context vector to be term freq / tweet frequency. \n",
+    "# Also then normalize by length.\n",
+    "import math\n",
+    "for term, context in contexts.items():\n",
+    "    for term2, frequency in context.items():\n",
+    "        context[term2] = frequency / (1. + math.log(doc_freq[term2]))\n",
+    "    length = math.sqrt(sum([v*v for v in context.values()]))\n",
+    "    for term2, frequency in context.items():\n",
+    "        context[term2] = 1. * frequency / length\n",
+    "\n",
+    "contexts['i'].most_common(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('control@1', 0.7649046244521146),\n",
+       " ('a@-1', 0.2748044135561928),\n",
+       " ('anti@-1', 0.24620735646234973),\n",
+       " ('the@-1', 0.2070426831444815),\n",
+       " ('the@-2', 0.15567221355940442),\n",
+       " ('owners@1', 0.13182927982147039),\n",
+       " ('of@-1', 0.12396703634205102),\n",
+       " ('of@-2', 0.11500702355700627),\n",
+       " ('pro@-1', 0.11447448925177982),\n",
+       " ('ownership@1', 0.11421855663134342)]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "contexts['gun'].most_common(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Make a sparse matrix.\n",
+    "from sklearn.feature_extraction import DictVectorizer\n",
+    "vec = DictVectorizer()\n",
+    "X = vec.fit_transform(contexts.values())\n",
+    "features = np.array(vec.get_feature_names())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# X is now a sparse matrix where each row is a term and each column is a context feature"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(16920, 19786)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<1x19786 sparse matrix of type '<class 'numpy.float64'>'\n",
+       "\twith 427 stored elements in Compressed Sparse Row format>"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "terms = contexts.keys()\n",
+    "term2id = dict((t, i) for i, t in enumerate(terms))\n",
+    "X[term2id['gun']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.preprocessing import scale\n",
+    "# SVD of X\n",
+    "# Too big!\n",
+    "# U, s, Vh = np.linalg.svd(X.toarray(), full_matrices=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "help(np.linalg.svd)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## word2vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Help on module gensim.models.word2vec in gensim.models:\n",
+      "\n",
+      "NAME\n",
+      "    gensim.models.word2vec\n",
+      "\n",
+      "DESCRIPTION\n",
+      "    Deep learning via word2vec's \"skip-gram and CBOW models\", using either\n",
+      "    hierarchical softmax or negative sampling [1]_ [2]_.\n",
+      "    \n",
+      "    The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/\n",
+      "    and extended with additional functionality.\n",
+      "    \n",
+      "    For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, visit http://radimrehurek.com/2014/02/word2vec-tutorial/\n",
+      "    \n",
+      "    **Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training**\n",
+      "    (70x speedup compared to plain NumPy implementation [3]_).\n",
+      "    \n",
+      "    Initialize a model with e.g.::\n",
+      "    \n",
+      "    >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)\n",
+      "    \n",
+      "    Persist a model to disk with::\n",
+      "    \n",
+      "    >>> model.save(fname)\n",
+      "    >>> model = Word2Vec.load(fname)  # you can continue training with the loaded model!\n",
+      "    \n",
+      "    The model can also be instantiated from an existing file on disk in the word2vec C format::\n",
+      "    \n",
+      "      >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)  # C text format\n",
+      "      >>> model = Word2Vec.load_word2vec_format('/tmp/vectors.bin', binary=True)  # C binary format\n",
+      "    \n",
+      "    You can perform various syntactic/semantic NLP word tasks with the model. Some of them\n",
+      "    are already built-in::\n",
+      "    \n",
+      "      >>> model.most_similar(positive=['woman', 'king'], negative=['man'])\n",
+      "      [('queen', 0.50882536), ...]\n",
+      "    \n",
+      "      >>> model.doesnt_match(\"breakfast cereal dinner lunch\".split())\n",
+      "      'cereal'\n",
+      "    \n",
+      "      >>> model.similarity('woman', 'man')\n",
+      "      0.73723527\n",
+      "    \n",
+      "      >>> model['computer']  # raw numpy vector of a word\n",
+      "      array([-0.00449447, -0.00310097,  0.02421786, ...], dtype=float32)\n",
+      "    \n",
+      "    and so on.\n",
+      "    \n",
+      "    If you're finished training a model (=no more updates, only querying), you can do\n",
+      "    \n",
+      "      >>> model.init_sims(replace=True)\n",
+      "    \n",
+      "    to trim unneeded model memory = use (much) less RAM.\n",
+      "    \n",
+      "    Note that there is a :mod:`gensim.models.phrases` module which lets you automatically\n",
+      "    detect phrases longer than one word. Using phrases, you can learn a word2vec model\n",
+      "    where \"words\" are actually multiword expressions, such as `new_york_times` or `financial_crisis`:\n",
+      "    \n",
+      "    >>> bigram_transformer = gensim.models.Phrases(sentences)\n",
+      "    >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...)\n",
+      "    \n",
+      "    .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.\n",
+      "    .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality.\n",
+      "           In Proceedings of NIPS, 2013.\n",
+      "    .. [3] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/\n",
+      "\n",
+      "CLASSES\n",
+      "    builtins.object\n",
+      "        BrownCorpus\n",
+      "        LineSentence\n",
+      "        Text8Corpus\n",
+      "        Vocab\n",
+      "    gensim.utils.SaveLoad(builtins.object)\n",
+      "        Word2Vec\n",
+      "    \n",
+      "    class BrownCorpus(builtins.object)\n",
+      "     |  Iterate over sentences from the Brown corpus (part of NLTK data).\n",
+      "     |  \n",
+      "     |  Methods defined here:\n",
+      "     |  \n",
+      "     |  __init__(self, dirname)\n",
+      "     |      Initialize self.  See help(type(self)) for accurate signature.\n",
+      "     |  \n",
+      "     |  __iter__(self)\n",
+      "     |  \n",
+      "     |  ----------------------------------------------------------------------\n",
+      "     |  Data descriptors defined here:\n",
+      "     |  \n",
+      "     |  __dict__\n",
+      "     |      dictionary for instance variables (if defined)\n",
+      "     |  \n",
+      "     |  __weakref__\n",
+      "     |      list of weak references to the object (if defined)\n",
+      "    \n",
+      "    class LineSentence(builtins.object)\n",
+      "     |  Simple format: one sentence = one line; words already preprocessed and separated by whitespace.\n",
+      "     |  \n",
+      "     |  Methods defined here:\n",
+      "     |  \n",
+      "     |  __init__(self, source, max_sentence_length=10000, limit=None)\n",
+      "     |      `source` can be either a string or a file object. Clip the file to the first\n",
+      "     |      `limit` lines (or no clipped if limit is None, the default).\n",
+      "     |      \n",
+      "     |      Example::\n",
+      "     |      \n",
+      "     |          sentences = LineSentence('myfile.txt')\n",
+      "     |      \n",
+      "     |      Or for compressed files::\n",
+      "     |      \n",
+      "     |          sentences = LineSentence('compressed_text.txt.bz2')\n",
+      "     |          sentences = LineSentence('compressed_text.txt.gz')\n",
+      "     |  \n",
+      "     |  __iter__(self)\n",
+      "     |      Iterate through the lines in the source.\n",
+      "     |  \n",
+      "     |  ----------------------------------------------------------------------\n",
+      "     |  Data descriptors defined here:\n",
+      "     |  \n",
+      "     |  __dict__\n",
+      "     |      dictionary for instance variables (if defined)\n",
+      "     |  \n",
+      "     |  __weakref__\n",
+      "     |      list of weak references to the object (if defined)\n",
+      "    \n",
+      "    class Text8Corpus(builtins.object)\n",
+      "     |  Iterate over sentences from the \"text8\" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .\n",
+      "     |  \n",
+      "     |  Methods defined here:\n",
+      "     |  \n",
+      "     |  __init__(self, fname, max_sentence_length=10000)\n",
+      "     |      Initialize self.  See help(type(self)) for accurate signature.\n",
+      "     |  \n",
+      "     |  __iter__(self)\n",
+      "     |  \n",
+      "     |  ----------------------------------------------------------------------\n",
+      "     |  Data descriptors defined here:\n",
+      "     |  \n",
+      "     |  __dict__\n",
+      "     |      dictionary for instance variables (if defined)\n",
+      "     |  \n",
+      "     |  __weakref__\n",
+      "     |      list of weak references to the object (if defined)\n",
+      "    \n",
+      "    class Vocab(builtins.object)\n",
+      "     |  A single vocabulary item, used internally for collecting per-word frequency/sampling info,\n",
+      "     |  and for constructing binary trees (incl. both word leaves and inner nodes).\n",
+      "     |  \n",
+      "     |  Methods defined here:\n",
+      "     |  \n",
+      "     |  __init__(self, **kwargs)\n",
+      "     |      Initialize self.  See help(type(self)) for accurate signature.\n",
+      "     |  \n",
+      "     |  __lt__(self, other)\n",
+      "     |      Return self<value.\n",
+      "     |  \n",
+      "     |  __str__(self)\n",
+      "     |      Return str(self).\n",
+      "     |  \n",
+      "     |  ----------------------------------------------------------------------\n",
+      "     |  Data descriptors defined here:\n",
+      "     |  \n",
+      "     |  __dict__\n",
+      "     |      dictionary for instance variables (if defined)\n",
+      "     |  \n",
+      "     |  __weakref__\n",
+      "     |      list of weak references to the object (if defined)\n",
+      "    \n",
+      "    class Word2Vec(gensim.utils.SaveLoad)\n",
+      "     |  Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/\n",
+      "     |  \n",
+      "     |  The model can be stored/loaded via its `save()` and `load()` methods, or stored/loaded in a format\n",
+      "     |  compatible with the original word2vec implementation via `save_word2vec_format()` and `load_word2vec_format()`.\n",
+      "     |  \n",
+      "     |  Method resolution order:\n",
+      "     |      Word2Vec\n",
+      "     |      gensim.utils.SaveLoad\n",
+      "     |      builtins.object\n",
+      "     |  \n",
+      "     |  Methods defined here:\n",
+      "     |  \n",
+      "     |  __contains__(self, word)\n",
+      "     |  \n",
+      "     |  __getitem__(self, words)\n",
+      "     |      Accept a single word or a list of words as input.\n",
+      "     |      \n",
+      "     |      If a single word: returns the word's representations in vector space, as\n",
+      "     |      a 1D numpy array.\n",
+      "     |      \n",
+      "     |      Multiple words: return the words' representations in vector space, as a\n",
+      "     |      2d numpy array: #words x #vector_size. Matrix rows are in the same order\n",
+      "     |      as in input.\n",
+      "     |      \n",
+      "     |      Example::\n",
+      "     |      \n",
+      "     |        >>> trained_model['office']\n",
+      "     |        array([ -1.40128313e-02, ...])\n",
+      "     |      \n",
+      "     |        >>> trained_model[['office', 'products']]\n",
+      "     |        array([ -1.40128313e-02, ...]\n",
+      "     |              [ -1.70425311e-03, ...]\n",
+      "     |               ...)\n",
+      "     |  \n",
+      "     |  __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000)\n",
+      "     |      Initialize the model from an iterable of `sentences`. Each sentence is a\n",
+      "     |      list of words (unicode strings) that will be used for training.\n",
+      "     |      \n",
+      "     |      The `sentences` iterable can be simply a list, but for larger corpora,\n",
+      "     |      consider an iterable that streams the sentences directly from disk/network.\n",
+      "     |      See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in\n",
+      "     |      this module for such examples.\n",
+      "     |      \n",
+      "     |      If you don't supply `sentences`, the model is left uninitialized -- use if\n",
+      "     |      you plan to initialize it in some other way.\n",
+      "     |      \n",
+      "     |      `sg` defines the training algorithm. By default (`sg=0`), CBOW is used.\n",
+      "     |      Otherwise (`sg=1`), skip-gram is employed.\n",
+      "     |      \n",
+      "     |      `size` is the dimensionality of the feature vectors.\n",
+      "     |      \n",
+      "     |      `window` is the maximum distance between the current and predicted word within a sentence.\n",
+      "     |      \n",
+      "     |      `alpha` is the initial learning rate (will linearly drop to zero as training progresses).\n",
+      "     |      \n",
+      "     |      `seed` = for the random number generator. Initial vectors for each\n",
+      "     |      word are seeded with a hash of the concatenation of word + str(seed).\n",
+      "     |      \n",
+      "     |      `min_count` = ignore all words with total frequency lower than this.\n",
+      "     |      \n",
+      "     |      `max_vocab_size` = limit RAM during vocabulary building; if there are more unique\n",
+      "     |      words than this, then prune the infrequent ones. Every 10 million word types\n",
+      "     |      need about 1GB of RAM. Set to `None` for no limit (default).\n",
+      "     |      \n",
+      "     |      `sample` = threshold for configuring which higher-frequency words are randomly downsampled;\n",
+      "     |          default is 1e-3, useful range is (0, 1e-5).\n",
+      "     |      \n",
+      "     |      `workers` = use this many worker threads to train the model (=faster training with multicore machines).\n",
+      "     |      \n",
+      "     |      `hs` = if 1, hierarchical softmax will be used for model training.\n",
+      "     |      If set to 0 (default), and `negative` is non-zero, negative sampling will be used.\n",
+      "     |      \n",
+      "     |      `negative` = if > 0, negative sampling will be used, the int for negative\n",
+      "     |      specifies how many \"noise words\" should be drawn (usually between 5-20).\n",
+      "     |      Default is 5. If set to 0, no negative samping is used.\n",
+      "     |      \n",
+      "     |      `cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean.\n",
+      "     |      Only applies when cbow is used.\n",
+      "     |      \n",
+      "     |      `hashfxn` = hash function to use to randomly initialize weights, for increased\n",
+      "     |      training reproducibility. Default is Python's rudimentary built in hash function.\n",
+      "     |      \n",
+      "     |      `iter` = number of iterations (epochs) over the corpus.\n",
+      "     |      \n",
+      "     |      `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain\n",
+      "     |       in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count).\n",
+      "     |       Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and\n",
+      "     |       returns either util.RULE_DISCARD, util.RULE_KEEP or util.RULE_DEFAULT.\n",
+      "     |       Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part\n",
+      "     |        of the model.\n",
+      "     |      \n",
+      "     |      `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before\n",
+      "     |      assigning word indexes.\n",
+      "     |      \n",
+      "     |      `batch_words` = target size (in words) for batches of examples passed to worker threads (and\n",
+      "     |      thus cython routines). Default is 10000. (Larger batches can be passed if individual\n",
+      "     |      texts are longer, but the cython code may truncate.)\n",
+      "     |  \n",
+      "     |  __str__(self)\n",
+      "     |      Return str(self).\n",
+      "     |  \n",
+      "     |  accuracy(self, questions, restrict_vocab=30000, most_similar=<function Word2Vec.most_similar at 0x110b26048>)\n",
+      "     |      Compute accuracy of the model. `questions` is a filename where lines are\n",
+      "     |      4-tuples of words, split into sections by \": SECTION NAME\" lines.\n",
+      "     |      See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.\n",
+      "     |      \n",
+      "     |      The accuracy is reported (=printed to log and returned as a list) for each\n",
+      "     |      section separately, plus there's one aggregate summary at the end.\n",
+      "     |      \n",
+      "     |      Use `restrict_vocab` to ignore all questions containing a word whose frequency\n",
+      "     |      is not in the top-N most frequent words (default top 30,000).\n",
+      "     |      \n",
+      "     |      This method corresponds to the `compute-accuracy` script of the original C word2vec.\n",
+      "     |  \n",
+      "     |  build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None)\n",
+      "     |      Build vocabulary from a sequence of sentences (can be a once-only generator stream).\n",
+      "     |      Each sentence must be a list of unicode strings.\n",
+      "     |  \n",
+      "     |  clear_sims(self)\n",
+      "     |  \n",
+      "     |  create_binary_tree(self)\n",
+      "     |      Create a binary Huffman tree using stored vocabulary word counts. Frequent words\n",
+      "     |      will have shorter binary codes. Called internally from `build_vocab()`.\n",
+      "     |  \n",
+      "     |  doesnt_match(self, words)\n",
+      "     |      Which word from the given list doesn't go with the others?\n",
+      "     |      \n",
+      "     |      Example::\n",
+      "     |      \n",
+      "     |        >>> trained_model.doesnt_match(\"breakfast cereal dinner lunch\".split())\n",
+      "     |        'cereal'\n",
+      "     |  \n",
+      "     |  estimate_memory(self, vocab_size=None, report=None)\n",
+      "     |      Estimate required memory for a model using current settings and provided vocabulary size.\n",
+      "     |  \n",
+      "     |  finalize_vocab(self)\n",
+      "     |      Build tables and model weights based on final vocabulary settings.\n",
+      "     |  \n",
+      "     |  init_sims(self, replace=False)\n",
+      "     |      Precompute L2-normalized vectors.\n",
+      "     |      \n",
+      "     |      If `replace` is set, forget the original vectors and only keep the normalized\n",
+      "     |      ones = saves lots of memory!\n",
+      "     |      \n",
+      "     |      Note that you **cannot continue training** after doing a replace. The model becomes\n",
+      "     |      effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.\n",
+      "     |  \n",
+      "     |  intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicode_errors='strict')\n",
+      "     |      Merge the input-hidden weight matrix from the original C word2vec-tool format\n",
+      "     |      given, where it intersects with the current vocabulary. (No words are added to the\n",
+      "     |      existing vocabulary, but intersecting words adopt the file's weights, and\n",
+      "     |      non-intersecting words are left alone.)\n",
+      "     |      \n",
+      "     |      `binary` is a boolean indicating whether the data is in binary word2vec format.\n",
+      "     |  \n",
+      "     |  make_cum_table(self, power=0.75, domain=2147483647)\n",
+      "     |      Create a cumulative-distribution table using stored vocabulary word counts for\n",
+      "     |      drawing random words in the negative-sampling training routines.\n",
+      "     |      \n",
+      "     |      To draw a word index, choose a random integer up to the maximum value in the\n",
+      "     |      table (cum_table[-1]), then finding that integer's sorted insertion point\n",
+      "     |      (as if by bisect_left or ndarray.searchsorted()). That insertion point is the\n",
+      "     |      drawn index, coming up in proportion equal to the increment at that slot.\n",
+      "     |      \n",
+      "     |      Called internally from 'build_vocab()'.\n",
+      "     |  \n",
+      "     |  most_similar(self, positive=[], negative=[], topn=10, restrict_vocab=None)\n",
+      "     |      Find the top-N most similar words. Positive words contribute positively towards the\n",
+      "     |      similarity, negative words negatively.\n",
+      "     |      \n",
+      "     |      This method computes cosine similarity between a simple mean of the projection\n",
+      "     |      weight vectors of the given words and the vectors for each word in the model.\n",
+      "     |      The method corresponds to the `word-analogy` and `distance` scripts in the original\n",
+      "     |      word2vec implementation.\n",
+      "     |      \n",
+      "     |      If topn is False, most_similar returns the vector of similarity scores.\n",
+      "     |      \n",
+      "     |      `restrict_vocab` is an optional integer which limits the range of vectors which\n",
+      "     |      are searched for most-similar values. For example, restrict_vocab=10000 would\n",
+      "     |      only check the first 10000 word vectors in the vocabulary order. (This may be\n",
+      "     |      meaningful if you've sorted the vocabulary by descending frequency.)\n",
+      "     |      \n",
+      "     |      Example::\n",
+      "     |      \n",
+      "     |        >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])\n",
+      "     |        [('queen', 0.50882536), ...]\n",
+      "     |  \n",
+      "     |  most_similar_cosmul(self, positive=[], negative=[], topn=10)\n",
+      "     |      Find the top-N most similar words, using the multiplicative combination objective\n",
+      "     |      proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute\n",
+      "     |      positively towards the similarity, negative words negatively, but with less\n",
+      "     |      susceptibility to one large distance dominating the calculation.\n",
+      "     |      \n",
+      "     |      In the common analogy-solving case, of two positive and one negative examples,\n",
+      "     |      this method is equivalent to the \"3CosMul\" objective (equation (4)) of Levy and Goldberg.\n",
+      "     |      \n",
+      "     |      Additional positive or negative examples contribute to the numerator or denominator,\n",
+      "     |      respectively – a potentially sensible but untested extension of the method. (With\n",
+      "     |      a single positive example, rankings will be the same as in the default most_similar.)\n",
+      "     |      \n",
+      "     |      Example::\n",
+      "     |      \n",
+      "     |        >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])\n",
+      "     |        [(u'iraq', 0.8488819003105164), ...]\n",
+      "     |      \n",
+      "     |      .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.\n",
+      "     |  \n",
+      "     |  n_similarity(self, ws1, ws2)\n",
+      "     |      Compute cosine similarity between two sets of words.\n",
+      "     |      \n",
+      "     |      Example::\n",
+      "     |      \n",
+      "     |        >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])\n",
+      "     |        0.61540466561049689\n",
+      "     |      \n",
+      "     |        >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant'])\n",
+      "     |        1.0000000000000004\n",
+      "     |      \n",
+      "     |        >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant')\n",
+      "     |        True\n",
+      "     |  \n",
+      "     |  reset_from(self, other_model)\n",
+      "     |      Borrow shareable pre-built structures (like vocab) from the other_model. Useful\n",
+      "     |      if testing multiple models in parallel on the same corpus.\n",
+      "     |  \n",
+      "     |  reset_weights(self)\n",
+      "     |      Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.\n",
+      "     |  \n",
+      "     |  save(self, *args, **kwargs)\n",
+      "     |      Save the object to file (also see `load`).\n",
+      "     |      \n",
+      "     |      `fname_or_handle` is either a string specifying the file name to\n",
+      "     |      save to, or an open file-like object which can be written to. If\n",
+      "     |      the object is a file handle, no special array handling will be\n",
+      "     |      performed; all attributes will be saved to the same file.\n",
+      "     |      \n",
+      "     |      If `separately` is None, automatically detect large\n",
+      "     |      numpy/scipy.sparse arrays in the object being stored, and store\n",
+      "     |      them into separate files. This avoids pickle memory errors and\n",
+      "     |      allows mmap'ing large arrays back on load efficiently.\n",
+      "     |      \n",
+      "     |      You can also set `separately` manually, in which case it must be\n",
+      "     |      a list of attribute names to be stored in separate files. The\n",
+      "     |      automatic check is not performed in this case.\n",
+      "     |      \n",
+      "     |      `ignore` is a set of attribute names to *not* serialize (file\n",
+      "     |      handles, caches etc). On subsequent load() these attributes will\n",
+      "     |      be set to None.\n",
+      "     |      \n",
+      "     |      `pickle_protocol` defaults to 2 so the pickled object can be imported\n",
+      "     |      in both Python 2 and 3.\n",
+      "     |  \n",
+      "     |  save_word2vec_format(self, fname, fvocab=None, binary=False)\n",
+      "     |      Store the input-hidden weight matrix in the same format used by the original\n",
+      "     |      C word2vec-tool, for compatibility.\n",
+      "     |  \n",
+      "     |  scale_vocab(self, min_count=None, sample=None, dry_run=False, keep_raw_vocab=False, trim_rule=None)\n",
+      "     |      Apply vocabulary settings for `min_count` (discarding less-frequent words)\n",
+      "     |      and `sample` (controlling the downsampling of more-frequent words).\n",
+      "     |      \n",
+      "     |      Calling with `dry_run=True` will only simulate the provided settings and\n",
+      "     |      report the size of the retained vocabulary, effective corpus length, and\n",
+      "     |      estimated memory requirements. Results are both printed via logging and\n",
+      "     |      returned as a dict.\n",
+      "     |      \n",
+      "     |      Delete the raw vocabulary after the scaling is done to free up RAM,\n",
+      "     |      unless `keep_raw_vocab` is set.\n",
+      "     |  \n",
+      "     |  scan_vocab(self, sentences, progress_per=10000, trim_rule=None)\n",
+      "     |      Do an initial scan of all words appearing in sentences.\n",
+      "     |  \n",
+      "     |  score(self, sentences, total_sentences=1000000, chunksize=100, queue_factor=2, report_delay=1)\n",
+      "     |      Score the log probability for a sequence of sentences (can be a once-only generator stream).\n",
+      "     |      Each sentence must be a list of unicode strings.\n",
+      "     |      This does not change the fitted model in any way (see Word2Vec.train() for that)\n",
+      "     |      \n",
+      "     |      Note that you should specify total_sentences; we'll run into problems if you ask to\n",
+      "     |      score more than this number of sentences but it is inefficient to set the value too high.\n",
+      "     |      \n",
+      "     |      See the article by [taddy]_ and the gensim demo at [deepir]_ for examples of how to use such scores in document classification.\n",
+      "     |      \n",
+      "     |      .. [taddy] Taddy, Matt.  Document Classification by Inversion of Distributed Language Representations, in Proceedings of the 2015 Conference of the Association of Computational Linguistics.\n",
+      "     |      .. [deepir] https://github.com/TaddyLab/gensim/blob/deepir/docs/notebooks/deepir.ipynb\n",
+      "     |  \n",
+      "     |  seeded_vector(self, seed_string)\n",
+      "     |      Create one 'random' vector (but deterministic by seed_string)\n",
+      "     |  \n",
+      "     |  similarity(self, w1, w2)\n",
+      "     |      Compute cosine similarity between two words.\n",
+      "     |      \n",
+      "     |      Example::\n",
+      "     |      \n",
+      "     |        >>> trained_model.similarity('woman', 'man')\n",
+      "     |        0.73723527\n",
+      "     |      \n",
+      "     |        >>> trained_model.similarity('woman', 'woman')\n",
+      "     |        1.0\n",
+      "     |  \n",
+      "     |  sort_vocab(self)\n",
+      "     |      Sort the vocabulary so the most frequent words have the lowest indexes.\n",
+      "     |  \n",
+      "     |  train(self, sentences, total_words=None, word_count=0, total_examples=None, queue_factor=2, report_delay=1.0)\n",
+      "     |      Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).\n",
+      "     |      For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)\n",
+      "     |      \n",
+      "     |      To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples\n",
+      "     |      (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the\n",
+      "     |      sentences are the same as those that were used to initially build the vocabulary.\n",
+      "     |  \n",
+      "     |  ----------------------------------------------------------------------\n",
+      "     |  Class methods defined here:\n",
+      "     |  \n",
+      "     |  load(*args, **kwargs) from builtins.type\n",
+      "     |      Load a previously saved object from file (also see `save`).\n",
+      "     |      \n",
+      "     |      If the object was saved with large arrays stored separately, you can load\n",
+      "     |      these arrays via mmap (shared memory) using `mmap='r'`. Default: don't use\n",
+      "     |      mmap, load large arrays as normal objects.\n",
+      "     |      \n",
+      "     |      If the file being loaded is compressed (either '.gz' or '.bz2'), then\n",
+      "     |      `mmap=None` must be set.  Load will raise an `IOError` if this condition\n",
+      "     |      is encountered.\n",
+      "     |  \n",
+      "     |  load_word2vec_format(fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict') from builtins.type\n",
+      "     |      Load the input-hidden weight matrix from the original C word2vec-tool format.\n",
+      "     |      \n",
+      "     |      Note that the information stored in the file is incomplete (the binary tree is missing),\n",
+      "     |      so while you can query for word similarity etc., you cannot continue training\n",
+      "     |      with a model loaded this way.\n",
+      "     |      \n",
+      "     |      `binary` is a boolean indicating whether the data is in binary word2vec format.\n",
+      "     |      `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.\n",
+      "     |      Word counts are read from `fvocab` filename, if set (this is the file generated\n",
+      "     |      by `-save-vocab` flag of the original C tool).\n",
+      "     |      \n",
+      "     |      If you trained the C model using non-utf8 encoding for words, specify that\n",
+      "     |      encoding in `encoding`.\n",
+      "     |  \n",
+      "     |  ----------------------------------------------------------------------\n",
+      "     |  Static methods defined here:\n",
+      "     |  \n",
+      "     |  log_accuracy(section)\n",
+      "     |  \n",
+      "     |  ----------------------------------------------------------------------\n",
+      "     |  Data descriptors inherited from gensim.utils.SaveLoad:\n",
+      "     |  \n",
+      "     |  __dict__\n",
+      "     |      dictionary for instance variables (if defined)\n",
+      "     |  \n",
+      "     |  __weakref__\n",
+      "     |      list of weak references to the object (if defined)\n",
+      "\n",
+      "FUNCTIONS\n",
+      "    array(...)\n",
+      "        array(object, dtype=None, copy=True, order=None, subok=False, ndmin=0)\n",
+      "        \n",
+      "        Create an array.\n",
+      "        \n",
+      "        Parameters\n",
+      "        ----------\n",
+      "        object : array_like\n",
+      "            An array, any object exposing the array interface, an\n",
+      "            object whose __array__ method returns an array, or any\n",
+      "            (nested) sequence.\n",
+      "        dtype : data-type, optional\n",
+      "            The desired data-type for the array.  If not given, then\n",
+      "            the type will be determined as the minimum type required\n",
+      "            to hold the objects in the sequence.  This argument can only\n",
+      "            be used to 'upcast' the array.  For downcasting, use the\n",
+      "            .astype(t) method.\n",
+      "        copy : bool, optional\n",
+      "            If true (default), then the object is copied.  Otherwise, a copy\n",
+      "            will only be made if __array__ returns a copy, if obj is a\n",
+      "            nested sequence, or if a copy is needed to satisfy any of the other\n",
+      "            requirements (`dtype`, `order`, etc.).\n",
+      "        order : {'C', 'F', 'A'}, optional\n",
+      "            Specify the order of the array.  If order is 'C', then the array\n",
+      "            will be in C-contiguous order (last-index varies the fastest).\n",
+      "            If order is 'F', then the returned array will be in\n",
+      "            Fortran-contiguous order (first-index varies the fastest).\n",
+      "            If order is 'A' (default), then the returned array may be\n",
+      "            in any order (either C-, Fortran-contiguous, or even discontiguous),\n",
+      "            unless a copy is required, in which case it will be C-contiguous.\n",
+      "        subok : bool, optional\n",
+      "            If True, then sub-classes will be passed-through, otherwise\n",
+      "            the returned array will be forced to be a base-class array (default).\n",
+      "        ndmin : int, optional\n",
+      "            Specifies the minimum number of dimensions that the resulting\n",
+      "            array should have.  Ones will be pre-pended to the shape as\n",
+      "            needed to meet this requirement.\n",
+      "        \n",
+      "        Returns\n",
+      "        -------\n",
+      "        out : ndarray\n",
+      "            An array object satisfying the specified requirements.\n",
+      "        \n",
+      "        See Also\n",
+      "        --------\n",
+      "        empty, empty_like, zeros, zeros_like, ones, ones_like, fill\n",
+      "        \n",
+      "        Examples\n",
+      "        --------\n",
+      "        >>> np.array([1, 2, 3])\n",
+      "        array([1, 2, 3])\n",
+      "        \n",
+      "        Upcasting:\n",
+      "        \n",
+      "        >>> np.array([1, 2, 3.0])\n",
+      "        array([ 1.,  2.,  3.])\n",
+      "        \n",
+      "        More than one dimension:\n",
+      "        \n",
+      "        >>> np.array([[1, 2], [3, 4]])\n",
+      "        array([[1, 2],\n",
+      "               [3, 4]])\n",
+      "        \n",
+      "        Minimum dimensions 2:\n",
+      "        \n",
+      "        >>> np.array([1, 2, 3], ndmin=2)\n",
+      "        array([[1, 2, 3]])\n",
+      "        \n",
+      "        Type provided:\n",
+      "        \n",
+      "        >>> np.array([1, 2, 3], dtype=complex)\n",
+      "        array([ 1.+0.j,  2.+0.j,  3.+0.j])\n",
+      "        \n",
+      "        Data-type consisting of more than one element:\n",
+      "        \n",
+      "        >>> x = np.array([(1,2),(3,4)],dtype=[('a','<i4'),('b','<i4')])\n",
+      "        >>> x['a']\n",
+      "        array([1, 3])\n",
+      "        \n",
+      "        Creating an array from sub-classes:\n",
+      "        \n",
+      "        >>> np.array(np.mat('1 2; 3 4'))\n",
+      "        array([[1, 2],\n",
+      "               [3, 4]])\n",
+      "        \n",
+      "        >>> np.array(np.mat('1 2; 3 4'), subok=True)\n",
+      "        matrix([[1, 2],\n",
+      "                [3, 4]])\n",
+      "    \n",
+      "    default_timer = perf_counter(...)\n",
+      "        perf_counter() -> float\n",
+      "        \n",
+      "        Performance counter for benchmarking.\n",
+      "    \n",
+      "    dot(...)\n",
+      "        dot(a, b, out=None)\n",
+      "        \n",
+      "        Dot product of two arrays.\n",
+      "        \n",
+      "        For 2-D arrays it is equivalent to matrix multiplication, and for 1-D\n",
+      "        arrays to inner product of vectors (without complex conjugation). For\n",
+      "        N dimensions it is a sum product over the last axis of `a` and\n",
+      "        the second-to-last of `b`::\n",
+      "        \n",
+      "            dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])\n",
+      "        \n",
+      "        Parameters\n",
+      "        ----------\n",
+      "        a : array_like\n",
+      "            First argument.\n",
+      "        b : array_like\n",
+      "            Second argument.\n",
+      "        out : ndarray, optional\n",
+      "            Output argument. This must have the exact kind that would be returned\n",
+      "            if it was not used. In particular, it must have the right type, must be\n",
+      "            C-contiguous, and its dtype must be the dtype that would be returned\n",
+      "            for `dot(a,b)`. This is a performance feature. Therefore, if these\n",
+      "            conditions are not met, an exception is raised, instead of attempting\n",
+      "            to be flexible.\n",
+      "        \n",
+      "        Returns\n",
+      "        -------\n",
+      "        output : ndarray\n",
+      "            Returns the dot product of `a` and `b`.  If `a` and `b` are both\n",
+      "            scalars or both 1-D arrays then a scalar is returned; otherwise\n",
+      "            an array is returned.\n",
+      "            If `out` is given, then it is returned.\n",
+      "        \n",
+      "        Raises\n",
+      "        ------\n",
+      "        ValueError\n",
+      "            If the last dimension of `a` is not the same size as\n",
+      "            the second-to-last dimension of `b`.\n",
+      "        \n",
+      "        See Also\n",
+      "        --------\n",
+      "        vdot : Complex-conjugating dot product.\n",
+      "        tensordot : Sum products over arbitrary axes.\n",
+      "        einsum : Einstein summation convention.\n",
+      "        matmul : '@' operator as method with out parameter.\n",
+      "        \n",
+      "        Examples\n",
+      "        --------\n",
+      "        >>> np.dot(3, 4)\n",
+      "        12\n",
+      "        \n",
+      "        Neither argument is complex-conjugated:\n",
+      "        \n",
+      "        >>> np.dot([2j, 3j], [2j, 3j])\n",
+      "        (-13+0j)\n",
+      "        \n",
+      "        For 2-D arrays it is the matrix product:\n",
+      "        \n",
+      "        >>> a = [[1, 0], [0, 1]]\n",
+      "        >>> b = [[4, 1], [2, 2]]\n",
+      "        >>> np.dot(a, b)\n",
+      "        array([[4, 1],\n",
+      "               [2, 2]])\n",
+      "        \n",
+      "        >>> a = np.arange(3*4*5*6).reshape((3,4,5,6))\n",
+      "        >>> b = np.arange(3*4*5*6)[::-1].reshape((5,4,6,3))\n",
+      "        >>> np.dot(a, b)[2,3,2,1,2,2]\n",
+      "        499128\n",
+      "        >>> sum(a[2,3,2,:] * b[1,2,:,2])\n",
+      "        499128\n",
+      "    \n",
+      "    empty(...)\n",
+      "        empty(shape, dtype=float, order='C')\n",
+      "        \n",
+      "        Return a new array of given shape and type, without initializing entries.\n",
+      "        \n",
+      "        Parameters\n",
+      "        ----------\n",
+      "        shape : int or tuple of int\n",
+      "            Shape of the empty array\n",
+      "        dtype : data-type, optional\n",
+      "            Desired output data-type.\n",
+      "        order : {'C', 'F'}, optional\n",
+      "            Whether to store multi-dimensional data in row-major\n",
+      "            (C-style) or column-major (Fortran-style) order in\n",
+      "            memory.\n",
+      "        \n",
+      "        Returns\n",
+      "        -------\n",
+      "        out : ndarray\n",
+      "            Array of uninitialized (arbitrary) data with the given\n",
+      "            shape, dtype, and order.\n",
+      "        \n",
+      "        See Also\n",
+      "        --------\n",
+      "        empty_like, zeros, ones\n",
+      "        \n",
+      "        Notes\n",
+      "        -----\n",
+      "        `empty`, unlike `zeros`, does not set the array values to zero,\n",
+      "        and may therefore be marginally faster.  On the other hand, it requires\n",
+      "        the user to manually set all the values in the array, and should be\n",
+      "        used with caution.\n",
+      "        \n",
+      "        Examples\n",
+      "        --------\n",
+      "        >>> np.empty([2, 2])\n",
+      "        array([[ -9.74499359e+001,   6.69583040e-309],\n",
+      "               [  2.13182611e-314,   3.06959433e-309]])         #random\n",
+      "        \n",
+      "        >>> np.empty([2, 2], dtype=int)\n",
+      "        array([[-1073741821, -1067949133],\n",
+      "               [  496041986,    19249760]])                     #random\n",
+      "    \n",
+      "    fromstring(...)\n",
+      "        fromstring(string, dtype=float, count=-1, sep='')\n",
+      "        \n",
+      "        A new 1-D array initialized from raw binary or text data in a string.\n",
+      "        \n",
+      "        Parameters\n",
+      "        ----------\n",
+      "        string : str\n",
+      "            A string containing the data.\n",
+      "        dtype : data-type, optional\n",
+      "            The data type of the array; default: float.  For binary input data,\n",
+      "            the data must be in exactly this format.\n",
+      "        count : int, optional\n",
+      "            Read this number of `dtype` elements from the data.  If this is\n",
+      "            negative (the default), the count will be determined from the\n",
+      "            length of the data.\n",
+      "        sep : str, optional\n",
+      "            If not provided or, equivalently, the empty string, the data will\n",
+      "            be interpreted as binary data; otherwise, as ASCII text with\n",
+      "            decimal numbers.  Also in this latter case, this argument is\n",
+      "            interpreted as the string separating numbers in the data; extra\n",
+      "            whitespace between elements is also ignored.\n",
+      "        \n",
+      "        Returns\n",
+      "        -------\n",
+      "        arr : ndarray\n",
+      "            The constructed array.\n",
+      "        \n",
+      "        Raises\n",
+      "        ------\n",
+      "        ValueError\n",
+      "            If the string is not the correct size to satisfy the requested\n",
+      "            `dtype` and `count`.\n",
+      "        \n",
+      "        See Also\n",
+      "        --------\n",
+      "        frombuffer, fromfile, fromiter\n",
+      "        \n",
+      "        Examples\n",
+      "        --------\n",
+      "        >>> np.fromstring('\\x01\\x02', dtype=np.uint8)\n",
+      "        array([1, 2], dtype=uint8)\n",
+      "        >>> np.fromstring('1 2', dtype=int, sep=' ')\n",
+      "        array([1, 2])\n",
+      "        >>> np.fromstring('1, 2', dtype=int, sep=',')\n",
+      "        array([1, 2])\n",
+      "        >>> np.fromstring('\\x01\\x02\\x03\\x04\\x05', dtype=np.uint8, count=3)\n",
+      "        array([1, 2, 3], dtype=uint8)\n",
+      "    \n",
+      "    score_cbow_pair(model, word, word2_indices, l1)\n",
+      "    \n",
+      "    score_sentence_cbow(...)\n",
+      "    \n",
+      "    score_sentence_sg(...)\n",
+      "    \n",
+      "    score_sg_pair(model, word, word2)\n",
+      "    \n",
+      "    train_batch_cbow(...)\n",
+      "    \n",
+      "    train_batch_sg(...)\n",
+      "    \n",
+      "    train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True)\n",
+      "    \n",
+      "    train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, context_vectors=None, context_locks=None)\n",
+      "    \n",
+      "    zeros(...)\n",
+      "        zeros(shape, dtype=float, order='C')\n",
+      "        \n",
+      "        Return a new array of given shape and type, filled with zeros.\n",
+      "        \n",
+      "        Parameters\n",
+      "        ----------\n",
+      "        shape : int or sequence of ints\n",
+      "            Shape of the new array, e.g., ``(2, 3)`` or ``2``.\n",
+      "        dtype : data-type, optional\n",
+      "            The desired data-type for the array, e.g., `numpy.int8`.  Default is\n",
+      "            `numpy.float64`.\n",
+      "        order : {'C', 'F'}, optional\n",
+      "            Whether to store multidimensional data in C- or Fortran-contiguous\n",
+      "            (row- or column-wise) order in memory.\n",
+      "        \n",
+      "        Returns\n",
+      "        -------\n",
+      "        out : ndarray\n",
+      "            Array of zeros with the given shape, dtype, and order.\n",
+      "        \n",
+      "        See Also\n",
+      "        --------\n",
+      "        zeros_like : Return an array of zeros with shape and type of input.\n",
+      "        ones_like : Return an array of ones with shape and type of input.\n",
+      "        empty_like : Return an empty array with shape and type of input.\n",
+      "        ones : Return a new array setting values to one.\n",
+      "        empty : Return a new uninitialized array.\n",
+      "        \n",
+      "        Examples\n",
+      "        --------\n",
+      "        >>> np.zeros(5)\n",
+      "        array([ 0.,  0.,  0.,  0.,  0.])\n",
+      "        \n",
+      "        >>> np.zeros((5,), dtype=np.int)\n",
+      "        array([0, 0, 0, 0, 0])\n",
+      "        \n",
+      "        >>> np.zeros((2, 1))\n",
+      "        array([[ 0.],\n",
+      "               [ 0.]])\n",
+      "        \n",
+      "        >>> s = (2,2)\n",
+      "        >>> np.zeros(s)\n",
+      "        array([[ 0.,  0.],\n",
+      "               [ 0.,  0.]])\n",
+      "        \n",
+      "        >>> np.zeros((2,), dtype=[('x', 'i4'), ('y', 'i4')]) # custom dtype\n",
+      "        array([(0, 0), (0, 0)],\n",
+      "              dtype=[('x', '<i4'), ('y', '<i4')])\n",
+      "\n",
+      "DATA\n",
+      "    FAST_VERSION = 0\n",
+      "    MAX_WORDS_IN_BATCH = 10000\n",
+      "    division = _Feature((2, 2, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 8192...\n",
+      "    exp = <ufunc 'exp'>\n",
+      "    log = <ufunc 'log'>\n",
+      "    logger = <logging.Logger object>\n",
+      "    newaxis = None\n",
+      "    sqrt = <ufunc 'sqrt'>\n",
+      "    string_types = (<class 'str'>,)\n",
+      "\n",
+      "FILE\n",
+      "    /Users/awculott/.local/lib/python3.5/site-packages/gensim/models/word2vec.py\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim.models import word2vec\n",
+    "# Tutorial: http://rare-technologies.com/deep-learning-with-word2vec-and-gensim/\n",
+    "help(word2vec)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2407167"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum(len(t) for t in tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
+    "model = word2vec.Word2Vec(tokens, size=100, window=5, min_count=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "model.init_sims(replace=True)  # free unneeded variables and precompute normalized vectors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('bikes', 0.8544794321060181),\n",
+       " ('spots', 0.8441545367240906),\n",
+       " ('decades', 0.8351261615753174),\n",
+       " ('mice', 0.8316686749458313),\n",
+       " ('plants', 0.8287297487258911),\n",
+       " ('seats', 0.822428286075592),\n",
+       " ('masters', 0.8212233781814575),\n",
+       " ('antibiotics', 0.8196142315864563),\n",
+       " ('adults', 0.8149713277816772),\n",
+       " ('buses', 0.812675952911377)]"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.most_similar(positive=['cars', 'trucks'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('crime', 0.882451593875885),\n",
+       " ('military', 0.8497803211212158),\n",
+       " ('criminal', 0.8445150852203369),\n",
+       " ('safety', 0.7943019866943359),\n",
+       " ('defense', 0.7925150394439697),\n",
+       " ('community', 0.7909201979637146),\n",
+       " ('armed', 0.7859578132629395),\n",
+       " ('drug', 0.780666708946228),\n",
+       " ('violent', 0.777873158454895),\n",
+       " ('self', 0.7714388370513916)]"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.most_similar(positive=['gun'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'engine'"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.doesnt_match(['mouse', 'engine', 'cpu'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.79739832314678638"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.n_similarity(['chip', 'cpu'], ['software', 'algorithm'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.11561933793959725"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.n_similarity(['chip', 'cpu'], ['religion', 'belief'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('items', 0.6851003170013428),\n",
+       " ('vendors', 0.667102038860321),\n",
+       " ('inanimate', 0.6526215672492981),\n",
+       " ('prophesied', 0.6507330536842346),\n",
+       " ('models', 0.6335091590881348),\n",
+       " ('solicited', 0.625587522983551),\n",
+       " ('places', 0.6187167167663574),\n",
+       " ('sizes', 0.6169173717498779),\n",
+       " ('assemblers', 0.6047966480255127),\n",
+       " ('tapes', 0.6001349091529846)]"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.most_similar(positive=['cars', 'guns', 'prices'], negative=['car', 'gun'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}