05_cta_notebook1.html

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
  <meta charset="utf-8">
  <meta name="generator" content="quarto-0.9.165">
  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
  <title>Topic models</title>
  <style>
    code{white-space: pre-wrap;}
    span.smallcaps{font-variant: small-caps;}
    span.underline{text-decoration: underline;}
    div.column{display: inline-block; vertical-align: top; width: 50%;}
    div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
    ul.task-list{list-style: none;}
    pre > code.sourceCode { white-space: pre; position: relative; }
    pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
    pre > code.sourceCode > span:empty { height: 1.2em; }
    .sourceCode { overflow: visible; }
    code.sourceCode > span { color: inherit; text-decoration: inherit; }
    div.sourceCode { margin: 1em 0; }
    pre.sourceCode { margin: 0; }
    @media screen {
    div.sourceCode { overflow: auto; }
    }
    @media print {
    pre > code.sourceCode { white-space: pre-wrap; }
    pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
    }
    pre.numberSource code
      { counter-reset: source-line 0; }
    pre.numberSource code > span
      { position: relative; left: -4em; counter-increment: source-line; }
    pre.numberSource code > span > a:first-child::before
      { content: counter(source-line);
        position: relative; left: -1em; text-align: right; vertical-align: baseline;
        border: none; display: inline-block;
        -webkit-touch-callout: none; -webkit-user-select: none;
        -khtml-user-select: none; -moz-user-select: none;
        -ms-user-select: none; user-select: none;
        padding: 0 4px; width: 4em;
        color: #aaaaaa;
      }
    pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa;  padding-left: 4px; }
    div.sourceCode
      {   }
    @media screen {
    pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
    }
    code span.al { color: #ff0000; font-weight: bold; } /* Alert */
    code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
    code span.at { color: #7d9029; } /* Attribute */
    code span.bn { color: #40a070; } /* BaseN */
    code span.bu { } /* BuiltIn */
    code span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
    code span.ch { color: #4070a0; } /* Char */
    code span.cn { color: #880000; } /* Constant */
    code span.co { color: #60a0b0; font-style: italic; } /* Comment */
    code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
    code span.do { color: #ba2121; font-style: italic; } /* Documentation */
    code span.dt { color: #902000; } /* DataType */
    code span.dv { color: #40a070; } /* DecVal */
    code span.er { color: #ff0000; font-weight: bold; } /* Error */
    code span.ex { } /* Extension */
    code span.fl { color: #40a070; } /* Float */
    code span.fu { color: #06287e; } /* Function */
    code span.im { } /* Import */
    code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
    code span.kw { color: #007020; font-weight: bold; } /* Keyword */
    code span.op { color: #666666; } /* Operator */
    code span.ot { color: #007020; } /* Other */
    code span.pp { color: #bc7a00; } /* Preprocessor */
    code span.sc { color: #4070a0; } /* SpecialChar */
    code span.ss { color: #bb6688; } /* SpecialString */
    code span.st { color: #4070a0; } /* String */
    code span.va { color: #19177c; } /* Variable */
    code span.vs { color: #4070a0; } /* VerbatimString */
    code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
  </style>

  <script src="05_cta_notebook1_files/libs/clipboard/clipboard.min.js"></script>
  <script src="05_cta_notebook1_files/libs/quarto-html/quarto.js"></script>
  <script src="05_cta_notebook1_files/libs/quarto-html/popper.min.js"></script>
  <script src="05_cta_notebook1_files/libs/quarto-html/tippy.umd.min.js"></script>
  <script src="05_cta_notebook1_files/libs/quarto-html/anchor.min.js"></script>
  <link href="05_cta_notebook1_files/libs/quarto-html/tippy.css" rel="stylesheet">
  <link id="quarto-text-highlighting-styles" href="05_cta_notebook1_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet">
  <script src="05_cta_notebook1_files/libs/bootstrap/bootstrap.min.js"></script>
  <link href="05_cta_notebook1_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
  <link href="05_cta_notebook1_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet">
  <!--[if lt IE 9]>
    <script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.3/html5shiv-printshiv.min.js"></script>
  <![endif]-->
</head>
<body class="fullcontent">
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">

<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">


<div class="quarto-title"><h1 class="title display-7">Topic models</h1><p class="subtitle lead">SICSS, 2022</p></div></header>
<section id="topic-modelling-notebook" class="level1">
<h1>Topic modelling notebook</h1>
<p>This hands-on exercise focuses on: 1) estimating a topic model ; 2) interpreting and visualizing results.</p>
<p>In this tutorial, you will learn how to:</p>
<ul>
<li>Generate document-term-matrices in format appropriate for topic modelling</li>
<li>Estimate a topic model using the <code>quanteda</code> and <code>topicmodels</code> package</li>
<li>Visualize results</li>
<li>Reverse engineer a test of model accuracy</li>
<li>Run some validation tests</li>
</ul>
<section id="setup" class="level2">
<h2 class="anchored" data-anchor-id="setup">Setup</h2>
<p>Before proceeding, we’ll load the packages we will need for this tutorial.</p>
<div class="cell">
<div class="sourceCode" id="cb1"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse) <span class="co"># loads dplyr, ggplot2, and others</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(stringr) <span class="co"># to handle text elements</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidytext) <span class="co"># includes set of functions useful for manipulating text</span></span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(topicmodels) <span class="co"># to estimate topic models</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(gutenbergr) <span class="co"># to get text data</span></span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(scales)</span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tm)</span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(ggthemes) <span class="co"># to make your plots look nice</span></span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(readr)</span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(quanteda)</span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(quanteda.textmodels)</span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co">#devtools::install_github("matthewjdenny/preText")</span></span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(preText)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>We’ll be using data from Alexis de Tocqueville’s “Democracy in America.” We will download these data , both Volume 1 and Volume 2, and combine them into one data frame. For this, we’ll be using the <tt>gutenbergr</tt> package, which allows the user to download text data from over 60,000 out-of-copyright books. The ID for each book appears in the url for the book selected after a search on <a href="https://www.gutenberg.org/ebooks/">https://www.gutenberg.org/ebooks/</a>.</p>
<p>This example is adapted by <a href="https://www.tidytextmining.com/">Text Mining with R: A Tidy Approach</a> by Julia Silge and David Robinson.</p>
<p><img src="data/gutenberg.gif" class="img-fluid" style="width:100.0%"></p>
<p>Here, we see that Volume of Tocqueville’s “Democracy in America” is stored as “815”. A separate search reveals that Volume 2 is stored as “816”.</p>
<div class="cell">
<div class="sourceCode" id="cb2"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>tocq <span class="ot">&lt;-</span> <span class="fu">gutenberg_download</span>(<span class="fu">c</span>(<span class="dv">815</span>, <span class="dv">816</span>), </span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>                            <span class="at">meta_fields =</span> <span class="st">"author"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Or we can download the dataset with:</p>
<div class="cell">
<div class="sourceCode" id="cb3"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>tocq <span class="ot">&lt;-</span> <span class="fu">readRDS</span>(<span class="st">"data/tocq.rds"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>If you’re working on this document from your own computer (“locally”) you can download the data in the following way:</p>
<div class="cell">
<div class="sourceCode" id="cb4"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>tocq  <span class="ot">&lt;-</span> <span class="fu">readRDS</span>(<span class="fu">gzcon</span>(<span class="fu">url</span>(<span class="st">"https://github.com/cjbarrie/CTA-ED/blob/main/data/topicmodels/tocq.RDS?raw=true"</span>)))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>Once we have read in these data, we convert it into a different data shape: the document-term-matrix. We also create a new columns, which we call “booknumber” that recordss whether the term in question is from Volume 1 or Volume 2. To convert from tidy into “DocumentTermMatrix” format we can first use <code>unnest_tokens()</code> as we have done in past exercises, remove stop words, and then use the <code>cast_dtm()</code> function to convert into a “DocumentTermMatrix” object.</p>
<div class="cell">
<div class="sourceCode" id="cb5"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>tocq_words <span class="ot">&lt;-</span> tocq <span class="sc">%&gt;%</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">booknumber =</span> <span class="fu">ifelse</span>(gutenberg_id<span class="sc">==</span><span class="dv">815</span>, <span class="st">"DiA1"</span>, <span class="st">"DiA2"</span>)) <span class="sc">%&gt;%</span></span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">unnest_tokens</span>(word, text) <span class="sc">%&gt;%</span></span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">filter</span>(<span class="sc">!</span><span class="fu">is.na</span>(word)) <span class="sc">%&gt;%</span></span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(booknumber, word, <span class="at">sort =</span> <span class="cn">TRUE</span>) <span class="sc">%&gt;%</span></span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ungroup</span>() <span class="sc">%&gt;%</span></span>
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">anti_join</span>(stop_words)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stderr">
<pre><code>Joining, by = "word"</code></pre>
</div>
<div class="sourceCode" id="cb7"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>tocq_dtm <span class="ot">&lt;-</span> tocq_words <span class="sc">%&gt;%</span></span>
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">cast_dtm</span>(booknumber, word, n)</span>
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>tm<span class="sc">::</span><span class="fu">inspect</span>(tocq_dtm)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>&lt;&lt;DocumentTermMatrix (documents: 2, terms: 12092)&gt;&gt;
Non-/sparse entries: 17581/6603
Sparsity           : 27%
Maximal term length: 18
Weighting          : term frequency (tf)
Sample             :
      Terms
Docs   country democratic government laws nations people power society time
  DiA1     357        212        556  397     233    516   543     290  311
  DiA2     167        561        162  133     313    360   263     241  309
      Terms
Docs   united
  DiA1    554
  DiA2    227</code></pre>
</div>
</div>
<p>We see here that the data are now stored as a “DocumentTermMatrix.” In this format, the matrix records the term (as equivalent of a column) and the document (as equivalent of row), and the number of times the term appears in the given document. Many terms will not appear in the document, meaning that the matrix will be stored as “sparse,” meaning there will be a preponderance of zeroes. Here, since we are looking only at two documents that both come from a single volume set, the sparsity is relatively low (only 27%). In most applications, the sparsity will be a lot higher, approaching 99% or more.</p>
<p>Estimating our topic model is then relatively simple. All we need to do if specify how many topics that we want to search for, and we can also set our seed, which is needed to reproduce the same results each time (as the model is a generative probabilistic one, meaning different random iterations will produce different results).</p>
<div class="cell">
<div class="sourceCode" id="cb9"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>tocq_lda <span class="ot">&lt;-</span> <span class="fu">LDA</span>(tocq_dtm, <span class="at">k =</span> <span class="dv">10</span>, <span class="at">control =</span> <span class="fu">list</span>(<span class="at">seed =</span> <span class="dv">1234</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>After this we can extract the per-topic-per-word probabilities, called “β” from the model:</p>
<div class="cell">
<div class="sourceCode" id="cb10"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>tocq_topics <span class="ot">&lt;-</span> <span class="fu">tidy</span>(tocq_lda, <span class="at">matrix =</span> <span class="st">"beta"</span>)</span>
<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="fu">head</span>(tocq_topics, <span class="at">n =</span> <span class="dv">10</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code># A tibble: 10 × 3
   topic term          beta
   &lt;int&gt; &lt;chr&gt;        &lt;dbl&gt;
 1     1 democratic 0.00855
 2     2 democratic 0.0115 
 3     3 democratic 0.00444
 4     4 democratic 0.0193 
 5     5 democratic 0.00254
 6     6 democratic 0.00866
 7     7 democratic 0.00165
 8     8 democratic 0.0108 
 9     9 democratic 0.00276
10    10 democratic 0.00334</code></pre>
</div>
</div>
<p>We now have data stored as one topic-per-term-per-row. The betas listed here represent the probability that the given term belongs to a given topic. So, here, we see that the term “democratic” is most likely to belong to topic 4. Strictly, this probability represents the probability that the term is generated from the topic in question.</p>
<p>We can then plots the top terms, in terms of beta, for each topic as follows:</p>
<div class="cell">
<div class="sourceCode" id="cb12"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>tocq_top_terms <span class="ot">&lt;-</span> tocq_topics <span class="sc">%&gt;%</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(topic) <span class="sc">%&gt;%</span></span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">top_n</span>(<span class="dv">10</span>, beta) <span class="sc">%&gt;%</span></span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ungroup</span>() <span class="sc">%&gt;%</span></span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">arrange</span>(topic, <span class="sc">-</span>beta)</span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb12-7"><a href="#cb12-7" aria-hidden="true" tabindex="-1"></a>tocq_top_terms <span class="sc">%&gt;%</span></span>
<span id="cb12-8"><a href="#cb12-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">term =</span> <span class="fu">reorder_within</span>(term, beta, topic)) <span class="sc">%&gt;%</span></span>
<span id="cb12-9"><a href="#cb12-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ggplot</span>(<span class="fu">aes</span>(beta, term, <span class="at">fill =</span> <span class="fu">factor</span>(topic))) <span class="sc">+</span></span>
<span id="cb12-10"><a href="#cb12-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_col</span>(<span class="at">show.legend =</span> <span class="cn">FALSE</span>) <span class="sc">+</span></span>
<span id="cb12-11"><a href="#cb12-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">facet_wrap</span>(<span class="sc">~</span> topic, <span class="at">scales =</span> <span class="st">"free"</span>, <span class="at">ncol =</span> <span class="dv">4</span>) <span class="sc">+</span></span>
<span id="cb12-12"><a href="#cb12-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_y_reordered</span>() <span class="sc">+</span></span>
<span id="cb12-13"><a href="#cb12-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_tufte</span>(<span class="at">base_family =</span> <span class="st">"Helvetica"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="05_cta_notebook1_files/figure-html/unnamed-chunk-16-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>But how do we actually evaluate these topics? Here, the topics all seem pretty similar.</p>
</section>
<section id="evaluating-topic-model" class="level2">
<h2 class="anchored" data-anchor-id="evaluating-topic-model">Evaluating topic model</h2>
<p>Well, one way to evaluate the performance of unspervised forms of classification is by testing our model on an outcome that is already known.</p>
<p>Here, two topics that are most obvious are the ‘topics’ Volume 1 and Volume 2 of Tocqueville’s “Democracy in America.” Volume 1 of Tocqueville’s work deals more obviously with abstract constitutional ideas and questions of race; Volume 2 focuses on more esoteric aspects of American society. Listen an “In Our Time” episode with Melvyn Bragg discussing Democracy in America <a href="https://www.bbc.co.uk/programmes/b09vyw0x">here</a>.</p>
<p>Given these differences in focus, we might think that a generative model could accurately assign to topic (i.e., Volume) with some accuracy.</p>
<section id="plot-relative-word-frequencies" class="level3">
<h3 class="anchored" data-anchor-id="plot-relative-word-frequencies">Plot relative word frequencies</h3>
<p>First let’s have a look and see whether there really are words obviously distinguishing the two Volumes.</p>
<div class="cell">
<div class="sourceCode" id="cb13"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>tidy_tocq <span class="ot">&lt;-</span> tocq <span class="sc">%&gt;%</span></span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">unnest_tokens</span>(word, text) <span class="sc">%&gt;%</span></span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">anti_join</span>(stop_words)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stderr">
<pre><code>Joining, by = "word"</code></pre>
</div>
<div class="sourceCode" id="cb15"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="do">## Count most common words in both</span></span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a>tidy_tocq <span class="sc">%&gt;%</span></span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(word, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code># A tibble: 12,092 × 2
   word           n
   &lt;chr&gt;      &lt;int&gt;
 1 people       876
 2 power        806
 3 united       781
 4 democratic   773
 5 government   718
 6 time         620
 7 nations      546
 8 society      531
 9 laws         530
10 country      524
# … with 12,082 more rows</code></pre>
</div>
<div class="sourceCode" id="cb17"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>bookfreq <span class="ot">&lt;-</span> tidy_tocq <span class="sc">%&gt;%</span></span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">booknumber =</span> <span class="fu">ifelse</span>(gutenberg_id<span class="sc">==</span><span class="dv">815</span>, <span class="st">"DiA1"</span>, <span class="st">"DiA2"</span>)) <span class="sc">%&gt;%</span></span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">word =</span> <span class="fu">str_extract</span>(word, <span class="st">"[a-z']+"</span>)) <span class="sc">%&gt;%</span></span>
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(booknumber, word) <span class="sc">%&gt;%</span></span>
<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(booknumber) <span class="sc">%&gt;%</span></span>
<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">proportion =</span> n <span class="sc">/</span> <span class="fu">sum</span>(n)) <span class="sc">%&gt;%</span> </span>
<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">select</span>(<span class="sc">-</span>n) <span class="sc">%&gt;%</span> </span>
<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">spread</span>(booknumber, proportion)</span>
<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a><span class="fu">ggplot</span>(bookfreq, <span class="fu">aes</span>(<span class="at">x =</span> DiA1, <span class="at">y =</span> DiA2, <span class="at">color =</span> <span class="fu">abs</span>(DiA1 <span class="sc">-</span> DiA2))) <span class="sc">+</span></span>
<span id="cb17-11"><a href="#cb17-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_abline</span>(<span class="at">color =</span> <span class="st">"gray40"</span>, <span class="at">lty =</span> <span class="dv">2</span>) <span class="sc">+</span></span>
<span id="cb17-12"><a href="#cb17-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_jitter</span>(<span class="at">alpha =</span> <span class="fl">0.1</span>, <span class="at">size =</span> <span class="fl">2.5</span>, <span class="at">width =</span> <span class="fl">0.3</span>, <span class="at">height =</span> <span class="fl">0.3</span>) <span class="sc">+</span></span>
<span id="cb17-13"><a href="#cb17-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_text</span>(<span class="fu">aes</span>(<span class="at">label =</span> word), <span class="at">check_overlap =</span> <span class="cn">TRUE</span>, <span class="at">vjust =</span> <span class="fl">1.5</span>) <span class="sc">+</span></span>
<span id="cb17-14"><a href="#cb17-14" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_log10</span>(<span class="at">labels =</span> <span class="fu">percent_format</span>()) <span class="sc">+</span></span>
<span id="cb17-15"><a href="#cb17-15" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_y_log10</span>(<span class="at">labels =</span> <span class="fu">percent_format</span>()) <span class="sc">+</span></span>
<span id="cb17-16"><a href="#cb17-16" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_color_gradient</span>(<span class="at">limits =</span> <span class="fu">c</span>(<span class="dv">0</span>, <span class="fl">0.001</span>), <span class="at">low =</span> <span class="st">"darkslategray4"</span>, <span class="at">high =</span> <span class="st">"gray75"</span>) <span class="sc">+</span></span>
<span id="cb17-17"><a href="#cb17-17" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_tufte</span>(<span class="at">base_family =</span> <span class="st">"Helvetica"</span>) <span class="sc">+</span></span>
<span id="cb17-18"><a href="#cb17-18" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">legend.position=</span><span class="st">"none"</span>, </span>
<span id="cb17-19"><a href="#cb17-19" aria-hidden="true" tabindex="-1"></a>        <span class="at">strip.background =</span> <span class="fu">element_blank</span>(), </span>
<span id="cb17-20"><a href="#cb17-20" aria-hidden="true" tabindex="-1"></a>        <span class="at">strip.text.x =</span> <span class="fu">element_blank</span>()) <span class="sc">+</span></span>
<span id="cb17-21"><a href="#cb17-21" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"Tocqueville DiA 2"</span>, <span class="at">y =</span> <span class="st">"Tocqueville DiA 1"</span>) <span class="sc">+</span></span>
<span id="cb17-22"><a href="#cb17-22" aria-hidden="true" tabindex="-1"></a>  <span class="fu">coord_equal</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stderr">
<pre><code>Warning: Removed 6173 rows containing missing values (geom_point).</code></pre>
</div>
<div class="cell-output-stderr">
<pre><code>Warning: Removed 6174 rows containing missing values (geom_text).</code></pre>
</div>
<div class="cell-output-display">
<p><img src="05_cta_notebook1_files/figure-html/unnamed-chunk-18-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>We see that there do seem to be some marked distinguishing characteristics. In the plot above, for example, we see that more abstract notions of state systems appear with greater frequency in Volume 1 while Volume 2 seems to contain words specific to America (e.g., “north” and “south”) with greater frequency. The way to read the above plot is that words positioned further away from the diagonal line appear with greater frequency in one volume versus the other.</p>
</section>
<section id="split-into-chapter-documents" class="level3">
<h3 class="anchored" data-anchor-id="split-into-chapter-documents">Split into chapter documents</h3>
<p>In the below, we first separate the volumes into chapters, then we repeat the same procedure as above. The only difference now is that instead of two documents representing the two full volumes of Tocqueville’s work, we now have 132 documents, each representing an individual chapter. Notice now that the sparsity is much increased: around 96%.</p>
<div class="cell">
<div class="sourceCode" id="cb20"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>tocq <span class="ot">&lt;-</span> tocq <span class="sc">%&gt;%</span></span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">filter</span>(<span class="sc">!</span><span class="fu">is.na</span>(text))</span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Divide into documents, each representing one chapter</span></span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a>tocq_chapter <span class="ot">&lt;-</span> tocq <span class="sc">%&gt;%</span></span>
<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">booknumber =</span> <span class="fu">ifelse</span>(gutenberg_id<span class="sc">==</span><span class="dv">815</span>, <span class="st">"DiA1"</span>, <span class="st">"DiA2"</span>)) <span class="sc">%&gt;%</span></span>
<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(booknumber) <span class="sc">%&gt;%</span></span>
<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">chapter =</span> <span class="fu">cumsum</span>(<span class="fu">str_detect</span>(text, <span class="fu">regex</span>(<span class="st">"^chapter "</span>, <span class="at">ignore_case =</span> <span class="cn">TRUE</span>)))) <span class="sc">%&gt;%</span></span>
<span id="cb20-9"><a href="#cb20-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ungroup</span>() <span class="sc">%&gt;%</span></span>
<span id="cb20-10"><a href="#cb20-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">filter</span>(chapter <span class="sc">&gt;</span> <span class="dv">0</span>) <span class="sc">%&gt;%</span></span>
<span id="cb20-11"><a href="#cb20-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">unite</span>(document, booknumber, chapter)</span>
<span id="cb20-12"><a href="#cb20-12" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-13"><a href="#cb20-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Split into words</span></span>
<span id="cb20-14"><a href="#cb20-14" aria-hidden="true" tabindex="-1"></a>tocq_chapter_word <span class="ot">&lt;-</span> tocq_chapter <span class="sc">%&gt;%</span></span>
<span id="cb20-15"><a href="#cb20-15" aria-hidden="true" tabindex="-1"></a>  <span class="fu">unnest_tokens</span>(word, text)</span>
<span id="cb20-16"><a href="#cb20-16" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb20-17"><a href="#cb20-17" aria-hidden="true" tabindex="-1"></a><span class="co"># Find document-word counts</span></span>
<span id="cb20-18"><a href="#cb20-18" aria-hidden="true" tabindex="-1"></a>tocq_word_counts <span class="ot">&lt;-</span> tocq_chapter_word <span class="sc">%&gt;%</span></span>
<span id="cb20-19"><a href="#cb20-19" aria-hidden="true" tabindex="-1"></a>  <span class="fu">anti_join</span>(stop_words) <span class="sc">%&gt;%</span></span>
<span id="cb20-20"><a href="#cb20-20" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(document, word, <span class="at">sort =</span> <span class="cn">TRUE</span>) <span class="sc">%&gt;%</span></span>
<span id="cb20-21"><a href="#cb20-21" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ungroup</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stderr">
<pre><code>Joining, by = "word"</code></pre>
</div>
<div class="sourceCode" id="cb22"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a>tocq_word_counts</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code># A tibble: 69,781 × 3
   document word             n
   &lt;chr&gt;    &lt;chr&gt;        &lt;int&gt;
 1 DiA2_76  united          88
 2 DiA2_60  honor           70
 3 DiA1_52  union           66
 4 DiA2_76  president       60
 5 DiA2_76  law             59
 6 DiA1_42  jury            57
 7 DiA2_76  time            50
 8 DiA1_11  township        49
 9 DiA1_21  federal         48
10 DiA2_76  constitution    48
# … with 69,771 more rows</code></pre>
</div>
<div class="sourceCode" id="cb24"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Cast into DTM format for LDA analysis</span></span>
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a>tocq_chapters_dtm <span class="ot">&lt;-</span> tocq_word_counts <span class="sc">%&gt;%</span></span>
<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">cast_dtm</span>(document, word, n)</span>
<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a>tm<span class="sc">::</span><span class="fu">inspect</span>(tocq_chapters_dtm)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code>&lt;&lt;DocumentTermMatrix (documents: 132, terms: 11898)&gt;&gt;
Non-/sparse entries: 69781/1500755
Sparsity           : 96%
Maximal term length: 18
Weighting          : term frequency (tf)
Sample             :
         Terms
Docs      country democratic government laws nations people power public time
  DiA1_11      10          0         23   19       7     13    19     15    6
  DiA1_13      13          5         34    9      12     17    37     15    6
  DiA1_20       9          0         25   13       2     14    32     13   10
  DiA1_21       4          0         20   29       6     12    20      5    5
  DiA1_23      10          0         35    9      24     20    13      4    8
  DiA1_31       7         12         10   13       4     30    18     31    6
  DiA1_32      10         14         25    6       9     25    11     43    8
  DiA1_47      12          2          5    3       3      6     8      0    3
  DiA1_56      12          0          3    7      19      3     8      3   22
  DiA2_76      11         10         24   39      12     31    27     27   50
         Terms
Docs      united
  DiA1_11     13
  DiA1_13     19
  DiA1_20     21
  DiA1_21     23
  DiA1_23     15
  DiA1_31     11
  DiA1_32     14
  DiA1_47      8
  DiA1_56     25
  DiA2_76     88</code></pre>
</div>
</div>
<p>We then re-estimate the topic model with this new DocumentTermMatrix object, specifying k equal to 2. This will enable us to evaluate whether a topic model is able to generatively assign to volume with accuracy.</p>
<div class="cell">
<div class="sourceCode" id="cb26"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a>tocq_chapters_lda <span class="ot">&lt;-</span> <span class="fu">LDA</span>(tocq_chapters_dtm, <span class="at">k =</span> <span class="dv">2</span>, <span class="at">control =</span> <span class="fu">list</span>(<span class="at">seed =</span> <span class="dv">1234</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>After this, it is worth looking at another output of the latent dirichlet allocation procedure. The γ probability represents the per-document-per-topic probability or, in other words, the probability that a given document (here: chapter) belongs to a particular topic (and here, we are assuming these topics represent volumes).</p>
<p>The gamma values are therefore the estimated proportion of words within a given chapter allocated to a given volume.</p>
<div class="cell">
<div class="sourceCode" id="cb27"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a>tocq_chapters_gamma <span class="ot">&lt;-</span> <span class="fu">tidy</span>(tocq_chapters_lda, <span class="at">matrix =</span> <span class="st">"gamma"</span>)</span>
<span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a>tocq_chapters_gamma</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code># A tibble: 264 × 3
   document topic     gamma
   &lt;chr&gt;    &lt;int&gt;     &lt;dbl&gt;
 1 DiA2_76      1 0.551    
 2 DiA2_60      1 1.00     
 3 DiA1_52      1 0.0000464
 4 DiA1_42      1 0.0000746
 5 DiA1_11      1 0.0000382
 6 DiA1_21      1 0.0000437
 7 DiA1_20      1 0.0000425
 8 DiA1_28      1 0.249    
 9 DiA1_50      1 0.0000477
10 DiA1_22      1 0.0000466
# … with 254 more rows</code></pre>
</div>
</div>
</section>
<section id="examine-consensus" class="level3">
<h3 class="anchored" data-anchor-id="examine-consensus">Examine consensus</h3>
<p>Now that we have these topic probabilities, we can see how well our unsupervised learning did at distinguishing the two volumes generatively just from the words contained in each chapter.</p>
<div class="cell">
<div class="sourceCode" id="cb29"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb29-1"><a href="#cb29-1" aria-hidden="true" tabindex="-1"></a><span class="co"># First separate the document name into title and chapter</span></span>
<span id="cb29-2"><a href="#cb29-2" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb29-3"><a href="#cb29-3" aria-hidden="true" tabindex="-1"></a>tocq_chapters_gamma <span class="ot">&lt;-</span> tocq_chapters_gamma <span class="sc">%&gt;%</span></span>
<span id="cb29-4"><a href="#cb29-4" aria-hidden="true" tabindex="-1"></a>  <span class="fu">separate</span>(document, <span class="fu">c</span>(<span class="st">"title"</span>, <span class="st">"chapter"</span>), <span class="at">sep =</span> <span class="st">"_"</span>, <span class="at">convert =</span> <span class="cn">TRUE</span>)</span>
<span id="cb29-5"><a href="#cb29-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb29-6"><a href="#cb29-6" aria-hidden="true" tabindex="-1"></a>tocq_chapter_classifications <span class="ot">&lt;-</span> tocq_chapters_gamma <span class="sc">%&gt;%</span></span>
<span id="cb29-7"><a href="#cb29-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(title, chapter) <span class="sc">%&gt;%</span></span>
<span id="cb29-8"><a href="#cb29-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">top_n</span>(<span class="dv">1</span>, gamma) <span class="sc">%&gt;%</span></span>
<span id="cb29-9"><a href="#cb29-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ungroup</span>()</span>
<span id="cb29-10"><a href="#cb29-10" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb29-11"><a href="#cb29-11" aria-hidden="true" tabindex="-1"></a>tocq_book_topics <span class="ot">&lt;-</span> tocq_chapter_classifications <span class="sc">%&gt;%</span></span>
<span id="cb29-12"><a href="#cb29-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(title, topic) <span class="sc">%&gt;%</span></span>
<span id="cb29-13"><a href="#cb29-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(title) <span class="sc">%&gt;%</span></span>
<span id="cb29-14"><a href="#cb29-14" aria-hidden="true" tabindex="-1"></a>  <span class="fu">top_n</span>(<span class="dv">1</span>, n) <span class="sc">%&gt;%</span></span>
<span id="cb29-15"><a href="#cb29-15" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ungroup</span>() <span class="sc">%&gt;%</span></span>
<span id="cb29-16"><a href="#cb29-16" aria-hidden="true" tabindex="-1"></a>  <span class="fu">transmute</span>(<span class="at">consensus =</span> title, topic)</span>
<span id="cb29-17"><a href="#cb29-17" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb29-18"><a href="#cb29-18" aria-hidden="true" tabindex="-1"></a>tocq_chapter_classifications <span class="sc">%&gt;%</span></span>
<span id="cb29-19"><a href="#cb29-19" aria-hidden="true" tabindex="-1"></a>  <span class="fu">inner_join</span>(tocq_book_topics, <span class="at">by =</span> <span class="st">"topic"</span>) <span class="sc">%&gt;%</span></span>
<span id="cb29-20"><a href="#cb29-20" aria-hidden="true" tabindex="-1"></a>  <span class="fu">filter</span>(title <span class="sc">!=</span> consensus)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code># A tibble: 15 × 5
   title chapter topic gamma consensus
   &lt;chr&gt;   &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;chr&gt;    
 1 DiA1       45     1 0.762 DiA2     
 2 DiA1        5     1 0.504 DiA2     
 3 DiA1       33     1 0.570 DiA2     
 4 DiA1       34     1 0.626 DiA2     
 5 DiA1       41     1 0.512 DiA2     
 6 DiA1       44     1 0.765 DiA2     
 7 DiA1        8     1 0.791 DiA2     
 8 DiA1        4     1 0.717 DiA2     
 9 DiA1       35     1 0.576 DiA2     
10 DiA1       39     1 0.577 DiA2     
11 DiA1        7     1 0.687 DiA2     
12 DiA1       29     1 0.983 DiA2     
13 DiA1        6     1 0.707 DiA2     
14 DiA2       27     2 0.654 DiA1     
15 DiA2       21     2 0.510 DiA1     </code></pre>
</div>
<div class="sourceCode" id="cb31"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb31-1"><a href="#cb31-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Look document-word pairs were to see which words in each documents were assigned</span></span>
<span id="cb31-2"><a href="#cb31-2" aria-hidden="true" tabindex="-1"></a><span class="co"># to a given topic</span></span>
<span id="cb31-3"><a href="#cb31-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb31-4"><a href="#cb31-4" aria-hidden="true" tabindex="-1"></a>assignments <span class="ot">&lt;-</span> <span class="fu">augment</span>(tocq_chapters_lda, <span class="at">data =</span> tocq_chapters_dtm)</span>
<span id="cb31-5"><a href="#cb31-5" aria-hidden="true" tabindex="-1"></a>assignments</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code># A tibble: 69,781 × 4
   document term   count .topic
   &lt;chr&gt;    &lt;chr&gt;  &lt;dbl&gt;  &lt;dbl&gt;
 1 DiA2_76  united    88      2
 2 DiA2_60  united     6      1
 3 DiA1_52  united    11      2
 4 DiA1_42  united     7      2
 5 DiA1_11  united    13      2
 6 DiA1_21  united    23      2
 7 DiA1_20  united    21      2
 8 DiA1_28  united    14      2
 9 DiA1_50  united     5      2
10 DiA1_22  united     8      2
# … with 69,771 more rows</code></pre>
</div>
<div class="sourceCode" id="cb33"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a>assignments <span class="ot">&lt;-</span> assignments <span class="sc">%&gt;%</span></span>
<span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a>  <span class="fu">separate</span>(document, <span class="fu">c</span>(<span class="st">"title"</span>, <span class="st">"chapter"</span>), <span class="at">sep =</span> <span class="st">"_"</span>, <span class="at">convert =</span> <span class="cn">TRUE</span>) <span class="sc">%&gt;%</span></span>
<span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a>  <span class="fu">inner_join</span>(tocq_book_topics, <span class="at">by =</span> <span class="fu">c</span>(<span class="st">".topic"</span> <span class="ot">=</span> <span class="st">"topic"</span>))</span>
<span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a>assignments <span class="sc">%&gt;%</span></span>
<span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a>  <span class="fu">count</span>(title, consensus, <span class="at">wt =</span> count) <span class="sc">%&gt;%</span></span>
<span id="cb33-7"><a href="#cb33-7" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(title) <span class="sc">%&gt;%</span></span>
<span id="cb33-8"><a href="#cb33-8" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">percent =</span> n <span class="sc">/</span> <span class="fu">sum</span>(n)) <span class="sc">%&gt;%</span></span>
<span id="cb33-9"><a href="#cb33-9" aria-hidden="true" tabindex="-1"></a>  <span class="fu">ggplot</span>(<span class="fu">aes</span>(consensus, title, <span class="at">fill =</span> percent)) <span class="sc">+</span></span>
<span id="cb33-10"><a href="#cb33-10" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_tile</span>() <span class="sc">+</span></span>
<span id="cb33-11"><a href="#cb33-11" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_fill_gradient2</span>(<span class="at">high =</span> <span class="st">"red"</span>, <span class="at">label =</span> <span class="fu">percent_format</span>()) <span class="sc">+</span></span>
<span id="cb33-12"><a href="#cb33-12" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_text</span>(<span class="fu">aes</span>(<span class="at">x =</span> consensus, <span class="at">y =</span> title, <span class="at">label =</span> scales<span class="sc">::</span><span class="fu">percent</span>(percent))) <span class="sc">+</span></span>
<span id="cb33-13"><a href="#cb33-13" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme_tufte</span>(<span class="at">base_family =</span> <span class="st">"Helvetica"</span>) <span class="sc">+</span></span>
<span id="cb33-14"><a href="#cb33-14" aria-hidden="true" tabindex="-1"></a>  <span class="fu">theme</span>(<span class="at">axis.text.x =</span> <span class="fu">element_text</span>(<span class="at">angle =</span> <span class="dv">90</span>, <span class="at">hjust =</span> <span class="dv">1</span>),</span>
<span id="cb33-15"><a href="#cb33-15" aria-hidden="true" tabindex="-1"></a>        <span class="at">panel.grid =</span> <span class="fu">element_blank</span>()) <span class="sc">+</span></span>
<span id="cb33-16"><a href="#cb33-16" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">x =</span> <span class="st">"Book words assigned to"</span>,</span>
<span id="cb33-17"><a href="#cb33-17" aria-hidden="true" tabindex="-1"></a>       <span class="at">y =</span> <span class="st">"Book words came from"</span>,</span>
<span id="cb33-18"><a href="#cb33-18" aria-hidden="true" tabindex="-1"></a>       <span class="at">fill =</span> <span class="st">"% of assignments"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-display">
<p><img src="05_cta_notebook1_files/figure-html/unnamed-chunk-26-1.png" class="img-fluid" width="672"></p>
</div>
</div>
<p>Not bad! We see that the model estimated with accuracy 91% of chapters in Volume 2 and 79% of chapters in Volume 1</p>
</section>
</section>
<section id="validation" class="level2">
<h2 class="anchored" data-anchor-id="validation">Validation</h2>
<p>In this section, we’ll be using the <code>preText</code> package mentioned in <span class="citation" data-cites="denny_text_2018">@denny_text_2018</span> to see the impact of different pre-processing choices on our text. Here, I am adapting from a <a href="http://www.mjdenny.com/getting_started_with_preText.html">tutorial</a> by Matthew Denny.</p>
<p>First we need to reformat our text into a <code>quanteda</code> corpus object.</p>
<div class="cell">
<div class="sourceCode" id="cb34"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb34-1"><a href="#cb34-1" aria-hidden="true" tabindex="-1"></a><span class="co"># load in U.S. presidential inaugural speeches from Quanteda example data.</span></span>
<span id="cb34-2"><a href="#cb34-2" aria-hidden="true" tabindex="-1"></a>corp <span class="ot">&lt;-</span> <span class="fu">corpus</span>(tocq, <span class="at">text_field =</span> <span class="st">"text"</span>)</span>
<span id="cb34-3"><a href="#cb34-3" aria-hidden="true" tabindex="-1"></a><span class="co"># use first 10 documents for example</span></span>
<span id="cb34-4"><a href="#cb34-4" aria-hidden="true" tabindex="-1"></a>documents <span class="ot">&lt;-</span> corp[<span class="fu">sample</span>(<span class="dv">1</span><span class="sc">:</span><span class="dv">30000</span>,<span class="dv">1000</span>)]</span>
<span id="cb34-5"><a href="#cb34-5" aria-hidden="true" tabindex="-1"></a><span class="co"># take a look at the document names</span></span>
<span id="cb34-6"><a href="#cb34-6" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">names</span>(documents[<span class="dv">1</span><span class="sc">:</span><span class="dv">10</span>]))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="cell-output-stdout">
<pre><code> [1] "text15825" "text12268" "text17823" "text22967" "text9681"  "text10806"
 [7] "text12644" "text3778"  "text3356"  "text24637"</code></pre>
</div>
</div>
<p>And now we are ready to preprocess in different ways. Here, we are including n-grams so we are preprocessing the text in 128 different ways. This takes about ten minutes to run on a machine with 8GB RAM.</p>
<div class="cell">
<div class="sourceCode" id="cb36"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb36-1"><a href="#cb36-1" aria-hidden="true" tabindex="-1"></a>preprocessed_documents <span class="ot">&lt;-</span> <span class="fu">factorial_preprocessing</span>(</span>
<span id="cb36-2"><a href="#cb36-2" aria-hidden="true" tabindex="-1"></a>    documents,</span>
<span id="cb36-3"><a href="#cb36-3" aria-hidden="true" tabindex="-1"></a>    <span class="at">use_ngrams =</span> <span class="cn">TRUE</span>,</span>
<span id="cb36-4"><a href="#cb36-4" aria-hidden="true" tabindex="-1"></a>    <span class="at">infrequent_term_threshold =</span> <span class="fl">0.2</span>,</span>
<span id="cb36-5"><a href="#cb36-5" aria-hidden="true" tabindex="-1"></a>    <span class="at">verbose =</span> <span class="cn">FALSE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>We can then get the results of our pre-processing, comparing the distance between documents that have been processed in different ways.</p>
<div class="cell">
<div class="sourceCode" id="cb37"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a>preText_results <span class="ot">&lt;-</span> <span class="fu">preText</span>(</span>
<span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a>    preprocessed_documents,</span>
<span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a>    <span class="at">dataset_name =</span> <span class="st">"Tocqueville text"</span>,</span>
<span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a>    <span class="at">distance_method =</span> <span class="st">"cosine"</span>,</span>
<span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a>    <span class="at">num_comparisons =</span> <span class="dv">20</span>,</span>
<span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a>    <span class="at">verbose =</span> <span class="cn">FALSE</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p>And we can plot these accordingly.</p>
<div class="cell">
<div class="sourceCode" id="cb38"><pre class="sourceCode r cell-code code-with-copy"><code class="sourceCode r"><span id="cb38-1"><a href="#cb38-1" aria-hidden="true" tabindex="-1"></a><span class="fu">preText_score_plot</span>(preText_results)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</div>
<p><img src="data/pretext_results.png" class="img-fluid" style="width:100.0%"></p>
</section>
<section id="exercises" class="level2">
<h2 class="anchored" data-anchor-id="exercises">Exercises</h2>
<ol type="1">
<li>Choose another book or set of books from Project Gutenberg</li>
<li>Run your own topic model on these books, changing the k of topics, and evaluating accuracy.</li>
<li>Validate different pre-processing techniques using <code>preText</code> on the new book(s) of your choice.</li>
</ol>

</section>
</section>
</main>
<!-- /main column -->
<script type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
  const icon = "";
  const anchorJS = new window.AnchorJS();
  anchorJS.options = {
    placement: 'right',
    icon: icon
  };
  anchorJS.add('.anchored');
  const clipboard = new window.ClipboardJS('.code-copy-button', {
    target: function(trigger) {
      return trigger.previousElementSibling;
    }
  });
  clipboard.on('success', function(e) {
    // button target
    const button = e.trigger;
    // don't keep focus
    button.blur();
    // flash "checked"
    button.classList.add('code-copy-button-checked');
    setTimeout(function() {
      button.classList.remove('code-copy-button-checked');
    }, 1000);
    // clear code selection
    e.clearSelection();
  });
  function tippyHover(el, contentFn) {
    const config = {
      allowHTML: true,
      content: contentFn,
      maxWidth: 500,
      delay: 100,
      arrow: false,
      appendTo: function(el) {
          return el.parentElement;
      },
      interactive: true,
      interactiveBorder: 10,
      theme: 'quarto',
      placement: 'bottom-start'
    };
    window.tippy(el, config); 
  }
  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
  for (var i=0; i<noterefs.length; i++) {
    const ref = noterefs[i];
    tippyHover(ref, function() {
      let href = ref.getAttribute('href');
      try { href = new URL(href).hash; } catch {}
      const id = href.replace(/^#\/?/, "");
      const note = window.document.getElementById(id);
      return note.innerHTML;
    });
  }
  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
  for (var i=0; i<bibliorefs.length; i++) {
    const ref = bibliorefs[i];
    const cites = ref.parentNode.getAttribute('data-cites').split(' ');
    tippyHover(ref, function() {
      var popup = window.document.createElement('div');
      cites.forEach(function(cite) {
        var citeDiv = window.document.createElement('div');
        citeDiv.classList.add('hanging-indent');
        citeDiv.classList.add('csl-entry');
        var biblioDiv = window.document.getElementById('ref-' + cite);
        if (biblioDiv) {
          citeDiv.innerHTML = biblioDiv.innerHTML;
        }
        popup.appendChild(citeDiv);
      });
      return popup.innerHTML;
    });
  }
});
</script>
</div> <!-- /content -->


</body></html>