-
Notifications
You must be signed in to change notification settings - Fork 0
/
qtalr-manuscript.tex
21929 lines (17827 loc) · 941 KB
/
qtalr-manuscript.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode,linktoc=all}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{space}{xeCJK}
%
\documentclass[
letterpaper,
krantz1]{latex/krantz-mod}
\usepackage{amsmath,amssymb}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math}
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
% xetex/luatex font selection
\setmainfont[]{Palatino}
\setmonofont[Scale=0.75]{Hack Nerd Font Mono}
\ifXeTeX
\usepackage{xeCJK}
\setCJKmainfont[]{Arial Unicode MS}
\fi
\ifLuaTeX
\usepackage[]{luatexja-fontspec}
\setmainjfont[]{Arial Unicode MS}
\fi
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{2}
% Make \paragraph and \subparagraph free-standing
\makeatletter
\ifx\paragraph\undefined\else
\let\oldparagraph\paragraph
\renewcommand{\paragraph}{
\@ifstar
\xxxParagraphStar
\xxxParagraphNoStar
}
\newcommand{\xxxParagraphStar}[1]{\oldparagraph*{#1}\mbox{}}
\newcommand{\xxxParagraphNoStar}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
\let\oldsubparagraph\subparagraph
\renewcommand{\subparagraph}{
\@ifstar
\xxxSubParagraphStar
\xxxSubParagraphNoStar
}
\newcommand{\xxxSubParagraphStar}[1]{\oldsubparagraph*{#1}\mbox{}}
\newcommand{\xxxSubParagraphNoStar}[1]{\oldsubparagraph{#1}\mbox{}}
\fi
\makeatother
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{255,255,255}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{\textit{#1}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{\textit{#1}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{\textit{#1}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{\textit{#1}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{\textit{#1}}}
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
% definitions for citeproc citations
\NewDocumentCommand\citeproctext{}{}
\NewDocumentCommand\citeproc{mm}{%
\begingroup\def\citeproctext{#2}\cite{#1}\endgroup}
\makeatletter
% allow citations to break across lines
\let\@cite@ofmt\@firstofone
% avoid brackets around text for \cite:
\def\@biblabel#1{}
\def\@cite#1#2{{#1\if@tempswa , #2\fi}}
\makeatother
\newlength{\cslhangindent}
\setlength{\cslhangindent}{1.5em}
\newlength{\csllabelwidth}
\setlength{\csllabelwidth}{3em}
\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing
{\begin{list}{}{%
\setlength{\itemindent}{0pt}
\setlength{\leftmargin}{0pt}
\setlength{\parsep}{0pt}
% turn on hanging indent if param 1 is 1
\ifodd #1
\setlength{\leftmargin}{\cslhangindent}
\setlength{\itemindent}{-1\cslhangindent}
\fi
% set entry spacing
\setlength{\itemsep}{#2\baselineskip}}}
{\end{list}}
\usepackage{calc}
\newcommand{\CSLBlock}[1]{\hfill\break\parbox[t]{\linewidth}{\strut\ignorespaces#1\strut}}
\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{\strut#1\strut}}
\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{\strut#1\strut}}
\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1}
% % PREAMBLE
% Wrap code blocks
\usepackage{fvextra}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{
breaklines,
commandchars=\\\{\},
breaknonspaceingroup,
breakanywhere
}
% Setup chapter title page
\usepackage{fancyhdr}
% Store both DOI and license info
\newcommand{\setDOI}[1]{%
\gdef\doi{%
\footnotesize%
#1\par% Print the DOI
This chapter has been made available under a CC-BY-NC-ND license.% Print the license info
}%
}
\fancypagestyle{chapterfirstpage}{
\fancyhf{} % clear all header and footer fields
\fancyfoot[L]{\footnotesize\doi} % the DOI in the left of the footer (footnotesize)
\fancyfoot[R]{\thepage} % the page number in the right of the footer
\renewcommand{\headrulewidth}{0pt} % remove the header rule
}
% Avoid widows and orphans (4 lines before and after page break)
\usepackage[defaultlines=4,all]{nowidow}
% Adjust hypenation
\hyphenation{under-stand under-standing under-graduate researchers patterns under-stood original dist-ributions dist-ribution aggre-gate specify specifies specifying TextEdit DIRECTORY EXISTS idealized white-space europarl textrecipes para-meters relation-ship preposi-tion categories inter-action temporal category visual-ize notably hyper-parameters sampled foundational information hypothesis phenomenon realized otherwise necessary presentations multilingual}
% create index
\usepackage{imakeidx}[intoc,columns=2]
\indexsetup{noclearpage}
\makeindex[options=-s header.ist]
% command to index code
\usepackage{xstring}
\newcommand{\cindex}[1]{%
\StrSubstitute{#1}{_}{\_}[\temp]%
\index{\temp}%
}
\frontmatter
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\usepackage{tabularray}
\usepackage[normalem]{ulem}
\usepackage{graphicx}
\UseTblrLibrary{booktabs}
\UseTblrLibrary{siunitx}
\NewTableCommand{\tinytableDefineColor}[3]{\definecolor{#1}{#2}{#3}}
\newcommand{\tinytableTabularrayUnderline}[1]{\underline{#1}}
\newcommand{\tinytableTabularrayStrikeout}[1]{\sout{#1}}
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[skins,breakable]{tcolorbox}}
\@ifpackageloaded{fontawesome5}{}{\usepackage{fontawesome5}}
\definecolor{quarto-callout-color}{HTML}{909090}
\definecolor{quarto-callout-note-color}{HTML}{0758E5}
\definecolor{quarto-callout-important-color}{HTML}{CC1914}
\definecolor{quarto-callout-warning-color}{HTML}{EB9113}
\definecolor{quarto-callout-tip-color}{HTML}{00A047}
\definecolor{quarto-callout-caution-color}{HTML}{FC5300}
\definecolor{quarto-callout-color-frame}{HTML}{acacac}
\definecolor{quarto-callout-note-color-frame}{HTML}{4582ec}
\definecolor{quarto-callout-important-color-frame}{HTML}{d9534f}
\definecolor{quarto-callout-warning-color-frame}{HTML}{f0ad4e}
\definecolor{quarto-callout-tip-color-frame}{HTML}{02b875}
\definecolor{quarto-callout-caution-color-frame}{HTML}{fd7e14}
\makeatother
\makeatletter
\@ifpackageloaded{bookmark}{}{\usepackage{bookmark}}
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\AtBeginDocument{%
\ifdefined\contentsname
\renewcommand*\contentsname{Table of contents}
\else
\newcommand\contentsname{Table of contents}
\fi
\ifdefined\listfigurename
\renewcommand*\listfigurename{List of Figures}
\else
\newcommand\listfigurename{List of Figures}
\fi
\ifdefined\listtablename
\renewcommand*\listtablename{List of Tables}
\else
\newcommand\listtablename{List of Tables}
\fi
\ifdefined\figurename
\renewcommand*\figurename{Figure}
\else
\newcommand\figurename{Figure}
\fi
\ifdefined\tablename
\renewcommand*\tablename{Table}
\else
\newcommand\tablename{Table}
\fi
}
\@ifpackageloaded{float}{}{\usepackage{float}}
\floatstyle{ruled}
\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
\floatname{codelisting}{Listing}
\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
\usepackage{amsthm}
\theoremstyle{definition}
\newtheorem{definition}{Snippet}[chapter]
\theoremstyle{definition}
\newtheorem{example}{Example}[chapter]
\theoremstyle{remark}
\AtBeginDocument{\renewcommand*{\proofname}{Proof}}
\newtheorem*{remark}{Remark}
\newtheorem*{solution}{Solution}
\newtheorem{refremark}{Remark}[chapter]
\newtheorem{refsolution}{Solution}[chapter]
\makeatother
\makeatletter
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
\makeatother
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[skins,breakable]{tcolorbox}}
\makeatother
\makeatletter
\@ifundefined{shadecolor}{\definecolor{shadecolor}{rgb}{.97, .97, .97}}{}
\makeatother
\makeatletter
\@ifundefined{codebgcolor}{\definecolor{codebgcolor}{HTML}{f8f8f8}}{}
\makeatother
\makeatletter
\ifdefined\Shaded\renewenvironment{Shaded}{\begin{tcolorbox}[colback={codebgcolor}, sharp corners, enhanced, boxrule=0pt, frame hidden, breakable]}{\end{tcolorbox}}\fi
\makeatother
\makeatletter
\@ifpackageloaded{fontawesome5}{}{\usepackage{fontawesome5}}
\makeatother
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\usepackage{bookmark}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
% Make links footnotes instead of hotlinks:
\DeclareRobustCommand{\href}[2]{#2\footnote{\url{#1}}}
\hypersetup{
pdftitle={An Introduction to Quantitative Text Analysis for Linguistics},
pdfauthor={Jerid Francom},
hidelinks,
pdfcreator={LaTeX via pandoc}}
\title{An Introduction to Quantitative Text Analysis for Linguistics}
\usepackage{etoolbox}
\makeatletter
\providecommand{\subtitle}[1]{% add subtitle to \maketitle
\apptocmd{\@title}{\par {\large #1 \par}}{}{}
}
\makeatother
\subtitle{Reproducible Research Using R}
\author{Jerid Francom 0000-0001-5972-6330}% hardcoded author ORCID
\date{July 30, 2024}
\begin{document}
\setcounter{page}{1} % Manually set the page number
\halftitle{An Introduction to Quantitative Text Analysis for
Linguistics}{}
\textit{An Introduction to Quantitative Text Analysis for Linguistics: Reproducible Research Using R} is a pragmatic textbook that equips students and researchers with the essential concepts and practical programming skills needed to conduct quantitative text analysis in a reproducible manner. Designed for undergraduate students and those new to the field, this book assumes no prior experience with statistics or programming, making it an accessible resource for anyone embarking on their journey into quantitative text analysis.
Through a pedagogical approach which emphasizes intuitive understanding over technical details, readers will gain data literacy by learning to identify, interpret, and evaluate data analysis procedures and results. They will also develop research skills, enabling them to design, implement, and communicate quantitative text analysis projects effectively. The book places a strong emphasis on programming skills, guiding readers through interactive lessons, tutorials, and lab activities using the R programming language and real-world datasets.
This practical textbook is enriched with features that facilitate learning, including thought and practical exercises, a companion website which includes programming demonstrations to develop and augment readers' recognition of how programming strategies are implemented, and a GitHub repository which contains both a set of interactive R programming lessons and lab exercises, which guide readers through practical hands-on programming applications. This textbook is an essential companion to any linguist looking to learn how to incorporate quantitative data analysis into their work.
\textbf{Jerid Francom} is Associate Professor of Spanish and Linguistics at Wake Forest University. His research focuses on the use of language corpora from a variety of sources (news, social media, and other internet sources) to better understand the linguistic and cultural similarities and differences between language varieties for both scholarly and pedagogical projects. He has published on topics including the development, annotation, and evaluation of linguistic corpora and analyzed corpora through corpus, psycholinguistic, and computational methodologies. He also has experience working with and teaching statistical programming with R.
\maketitle
\setcounter{page}{4} % Manually set the page number (this will affect the subsequent pages)
% BEFORE BODY
% Publishing info
\thispagestyle{empty}
% Wraps code output text
% https://github.com/quarto-dev/quarto-cli/discussions/4121#discussioncomment-5823761
\RecustomVerbatimEnvironment{verbatim}{Verbatim}{
showspaces = false,
showtabs = false,
breaksymbolleft={},
breaklines,
formatcom=\small
% Note: setting commandchars=\\\{\} here will cause an error
}
% Reduce callout and table font size
% change font for tables
\AtBeginEnvironment{tcolorbox}{\small}
\AtBeginEnvironment{longtable}{\small\sffamily}
\AtBeginEnvironment{table}{\small\sffamily}
\AtBeginEnvironment{tabular}{\small\sffamily}
Designed cover image: Getty Images | Man As Thep
First published 2025\\
by Routledge\\
4 Park Square, Milton Park, Abingdon, Oxon OX14 4RN
and by Routledge\\
605 Third Avenue, New York, NY 10158
\textit{Routledge is an imprint of the Taylor \& Francis Group, an informa business}
© 2025 Jerid Francom
The right of Jerid Francom to be identified as author of this work has been asserted in accordance with sections 77 and 78 of the Copyright, Designs and Patents Act 1988.
The Open Access version of this book, available at www.taylorfrancis.com, has been made available under a Creative Commons Attribution-Non Commercial-No Derivatives (CC-BY-NC-ND) 4.0 license.
Any third party material in this book is not included in the OA Creative Commons license, unless indicated otherwise in a credit line to the material. Please direct any permissions enquiries to the original rightsholder.
\textit{Trademark notice}: Product or corporate names may be trademarks or registered trademarks, and are used only for identification and explanation without intent to infringe.
\textit{British Library Cataloguing-in-Publication Data}\\
A catalogue record for this book is available from the British Library
%\textit{Library of Congress Cataloging-in-Publication Data}\\
%A catalog record has been requested for this book
% holding lines
\vspace{1.25cm}
ISBN: 978-1-032-49427-2 (hbk)\\
ISBN: 978-1-032-49426-5 (pbk)\\
ISBN: 978-1-003-39376-4 (ebk)\\
DOI: 10.4324/9781003393764
Typeset in Palatino/ Hack\\
by Jerid Francom\\
\vspace{.25cm}
Publisher's Note\\
This book has been prepared from camera-ready copy provided by the author.
\vspace{.5cm}
Visit the web and companion sites at \url{qtalr.com} and \url{qtalr.com/resources/}.
\renewcommand*\contentsname{Contents}
{
\setcounter{tocdepth}{2}
\tableofcontents
}
\bookmarksetup{startatroot}
\chapter*{Acknowledgments}\label{sec-acknowledgments-pdf}
\markboth{Acknowledgments}{Acknowledgments}
The journey of creating this textbook has been both challenging and
rewarding, and it would not have been possible without the inspiration,
support, and invaluable feedback from many individuals. First and
foremost, I extend my deepest gratitude to my students at Wake Forest
University. Your enthusiasm and curiosity have been a constant source of
inspiration, pushing me to address my blind spots and meet your needs
more effectively.
I am particularly grateful for the generous feedback from the following
individuals, whose insights and suggestions have significantly shaped
the development of this book: Laura Aull, Andrea Bowling, Caroline
Brady, Declan Golsen, Logan Jacobs, Abby Komiske, Asya Little, Elaine
Lu, Jack Nelson, and Sicheng Wang. Your contributions have been
instrumental in refining the content and making it more accessible and
engaging for future readers.
A special thanks to my colleague and spouse, Dr.~Claudia Valdez, for her
unwavering support, encouragement, and patience throughout this project.
Your feedback and guidance have been invaluable, and I am grateful for
your willingness to engage in countless discussions about the content,
structure, and pedagogical approach of this book. Most importantly,
thank you for your love and understanding, which have sustained me
through the ups and downs of this journey.
Finally, I would like to express my appreciation to the R community,
especially the developers and contributors of the \{tidyverse\} and
\{tidymodels\} packages. Your dedication to creating user-friendly and
powerful tools for data analysis has revolutionized the field of
quantitative text analysis and made it accessible to a broader audience.
\bookmarksetup{startatroot}
\chapter*{Preface}\label{sec-preface}
\markboth{Preface}{Preface}
\setDOI{10.4324/9781003393764.0}
\thispagestyle{chapterfirstpage}
\begin{tcolorbox}[enhanced jigsaw, arc=.35mm, colback=white, bottomrule=.15mm, leftrule=.75mm, colframe=quarto-callout-color-frame, toprule=.15mm, opacityback=0, breakable, rightrule=.15mm, left=2mm]
\textbf{\faIcon{list-alt} Outcomes}
\begin{itemize}
\tightlist
\item
Comprehend the book's rationale, learning goals, and pedagogical
approach.
\item
Navigate and engage with the book's structure and content effectively.
\item
Set up the computing environment and utilize textbook and support
resources for an optimal learning experience.
\end{itemize}
\end{tcolorbox}
The purpose of this preface is to present the rationale behind this
textbook, outline the key learning objectives, describe the pedagogical
approach, and identify the intended audience. Additionally, this chapter
will provide readers with a guide to the book's structure and the scope
of its content, as well as a summary of supporting learning and
instructor resources. Finally, this chapter will provide readers with
information on setting up their computing environment and where to seek
support.
\section*{Rationale}\label{sec-preface-rationale}
\markright{Rationale}
\index{data science}\textbf{Data science}, an interdisciplinary field
that combines knowledge and skills from statistics, computer science,
and domain-specific expertise to extract meaningful insight from
structured and unstructured data, has emerged as an exciting and rapidly
growing field in recent years, driven in large part by the increase in
computing power available to the average individual and the abundance of
electronic data now available through the internet. These advances have
become an integral part of the modern scientific landscape, with
data-driven insights now being used to inform decision-making in a wide
variety of academic fields, including linguistics and language-related
disciplines.
This textbook seeks to meet this growing demand by providing an
introduction to the fundamental concepts and practical programming
skills from data science applied to the task of quantitative text
analysis. It is intended primarily for undergraduate students, but may
also be useful for graduates and researchers seeking to expand their
methodological toolbox. The textbook takes a pedagogical approach which
assumes no prior experience with statistics or programming, making it an
accessible resource for novices beginning their exploration of
quantitative text analysis methods.
\section*{Aims}\label{sec-preface-aims}
\markright{Aims}
The overarching goal of this textbook is to provide readers with
foundational knowledge and practical skills to conduct and evaluate
quantitative text analysis using the R programming language and other
open source tools and technologies. The specific aims are to develop the
reader's proficiency in three main areas:
\begin{itemize}
\item
\index{data literacy}\textbf{Data literacy}: Identify, interpret and
evaluate data analysis procedures and results.\\
Throughout this textbook we will explore topics which will help you
understand how data analysis methods derive insight from data. In this
process you will be encouraged to critically evaluate connections
across linguistic and language-related disciplines using data analysis
knowledge and skills. Data literacy is an invaluable skillset for
academics and professionals but also is indispensable for 21st-century
citizens to navigate and actively participate in the ``Information
Age'' in which we live (\citeproc{ref-Carmi2020}{Carmi, Yates,
Lockley, \& Pawluczuk, 2020}).
\item
\index{research skills}\textbf{Research skills}: Design, implement,
and communicate quantitative text analysis research.\\
This aim does not differ significantly, in spirit, from common
learning outcomes in a research methods course. However, working with
text will incur a series of key steps in the selection, collection,
and preparation of the data that are unique to text analysis projects.
In addition, I will stress the importance of research documentation
and creating reproducible research as an integral part of modern
scientific inquiry (\citeproc{ref-Buckheit1995}{Buckheit \& Donoho,
1995}).
\item
\index{programming skills}\textbf{Programming skills}: Develop and
apply programming skills to text analysis tasks in a reproducible
manner.\\
Modern data analysis, and by extension, text analysis is conducted
using programming. There are various key reasons for this: a
programming approach (1) affords researchers unlimited research
freedom ---if you can envision it, you can program it, (2) underlies
well-documented and reproducible research
(\citeproc{ref-Gandrud2015}{Gandrud, 2015})\index{Gandrud}, and (3)
invites researchers to engage more intimately with the data and the
methods for analysis.
\end{itemize}
These aims are important for linguistics students because they provide a
foundation for concepts and in the skills required to succeed in the
rapidly evolving landscape of 21st-century research. These abilities
enable researchers to evaluate and conduct high-quality empirical
investigation across linguistic fields on a wide variety of topics.
Moreover, these skills go beyond linguistics research; they are widely
applicable across many disciplines where quantitative data analysis and
programming are becoming increasingly important. Thus, this textbook
provides students with a comprehensive introduction to quantitative text
analysis that is relevant to linguistics research and that equips them
with valuable skills for their future careers.
\section*{Approach}\label{sec-preface-approach}
\markright{Approach}
The approach taken in this textbook is designed to accommodate
linguistics students and researchers with little to no prior experience
with programming or quantitative methods. With this in mind the
objective is connect conceptual understanding with practical
application. Real-world data and research tasks relevant to linguistics
are used throughout the book to provide context and to motivate the
learning process\footnote{Research data and questions are primarily
based on English for wide accessibility as it is the \emph{de facto}
language of academics and research. However, the methods and
techniques presented in this textbook are applicable to many other
languages.}. Furthermore, as an introduction to the field, the
textbook focuses on the most common and fundamental methods and
techniques for quantitative text analysis and prioritizes breadth over
depth and intuitive understanding over technical explanations. On the
programming side, the \index{Tidyverse}\textbf{Tidyverse} approach to
programming in \index{R} will be adopted
(\citeproc{ref-Wickham2014}{Wickham, 2014b})\index{Wickham}. This
approach provides a consistent syntax across different packages and is
known for its legibility, making it easier for readers to understand and
write code. Together, these strategies form an approach that is intended
to provide readers with an accessible resource to gain a foothold in the
field and to equip them with the knowledge and skills to apply
quantitative text analysis in their own research.
\section*{Structure}\label{sec-preface-structure}
\markright{Structure}
The aims and approach described above are reflected in the overall
structure of the book and each chapter.
\subsection*{Book level}\label{sec-preface-structure-book}
At the book level, there are five interdependent parts:
Part I ``Orientation'' provides the necessary background knowledge to
situate quantitative text analysis in the wider context of data analysis
and linguistic research and to provide a clearer picture of what text
analysis entails and its range of applications.
The subsequent parts are directly aligned with the data analysis
process. The building blocks of this process are reflected in
\index{Data to Insight Hierarchy (DIKI)}\textbf{`Data to Insight
Hierarchy (DIKI)'} visualized in Figure 0.1.
\begin{figure}
\centering
\includegraphics[width=0.75\linewidth]{part_0/figures/preface-diki.drawio.png}
\captionsetup{labelformat=empty,labelsep=none}
\caption[Figure 0.1: Data to Insight Hierarchy (DIKI)]{Figure 0.1: Data to Insight Hierarchy (DIKI)\footnotemark{}}
\end{figure}%
\footnotetext{Adapted from Ackoff (\citeproc{ref-Ackoff1989}{1989}) and Rowley (\citeproc{ref-Rowley2007}{2007}).}
The DIKI Hierarchy highlights the stages and intermediate steps required
to derive insight from data. Part II ``Foundations'' provides a
conceptual introduction to the DIKI Hierarchy and establishes
foundational knowledge about data, information, knowledge, and insight
which is fundamental to developing a viable research plan.
Parts III ``Preparation'' and IV ``Analysis'' focus on the
implementation process. Part III covers the steps involved in preparing
data for analysis, including data acquisition, curation, and
transformation. Part IV covers the steps involved in conducting
analysis, including exploratory, predictive, and inferential data
analysis.
The final part, Part V ``Communication'', covers the final stage of the
data analysis process, which is to communicate the results of the
analysis. This includes the structure and content of research reports as
well as the process of publishing, sharing, and collaborating on
research.
\subsection*{Chapter level}\label{sec-preface-structure-chapter}
At the chapter level, both conceptual and programming skills are
developed in stages\footnote{These stages attempt to capture the general
progression of learning reflected in Bloom's Taxonomy. See Krathwohl
(\citeproc{ref-Krathwohl2002}{2002}) for a description and revised
version.}. The chapter-level structure is consistent across chapters
and can be seen in Table~\ref{tbl-structure-approach}.
\begin{longtable}[]{@{}
>{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1500}}
>{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.5500}}
>{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1500}}
>{\raggedright\arraybackslash}p{(\columnwidth - 6\tabcolsep) * \real{0.1500}}@{}}
\caption{The general structure and learning progression of a
chapter}\label{tbl-structure-approach}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
Component
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Purpose
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Resource
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Stage
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
Component
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Purpose
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Resource
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Stage
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
Outcomes & Identify the learning objectives for the chapter & Textbook &
Indicate \\
Overview & Provide a brief introduction to the chapter topic & Textbook
& Outline \\
Coding Lessons & Teach programming techniques with hands-on interactive
exercises & GitHub & Interact \\
Content & Combine conceptual discussions and programming skills,
incorporating thought-provoking questions, relevant studies, and
advanced topic references & Textbook & Explore \\
Recipes & Offer step-by-step programming examples related to the chapter
and relevant for the upcoming lab & Resources Kit website & Examine \\
Labs & Allow readers to apply chapter-specific concepts and techniques
to practical tasks & GitHub & Apply \\
Summary & Review the key concepts and skills covered in the chapter &
Textbook & Review \\
\end{longtable}
Each chapter will begin with a list of key learning outcomes followed by
a brief introduction to the chapter's content. The goal is to orient the
reader to the chapter. Next there will be a prompt to complete the
interactive coding lesson(s) to introduce readers to key programming
concepts related to the chapter though hands-on experience and then the
main content of the chapter will follow. The content will be a
combination of conceptual discussions and programming skills,
incorporating thought-provoking questions (`\faIcon{lightbulb} Consider
this'), relevant studies (`\faIcon{file-alt} Case study'), and advanced
topic references (`\faIcon{medal} Dive deeper'). Together these
components form the skills and knowledge phase.
The next phase is the application phase. This phase will include
step-by-step programming demonstrations related to the chapter (Recipes)
and lab exercises that allow readers to apply their knowledge and skills
to chapter-related tasks. Finally, the chapters conclude with a summary
of the key concepts and skills covered in the chapter and in the
associated activities.
\section*{Resources}\label{sec-preface-resources}
\markright{Resources}
The description and location of the available resources to support the
aims and approach of this textbook appear in Table~\ref{tbl-resources}.
\begin{longtable}[]{@{}
>{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1500}}
>{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.7000}}
>{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1500}}@{}}
\caption{Resources available to support the aims and approach of this
textbook}\label{tbl-resources}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
Resource
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Description
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Location
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
Resource
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Description
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Location
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
Textbook & Prose discussion, figures/ tables, R code, case studies, and
thought and practical exercises & Physical/
\href{https://qtalr.com}{Web} \\
\{qtkit\} & R package with functions for accessing data and datasets, as
well as various useful functions developed specifically for this
textbook & CRAN/ GitHub \\
Resources Kit & Includes Recipes, programming tutorials to enhance the
reader's recognition of how programming strategies are implemented, and
other supplementary materials including setup Guides, and Instructor
materials & GitHub \\
Lessons & A set of interactive R programming lessons associated with
each chapter & GitHub \\
Labs & A set of lab exercises designed to direct the reader through
practical hands-on programming applications & GitHub \\
\end{longtable}
All resources are freely available and accessible to readers and are
found on the GitHub organization \url{https://github.com/qtalr/}. For
the textbook and Resources Kit, the code and a link to the website are
provided in each respective repository. The development version of the
\{qtkit\} package is available on GitHub and the stable version is
available on the
\index{Comprehensive R Archive Network (CRAN)}Comprehensive R Archive
Network (CRAN) (\citeproc{ref-R-qtkit}{Francom, 2024}). The interactive
programming lessons and lab exercises are also available on GitHub.
Errata should be reported in the respective repository's issue tracker
on GitHub.
\section*{Getting started}\label{sec-preface-getting-started}
\markright{Getting started}
Before jumping in to this and subsequent chapter's textbook activities,
it is important to prepare your \index{computing environment}computing
environment and understand how to take advantage of the resources
available, both those directly and indirectly associated with the
textbook.
\subsection*{R environment}\label{sec-preface-r-environment}
Programming is the backbone for modern quantitative research. Among the
many programming languages available, \index{R}R is a popular
\index{open source}open-source language and software environment for
statistical computing. R is popular with statisticians and has been
adopted as the \emph{de facto} language by many other fields in natural
and social sciences, including linguistics. It is freely downloadable
from The R Project for Statistical Programming website
(\citeproc{ref-TheRFoundation2024}{The R Foundation, 2024}) and is
available for macOS, Linux, and Windows operating
systems\index{operating system}.
Successfully installing R is rarely the last step in setting up your
R-enabled computing environment. The majority of R users also install an
\index{integrated development environment (IDE)}\textbf{integrated
development environment} (IDE). An IDE, such as \index{RStudio}RStudio
(\citeproc{ref-Posit2024}{Posit, 2024}), or a \index{text editor}text
editor, such as \index{Visual Studio Code}Visual Studio Code
(\citeproc{ref-Microsoft2024}{Microsoft, 2024}), provide a
\index{graphical user interface (GUI)}\textbf{graphical user interface}
(GUI) for working with R\footnote{For those who prefer a terminal-based
text editor, \index{Neovim}Neovim is a popular choice. Neovim is a
text editor that is designed to be extensible and customizable. It is
a modern version of the classic Vim text editor.}. In effect, these
interfaces provide a dashboard for working with R and are designed to
make it easier to write and execute R code. IDEs also provide a number
of other useful features such as syntax highlighting, code completion,
and debugging. IDEs are not required to work with R but they are
\emph{highly} recommended.
Choosing to install R and an IDE directly on your personal computer,
which is know as your \index{local environment}\textbf{local
environment}, is not the only option to work with R. Other options
include working with R in a \index{remote environment}\textbf{remote
environment} or a \index{virtual environment}\textbf{virtual
environment}.
\begin{tcolorbox}[enhanced jigsaw, arc=.35mm, colback=white, bottomrule=.15mm, leftrule=.75mm, colframe=quarto-callout-color-frame, toprule=.15mm, opacityback=0, breakable, rightrule=.15mm, left=2mm]
\textbf{\faIcon{file-code} Guides} For more information and instructions
on setting up an R environment for using this book, consult the
Resources Kit ``Setting up an R environment'' guide.
\end{tcolorbox}
There are trade-offs in terms of cost, convenience, and flexibility when
choosing to work with R in a local, remote, or virtual environment. The
choice is yours and you can always change your mind later. The important
thing is to get started and begin learning R. Furthermore, any of the
approaches described here will be compatible with this textbook.
\subsection*{R packages}\label{sec-preface-r-packages}
As you progress in your R programming experience, you'll find yourself
leveraging code from other R users, which is typically provided as
packages. \index{R packages}Packages are sets of functions and/or
datasets that are freely accessible for download, designed to perform a
specific set of interrelated tasks. They enhance the capabilities of R.
Official R packages can be found in repositories like
\index{Comprehensive R Archive Network (CRAN)}CRAN
(\citeproc{ref-RCommunity2024}{R Community, 2024}) or
\index{R Universe}R-universe (\citeproc{ref-ROpenSci2024}{ROpenSci,
2024}), while other packages can be obtained from code-sharing platforms
such as \index{GitHub}GitHub (\citeproc{ref-Github2024}{GitHub, 2024}).
\begin{tcolorbox}[enhanced jigsaw, arc=.35mm, colback=white, bottomrule=.15mm, leftrule=.75mm, colframe=quarto-callout-color-frame, toprule=.15mm, opacityback=0, breakable, rightrule=.15mm, left=2mm]
\textbf{\faIcon{lightbulb} Consider this}
\index{Comprehensive R Archive Network (CRAN)}CRAN includes groupings of
popular packages related to a given applied programming task called Task
Views \url{https://cran.r-project.org/web/views/}. Explore the available
CRAN Task Views listings. Note the variety of areas (tasks) that are
covered in this listing. Now explore in more detail one of the following
task views which are directly related to topics covered in this textbook
noting the associated packages and their descriptions: (1)
\index{clustering}Cluster, (2) \index{machine learning}MachineLearning,
(3) \index{Natural Language Processing (NLP)}NaturalLanguageProcessing,
or (4) \index{reproducible research}ReproducibleResearch.
\end{tcolorbox}
You will download a number of packages at different stages of this
textbook, but there is a set of packages that will be key to have from
the get go. Once you have access to a working \index{R}R environment,
you can proceed to install the following packages.
\begin{tcolorbox}[enhanced jigsaw, arc=.35mm, colback=white, bottomrule=.15mm, leftrule=.75mm, colframe=quarto-callout-color-frame, toprule=.15mm, opacityback=0, breakable, rightrule=.15mm, left=2mm]
\textbf{\faIcon{file-code} Guides}
For instructions on how to install the \index{R packages!qtkit}{qtkit}
package from CRAN\index{Comprehensive R Archive Network (CRAN)} or
GitHub\index{GitHub} and download and use the interactive R programming
lessons for this textbook, see the Resources Kit ``Getting started''
guide.
\end{tcolorbox}
Install the following packages from CRAN.
\begin{itemize}
\tightlist
\item
\index{R packages!tidyverse}{tidyverse}
(\citeproc{ref-R-tidyverse}{Wickham, 2023c})
\item
\index{R packages!tinytex}{tinytex} (\citeproc{ref-R-tinytex}{Xie,
2024})
\item
\index{R packages!swirl}{swirl} (\citeproc{ref-R-swirl}{Kross,
Carchedi, Bauer, \& Grdina, 2020})
\item
\index{R packages!qtkit}{qtkit} (\citeproc{ref-R-qtkit}{Francom,
2024})
\end{itemize}
You can do this by running Example~\ref{exm-install-packages} in an R
console:
\begin{example}[]\protect\hypertarget{exm-install-packages}{}\label{exm-install-packages}
~
\begin{Shaded}
\begin{Highlighting}[numbers=left,,]
\CommentTok{\# install key packages from CRAN}
\FunctionTok{install.packages}\NormalTok{(}\FunctionTok{c}\NormalTok{(}\StringTok{"tidyverse"}\NormalTok{, }\StringTok{"tinytex"}\NormalTok{, }\StringTok{"swirl"}\NormalTok{, }\StringTok{"qtkit"}\NormalTok{))}
\end{Highlighting}
\end{Shaded}
\end{example}
\cindex{install.packages()}
\cindex{c()}
\subsection*{Git and GitHub}\label{sec-preface-git-github}
\index{GitHub}GitHub is a code sharing website. Modern computing is
highly collaborative and GitHub is a very popular platform for sharing
and collaborating on coding projects. The lab exercises for this
textbook are shared on GitHub. To access and complete these exercises
you will need to sign up for a (free) account and then set up the
version control software \index{Git}Git on your computing environment.
\begin{tcolorbox}[enhanced jigsaw, arc=.35mm, colback=white, bottomrule=.15mm, leftrule=.75mm, colframe=quarto-callout-color-frame, toprule=.15mm, opacityback=0, breakable, rightrule=.15mm, left=2mm]
\textbf{\faIcon{file-code} Guides}