-
Notifications
You must be signed in to change notification settings - Fork 0
/
textbook.bib
6252 lines (5761 loc) · 442 KB
/
textbook.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{Abney1996,
title = {Statistical Methods and Linguistics},
author = {Abney, Steven},
editor = {Klavans, Judith and Resnik, Philip},
year = {1996},
journal = {The balancing act. Combining Symbolic and Statistical Approaches to Language},
pages = {1--23},
publisher = {MIT Press},
address = {Cambridge, Massachusetts},
abstract = {is unusual as a mass noun, but can in fact be used as one, as for example in the article consisted of three pages of abstract and only two pages of actual text. One might object that the NP headed by might is bad because of the multiple postmodifiers, but in fact there is no absolute constraint against stacking nominal postmodifiers, and good examples can be constructed with the same structure: marlinespikes, business end up, sprinkled with tabasco sauce, can be a powerful deterrent against pigeons. Even the commas are not absolutely required. The strength of preference for them depends on how heavy the modifiers are: cf. strength judicially applied increases the effectiveness of diplomacy, a cup of peanuts unshelled in the stock adds character. 3 In short, the structure (3) seems to be best characterized as grammatical, though it violates any number of parsing preferences and is completely absurd. One might think that one could eliminate ambiguities by turning some of the dispref...},
file = {/Users/francojc/Zotero/storage/QT9V3PEU/Abney - 1996 - Statistical methods and linguistics.pdf}
}
@book{Abney2008,
title = {Semisupervised Learning for Computational Linguistics},
author = {Abney, Steven},
year = {2008},
journal = {Journal of the Royal Statistical Society Series A Statistics in Society},
volume = {172},
publisher = {CRC Press},
issn = {09641998},
doi = {10.1111/j.1467-985X.2009.00595_2.x},
abstract = {The rapid advancement in the theoretical understanding of statistical and machine learning methods for semisupervised learning has made it difficult for nonspecialists to keep up to date in the field. Providing a broad, accessible treatment of the theory as well as linguistic applications, Semisupervised Learning for Computational Linguistics offers self-contained coverage of semisupervised methods that includes background material on supervised and unsupervised learning. The book presents a brief history of semisupervised learning and its place in the spectrum of learning methods before moving on to discuss well-known natural language processing methods, such as self-training and co-training. It then centers on machine learning techniques, including the boundary-oriented methods of perceptrons, boosting, support vector machines (SVMs), and the null-category noise model. In addition, the book covers clustering, the expectation-maximization (EM) algorithm, related generative methods, and agreement methods. It concludes with the graph-based method of label propagation as well as a detailed discussion of spectral methods. Taking an intuitive approach to the material, this lucid book facilitates the application of semisupervised learning methods to natural language processing and provides the framework and motivation for a more systematic study of machine learning.},
isbn = {978-1-58488-559-7}
}
@article{Ackoff1989,
title = {From Data to Wisdom},
author = {Ackoff, Russell L.},
year = {1989},
journal = {Journal of Applied Systems Analysis},
volume = {16},
number = {1},
pages = {3--9},
file = {/Users/francojc/Zotero/storage/AD8VE6XD/Ackoff - 1989 - From data to wisdom.pdf}
}
@incollection{Adel2020,
title = {Corpus Compilation},
booktitle = {A {{Practical Handbook}} of {{Corpus Linguistics}}},
author = {{\"A}del, Annelie},
editor = {Paquot, Magali and Gries, Stefan Th.},
year = {2020},
pages = {3--24},
publisher = {Springer},
address = {Switzerland}
}
@book{Aggarwal2012,
title = {Mining Text Data},
author = {Aggarwal, Charu C. and Zhai, ChengXiang},
year = {2012},
month = feb,
publisher = {Springer Science \& Business Media},
abstract = {Text mining applications have experienced tremendous advances because of web 2.0 and social networking applications. Recent advances in hardware and software technology have lead to a number of unique scenarios where text mining algorithms are learned. Mining Text Data introduces an important niche in the text analytics field, and is an edited volume contributed by leading international researchers and practitioners focused on social networks \& data mining. This book contains a wide swath in topics across social networks \& data mining. Each chapter contains a comprehensive survey including the key research content on the topic, and the future directions of research in the field. There is a special focus on Text Embedded with Heterogeneous and Multimedia Data which makes the mining process much more challenging. A number of methods have been designed such as transfer learning and cross-lingual mining for such cases. Mining Text Data simplifies the content, so that advanced-level students, practitioners and researchers in computer science can benefit from this book. Academic and corporate libraries, as well as ACM, IEEE, and Management Science focused on information security, electronic commerce, databases, data mining, machine learning, and statistics are the primary buyers for this reference book.},
googlebooks = {vFHOx8wfSU0C},
isbn = {978-1-4614-3223-4},
langid = {english}
}
@article{Agnieszka2014,
title = {The Acquisition of Formulaic Language by {{EFL}} Learners},
author = {Agnieszka, Le{\'n}ko-Szyma{\'n}ska},
year = {2014},
journal = {International Journal of Corpus Linguistics},
volume = {19},
number = {2},
pages = {225--251},
langid = {english},
file = {/Users/francojc/Zotero/storage/L3Y8Z2DD/Le - e acquisition of formulaic language by EFL learner.pdf}
}
@article{Ahmed2021,
title = {The Role of Forensic Linguistics in Crime Investigation: {{Uses}} in Legal Proceedings.},
author = {Ahmed, Hazhar Ramadhan},
year = {2021},
month = mar,
journal = {Journal of the Association-Institute for English Language and American Studies},
series = {{{ANGLISTICUM}}},
volume = {10},
number = {2},
pages = {23--31},
publisher = {Zenodo},
doi = {10.5281/ZENODO.4609333},
urldate = {2021-06-01},
abstract = {This paper considers the extent to which forensic linguistics can be considered a science, and outlines some ways in which it is useful in legal proceedings, including voice identification, the interpretation of police-suspect interaction, verification of police reports, and cross-cultural insights into speech patterns in a courtroom context. On the basis of the analysis, the paper concludes that Forensic linguistics can prove beneficial for the investigation of crimes, analysis of the judicial procedures, and particularly disputes in law. It can also be used for the analysis of courtroom discourse and interpret and translate the legal documents for their readability and comprehensibility. Moreover, the police cautions issued to the suspects can also be analyzed for their comprehensibility and the authorship attribution can be established for written or spoken texts. It, therefore, works as the interface between language, crime, and the law.},
copyright = {Creative Commons Attribution 4.0 International, Open Access},
langid = {english},
file = {/Users/francojc/Zotero/storage/UACLCNUA/Hazhar Ramadhan Ahmed - 2021 - THE ROLE OF FORENSIC LINGUISTICS IN CRIME INVESTIG.pdf}
}
@article{Akbary2018,
title = {The Value of Song Lyrics for Teaching and Learning {{English}} Phrasal Verbs: A Corpus Investigation of Four Music Genres},
shorttitle = {The Value of Song Lyrics for Teaching and Learning English Phrasal Verbs},
author = {Akbary, Maryam and Shahriari, Hesamoddin and Hosseini Fatemi, Azar},
year = {2018},
month = oct,
journal = {Innovation in Language Learning and Teaching},
volume = {12},
number = {4},
pages = {344--356},
issn = {1750-1229, 1750-1237},
doi = {10.1080/17501229.2016.1216121},
urldate = {2021-07-28},
abstract = {Phrasal verbs are a notoriously difficult feature of English for most second language and foreign language learners to master. Different sources, such as movies, music, games and books, can provide learners with exposure to the most common phrasal verbs in English. This study aims to investigate the degree to which music can play a role in exposing learners to phrasal verbs through analyzing their frequency in song lyrics from different genres (i.e., Pop, Rock, Hip-hop and Metal). For this purpose, a corpus of 400 song lyrics by different artists from these four genres was searched for all existing phrasal verbs. The resulting list of phrasal verbs was compared to Garnier and Schmitt's (2014) Phrasal Verb Pedagogical List in order to determine their value for learners. Further comparisons were subsequently drawn to determine which genre could be of greater use to language learning and instruction. The results revealed that song lyrics can potentially be a beneficial source for learning these constructions. Differences in the type and token frequency of phrasal verb among the four genres can also be used to determine the usefulness of each genre to students from various levels of proficiency.},
langid = {english},
file = {/Users/francojc/Zotero/storage/9JQ99CUA/Akbary et al. - 2018 - The value of song lyrics for teaching and learning.pdf}
}
@article{Alanazi2022,
title = {Corpus-Based Analysis of near-Synonymous Verbs},
author = {Alanazi, Zaha},
year = {2022},
month = aug,
journal = {Asian-Pacific Journal of Second and Foreign Language Education},
volume = {7},
number = {1},
pages = {15},
issn = {2363-5169},
doi = {10.1186/s40862-022-00138-5},
urldate = {2022-08-12},
abstract = {Despite having different semantic profiles, near synonyms are usually presented in dictionaries as being contextually interchangeable, which may lead EFL learners to assume their contextual interchangeability. Nevertheless, there is a scarcity of studies on how near synonyms are similar or different in their semantic and grammatical preferences. To enrich the literature on near synonyms' semantic and grammatical profiles, this study explores the collocational behaviors and the semantic preferences of the near-synonymous verbs (affect vs. impact). Sketch Engine was used to examine lexical collocates, the colligational profile and the semantic prosody of the two verbs. The findings revealed fine-grained contextual differences in their collocational, grammatical, and semantic preferences. Applications of the findings for English language teaching will be discussed a long with recommendations for future research.},
keywords = {/unread,Colligation,Collocation,collocations,Corpus,corpus linguistics,second language acquisition,Semantic,semantics,Sketch engine,Synonyms},
file = {/Users/francojc/Zotero/storage/8HXKD4M7/Alanazi_2022_Corpus-based analysis of near-synonymous verbs.pdf}
}
@misc{Albert2015,
title = {{{CABNC}}: {{The Jeffersonian}} Transcription of the Spoken {{British National Corpus}}},
shorttitle = {{{CABNC}}},
author = {Albert, Saul and {de Ruiter}, Laura E. and {de Ruiter}, J. P.},
year = {2015},
publisher = {TalkBank},
url = {https://saulalbert.github.io/CABNC/},
copyright = {CC BY 3.0},
keywords = {/unread,bnc,cabnc,english,spoken,talkbank}
}
@inproceedings{Alegria2014,
title = {Wikipedia and Machine Translation: Killing Two Birds with One Stone},
booktitle = {Workshop on '{{Free}}/Open-Source Language Resources for the Machine Translation of Less-Resourced Languages' at {{LREC}} 2014},
author = {Alegria, I{\~n}aki and Cabezon, Unai and {Fernandez de Beto{\~n}o}, Unai and Labaka, Gorka and Mayor, Aingeru and Sarasola, Kepa and Zubiaga, Arkaitz},
year = {2014},
urldate = {2014-05-05},
file = {/Users/francojc/Zotero/storage/PFENS7EZ/Alegria et al. - 2014 - Wikipedia and Machine Translation killing two birds with one stone.pdf}
}
@misc{Almeida2011,
title = {{{SMS}} Spam Collection},
shorttitle = {{{SMS}} Spam Collection},
author = {Almeida, Tiago A. and G{\'o}mez Hildago, Jos{\'e} Mar{\'i}a},
year = {2011},
journal = {SMS Spam Collection v. 1},
url = {https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/},
urldate = {2021-07-08},
abstract = {The SMS Spam Collection v.1 is a public set of SMS labeled messages that have been collected for mobile phone spam research. It has one collection composed by 5,574 English, real and non-encoded messages, tagged according being legitimate (ham) or spam.},
file = {/Users/francojc/Zotero/storage/DLNV7GK8/smsspamcollection.html}
}
@inproceedings{Almeida2011a,
title = {Contributions to the Study of {{SMS}} Spam Filtering: {{New}} Collection and Results},
booktitle = {Proceedings of the 2011 {{ACM Symposium}} on {{Document Engineering}} ({{DOCENG}}'11)},
author = {Almeida, Tiago A and G{\'o}mez Hildago, Jos{\'e} Mar{\'i}a and Yamakami, Akebo},
year = {2011},
pages = {4},
address = {Mountain View, CA},
url = {https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/},
abstract = {The growth of mobile phone users has lead to a dramatic increasing of SMS spam messages. In practice, fighting mobile phone spam is difficult by several factors, including the lower rate of SMS that has allowed many users and service providers to ignore the issue, and the limited availability of mobile phone spam-filtering software. On the other hand, in academic settings, a major handicap is the scarcity of public SMS spam datasets, that are sorely needed for validation and comparison of different classifiers. Moreover, as SMS messages are fairly short, content-based spam filters may have their performance degraded. In this paper, we offer a new real, public and non-encoded SMS spam collection that is the largest one as far as we know. Moreover, we compare the performance achieved by several established machine learning methods. The results indicate that Support Vector Machine outperforms other evaluated classifiers and, hence, it can be used as a good baseline for further comparison.},
langid = {english},
file = {/Users/francojc/Zotero/storage/4NRMQS4L/Almeida et al. - Contributions to the Study of SMS Spam Filtering .pdf}
}
@book{Alpaydin2004,
title = {Introduction to Machine Learning},
author = {Alpaydin, E},
year = {2004},
edition = {Second Edi},
url = {http://books.google.com/books?hl=en&lr=&id=1k0_-WroiqEC&oi=fnd&pg=PR13&dq=Introduction+to+Machine+Learning&ots=p94DZSgFwO&sig=vDDNGh5k63XWekU_kVjRFLMZNSE},
urldate = {2014-02-16},
isbn = {978-0-262-01243-0},
file = {/Users/francojc/Zotero/storage/P3XAB7KE/Alpaydin - 2004 - Introduction to Machine Learning.pdf}
}
@article{Anderwald2010,
title = {Are Non-Standard Dialects More `Natural' than the Standard? {{A}} Test Case from {{English}} Verb Morphology},
author = {Anderwald, Lieselotte},
year = {2010},
month = jul,
journal = {Journal of Linguistics},
volume = {47},
number = {02},
pages = {251--274},
issn = {0022-2267},
doi = {10.1017/S0022226710000241},
urldate = {2013-10-30},
isbn = {0022226710}
}
@article{Argamon2019,
title = {Register in Computational Language Research},
author = {Argamon, Shlomo},
year = {2019},
journal = {Register Studies},
volume = {1},
number = {1},
pages = {100--135},
issn = {2542-9477},
doi = {10.1075/rs.18015.arg},
abstract = {Shlomo Argamon is Professor of Computer Science and Director of the Master of Data Science Program at the Illinois Institute of Technology (USA). In this article, he reflects on the current and potential relationship between register and the field of computational linguistics. He applies his expertise in computational linguistics and machine learning to a variety of problems in natural language processing. These include stylistic variation, forensic linguistics, authorship attribution, and biomedical informatics. He is particularly interested in the linguistic structures used by speakers and writers, including linguistic choices that are influenced by social variables such as age, gender, and register, as well as linguistic choices that are unique or distinctive to the style of individual authors. Argamon has been a pioneer in computational linguistics and NLP research in his efforts to account for and explore register variation. His computational linguistic research on register draws inspiration from Systemic Functional Linguistics, Biber's multi-dimensional approach to register variation, as well as his own extensive experience accounting for variation within and across text types and authors. Argamon has applied computational methods to text classification and description across registers~-- including blogs, academic disciplines, and news writing~-- as well as the interaction between register and other social variables, such as age and gender. His cutting-edge research in these areas is certain to have a lasting impact on the future of computational linguistics and NLP.},
file = {/Users/francojc/Zotero/storage/TVMFBYDG/Argamon - 2019 - Register in computational language research.pdf}
}
@article{Arnold2017,
title = {A Tidy Data Model for Natural Language Processing Using {{cleanNLP}}},
author = {Arnold, Taylor},
year = {2017},
journal = {The R Journal},
eprint = {1703.09570},
pages = {1--20},
abstract = {The package cleanNLP provides a set of fast tools for converting a textual corpus into a set of normalized tables. The underlying natural language processing pipeline utilizes Stanford's CoreNLP library, exposing a number of annotation tasks for text written in English, French, German, and Spanish. Annotators include tokenization, part of speech tagging, named entity recognition, entity linking, sentiment analysis, dependency parsing, coreference resolution, and information extraction.},
archiveprefix = {arXiv},
file = {/Users/francojc/Zotero/storage/2XA5HMVH/Arnold - 2017 - A Tidy Data Model for Natural Language Processing .pdf}
}
@article{Arnon2010,
title = {More than Words: {{Frequency}} Effects for Multi-Word Phrases},
author = {Arnon, Inbal and Snider, Neal},
year = {2010},
journal = {Journal of Memory and Language},
volume = {62},
number = {1},
pages = {67--82},
publisher = {Elsevier},
urldate = {2012-01-16},
abstract = {There is mounting evidence that language users are sensitive to distributional information at many grain-sizes. Much of this research has focused on the distributional properties of words, the units they consist of (morphemes, phonemes), and the syntactic structures they appear in (verb-categorization frames, syntactic constructions). In a series of studies we show that comprehenders are also sensitive to the frequencies of compositional four-word phrases (e.g. don't have to worry): more frequent phrases are processed faster. The effect is not reducible to the frequency of the individual words or substrings and is observed across the entire frequency range (for low, mid and high frequency phrases). Comprehenders seem to learn and store frequency information about multi-word phrases. These findings call for processing models that can capture and predict phrase-frequency effects and support accounts where linguistic knowledge consists of patterns of varying sizes and levels of abstraction. {\copyright} 2009 Elsevier Inc. All rights reserved.},
file = {/Users/francojc/Zotero/storage/7MHBNQVM/Arnon and Snider - 2010 - More than words Frequency effects for multi-word .pdf}
}
@article{Asao2002,
title = {Communication Strategies of {{EFL}} Learners: A Corpus-Based Approach},
author = {Asao, Kojiro},
year = {2002},
journal = {Language and Computers},
volume = {12},
pages = {291--302},
url = {http://www.ingentaconnect.com/content/rodopi/lang/2002/00000038/00000001/art00020},
urldate = {2011-10-25},
abstract = {This paper examines a number of communication strategies that are deployed by EFL learners when writing English. Students learning a foreign language do not, of course, have a complete repertory of vocabulary and grammar; therefore, when faced with the need for a word that they do not know, they will try to resolve any ambiguity by deploying various communication strategies. These strategies include using approximations of meaning or circumlocutions, or simply switching to their mother tongue. Most studies in this field have been in the form of episodic reference. This study, however, focuses on the types of communication strategy that are most frequently used by certain EFL learners in a given situation, and on why those strategies are preferred. This study offers an analysis of a task-based corpus that was created specifically for this purpose.},
file = {/Users/francojc/Zotero/storage/7Y7D5QEN/Asao - 2002 - Communication Strategies of EFL Learners A Corpus-based Approach.pdf}
}
@article{Atkins1992,
title = {Corpus Design Criteria},
author = {Atkins, Sue and Clear, J and Ostler, N},
year = {1992},
journal = {Literary and Linguistic Computing},
volume = {7},
number = {1},
pages = {1--16},
file = {/Users/francojc/Zotero/storage/HXI8INCF/Atkins, Clear, Ostler - 1992 - Corpus Design Criteria.pdf}
}
@article{Baayen1991,
title = {Productivity and {{English}} Derivation: A Corpus-Based Study},
shorttitle = {Productivity and English Derivation},
author = {Baayen, R. Harald and Lieber, Rochelle},
year = {1991},
publisher = {Walter de Gruyter, Berlin/New York Berlin, New York},
keywords = {/unread,corpus,distributions,morphology,productivity},
file = {/Users/francojc/Zotero/storage/JGT7ZBPT/Baayen_Lieber_1991_Productivity and English derivation.pdf}
}
@article{Baayen1993,
title = {The {{CELEX}} Lexical Database ({{CD-ROM}})},
author = {Baayen, R. Harald and Piepenbrock, R and {van Rijn}, H},
year = {1993},
journal = {Linguistic Data Consortium, University of Pennsylvania, Philadelphia, PA}
}
@article{Baayen2004,
title = {Statistics in Psycholinguistics: A Critique of Some Current Gold Standards},
author = {Baayen, R. Harald},
year = {2004},
journal = {Mental Lexicon Working Papers},
volume = {1},
number = {1},
pages = {1--47},
abstract = {This paper presents a detailed critique of some current gold standards for the stat- istical analysis of experimental data in psycholinguistics. A series of examples il- lustrates (1) the disadvantages of reducing numerical variables to factors and the importance of including available covariates in the model, (2) the advantages of us- ing multilevel models instead of the traditional by-subjectand by-item procedures and the quasi-F test, and (3) the relevance of logistic modelsfor binary data such as the error measure in decision tasks.},
file = {/Users/francojc/Zotero/storage/H5AXH88P/Baayen - 2004 - Statistics in Psycholinguistics a critique of some current gold standards.pdf}
}
@article{Baayen2006,
title = {Morphological Influences on the Recognition of Monosyllabic Monomorphemic Words},
author = {Baayen, R. Harald and Feldman, Laurie and Schreuder, Robert},
year = {2006},
journal = {Journal of Memory and Language},
volume = {55},
pages = {290--313},
issn = {0749596X},
doi = {10.1016/j.jml.2006.03.008},
abstract = {Balota et al. [Balota, D., Cortese, M., Sergent-Marshall, S., Spieler, D., \& Yap, M. (2004). Visual word recognition for single-syllable words. Journal of Experimental Psychology: General, 133, 283-316] studied lexical processing in word naming and lexical decision using hierarchical multiple regression techniques for a large data set of monosyllabic, morphologically simple words. The present study supplements their work by making use of more flexible regression techniques that are better suited for dealing with collinearity and non-linearity, and by documenting the contributions of several variables that they did not take into account. In particular, we included measures of morphological connectivity, as well as a new frequency count, the frequency of a word in speech rather than in writing. The morphological measures emerged as strong predictors in visual lexical decision, but not in naming, providing evidence for the importance of morphological connectivity even for the recognition of morphologically simple words. Spoken frequency was predictive not only for naming but also for visual lexical decision. In addition, it co-determined subjective frequency estimates and norms for age of acquisition. Finally, we show that frequency predominantly reflects conceptual familiarity rather than familiarity with a word's form. ?? 2006.},
isbn = {0749596X},
keywords = {BNC corpus,CELEX database,corpus,entropy,frequency effects,inflectional family size,lexical access,morphological family size,morphology},
file = {/Users/francojc/Zotero/storage/XPKPYIF3/Baayen et al. - 2006 - Morphological influences on the recognition of monosyllabic monomorphemic words.pdf}
}
@book{Baayen2008a,
title = {Analyzing Linguistic Data: A Practical Introduction to Statistics Using {{R}}},
author = {Baayen, R. Harald},
year = {2008},
publisher = {Cambridge University Press},
urldate = {2012-01-09},
file = {/Users/francojc/Zotero/storage/EQGX7MQU/Baayen - 2008 - Analyzing linguistic data A practical introductio.pdf}
}
@article{Baayen2010,
title = {A Real Experiment Is a Factorial Experiment?},
author = {Baayen, R. Harald},
year = {2010},
month = jun,
journal = {The Mental Lexicon},
volume = {5},
number = {1},
pages = {149--157},
issn = {18711340},
doi = {10.1075/ml.5.1.06baa}
}
@article{Baayen2011,
title = {Corpus Linguistics and Naive Discriminative Learning},
author = {Baayen, R. Harald},
year = {2011},
journal = {Revista Brasileira de Lingu{\'i}stica Aplicada},
volume = {11},
number = {2},
pages = {295--328},
publisher = {SciELO Brasil},
keywords = {corpus linguistics,dative alternation,datives,discriminative learning classifier,machine learning,memory based learning,support vector machines,switchboard},
file = {/Users/francojc/Zotero/storage/GC52JFVV/Baayen - 2011 - Corpus linguistics and naive discriminative learning.pdf}
}
@manual{Baayen2019,
type = {Manual},
title = {{{languageR}}: {{Analyzing}} Linguistic Data: A Practical Introduction to Statistics},
author = {Baayen, R. Harald and {Shafaei-Bajestan}, Elnaz},
year = {2019},
url = {https://CRAN.R-project.org/package=languageR}
}
@article{Baker2004,
title = {A Corpus-Based View of Similarity and Difference in Translation},
author = {Baker, Mona},
year = {2004},
month = jan,
journal = {International Journal of Corpus Linguistics},
volume = {9},
number = {2},
pages = {167--193},
issn = {13846655},
doi = {10.1075/ijcl.9.2.02bak}
}
@article{Baker2016,
title = {1,500 Scientists Lift the Lid on Reproducibility},
author = {Baker, Monya},
year = {2016},
month = may,
journal = {Nature},
volume = {533},
number = {7604},
pages = {452--454},
publisher = {Nature Publishing Group},
issn = {1476-4687},
doi = {10.1038/533452a},
urldate = {2024-01-19},
abstract = {Survey sheds light on the `crisis' rocking research.},
copyright = {2016 Springer Nature Limited},
langid = {english},
keywords = {crisis,Peer review,Publishing,reproducibility,research,Research management,survey},
file = {/Users/francojc/Zotero/storage/BNP9YHMI/Baker - 2016 - 1,500 scientists lift the lid on reproducibility.pdf}
}
@article{Bamman2014,
title = {A {{Bayesian}} Mixed Effects Model of Literary Character},
author = {Bamman, David and Underwood, Ted and Smith, Noah A.},
year = {2014},
journal = {Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (ACL 2014)},
pages = {370--379},
abstract = {We consider the problem of automatically inferring latent character types in a collection of 15,099 English novels published between 1700 and 1899. Unlike prior work in which character types are assumed responsible for probabilistically generating all text associated with a character, we introduce a model that employs multiple effects to account for the influence of extra-linguistic information (such as author). In an empirical evaluation, we find that this method leads to improved agreement with the preregistered judgments of a literary scholar, complementing the results of alternative models.},
isbn = {9781937284725},
file = {/Users/francojc/Zotero/storage/GTFI9KLP/Bamman et al. - 2014 - A Bayesian Mixed Effects Model of Literary Charact.pdf}
}
@inproceedings{Bamman2014a,
title = {Learning Latent Personas of Film Characters},
booktitle = {Proceedings of the 51st {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{ACL}} 2013)},
author = {Bamman, David and O'Connor, Brendan and Smith, Noah A},
year = {2014},
pages = {352--361},
abstract = {We present two latent variable models for learning character types, or personas, in film, in which a persona is defined as a set of mixtures over latent lexical classes. These lexical classes capture the stereotypical actions of which a character is the agent and patient, as well as attributes by which they are described. As the first attempt to solve this problem explicitly, we also present a new dataset for the text-driven analysis of film, along with a benchmark testbed to help drive future work in this area.},
isbn = {978-1-937284-50-3},
file = {/Users/francojc/Zotero/storage/3U24LGP8/Bamman, O'Connor, Smith - 2014 - Learning Latent Personas of Film Characters.pdf}
}
@article{Bamman2014b,
title = {Gender Identity and Lexical Variation in Social Media},
author = {Bamman, David and Eisenstein, Jacob and Schnoebelen, Tyler},
year = {2014},
journal = {Journal of Sociolinguistics},
volume = {18},
number = {2},
pages = {135--160},
url = {http://onlinelibrary.wiley.com/doi/10.1111/josl.12080/full},
urldate = {2014-04-26},
file = {/Users/francojc/Zotero/storage/VYASHJT2/Bamman, Eisenstein, Schnoebelen - 2014 - Gender identity and lexical variation in social media.pdf}
}
@article{Bao2019,
title = {Integration of Unsupervised and Supervised Machine Learning Algorithms for Credit Risk Assessment},
author = {Bao, Wang and Lianju, Ning and Yue, Kong},
year = {2019},
month = aug,
journal = {Expert Systems with Applications},
volume = {128},
pages = {301--315},
issn = {0957-4174},
doi = {10.1016/j.eswa.2019.02.033},
urldate = {2021-08-29},
abstract = {For the sake of credit risk assessment, credit scoring has become a critical tool to discriminate ``bad'' applicants from ``good'' applicants for financial institutions. Accordingly, a wide range of supervised machine learning algorithms have been successfully applied to credit scoring; however, integration of unsupervised learning with supervised learning in this field has drawn little consideration. In this work, we propose a combination strategy of integrating unsupervised learning with supervised learning for credit risk assessment. The difference between our work and other previous work on unsupervised integration is that we apply unsupervised learning techniques at two different stages: the consensus stage and dataset clustering stage. Comparisons of model performance are performed based on three credit datasets in four groups: individual models, individual models\,+\,consensus model, clustering\,+\,individual models, clustering\,+\,individual models\,+\,consensus model. As a result, integration at either the consensus stage or dataset clustering stage is effective on improving the performance of credit scoring models. Moreover, the combination of the two stages achieves the best performance, thereby confirming the superiority of the proposed integration of unsupervised and supervised machine learning algorithms, which boost our confidence that this strategy can be extended to many other credit datasets from financial institutions.},
langid = {english},
file = {/Users/francojc/Zotero/storage/VBRHQE3F/Bao et al. - 2019 - Integration of unsupervised and supervised machine.pdf;/Users/francojc/Zotero/storage/V7ZXT5UL/S0957417419301472.html}
}
@incollection{Baroni2008,
title = {Statistical Methods for Corpus Exploitation},
booktitle = {Corpus {{Linguistics}}. {{An International Handbook}}},
author = {Baroni, Marco and Evert, Stefan},
year = {2008},
pages = {777--803},
publisher = {Mouton de Gruyter},
file = {/Users/francojc/Zotero/storage/N6U5MXYX/Baroni, Evert - 2008 - Statistical methods for corpus exploitation.pdf}
}
@book{Baumer2017,
title = {Modern Data Science with {{R}}},
author = {Baumer, Benjamin S. and Kaplan, Daniel T. and Horton, Nicholas J.},
year = {2017},
month = mar,
publisher = {CRC Press},
abstract = {Modern Data Science with R is a comprehensive data science textbook for undergraduates that incorporates statistical and computational thinking to solve real-world problems with data. Rather than focus exclusively on case studies or programming syntax, this book illustrates how statistical programming in the state-of-the-art R/RStudio computing environment can be leveraged to extract meaningful information from a variety of data in the service of addressing compelling statistical questions. Contemporary data science requires a tight integration of knowledge from statistics, computer science, mathematics, and a domain of application. This book will help readers with some background in statistics and modest prior experience with coding develop and practice the appropriate skills to tackle complex data science projects. The book features a number of exercises and has a flexible organization conducive to teaching a variety of semester courses.},
googlebooks = {NrddDgAAQBAJ},
isbn = {978-1-4987-2449-4},
langid = {english}
}
@article{Becher2011,
title = {Explicitation and Implicitation in Translation: A Corpus-Based Study of {{English-German}} and {{German-English}} Translations of Business Texts},
author = {Becher, V},
year = {2011},
url = {http://d-nb.info/102042673X/34},
urldate = {2013-09-10}
}
@book{Beckerman2017,
title = {Getting Started with {{R}}: {{An}} Introduction for Biologists},
author = {Beckerman, Andrew P. and Childs, Dylan Z. and Petchey, Owen L.},
year = {2017},
edition = {Second edi},
publisher = {Oxford University Press},
abstract = {R is rapidly becoming the standard software for statistical analyses, graphical presentation of data, and programming in the natural, physical, social, and engineering sciences. Getting Started with R is now the go-to introductory guide for biologists wanting to learn how to use R in their research. It teaches readers how to import, explore, graph, and analyse data, while keeping them focused on their ultimate goals: clearly communicating their data in oral presentations, posters, papers, and reports. It provides a consistent workflow for using R that is simple, efficient, reliable, and reproducible. This second edition has been updated and expanded while retaining the concise and engaging nature of its predecessor, offering an accessible and fun introduction to the packages dplyr and ggplot2 for data manipulation and graphing. It expands the set of basic statistics considered in the first edition to include new examples of a simple regression, a one-way and a two-way ANOVA. Finally, it introduces a new chapter on the generalised linear model. Getting Started with R is suitable for undergraduates, graduate students, professional researchers, and practitioners in the biological sciences.},
isbn = {978-0-19-878784-6}
}
@misc{Bellinger2004,
title = {Data, Information, Knowledge and Wisdom},
author = {Bellinger, Gene and Castro, Durval and Mills, Anthony},
year = {2004},
journal = {Systems Thinking},
url = {http://www.systems-thinking.org/dikw/dikw.htm},
urldate = {2019-06-18},
file = {/Users/francojc/Zotero/storage/QX4T637U/Bellinger, Castro, Mills - 2004 - Data, information, knowledge and wisdom.pdf}
}
@manual{Benoit2020,
type = {Manual},
title = {Quanteda.Corpora: A Collection of Corpora for Quanteda},
author = {Benoit, Kenneth},
year = {2020},
url = {http://github.com/quanteda/quanteda.corpora}
}
@article{Bentz2014,
title = {Zipf's Law and the Grammar of Languages: A Quantitative Study of Old and Modern {{English}} Parallel Texts},
author = {Bentz, Christian and Kiela, Douwe and Hill, Feli and Buttery, Paula},
year = {2014},
journal = {Corpus Linguistics and Linguistic Theory},
volume = {10},
number = {2},
pages = {175--211},
issn = {16137035},
doi = {10.1515/cllt-2014-0009},
abstract = {This paper reports a quantitative analysis of the relationship between word frequency distributions and morphological features in languages. We analyze a commonly-observed process of historical language change: The loss of inflected forms in favour of `analytic' periphrastic constructions. These tendencies are observed in parallel translations of the Book of Genesis in Old English and Modern English. We show that there are significant differences in the frequency distributions of the two texts, and that parts of these differences are independent of total number of words, style of translation, orthography or contents. We argue that they derive instead from the trade-off between synthetic inflectional marking in Old English and analytic constructions in Modern English. By exploiting the earliest ideas of Zipf, we show that the syntheticity of the language in these texts can be captured mathematically, a property we tentatively call their grammatical fingerprint. Our findings suggest implications for both the specific historical process of inflection loss and more generally for the characterization of languages based on statistical properties.},
isbn = {1613-7027}
}
@misc{Berez-Kroeker2017,
title = {A Survey of Current Reproducibility Practices in Linguistics Journals, 2003-2012},
author = {{Berez-Kroeker}, Andrea L. and Gawne, Lauren and Kelly, Barbara F. and Heston, Tyler},
year = {2017},
url = {https://sites.google.com/a/hawaii.edu/data-citation/survey}
}
@article{Berez-Kroeker2018,
title = {Reproducible Research in Linguistics: A Position Statement on Data Citation and Attribution in Our Field},
author = {{Berez-Kroeker}, Andrea L. and Gawne, Lauren and Kung, Susan Smythe and Kelly, Barbara F. and Heston, Tyler and Holton, Gary and Pulsifer, Peter and Beaver, David I. and Chelliah, Shobhana and Dubinsky, Stanley and Meier, Richard P. and Thieberger, Nick and Rice, Keren and Woodbury, Anthony C.},
year = {2018},
journal = {Linguistics},
volume = {56},
number = {1},
pages = {1--18},
issn = {00243949},
doi = {10.1515/ling-2017-0032},
abstract = {This paper is a position statement on reproducible research in linguistics, including data citation and attribution, that represents the collective views of some 41 colleagues. Reproducibility can play a key role in increasing verification and accountability in linguistic research, and is a hallmark of social science research that is currently under-represented in our field. We believe that we need to take time as a discipline to clearly articulate our expectations for how linguistic data are managed, cited, and maintained for long-Term access.},
file = {/Users/francojc/Zotero/storage/KAEMYE3Q/Berez-Kroeker et al. - 2018 - Reproducible research in linguistics A position s.pdf}
}
@book{Bernard2016,
title = {Analyzing Qualitative Data: {{Systematic}} Approaches},
shorttitle = {Analyzing Qualitative Data},
author = {Bernard, H. Russell and Wutich, Amber and Ryan, Gery W.},
year = {2016},
month = jun,
publisher = {SAGE Publications},
abstract = {The fully updated Second Edition of Analyzing Qualitative Data: Systematic Approaches by H. Russell Bernard, Amber Wutich, and Gery W. Ryan presents systematic methods for analyzing qualitative data with clear and easy-to-understand steps. The first half is an overview of the basics, from choosing a topic to collecting data, and coding to finding themes, while the second half covers different methods of analysis, including grounded theory, content analysis, analytic induction, semantic network analysis, ethnographic decision modeling, and more. Real examples drawn from social science and health literature along with carefully crafted, hands-on exercises at the end of each chapter allow readers to master key techniques and apply them to their own disciplines.},
googlebooks = {yAi1DAAAQBAJ},
isbn = {978-1-4833-4711-0},
langid = {english}
}
@misc{Bialik2013,
title = {Data Crunchers Now the Cool Kids on Campus},
author = {Bialik, Carl},
year = {2013},
journal = {The Wall Street Journal},
url = {https://www.wsj.com/articles/SB10001424127887323478304578332850293360468},
urldate = {2019-06-17}
}
@article{Biber1987,
title = {A Textual Comparison of British and American Writing},
author = {Biber, Douglas},
year = 1987,
journal = {American Speech},
volume = {62},
number = {2},
eprint = {455273},
eprinttype = {jstor},
pages = {99},
issn = {00031283},
doi = {10.2307/455273},
urldate = {2024-04-09},
file = {/Users/francojc/Zotero/storage/PSBN6SR6/Biber - 1987 - A Textual Comparison of British and American Writing.pdf}
}
@article{Biber1993,
title = {Using Register-Diversified Corpora for General Language Studies},
author = {Biber, Douglas},
year = {1993},
journal = {Computational linguistics},
volume = {19},
number = {2},
pages = {219--241},
urldate = {2014-06-09},
keywords = {academic writing,composition,corpus evaluation,corpus linguistics,pedagogy},
file = {/Users/francojc/Zotero/storage/NVSNATTH/Biber - 1993 - Using register-diversified corpora for general language studies.pdf}
}
@article{Biber1993a,
title = {Representativeness in Corpus Design},
author = {Biber, Douglas},
year = {1993},
journal = {Literary and linguistic computing},
volume = {8},
number = {4},
pages = {243--257},
file = {/Users/francojc/Zotero/storage/L5HGSN9J/Biber - 1993 - Representativeness in corpus design.pdf}
}
@article{Biber2004,
title = {If You Look at: {{Lexical}} Bundles in University Teaching and Textbooks},
author = {Biber, Douglas and Conrad, Susan and Cortes, V},
year = {2004},
journal = {Applied Linguistics},
urldate = {2013-01-18},
file = {/Users/francojc/Zotero/storage/PUINMIRD/Biber et al. - 2004 - If you look at Lexical bundles in university teac.pdf}
}
@article{Biber2005,
title = {Merging Corpus Linguistic and Discourse Analytic Research Goals: {{Discourse}} Units in Biology Research Articles},
author = {Biber, Douglas and Jones, James K.},
year = {2005},
month = jan,
journal = {Corpus Linguistics and Linguistic Theory},
volume = {1},
number = {2},
issn = {1613-7027},
doi = {10.1515/cllt.2005.1.2.151}
}
@book{Biber2006,
title = {University Language: A Corpus-Based Study of Spoken and Written Registers},
author = {Biber, Douglas},
year = {2006},
url = {http://books.google.com/books?hl=en&lr=&id=-2zqpWi19h4C&oi=fnd&pg=PA1&dq=biber+2006+university+language+a+corpus&ots=PmhOEi57XY&sig=Bz0JysWYnN08AElZMC_BG04ytxQ},
urldate = {2014-11-06}
}
@article{Biber2006a,
title = {Stance in Spoken and Written University Registers},
author = {Biber, Douglas},
year = {2006},
journal = {Journal of English for Academic Purposes},
url = {http://www.sciencedirect.com/science/article/pii/S1475158506000075},
urldate = {2014-11-06}
}
@article{Biber2007,
title = {Lexical Bundles in University Spoken and Written Registers},
author = {Biber, Douglas and Barbieri, Federica},
year = {2007},
journal = {English for specific purposes},
url = {http://www.sciencedirect.com/science/article/pii/S0889490606000366},
urldate = {2014-11-06}
}
@article{Biber2009,
title = {A Corpus-Driven Approach to Formulaic Language in {{English}}: {{Multi-word}} Patterns in Speech and Writing},
author = {Biber, Douglas},
year = {2009},
journal = {International Journal of Corpus Linguistics},
url = {http://www.ingentaconnect.com/content/jbp/ijcl/2009/00000014/00000003/art00002},
urldate = {2014-11-06},
keywords = {collocations,corpus,corpus linguistics,formulaic language}
}
@article{Biber2010,
title = {Challenging Stereotypes about Academic Writing: {{Complexity}}, Elaboration, Explicitness},
author = {Biber, Douglas and Gray, Bethany},
year = {2010},
journal = {Journal of English for Academic Purposes},
url = {http://www.sciencedirect.com/science/article/pii/S1475158510000020},
urldate = {2014-11-06}
}
@misc{Blagotic2021,
title = {{{ProjectTemplate}}: {{Automates}} the Creation of New Statistical Analysis Projects},
shorttitle = {{{ProjectTemplate}}},
author = {Blagotic, Aleksandar and {Valle-Jones}, Diego and Breen, Jeffrey and Lundborg, Joakim and White, John Myles and Bode, Josh and White, Kenton and Mueller, Kirill and Redaelli, Matteo and Lorang, Noah and Schalk, Patrick and Schneider, Dominik and Hepp, Gerold and Jamile, Zunaira},
year = {2021},
month = feb,
url = {https://CRAN.R-project.org/package=ProjectTemplate},
urldate = {2021-07-20},
abstract = {Provides functions to automatically build a directory structure for a new R project. Using this structure, 'ProjectTemplate' automates data loading, preprocessing, library importing and unit testing.},
copyright = {GPL-3 {\textbar} file LICENSE}
}
@article{Blischak2019,
title = {Creating and Sharing Reproducible Research Code the Workflowr Way},
author = {Blischak, John D. and Carbonetto, Peter and Stephens, Matthew},
year = {2019},
journal = {F1000Research},
volume = {8},
publisher = {Faculty of 1000 Ltd},
file = {/Users/francojc/Zotero/storage/FY64FLHZ/PMC6833990.html}
}
@article{Bloomfield1926,
title = {A Set of Postulates for the Science of Language},
author = {Bloomfield, Leonard},
year = {1926},
journal = {Language},
volume = {2},
number = {3},
pages = {153--164},
issn = {00978507},
doi = {10.2307/408741},
abstract = {The method of postulates (that is, assumptions or axioms) and defini-tionst is fully adequate to mathematics; as for other sciences, the more complex their subject-matter, the less amenable are they to this method, since, under it, every descriptive or historical fact becomes the subject of a new postulate. Nevertheless, the postulational method can further the study of language, because it forces us to state explicitly whatever we assume, to define our terms, and to decide what things may exist independently and what things are interdependent.2 Certain errors can be avoided or corrected by examining and formu-lating our (at present tacit) assumptions and defining our (often unde-fined) terms.3 Also, the postulational method saves discussion, because it limits our statements to a defined terminology; in particular, it cuts us off from psychological dispute.4 Discussion of the fundamentals of our science 1 For a clear exposition of this method, see J. W. Young, Lectures on the Fundamental Concepts of Algebra and Geometry, New York 1911. 1 Cf. A. P. Weiss's set of postulates for psychology, Psychological Review. 32. 83. 3 Examples are many. Bopp took for granted that the formative elements of Indo-European were once independent words; this is a needless and unwarranted assumption. The last descendant of his error is the assumption that IE compound words are historically derived from phrases (Jacobi, Compositum und Nebensatz, Bonn 1897; this even in Brug-mann, Grundrisz I)I, 1, pp. 37. 78; cf. TAPA 45. 73 ff.). The notion is gaining ground that some forms have less meaning than others and are therefore more subject to phonetic change (Horn, Sprachkiirper und Sprachfunktion, Palaestra 135, Berlin 1921); I, for one, can discover no workable definition of the terms 'meaning' and 'phonetic change' under which this notion can be upheld. The whole dispute, perhaps today as unstilled as fifty years ago, about the regularity of phonetic change, is at bottom a question of terminology. *Recall the difficulties and obscurities in the writings of Humboldt and Steinthal, and the psychological dispute of Paul, Wundt, Delbrueck. From our point of view, the last-named was wrong in denying the value of descriptive data, but right in saying that it is indifferent what system of psychology a linguist believes in (Grundfragen der Sprach-forschung, Strassburg 1901). The trouble over the nature of the sentence is largely non-linguistic; contrast the simplicity and usefulness of Meillet's definition (adopted below), 153},
isbn = {00978507},
file = {/Users/francojc/Zotero/storage/4M54XRCC/Bloomfield - 1926 - A Set of Postulates for the Science of Language.pdf}
}
@misc{Bobbitt2021,
title = {Left Skewed vs. Right Skewed Distributions},
author = {Bobbitt, Zach},
year = {2021},
month = jan,
journal = {Statology},
url = {https://www.statology.org/left-skewed-vs-right-skewed/},
urldate = {2021-06-30},
abstract = {This tutorial explains the difference between left skewed and right skewed distributions, including several examples.},
langid = {american},
file = {/Users/francojc/Zotero/storage/KL3A8YQU/left-skewed-vs-right-skewed.html}
}
@article{Boettiger2017,
title = {An Introduction to Rocker: {{Docker}} Containers for {{R}}},
shorttitle = {An Introduction to Rocker},
author = {Boettiger, Carl and Eddelbuettel, Dirk},
year = {2017},
journal = {The R Journal},
volume = {9},
number = {2},
pages = {527},
issn = {2073-4859},
doi = {10.32614/RJ-2017-065},
urldate = {2023-06-22},
abstract = {We describe the Rocker project, which provides a widely-used suite of Docker images with customized R environments for particular tasks. We discuss how this suite is organized, and how these tools can increase portability, scaling, reproducibility, and convenience of R users and developers.},
langid = {english},
keywords = {/unread,containers,docker,images,r,reproducible research,rocker},
file = {/Users/francojc/Zotero/storage/YRNEPITC/Boettiger and Eddelbuettel - 2017 - An Introduction to Rocker Docker Containers for R.pdf}
}
@incollection{Bohmann2023,
title = {Contrastive Usage Profiling: A Word Vector Perspective on World {{Englishes}}},
shorttitle = {Contrastive Usage Profiling},
booktitle = {Language and {{Linguistics}} in a {{Complex World}}},
author = {Bohmann, Axel},
editor = {Busse, Beatrix and Warnke, Ingo H.},
year = {2023},
volume = {32},
pages = {11--30},
publisher = {De Gruyter},
file = {/Users/francojc/Zotero/storage/DCVMAHBR/Bohmann - 2023 - Contrastive Usage Profiling A Word Vector Perspec.pdf}
}
@article{Bolibaugh2021,
title = {Towards a Credibility Revolution in Bilingualism Research: {{Open}} Data and Materials as Stepping Stones to More Reproducible and Replicable Research},
shorttitle = {Towards a Credibility Revolution in Bilingualism Research},
author = {Bolibaugh, Cylcia and Vanek, Norbert and Marsden, Emma J.},
year = {2021},
month = nov,
journal = {Bilingualism: Language and Cognition},
volume = {24},
number = {5},
pages = {801--806},
publisher = {Cambridge University Press},
issn = {1366-7289, 1469-1841},
doi = {10.1017/S1366728921000535},
urldate = {2021-12-16},
abstract = {The extent to which findings in bilingualism research are contingent on specific analytic choices, experimental designs, or operationalisations, is currently unknown. Poor availability of data, analysis code, and materials has hindered the development of cumulative lines of research. In this review, we survey current practices and advocate a credibility revolution in bilingualism research through the adoption of minimum standards of transparency. Full disclosure of data and code is necessary not only to assess the reproducibility of original findings, but also to test the robustness of these findings to different analytic specifications. Similarly, full provision of experimental materials and protocols underpins assessment of both the replicability of original findings, as well as their generalisability to different contexts and samples. We illustrate the review with examples where good practice has advanced the agenda in bilingualism research and highlight resources to help researchers get started.},
langid = {english},
file = {/Users/francojc/Zotero/storage/V9MNKAR4/Bolibaugh et al. - 2021 - Towards a credibility revolution in bilingualism r.pdf;/Users/francojc/Zotero/storage/AP8EMQ5X/C4FC0550EE4537D8603942419B288C6E.html}
}
@article{Bolukbasi2016,
title = {Man Is to Computer Programmer as Woman Is to Homemaker? {{Debiasing}} Word Embeddings},
author = {Bolukbasi, Tolga and Chang, Kai-Wei and Zou, James and Saligrama, Venkatesh and Kalai, Adam},
year = {2016},
journal = {arXiv},
eprint = {1607.06520},
issn = {10495258},
url = {http://arxiv.org/abs/1607.06520},
abstract = {The blind application of machine learning runs the risk of amplifying biases present in data. Such a danger is facing us with word embedding, a popular framework to represent text data as vectors which has been used in many machine learning and natural language processing tasks. We show that even word embeddings trained on Google News articles exhibit female/male gender stereotypes to a disturbing extent. This raises concerns because their widespread use, as we describe, often tends to amplify these biases. Geometrically, gender bias is first shown to be captured by a direction in the word embedding. Second, gender neutral words are shown to be linearly separable from gender definition words in the word embedding. Using these properties, we provide a methodology for modifying an embedding to remove gender stereotypes, such as the association between between the words receptionist and female, while maintaining desired associations such as between the words queen and female. We define metrics to quantify both direct and indirect gender biases in embeddings, and develop algorithms to "debias" the embedding. Using crowd-worker evaluation as well as standard benchmarks, we empirically demonstrate that our algorithms significantly reduce gender bias in embeddings while preserving the its useful properties such as the ability to cluster related concepts and to solve analogy tasks. The resulting embeddings can be used in applications without amplifying gender bias.},
archiveprefix = {arXiv},
file = {/Users/francojc/Zotero/storage/LRHQ2CDD/Bolukbasi et al. - 2016 - Man is to Computer Programmer as Woman is to Homem.pdf}
}
@article{Bosch1998,
title = {Separating Hyperplanes and the Authorship of the Disputed Federalist Papers},
author = {Bosch, Robert A. and Smith, Jason A.},
year = {1998},
journal = {American Mathematical Monthly},
volume = {105},
number = {7},
pages = {601--608},
issn = {00029890},
doi = {10.2307/2589242},
urldate = {2016-08-26},
isbn = {0002-9890},
file = {/Users/francojc/Zotero/storage/IM2DKRV4/Bosch, Smith - 1998 - Separating hyperplanes and the authorship of the disputed federalist papers.pdf}
}
@misc{Bowman2020,
title = {{{OSF}} Prereg Template},
author = {Bowman, Sara and DeHaven, Alexander C. and Errington, Timothy M. and Hardwicke, Tom E. and Mellor, David Thomas and Nosek, Brian A. and Soderberg, Courtney K.},
year = {2020},
month = jan,
publisher = {MetaArXiv},
doi = {10.31222/osf.io/epgjd},
urldate = {2023-07-18},
abstract = {Preregistration is the act of submitting a study plan, ideally also with analytical plan, to a registry prior to conducting the work. Preregistration increases the discoverability of research even if it does not get published further. Adding specific analysis plans can clarify the distinction between planned, confirmatory tests and unplanned, exploratory research. This preprint contains a template for the ``OSF Prereg'' form available from the OSF Registry. An earlier version was originally developed for the Preregistration Challenge, an education campaign designed to initiate preregistration as a habit prior to data collection in basic research, funded by the Laura and John Arnold Foundation (now Arnold Ventures) and conducted by the Center for Open Science. More information is available at https://cos.io/prereg, and other templates are available at: https://osf.io/zab38/},
langid = {american},
keywords = {/unread,Bioethics and Medical Ethics,harking,Medicine and Health Sciences,open science,OSF,Other Social and Behavioral Sciences,p-hacking,Physical Sciences and Mathematics,preregistration,qrp,registration,research design,Social and Behavioral Sciences,Statistics and Probability,template},
file = {/Users/francojc/Zotero/storage/DS2KXP4S/Bowman et al. - 2020 - OSF Prereg Template.pdf}
}
@article{Bransford1972,
title = {Sentence Memory: A Constructive versus Interpretive Approach},
author = {Bransford, {\relax JD} and Barclay, {\relax JR} and Franks, {\relax JJ}},
year = {1972},
journal = {Cognitive psychology},
volume = {209},
pages = {193--209},
url = {http://www.sciencedirect.com/science/article/pii/0010028572900035},
urldate = {2013-10-31},
file = {/Users/francojc/Zotero/storage/KMVH97PM/Bransford, Barclay, Franks - 1972 - Sentence Memory A Constructive Versus Interpretive Approach.pdf}
}
@article{Breeze2013,
title = {Lexical Bundles across Four Legal Genres},
author = {Breeze, Ruth},
year = {2013},
month = jan,
journal = {International Journal of Corpus Linguistics},
volume = {18},
number = {2},
pages = {229--253},
issn = {13846655},
doi = {10.1075/ijcl.18.2.03bre},
urldate = {2013-10-30},
file = {/Users/francojc/Zotero/storage/IZANG9XU/Breeze - 2013 - Lexical bundles across four legal genres.pdf}
}
@inproceedings{Bresnan2007,
title = {Predicting the Dative Alternation},
booktitle = {Cognitive {{Foundations}} of {{Interpretation}}},
author = {Bresnan, Joan and Cueni, Anna and Nikitina, Tatiana and Baayen, R. Harald},
editor = {Bouma, G. and Kraemer, I. and Zwart, Jan-Wouter C},
year = {2007},
pages = {1--33},
publisher = {KNAW},
address = {Amsterdam},
file = {/Users/francojc/Zotero/storage/HCIQA3VA/Bresnan et al. - 2007 - Predicting the Dative Alternation.pdf}
}
@article{Bresnan2007a,
title = {A Few Lessons from Typology},
author = {Bresnan, Joan},
year = {2007},
journal = {Linguistic Typology},
volume = {11},
number = {1},
pages = {297--306},
abstract = {Typology has a low profile in much of American linguistics, especially outside of phonology (Nichols 2007, Hyman 2007, Van Valin 2007). Yet, as I will suggest, the study of the results and methods of modern typology has important lessons for us as the field of linguistics undergoes a paradigm shift. Typologists study a wide range of language types, but I will show that even when one does theoretical work on a single, well-studied standardized national language like English, one can (and should) benefit from an awareness of typological findings. [ABSTRACT FROM AUTHOR]},
file = {/Users/francojc/Zotero/storage/VENLH3VL/Bresnan - 2007 - A few lessons from typology.pdf}
}
@article{Briand2009,
title = {A Similarity Measure to Assess the Stability of Classification Trees},
author = {Briand, B{\'e}n{\'e}dicte and Ducharme, Gilles R. and Parache, Vanessa and {Mercat-Rommens}, Catherine},
year = {2009},
journal = {Computational Statistics and Data Analysis},
volume = {53},
number = {4},
pages = {1208--1217},
issn = {01679473},
doi = {10.1016/j.csda.2008.10.033},
abstract = {It has been recognized that Classification trees (CART) are unstable; a small perturbation in the input variables or a fresh sample can lead to a very different classification tree. Some approaches exist that try to correct this instability. However, their benefits can, at present, be appreciated only qualitatively. A similarity measure between two classification trees is introduced that can measure their closeness. Its usefulness is illustrated with synthetic data on the impact of radioactivity deposit through the environment. In this context, a modified node level stabilizing technique, referred to as the NLS-REP method, is introduced and shown to be more stable than the classical CART method. {\copyright} 2008 Elsevier B.V. All rights reserved.},
isbn = {1532-4435},
pmid = {10204200}
}
@article{Broman2018,
title = {Data Organization in Spreadsheets},
author = {Broman, Karl W. and Woo, Kara H.},
year = {2018},
month = jan,
journal = {The American Statistician},
volume = {72},
number = {1},
pages = {2--10},
publisher = {Taylor \& Francis},
issn = {0003-1305},
doi = {10.1080/00031305.2017.1375989},
urldate = {2021-04-21},
abstract = {Spreadsheets are widely used software tools for data entry, storage, analysis, and visualization. Focusing on the data entry and storage aspects, this article offers practical recommendations for organizing spreadsheet data to reduce errors and ease later analyses. The basic principles are: be consistent, write dates like YYYY-MM-DD, do not leave any cells empty, put just one thing in a cell, organize the data as a single rectangle (with subjects as rows and variables as columns, and with a single header row), create a data dictionary, do not include calculations in the raw data files, do not use font color or highlighting as data, choose good names for things, make backups, use data validation to avoid data entry errors, and save the data in plain text files.},
file = {/Users/francojc/Zotero/storage/7ZA9YH76/Broman and Woo - 2018 - Data Organization in Spreadsheets.pdf;/Users/francojc/Zotero/storage/VLE2BK9E/00031305.2017.html}
}
@book{Brown2005,
title = {Encyclopedia of Language and Linguistics},
author = {Brown, Keith},
year = {2005},
volume = {1},
publisher = {Elsevier},
file = {/Users/francojc/Zotero/storage/KLU9E7UK/cxYGQfiD_1oC.html}
}
@article{Brown2018,
title = {Ten Quick Tips for Teaching Programming},
author = {Brown, Neil C. C. and Wilson, Greg},
year = {2018},
journal = {PLOS Computational Biology},
volume = {14},
number = {4},
pages = {e1006023},
issn = {1553-7358},
doi = {10.1371/journal.pcbi.1006023},
abstract = {Research from educational psychology suggests that teaching and learning are subject-specific activities [1]: learning programming has a different set of challenges and techniques than learning physics or learning to read and write. Computing is a younger discipline than mathe- matics, physics, or biology, and while there have been correspondingly fewer studies of how best to teach it, there is a growing body of evidence about what works and what doesn't. This paper presents 10 quick tips that should be the foundation of any teaching of programming, whether formal or informal.},
isbn = {1111111111},
pmid = {29621229},
file = {/Users/francojc/Zotero/storage/6H69NUMY/Brown, Wilson - 2018 - Ten quick tips for teaching programming.pdf}
}
@article{Bryan2017,
title = {Excuse Me, Do You Have a Moment to Talk about Version Control?},
author = {Bryan, Jennifer},
year = {2017},
journal = {PeerJ Preprints},
volume = {5},
pages = {1--23},
issn = {2167-9843},
doi = {10.7287/peerj.preprints.3159v2},
abstract = {Data analysis, statistical research, and teaching statistics have at least one thing in common: these activities all produce many files! There are data files, source code, figures, tables, prepared reports, and much more. Most of these files evolve over the course of a project and often need to be shared with others, for reading or edits, as a project unfolds. Without explicit and structured management, project organization can easily descend into chaos, taking time away from the primary work and reducing the quality of the final product. This unhappy result can be avoided by repurposing tools and workflows from the software development world, namely, distributed version control. This article describes the use of the version control system Git and and the hosting site GitHub for statistical and data scientific workflows. Special attention is given to projects that use the statistical language R and, optionally, R Markdown documents. Supplementary materials include an annotated set of links to step-by-step tutorials, real world examples, and other useful learning resources.},
file = {/Users/francojc/Zotero/storage/X3FQY98H/Bryan - 2017 - Excuse me, do you have a moment to talk about version control.pdf}
}
@book{Bryan2020,
title = {Happy Git and {{GitHub}} for the {{useR}}},
author = {Bryan, Jennifer and Hester, Jim},
year = {2020},
url = {https://happygitwithr.com/},
urldate = {2021-01-06},
abstract = {Using Git and GitHub with R, Rstudio, and R Markdown},
file = {/Users/francojc/Zotero/storage/U778V6U4/happygitwithr.com.html}
}
@article{Brysbaert2011,
title = {Do the Effects of Subjective Frequency and Age of Acquisition Survive Better Word Frequency Norms?},
author = {Brysbaert, Marc and Cortese, Michael J},
year = {2011},
month = mar,
journal = {Quarterly journal of experimental psychology (2006)},
volume = {64},
number = {3},
eprint = {20700859},
eprinttype = {pubmed},
pages = {545--59},
issn = {1747-0226},
doi = {10.1080/17470218.2010.503374},
urldate = {2011-03-15},
abstract = {Megastudies with processing efficiency measures for thousands of words allow researchers to assess the quality of the word features they are using. In this article, we analyse reading aloud and lexical decision reaction times and accuracy rates for 2,336 words to assess the influence of subjective frequency and age of acquisition on performance. Specifically, we compare newly presented word frequency measures with the existing frequency norms of Kucera and Francis (1967), HAL (Burgess \& Livesay, 1998), Brysbaert and New (2009), and Zeno, Ivens, Millard, and Duvvuri (1995). We show that the use of the Kucera and Francis word frequency measure accounts for much less variance than the other word frequencies, which leaves more variance to be "explained" by familiarity ratings and age-of-acquisition ratings. We argue that subjective frequency ratings are no longer needed if researchers have good objective word frequency counts. The effect of age of acquisition remains significant and has an effect size that is of practical relevance, although it is substantially smaller than that of the first phoneme in naming and the objective word frequency in lexical decision. Thus, our results suggest that models of word processing need to utilize these recently developed frequency estimates during training or setting baseline activation levels in the lexicon.},
pmid = {20700859},
file = {/Users/francojc/Zotero/storage/Z5WJ4QH4/Brysbaert and Cortese - 2011 - Do the effects of subjective frequency and age of .pdf}
}
@incollection{Buckheit1995,
title = {Wavelab and Reproducible Research},
booktitle = {Wavelets and Statistics},
author = {Buckheit, Jonathan B. and Donoho, David L.},
year = {1995},
pages = {55--81},
publisher = {Springer},
file = {/Users/francojc/Zotero/storage/Y5GFJYQD/Buckheit and Donoho - 1995 - Wavelab and reproducible research.pdf;/Users/francojc/Zotero/storage/HAIDPE44/978-1-4612-2544-7_5.html}
}
@techreport{Bukhari2020,
type = {{{SSRN Scholarly Paper}}},
title = {Data Science Curriculum: {{Current}} Scenario},
shorttitle = {Data Science Curriculum},
author = {Bukhari, Duaa},
year = {2020},
number = {3616600},
address = {Rochester, NY},
institution = {Social Science Research Network},
url = {https://papers.ssrn.com/abstract=3616600},
urldate = {2022-05-09},
abstract = {Companies desires for making productive discoveries from big data have motivated academic institutions offering variety of different data science (DS) programs, in order to increases their graduates' ability to be data scientists who are capable to face the challenges of the new age. These data science programs represent a combination of subject areas from several disciplines. There are few studies have examined data science programs within a particular discipline, such as Business (e.g. Chen et al.). However, there are very few empirical studies that investigate DS programs and explore its curriculum structure across disciplines. Therefore, this study examines data science programs offered by American universities. The study aims to depict the current state of data science education in the U.S. to explore what discipline DS programs covers at the graduate level. The current study conducted an exploratory content analysis of 30 DS programs in the United States from a variety of disciplines. The analysis was conducted on course titles and course descriptions level. The study results indicate that DS programs required varying numbers of credit hours, including practicum and capstone. Management schools seem to take the lead and the initiative in lunching and hosting DS programs. In addition, all DS programs requires the basic knowledge of database design, representation, extraction and management. Furthermore, DS programs delivered information skills through their core courses. Moreover, the study results show that almost 40 percent of required courses in DS programs is involved information representations, retrieval and programming. Additionally, DS programs required courses also addressed communication visualization and mathematics skills.},
langid = {english},
file = {/Users/francojc/Zotero/storage/5WWYDGNT/Bukhari - 2020 - Data Science Curriculum Current Scenario.pdf;/Users/francojc/Zotero/storage/47P9MGBH/papers.html}
}
@article{Bullock2021,
title = {Exploring a Loan Translation and Its Consequences in an Oral Bilingual Corpus},
author = {Bullock, Barbara E. and Serigos, Jacqueline and Toribio, Almeida Jacqueline},
year = {2021},
month = jul,
journal = {Journal of Language Contact},
volume = {13},
number = {3},
pages = {612--635},
publisher = {Brill},
issn = {1877-4091, 1955-2629},
doi = {10.1163/19552629-bja10027},
urldate = {2021-08-17},
abstract = {Abstract This work applies computational tools that have been used to model loanwords in newspaper corpora to an analysis of a loan translation in an oral bilingual corpus. The explicit goal of the contribution is to argue that a specific collocation found in a corpus of Spanish spoken in Texas, agarrar+NP (e.g., agarrar ayuda), is a loan translation that is calqued on English get+np support verb constructions (e.g., get help). We base our argument on the frequency and the linguistic distribution of the nonconventional usage within and between corpora and on the factors that favor its use. Our findings show that the overall frequency of agarrar is the same in Spanish in Texas as it is in the benchmark monolingual corpus of Mexican Spanish but that it is used differently in the two varieties, a difference that has grammatical, as well as semantic, ramifications.},
langid = {english},
file = {/Users/francojc/Zotero/storage/K2QLUXJT/Bullock et al. - 2021 - Exploring a Loan Translation and Its Consequences .pdf}
}
@inproceedings{Bunt2006,
title = {Dimensions in Dialogue Act Annotation},
booktitle = {Language {{Resource}} and {{Evaluation Conference}}},
author = {Bunt, Harry},
year = {2006},
pages = {919--924},
abstract = {This paper is concerned with the fundamentals of multidimensional dialogue act annotation, i.e. with what it means to annotate dialogues with information about the communicative acts that are performed with the utterances, taking various `dimensions' into account. Two ideas seem to be prevalent in the literature concerning the notion of dimension: (1) dimensions correspond to different types of information; and (2) a dimension is formed by a set of mutually exclusive tags. In DAMSL, for instance, the terms `dimension' and `layer' are used sometimes in the sense of (1) and sometimes in that of (2). We argue that being mutually exclusive is not a good criterion for a set of dialogue act types to constitute a dimension, even though the description of an object in a multidimensional space should never assign more than one value per dimension. We define a dimension of dialogue act annotation as an aspect of participating in a dialogue that can be addressed independently by means of dialogue acts. We show that DAMSL dimensions such as Info-request, Statement, and Answer do not qualify as proper dimensions, and that the communicative functions in these categories do not fall in any specific dimension, but should be considered as `general-purpose' in the sense that they can be used in any dimension. We argue that using the notion of dimension that we propose, a multidimensional taxonomy of dialogue acts emerges that optimally supports multidimensional dialogue act annotation.},
langid = {english},
file = {/Users/francojc/Zotero/storage/LDPEXTKN/Bunt - Dimensions in Dialogue Act Annotation.pdf}
}