textbook.bib

@article{Abney1996,
  title = {Statistical Methods and Linguistics},
  author = {Abney, Steven},
  editor = {Klavans, Judith and Resnik, Philip},
  year = {1996},
  journal = {The balancing act. Combining Symbolic and Statistical Approaches to Language},
  pages = {1--23},
  publisher = {MIT Press},
  address = {Cambridge, Massachusetts},
  abstract = {is unusual as a mass noun, but can in fact be used as one, as for example in the article consisted of three pages of abstract and only two pages of actual text. One might object that the NP headed by might is bad because of the multiple postmodifiers, but in fact there is no absolute constraint against stacking nominal postmodifiers, and good examples can be constructed with the same structure: marlinespikes, business end up, sprinkled with tabasco sauce, can be a powerful deterrent against pigeons. Even the commas are not absolutely required. The strength of preference for them depends on how heavy the modifiers are: cf. strength judicially applied increases the effectiveness of diplomacy, a cup of peanuts unshelled in the stock adds character. 3 In short, the structure (3) seems to be best characterized as grammatical, though it violates any number of parsing preferences and is completely absurd. One might think that one could eliminate ambiguities by turning some of the dispref...},
  file = {/Users/francojc/Zotero/storage/QT9V3PEU/Abney - 1996 - Statistical methods and linguistics.pdf}
}

@book{Abney2008,
  title = {Semisupervised Learning for Computational Linguistics},
  author = {Abney, Steven},
  year = {2008},
  journal = {Journal of the Royal Statistical Society Series A Statistics in Society},
  volume = {172},
  publisher = {CRC Press},
  issn = {09641998},
  doi = {10.1111/j.1467-985X.2009.00595_2.x},
  abstract = {The rapid advancement in the theoretical understanding of statistical and machine learning methods for semisupervised learning has made it difficult for nonspecialists to keep up to date in the field. Providing a broad, accessible treatment of the theory as well as linguistic applications, Semisupervised Learning for Computational Linguistics offers self-contained coverage of semisupervised methods that includes background material on supervised and unsupervised learning. The book presents a brief history of semisupervised learning and its place in the spectrum of learning methods before moving on to discuss well-known natural language processing methods, such as self-training and co-training. It then centers on machine learning techniques, including the boundary-oriented methods of perceptrons, boosting, support vector machines (SVMs), and the null-category noise model. In addition, the book covers clustering, the expectation-maximization (EM) algorithm, related generative methods, and agreement methods. It concludes with the graph-based method of label propagation as well as a detailed discussion of spectral methods. Taking an intuitive approach to the material, this lucid book facilitates the application of semisupervised learning methods to natural language processing and provides the framework and motivation for a more systematic study of machine learning.},
  isbn = {978-1-58488-559-7}
}

@article{Ackoff1989,
  title = {From Data to Wisdom},
  author = {Ackoff, Russell L.},
  year = {1989},
  journal = {Journal of Applied Systems Analysis},
  volume = {16},
  number = {1},
  pages = {3--9},
  file = {/Users/francojc/Zotero/storage/AD8VE6XD/Ackoff - 1989 - From data to wisdom.pdf}
}

@incollection{Adel2020,
  title = {Corpus Compilation},
  booktitle = {A {{Practical Handbook}} of {{Corpus Linguistics}}},
  author = {{\"A}del, Annelie},
  editor = {Paquot, Magali and Gries, Stefan Th.},
  year = {2020},
  pages = {3--24},
  publisher = {Springer},
  address = {Switzerland}
}

@book{Aggarwal2012,
  title = {Mining Text Data},
  author = {Aggarwal, Charu C. and Zhai, ChengXiang},
  year = {2012},
  month = feb,
  publisher = {Springer Science \& Business Media},
  abstract = {Text mining applications have experienced tremendous advances because of web 2.0 and social networking applications. Recent advances in hardware and software technology have lead to a number of unique scenarios where text mining algorithms are learned. Mining Text Data introduces an important niche in the text analytics field, and is an edited volume contributed by leading international researchers and practitioners focused on social networks \& data mining. This book contains a wide swath in topics across social networks \& data mining. Each chapter contains a comprehensive survey including the key research content on the topic, and the future directions of research in the field. There is a special focus on Text Embedded with Heterogeneous and Multimedia Data which makes the mining process much more challenging. A number of methods have been designed such as transfer learning and cross-lingual mining for such cases. Mining Text Data simplifies the content, so that advanced-level students, practitioners and researchers in computer science can benefit from this book. Academic and corporate libraries, as well as ACM, IEEE, and Management Science focused on information security, electronic commerce, databases, data mining, machine learning, and statistics are the primary buyers for this reference book.},
  googlebooks = {vFHOx8wfSU0C},
  isbn = {978-1-4614-3223-4},
  langid = {english}
}

@article{Agnieszka2014,
  title = {The Acquisition of Formulaic Language by {{EFL}} Learners},
  author = {Agnieszka, Le{\'n}ko-Szyma{\'n}ska},
  year = {2014},
  journal = {International Journal of Corpus Linguistics},
  volume = {19},
  number = {2},
  pages = {225--251},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/L3Y8Z2DD/Le - e acquisition of formulaic language by EFL learner.pdf}
}

@article{Ahmed2021,
  title = {The Role of Forensic Linguistics in Crime Investigation: {{Uses}} in Legal Proceedings.},
  author = {Ahmed, Hazhar Ramadhan},
  year = {2021},
  month = mar,
  journal = {Journal of the Association-Institute for English Language and American Studies},
  series = {{{ANGLISTICUM}}},
  volume = {10},
  number = {2},
  pages = {23--31},
  publisher = {Zenodo},
  doi = {10.5281/ZENODO.4609333},
  urldate = {2021-06-01},
  abstract = {This paper considers the extent to which forensic linguistics can be considered a science, and outlines some ways in which it is useful in legal proceedings, including voice identification, the interpretation of police-suspect interaction, verification of police reports, and cross-cultural insights into speech patterns in a courtroom context. On the basis of the analysis, the paper concludes that Forensic linguistics can prove beneficial for the investigation of crimes, analysis of the judicial procedures, and particularly disputes in law. It can also be used for the analysis of courtroom discourse and interpret and translate the legal documents for their readability and comprehensibility. Moreover, the police cautions issued to the suspects can also be analyzed for their comprehensibility and the authorship attribution can be established for written or spoken texts. It, therefore, works as the interface between language, crime, and the law.},
  copyright = {Creative Commons Attribution 4.0 International, Open Access},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/UACLCNUA/Hazhar Ramadhan Ahmed - 2021 - THE ROLE OF FORENSIC LINGUISTICS IN CRIME INVESTIG.pdf}
}

@article{Akbary2018,
  title = {The Value of Song Lyrics for Teaching and Learning {{English}} Phrasal Verbs: A Corpus Investigation of Four Music Genres},
  shorttitle = {The Value of Song Lyrics for Teaching and Learning English Phrasal Verbs},
  author = {Akbary, Maryam and Shahriari, Hesamoddin and Hosseini Fatemi, Azar},
  year = {2018},
  month = oct,
  journal = {Innovation in Language Learning and Teaching},
  volume = {12},
  number = {4},
  pages = {344--356},
  issn = {1750-1229, 1750-1237},
  doi = {10.1080/17501229.2016.1216121},
  urldate = {2021-07-28},
  abstract = {Phrasal verbs are a notoriously difficult feature of English for most second language and foreign language learners to master. Different sources, such as movies, music, games and books, can provide learners with exposure to the most common phrasal verbs in English. This study aims to investigate the degree to which music can play a role in exposing learners to phrasal verbs through analyzing their frequency in song lyrics from different genres (i.e., Pop, Rock, Hip-hop and Metal). For this purpose, a corpus of 400 song lyrics by different artists from these four genres was searched for all existing phrasal verbs. The resulting list of phrasal verbs was compared to Garnier and Schmitt's (2014) Phrasal Verb Pedagogical List in order to determine their value for learners. Further comparisons were subsequently drawn to determine which genre could be of greater use to language learning and instruction. The results revealed that song lyrics can potentially be a beneficial source for learning these constructions. Differences in the type and token frequency of phrasal verb among the four genres can also be used to determine the usefulness of each genre to students from various levels of proficiency.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/9JQ99CUA/Akbary et al. - 2018 - The value of song lyrics for teaching and learning.pdf}
}

@article{Alanazi2022,
  title = {Corpus-Based Analysis of near-Synonymous Verbs},
  author = {Alanazi, Zaha},
  year = {2022},
  month = aug,
  journal = {Asian-Pacific Journal of Second and Foreign Language Education},
  volume = {7},
  number = {1},
  pages = {15},
  issn = {2363-5169},
  doi = {10.1186/s40862-022-00138-5},
  urldate = {2022-08-12},
  abstract = {Despite having different semantic profiles, near synonyms are usually presented in dictionaries as being contextually interchangeable, which may lead EFL learners to assume their contextual interchangeability. Nevertheless, there is a scarcity of studies on how near synonyms are similar or different in their semantic and grammatical preferences. To enrich the literature on near synonyms' semantic and grammatical profiles, this study explores the collocational behaviors and the semantic preferences of the near-synonymous verbs (affect vs. impact). Sketch Engine was used to examine lexical collocates, the colligational profile and the semantic prosody of the two verbs. The findings revealed fine-grained contextual differences in their collocational, grammatical, and semantic preferences. Applications of the findings for English language teaching will be discussed a long with recommendations for future research.},
  keywords = {/unread,Colligation,Collocation,collocations,Corpus,corpus linguistics,second language acquisition,Semantic,semantics,Sketch engine,Synonyms},
  file = {/Users/francojc/Zotero/storage/8HXKD4M7/Alanazi_2022_Corpus-based analysis of near-synonymous verbs.pdf}
}

@misc{Albert2015,
  title = {{{CABNC}}: {{The Jeffersonian}} Transcription of the Spoken {{British National Corpus}}},
  shorttitle = {{{CABNC}}},
  author = {Albert, Saul and {de Ruiter}, Laura E. and {de Ruiter}, J. P.},
  year = {2015},
  publisher = {TalkBank},
  url = {https://saulalbert.github.io/CABNC/},
  copyright = {CC BY 3.0},
  keywords = {/unread,bnc,cabnc,english,spoken,talkbank}
}

@inproceedings{Alegria2014,
  title = {Wikipedia and Machine Translation: Killing Two Birds with One Stone},
  booktitle = {Workshop on '{{Free}}/Open-Source Language Resources for the Machine Translation of Less-Resourced Languages' at {{LREC}} 2014},
  author = {Alegria, I{\~n}aki and Cabezon, Unai and {Fernandez de Beto{\~n}o}, Unai and Labaka, Gorka and Mayor, Aingeru and Sarasola, Kepa and Zubiaga, Arkaitz},
  year = {2014},
  urldate = {2014-05-05},
  file = {/Users/francojc/Zotero/storage/PFENS7EZ/Alegria et al. - 2014 - Wikipedia and Machine Translation killing two birds with one stone.pdf}
}

@misc{Almeida2011,
  title = {{{SMS}} Spam Collection},
  shorttitle = {{{SMS}} Spam Collection},
  author = {Almeida, Tiago A. and G{\'o}mez Hildago, Jos{\'e} Mar{\'i}a},
  year = {2011},
  journal = {SMS Spam Collection v. 1},
  url = {https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/},
  urldate = {2021-07-08},
  abstract = {The SMS Spam Collection v.1 is a public set of SMS labeled messages that have been collected for mobile phone spam research. It has one collection composed by 5,574 English, real and non-encoded messages, tagged according being legitimate (ham) or spam.},
  file = {/Users/francojc/Zotero/storage/DLNV7GK8/smsspamcollection.html}
}

@inproceedings{Almeida2011a,
  title = {Contributions to the Study of {{SMS}} Spam Filtering: {{New}} Collection and Results},
  booktitle = {Proceedings of the 2011 {{ACM Symposium}} on {{Document Engineering}} ({{DOCENG}}'11)},
  author = {Almeida, Tiago A and G{\'o}mez Hildago, Jos{\'e} Mar{\'i}a and Yamakami, Akebo},
  year = {2011},
  pages = {4},
  address = {Mountain View, CA},
  url = {https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/},
  abstract = {The growth of mobile phone users has lead to a dramatic increasing of SMS spam messages. In practice, fighting mobile phone spam is difficult by several factors, including the lower rate of SMS that has allowed many users and service providers to ignore the issue, and the limited availability of mobile phone spam-filtering software. On the other hand, in academic settings, a major handicap is the scarcity of public SMS spam datasets, that are sorely needed for validation and comparison of different classifiers. Moreover, as SMS messages are fairly short, content-based spam filters may have their performance degraded. In this paper, we offer a new real, public and non-encoded SMS spam collection that is the largest one as far as we know. Moreover, we compare the performance achieved by several established machine learning methods. The results indicate that Support Vector Machine outperforms other evaluated classifiers and, hence, it can be used as a good baseline for further comparison.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/4NRMQS4L/Almeida et al. - Contributions to the Study of SMS Spam Filtering .pdf}
}

@book{Alpaydin2004,
  title = {Introduction to Machine Learning},
  author = {Alpaydin, E},
  year = {2004},
  edition = {Second Edi},
  url = {http://books.google.com/books?hl=en&lr=&id=1k0_-WroiqEC&oi=fnd&pg=PR13&dq=Introduction+to+Machine+Learning&ots=p94DZSgFwO&sig=vDDNGh5k63XWekU_kVjRFLMZNSE},
  urldate = {2014-02-16},
  isbn = {978-0-262-01243-0},
  file = {/Users/francojc/Zotero/storage/P3XAB7KE/Alpaydin - 2004 - Introduction to Machine Learning.pdf}
}

@article{Anderwald2010,
  title = {Are Non-Standard Dialects More `Natural' than the Standard? {{A}} Test Case from {{English}} Verb Morphology},
  author = {Anderwald, Lieselotte},
  year = {2010},
  month = jul,
  journal = {Journal of Linguistics},
  volume = {47},
  number = {02},
  pages = {251--274},
  issn = {0022-2267},
  doi = {10.1017/S0022226710000241},
  urldate = {2013-10-30},
  isbn = {0022226710}
}

@article{Argamon2019,
  title = {Register in Computational Language Research},
  author = {Argamon, Shlomo},
  year = {2019},
  journal = {Register Studies},
  volume = {1},
  number = {1},
  pages = {100--135},
  issn = {2542-9477},
  doi = {10.1075/rs.18015.arg},
  abstract = {Shlomo Argamon is Professor of Computer Science and Director of the Master of Data Science Program at the Illinois Institute of Technology (USA). In this article, he reflects on the current and potential relationship between register and the field of computational linguistics. He applies his expertise in computational linguistics and machine learning to a variety of problems in natural language processing. These include stylistic variation, forensic linguistics, authorship attribution, and biomedical informatics. He is particularly interested in the linguistic structures used by speakers and writers, including linguistic choices that are influenced by social variables such as age, gender, and register, as well as linguistic choices that are unique or distinctive to the style of individual authors. Argamon has been a pioneer in computational linguistics and NLP research in his efforts to account for and explore register variation. His computational linguistic research on register draws inspiration from Systemic Functional Linguistics, Biber's multi-dimensional approach to register variation, as well as his own extensive experience accounting for variation within and across text types and authors. Argamon has applied computational methods to text classification and description across registers~-- including blogs, academic disciplines, and news writing~-- as well as the interaction between register and other social variables, such as age and gender. His cutting-edge research in these areas is certain to have a lasting impact on the future of computational linguistics and NLP.},
  file = {/Users/francojc/Zotero/storage/TVMFBYDG/Argamon - 2019 - Register in computational language research.pdf}
}

@article{Arnold2017,
  title = {A Tidy Data Model for Natural Language Processing Using {{cleanNLP}}},
  author = {Arnold, Taylor},
  year = {2017},
  journal = {The R Journal},
  eprint = {1703.09570},
  pages = {1--20},
  abstract = {The package cleanNLP provides a set of fast tools for converting a textual corpus into a set of normalized tables. The underlying natural language processing pipeline utilizes Stanford's CoreNLP library, exposing a number of annotation tasks for text written in English, French, German, and Spanish. Annotators include tokenization, part of speech tagging, named entity recognition, entity linking, sentiment analysis, dependency parsing, coreference resolution, and information extraction.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/2XA5HMVH/Arnold - 2017 - A Tidy Data Model for Natural Language Processing .pdf}
}

@article{Arnon2010,
  title = {More than Words: {{Frequency}} Effects for Multi-Word Phrases},
  author = {Arnon, Inbal and Snider, Neal},
  year = {2010},
  journal = {Journal of Memory and Language},
  volume = {62},
  number = {1},
  pages = {67--82},
  publisher = {Elsevier},
  urldate = {2012-01-16},
  abstract = {There is mounting evidence that language users are sensitive to distributional information at many grain-sizes. Much of this research has focused on the distributional properties of words, the units they consist of (morphemes, phonemes), and the syntactic structures they appear in (verb-categorization frames, syntactic constructions). In a series of studies we show that comprehenders are also sensitive to the frequencies of compositional four-word phrases (e.g. don't have to worry): more frequent phrases are processed faster. The effect is not reducible to the frequency of the individual words or substrings and is observed across the entire frequency range (for low, mid and high frequency phrases). Comprehenders seem to learn and store frequency information about multi-word phrases. These findings call for processing models that can capture and predict phrase-frequency effects and support accounts where linguistic knowledge consists of patterns of varying sizes and levels of abstraction. {\copyright} 2009 Elsevier Inc. All rights reserved.},
  file = {/Users/francojc/Zotero/storage/7MHBNQVM/Arnon and Snider - 2010 - More than words Frequency effects for multi-word .pdf}
}

@article{Asao2002,
  title = {Communication Strategies of {{EFL}} Learners: A Corpus-Based Approach},
  author = {Asao, Kojiro},
  year = {2002},
  journal = {Language and Computers},
  volume = {12},
  pages = {291--302},
  url = {http://www.ingentaconnect.com/content/rodopi/lang/2002/00000038/00000001/art00020},
  urldate = {2011-10-25},
  abstract = {This paper examines a number of communication strategies that are deployed by EFL learners when writing English. Students learning a foreign language do not, of course, have a complete repertory of vocabulary and grammar; therefore, when faced with the need for a word that they do not know, they will try to resolve any ambiguity by deploying various communication strategies. These strategies include using approximations of meaning or circumlocutions, or simply switching to their mother tongue. Most studies in this field have been in the form of episodic reference. This study, however, focuses on the types of communication strategy that are most frequently used by certain EFL learners in a given situation, and on why those strategies are preferred. This study offers an analysis of a task-based corpus that was created specifically for this purpose.},
  file = {/Users/francojc/Zotero/storage/7Y7D5QEN/Asao - 2002 - Communication Strategies of EFL Learners A Corpus-based Approach.pdf}
}

@article{Atkins1992,
  title = {Corpus Design Criteria},
  author = {Atkins, Sue and Clear, J and Ostler, N},
  year = {1992},
  journal = {Literary and Linguistic Computing},
  volume = {7},
  number = {1},
  pages = {1--16},
  file = {/Users/francojc/Zotero/storage/HXI8INCF/Atkins, Clear, Ostler - 1992 - Corpus Design Criteria.pdf}
}

@article{Baayen1991,
  title = {Productivity and {{English}} Derivation: A Corpus-Based Study},
  shorttitle = {Productivity and English Derivation},
  author = {Baayen, R. Harald and Lieber, Rochelle},
  year = {1991},
  publisher = {Walter de Gruyter, Berlin/New York Berlin, New York},
  keywords = {/unread,corpus,distributions,morphology,productivity},
  file = {/Users/francojc/Zotero/storage/JGT7ZBPT/Baayen_Lieber_1991_Productivity and English derivation.pdf}
}

@article{Baayen1993,
  title = {The {{CELEX}} Lexical Database ({{CD-ROM}})},
  author = {Baayen, R. Harald and Piepenbrock, R and {van Rijn}, H},
  year = {1993},
  journal = {Linguistic Data Consortium, University of Pennsylvania, Philadelphia, PA}
}

@article{Baayen2004,
  title = {Statistics in Psycholinguistics: A Critique of Some Current Gold Standards},
  author = {Baayen, R. Harald},
  year = {2004},
  journal = {Mental Lexicon Working Papers},
  volume = {1},
  number = {1},
  pages = {1--47},
  abstract = {This paper presents a detailed critique of some current gold standards for the stat- istical analysis of experimental data in psycholinguistics. A series of examples il- lustrates (1) the disadvantages of reducing numerical variables to factors and the importance of including available covariates in the model, (2) the advantages of us- ing multilevel models instead of the traditional by-subjectand by-item procedures and the quasi-F test, and (3) the relevance of logistic modelsfor binary data such as the error measure in decision tasks.},
  file = {/Users/francojc/Zotero/storage/H5AXH88P/Baayen - 2004 - Statistics in Psycholinguistics a critique of some current gold standards.pdf}
}

@article{Baayen2006,
  title = {Morphological Influences on the Recognition of Monosyllabic Monomorphemic Words},
  author = {Baayen, R. Harald and Feldman, Laurie and Schreuder, Robert},
  year = {2006},
  journal = {Journal of Memory and Language},
  volume = {55},
  pages = {290--313},
  issn = {0749596X},
  doi = {10.1016/j.jml.2006.03.008},
  abstract = {Balota et al. [Balota, D., Cortese, M., Sergent-Marshall, S., Spieler, D., \& Yap, M. (2004). Visual word recognition for single-syllable words. Journal of Experimental Psychology: General, 133, 283-316] studied lexical processing in word naming and lexical decision using hierarchical multiple regression techniques for a large data set of monosyllabic, morphologically simple words. The present study supplements their work by making use of more flexible regression techniques that are better suited for dealing with collinearity and non-linearity, and by documenting the contributions of several variables that they did not take into account. In particular, we included measures of morphological connectivity, as well as a new frequency count, the frequency of a word in speech rather than in writing. The morphological measures emerged as strong predictors in visual lexical decision, but not in naming, providing evidence for the importance of morphological connectivity even for the recognition of morphologically simple words. Spoken frequency was predictive not only for naming but also for visual lexical decision. In addition, it co-determined subjective frequency estimates and norms for age of acquisition. Finally, we show that frequency predominantly reflects conceptual familiarity rather than familiarity with a word's form. ?? 2006.},
  isbn = {0749596X},
  keywords = {BNC corpus,CELEX database,corpus,entropy,frequency effects,inflectional family size,lexical access,morphological family size,morphology},
  file = {/Users/francojc/Zotero/storage/XPKPYIF3/Baayen et al. - 2006 - Morphological influences on the recognition of monosyllabic monomorphemic words.pdf}
}

@book{Baayen2008a,
  title = {Analyzing Linguistic Data: A Practical Introduction to Statistics Using {{R}}},
  author = {Baayen, R. Harald},
  year = {2008},
  publisher = {Cambridge University Press},
  urldate = {2012-01-09},
  file = {/Users/francojc/Zotero/storage/EQGX7MQU/Baayen - 2008 - Analyzing linguistic data A practical introductio.pdf}
}

@article{Baayen2010,
  title = {A Real Experiment Is a Factorial Experiment?},
  author = {Baayen, R. Harald},
  year = {2010},
  month = jun,
  journal = {The Mental Lexicon},
  volume = {5},
  number = {1},
  pages = {149--157},
  issn = {18711340},
  doi = {10.1075/ml.5.1.06baa}
}

@article{Baayen2011,
  title = {Corpus Linguistics and Naive Discriminative Learning},
  author = {Baayen, R. Harald},
  year = {2011},
  journal = {Revista Brasileira de Lingu{\'i}stica Aplicada},
  volume = {11},
  number = {2},
  pages = {295--328},
  publisher = {SciELO Brasil},
  keywords = {corpus linguistics,dative alternation,datives,discriminative learning classifier,machine learning,memory based learning,support vector machines,switchboard},
  file = {/Users/francojc/Zotero/storage/GC52JFVV/Baayen - 2011 - Corpus linguistics and naive discriminative learning.pdf}
}

@manual{Baayen2019,
  type = {Manual},
  title = {{{languageR}}: {{Analyzing}} Linguistic Data: A Practical Introduction to Statistics},
  author = {Baayen, R. Harald and {Shafaei-Bajestan}, Elnaz},
  year = {2019},
  url = {https://CRAN.R-project.org/package=languageR}
}

@article{Baker2004,
  title = {A Corpus-Based View of Similarity and Difference in Translation},
  author = {Baker, Mona},
  year = {2004},
  month = jan,
  journal = {International Journal of Corpus Linguistics},
  volume = {9},
  number = {2},
  pages = {167--193},
  issn = {13846655},
  doi = {10.1075/ijcl.9.2.02bak}
}

@article{Baker2016,
  title = {1,500 Scientists Lift the Lid on Reproducibility},
  author = {Baker, Monya},
  year = {2016},
  month = may,
  journal = {Nature},
  volume = {533},
  number = {7604},
  pages = {452--454},
  publisher = {Nature Publishing Group},
  issn = {1476-4687},
  doi = {10.1038/533452a},
  urldate = {2024-01-19},
  abstract = {Survey sheds light on the `crisis' rocking research.},
  copyright = {2016 Springer Nature Limited},
  langid = {english},
  keywords = {crisis,Peer review,Publishing,reproducibility,research,Research management,survey},
  file = {/Users/francojc/Zotero/storage/BNP9YHMI/Baker - 2016 - 1,500 scientists lift the lid on reproducibility.pdf}
}

@article{Bamman2014,
  title = {A {{Bayesian}} Mixed Effects Model of Literary Character},
  author = {Bamman, David and Underwood, Ted and Smith, Noah A.},
  year = {2014},
  journal = {Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (ACL 2014)},
  pages = {370--379},
  abstract = {We consider the problem of automatically inferring latent character types in a collection of 15,099 English novels published between 1700 and 1899. Unlike prior work in which character types are assumed responsible for probabilistically generating all text associated with a character, we introduce a model that employs multiple effects to account for the influence of extra-linguistic information (such as author). In an empirical evaluation, we find that this method leads to improved agreement with the preregistered judgments of a literary scholar, complementing the results of alternative models.},
  isbn = {9781937284725},
  file = {/Users/francojc/Zotero/storage/GTFI9KLP/Bamman et al. - 2014 - A Bayesian Mixed Effects Model of Literary Charact.pdf}
}

@inproceedings{Bamman2014a,
  title = {Learning Latent Personas of Film Characters},
  booktitle = {Proceedings of the 51st {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{ACL}} 2013)},
  author = {Bamman, David and O'Connor, Brendan and Smith, Noah A},
  year = {2014},
  pages = {352--361},
  abstract = {We present two latent variable models for learning character types, or personas, in film, in which a persona is defined as a set of mixtures over latent lexical classes. These lexical classes capture the stereotypical actions of which a character is the agent and patient, as well as attributes by which they are described. As the first attempt to solve this problem explicitly, we also present a new dataset for the text-driven analysis of film, along with a benchmark testbed to help drive future work in this area.},
  isbn = {978-1-937284-50-3},
  file = {/Users/francojc/Zotero/storage/3U24LGP8/Bamman, O'Connor, Smith - 2014 - Learning Latent Personas of Film Characters.pdf}
}

@article{Bamman2014b,
  title = {Gender Identity and Lexical Variation in Social Media},
  author = {Bamman, David and Eisenstein, Jacob and Schnoebelen, Tyler},
  year = {2014},
  journal = {Journal of Sociolinguistics},
  volume = {18},
  number = {2},
  pages = {135--160},
  url = {http://onlinelibrary.wiley.com/doi/10.1111/josl.12080/full},
  urldate = {2014-04-26},
  file = {/Users/francojc/Zotero/storage/VYASHJT2/Bamman, Eisenstein, Schnoebelen - 2014 - Gender identity and lexical variation in social media.pdf}
}

@article{Bao2019,
  title = {Integration of Unsupervised and Supervised Machine Learning Algorithms for Credit Risk Assessment},
  author = {Bao, Wang and Lianju, Ning and Yue, Kong},
  year = {2019},
  month = aug,
  journal = {Expert Systems with Applications},
  volume = {128},
  pages = {301--315},
  issn = {0957-4174},
  doi = {10.1016/j.eswa.2019.02.033},
  urldate = {2021-08-29},
  abstract = {For the sake of credit risk assessment, credit scoring has become a critical tool to discriminate ``bad'' applicants from ``good'' applicants for financial institutions. Accordingly, a wide range of supervised machine learning algorithms have been successfully applied to credit scoring; however, integration of unsupervised learning with supervised learning in this field has drawn little consideration. In this work, we propose a combination strategy of integrating unsupervised learning with supervised learning for credit risk assessment. The difference between our work and other previous work on unsupervised integration is that we apply unsupervised learning techniques at two different stages: the consensus stage and dataset clustering stage. Comparisons of model performance are performed based on three credit datasets in four groups: individual models, individual models\,+\,consensus model, clustering\,+\,individual models, clustering\,+\,individual models\,+\,consensus model. As a result, integration at either the consensus stage or dataset clustering stage is effective on improving the performance of credit scoring models. Moreover, the combination of the two stages achieves the best performance, thereby confirming the superiority of the proposed integration of unsupervised and supervised machine learning algorithms, which boost our confidence that this strategy can be extended to many other credit datasets from financial institutions.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/VBRHQE3F/Bao et al. - 2019 - Integration of unsupervised and supervised machine.pdf;/Users/francojc/Zotero/storage/V7ZXT5UL/S0957417419301472.html}
}

@incollection{Baroni2008,
  title = {Statistical Methods for Corpus Exploitation},
  booktitle = {Corpus {{Linguistics}}. {{An International Handbook}}},
  author = {Baroni, Marco and Evert, Stefan},
  year = {2008},
  pages = {777--803},
  publisher = {Mouton de Gruyter},
  file = {/Users/francojc/Zotero/storage/N6U5MXYX/Baroni, Evert - 2008 - Statistical methods for corpus exploitation.pdf}
}

@book{Baumer2017,
  title = {Modern Data Science with {{R}}},
  author = {Baumer, Benjamin S. and Kaplan, Daniel T. and Horton, Nicholas J.},
  year = {2017},
  month = mar,
  publisher = {CRC Press},
  abstract = {Modern Data Science with R is a comprehensive data science textbook for undergraduates that incorporates statistical and computational thinking to solve real-world problems with data. Rather than focus exclusively on case studies or programming syntax, this book illustrates how statistical programming in the state-of-the-art R/RStudio computing environment can be leveraged to extract meaningful information from a variety of data in the service of addressing compelling statistical questions.   Contemporary data science requires a tight integration of knowledge from statistics, computer science, mathematics, and a domain of application. This book will help readers with some background in statistics and modest prior experience with coding develop and practice the appropriate skills to tackle complex data science projects. The book features a number of exercises and has a flexible organization conducive to teaching a variety of semester courses.},
  googlebooks = {NrddDgAAQBAJ},
  isbn = {978-1-4987-2449-4},
  langid = {english}
}

@article{Becher2011,
  title = {Explicitation and Implicitation in Translation: A Corpus-Based Study of {{English-German}} and {{German-English}} Translations of Business Texts},
  author = {Becher, V},
  year = {2011},
  url = {http://d-nb.info/102042673X/34},
  urldate = {2013-09-10}
}

@book{Beckerman2017,
  title = {Getting Started with {{R}}: {{An}} Introduction for Biologists},
  author = {Beckerman, Andrew P. and Childs, Dylan Z. and Petchey, Owen L.},
  year = {2017},
  edition = {Second edi},
  publisher = {Oxford University Press},
  abstract = {R is rapidly becoming the standard software for statistical analyses, graphical presentation of data, and programming in the natural, physical, social, and engineering sciences. Getting Started with R is now the go-to introductory guide for biologists wanting to learn how to use R in their research. It teaches readers how to import, explore, graph, and analyse data, while keeping them focused on their ultimate goals: clearly communicating their data in oral presentations, posters, papers, and reports. It provides a consistent workflow for using R that is simple, efficient, reliable, and reproducible. This second edition has been updated and expanded while retaining the concise and engaging nature of its predecessor, offering an accessible and fun introduction to the packages dplyr and ggplot2 for data manipulation and graphing. It expands the set of basic statistics considered in the first edition to include new examples of a simple regression, a one-way and a two-way ANOVA. Finally, it introduces a new chapter on the generalised linear model. Getting Started with R is suitable for undergraduates, graduate students, professional researchers, and practitioners in the biological sciences.},
  isbn = {978-0-19-878784-6}
}

@misc{Bellinger2004,
  title = {Data, Information, Knowledge and Wisdom},
  author = {Bellinger, Gene and Castro, Durval and Mills, Anthony},
  year = {2004},
  journal = {Systems Thinking},
  url = {http://www.systems-thinking.org/dikw/dikw.htm},
  urldate = {2019-06-18},
  file = {/Users/francojc/Zotero/storage/QX4T637U/Bellinger, Castro, Mills - 2004 - Data, information, knowledge and wisdom.pdf}
}

@manual{Benoit2020,
  type = {Manual},
  title = {Quanteda.Corpora: A Collection of Corpora for Quanteda},
  author = {Benoit, Kenneth},
  year = {2020},
  url = {http://github.com/quanteda/quanteda.corpora}
}

@article{Bentz2014,
  title = {Zipf's Law and the Grammar of Languages: A Quantitative Study of Old and Modern {{English}} Parallel Texts},
  author = {Bentz, Christian and Kiela, Douwe and Hill, Feli and Buttery, Paula},
  year = {2014},
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {10},
  number = {2},
  pages = {175--211},
  issn = {16137035},
  doi = {10.1515/cllt-2014-0009},
  abstract = {This paper reports a quantitative analysis of the relationship between word frequency distributions and morphological features in languages. We analyze a commonly-observed process of historical language change: The loss of inflected forms in favour of `analytic' periphrastic constructions. These tendencies are observed in parallel translations of the Book of Genesis in Old English and Modern English. We show that there are significant differences in the frequency distributions of the two texts, and that parts of these differences are independent of total number of words, style of translation, orthography or contents. We argue that they derive instead from the trade-off between synthetic inflectional marking in Old English and analytic constructions in Modern English. By exploiting the earliest ideas of Zipf, we show that the syntheticity of the language in these texts can be captured mathematically, a property we tentatively call their grammatical fingerprint. Our findings suggest implications for both the specific historical process of inflection loss and more generally for the characterization of languages based on statistical properties.},
  isbn = {1613-7027}
}

@misc{Berez-Kroeker2017,
  title = {A Survey of Current Reproducibility Practices in Linguistics Journals, 2003-2012},
  author = {{Berez-Kroeker}, Andrea L. and Gawne, Lauren and Kelly, Barbara F. and Heston, Tyler},
  year = {2017},
  url = {https://sites.google.com/a/hawaii.edu/data-citation/survey}
}

@article{Berez-Kroeker2018,
  title = {Reproducible Research in Linguistics: A Position Statement on Data Citation and Attribution in Our Field},
  author = {{Berez-Kroeker}, Andrea L. and Gawne, Lauren and Kung, Susan Smythe and Kelly, Barbara F. and Heston, Tyler and Holton, Gary and Pulsifer, Peter and Beaver, David I. and Chelliah, Shobhana and Dubinsky, Stanley and Meier, Richard P. and Thieberger, Nick and Rice, Keren and Woodbury, Anthony C.},
  year = {2018},
  journal = {Linguistics},
  volume = {56},
  number = {1},
  pages = {1--18},
  issn = {00243949},
  doi = {10.1515/ling-2017-0032},
  abstract = {This paper is a position statement on reproducible research in linguistics, including data citation and attribution, that represents the collective views of some 41 colleagues. Reproducibility can play a key role in increasing verification and accountability in linguistic research, and is a hallmark of social science research that is currently under-represented in our field. We believe that we need to take time as a discipline to clearly articulate our expectations for how linguistic data are managed, cited, and maintained for long-Term access.},
  file = {/Users/francojc/Zotero/storage/KAEMYE3Q/Berez-Kroeker et al. - 2018 - Reproducible research in linguistics A position s.pdf}
}

@book{Bernard2016,
  title = {Analyzing Qualitative Data: {{Systematic}} Approaches},
  shorttitle = {Analyzing Qualitative Data},
  author = {Bernard, H. Russell and Wutich, Amber and Ryan, Gery W.},
  year = {2016},
  month = jun,
  publisher = {SAGE Publications},
  abstract = {The fully updated Second Edition of Analyzing Qualitative Data: Systematic Approaches by H. Russell Bernard, Amber Wutich, and Gery W. Ryan presents systematic methods for analyzing qualitative data with clear and easy-to-understand steps. The first half is an overview of the basics, from choosing a topic to collecting data, and coding to finding themes, while the second half covers different methods of analysis, including grounded theory, content analysis, analytic induction, semantic network analysis, ethnographic decision modeling, and more. Real examples drawn from social science and health literature along with carefully crafted, hands-on exercises at the end of each chapter allow readers to master key techniques and apply them to their own disciplines.},
  googlebooks = {yAi1DAAAQBAJ},
  isbn = {978-1-4833-4711-0},
  langid = {english}
}

@misc{Bialik2013,
  title = {Data Crunchers Now the Cool Kids on Campus},
  author = {Bialik, Carl},
  year = {2013},
  journal = {The Wall Street Journal},
  url = {https://www.wsj.com/articles/SB10001424127887323478304578332850293360468},
  urldate = {2019-06-17}
}

@article{Biber1987,
  title = {A Textual Comparison of British and American Writing},
  author = {Biber, Douglas},
  year = 1987,
  journal = {American Speech},
  volume = {62},
  number = {2},
  eprint = {455273},
  eprinttype = {jstor},
  pages = {99},
  issn = {00031283},
  doi = {10.2307/455273},
  urldate = {2024-04-09},
  file = {/Users/francojc/Zotero/storage/PSBN6SR6/Biber - 1987 - A Textual Comparison of British and American Writing.pdf}
}

@article{Biber1993,
  title = {Using Register-Diversified Corpora for General Language Studies},
  author = {Biber, Douglas},
  year = {1993},
  journal = {Computational linguistics},
  volume = {19},
  number = {2},
  pages = {219--241},
  urldate = {2014-06-09},
  keywords = {academic writing,composition,corpus evaluation,corpus linguistics,pedagogy},
  file = {/Users/francojc/Zotero/storage/NVSNATTH/Biber - 1993 - Using register-diversified corpora for general language studies.pdf}
}

@article{Biber1993a,
  title = {Representativeness in Corpus Design},
  author = {Biber, Douglas},
  year = {1993},
  journal = {Literary and linguistic computing},
  volume = {8},
  number = {4},
  pages = {243--257},
  file = {/Users/francojc/Zotero/storage/L5HGSN9J/Biber - 1993 - Representativeness in corpus design.pdf}
}

@article{Biber2004,
  title = {If You Look at: {{Lexical}} Bundles in University Teaching and Textbooks},
  author = {Biber, Douglas and Conrad, Susan and Cortes, V},
  year = {2004},
  journal = {Applied Linguistics},
  urldate = {2013-01-18},
  file = {/Users/francojc/Zotero/storage/PUINMIRD/Biber et al. - 2004 - If you look at Lexical bundles in university teac.pdf}
}

@article{Biber2005,
  title = {Merging Corpus Linguistic and Discourse Analytic Research Goals: {{Discourse}} Units in Biology Research Articles},
  author = {Biber, Douglas and Jones, James K.},
  year = {2005},
  month = jan,
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {1},
  number = {2},
  issn = {1613-7027},
  doi = {10.1515/cllt.2005.1.2.151}
}

@book{Biber2006,
  title = {University Language: A Corpus-Based Study of Spoken and Written Registers},
  author = {Biber, Douglas},
  year = {2006},
  url = {http://books.google.com/books?hl=en&lr=&id=-2zqpWi19h4C&oi=fnd&pg=PA1&dq=biber+2006+university+language+a+corpus&ots=PmhOEi57XY&sig=Bz0JysWYnN08AElZMC_BG04ytxQ},
  urldate = {2014-11-06}
}

@article{Biber2006a,
  title = {Stance in Spoken and Written University Registers},
  author = {Biber, Douglas},
  year = {2006},
  journal = {Journal of English for Academic Purposes},
  url = {http://www.sciencedirect.com/science/article/pii/S1475158506000075},
  urldate = {2014-11-06}
}

@article{Biber2007,
  title = {Lexical Bundles in University Spoken and Written Registers},
  author = {Biber, Douglas and Barbieri, Federica},
  year = {2007},
  journal = {English for specific purposes},
  url = {http://www.sciencedirect.com/science/article/pii/S0889490606000366},
  urldate = {2014-11-06}
}

@article{Biber2009,
  title = {A Corpus-Driven Approach to Formulaic Language in {{English}}: {{Multi-word}} Patterns in Speech and Writing},
  author = {Biber, Douglas},
  year = {2009},
  journal = {International Journal of Corpus Linguistics},
  url = {http://www.ingentaconnect.com/content/jbp/ijcl/2009/00000014/00000003/art00002},
  urldate = {2014-11-06},
  keywords = {collocations,corpus,corpus linguistics,formulaic language}
}

@article{Biber2010,
  title = {Challenging Stereotypes about Academic Writing: {{Complexity}}, Elaboration, Explicitness},
  author = {Biber, Douglas and Gray, Bethany},
  year = {2010},
  journal = {Journal of English for Academic Purposes},
  url = {http://www.sciencedirect.com/science/article/pii/S1475158510000020},
  urldate = {2014-11-06}
}

@misc{Blagotic2021,
  title = {{{ProjectTemplate}}: {{Automates}} the Creation of New Statistical Analysis Projects},
  shorttitle = {{{ProjectTemplate}}},
  author = {Blagotic, Aleksandar and {Valle-Jones}, Diego and Breen, Jeffrey and Lundborg, Joakim and White, John Myles and Bode, Josh and White, Kenton and Mueller, Kirill and Redaelli, Matteo and Lorang, Noah and Schalk, Patrick and Schneider, Dominik and Hepp, Gerold and Jamile, Zunaira},
  year = {2021},
  month = feb,
  url = {https://CRAN.R-project.org/package=ProjectTemplate},
  urldate = {2021-07-20},
  abstract = {Provides functions to automatically build a directory structure for a new R project. Using this structure, 'ProjectTemplate' automates data loading, preprocessing, library importing and unit testing.},
  copyright = {GPL-3 {\textbar} file LICENSE}
}

@article{Blischak2019,
  title = {Creating and Sharing Reproducible Research Code the Workflowr Way},
  author = {Blischak, John D. and Carbonetto, Peter and Stephens, Matthew},
  year = {2019},
  journal = {F1000Research},
  volume = {8},
  publisher = {Faculty of 1000 Ltd},
  file = {/Users/francojc/Zotero/storage/FY64FLHZ/PMC6833990.html}
}

@article{Bloomfield1926,
  title = {A Set of Postulates for the Science of Language},
  author = {Bloomfield, Leonard},
  year = {1926},
  journal = {Language},
  volume = {2},
  number = {3},
  pages = {153--164},
  issn = {00978507},
  doi = {10.2307/408741},
  abstract = {The method of postulates (that is, assumptions or axioms) and defini-tionst is fully adequate to mathematics; as for other sciences, the more complex their subject-matter, the less amenable are they to this method, since, under it, every descriptive or historical fact becomes the subject of a new postulate. Nevertheless, the postulational method can further the study of language, because it forces us to state explicitly whatever we assume, to define our terms, and to decide what things may exist independently and what things are interdependent.2 Certain errors can be avoided or corrected by examining and formu-lating our (at present tacit) assumptions and defining our (often unde-fined) terms.3 Also, the postulational method saves discussion, because it limits our statements to a defined terminology; in particular, it cuts us off from psychological dispute.4 Discussion of the fundamentals of our science 1 For a clear exposition of this method, see J. W. Young, Lectures on the Fundamental Concepts of Algebra and Geometry, New York 1911. 1 Cf. A. P. Weiss's set of postulates for psychology, Psychological Review. 32. 83. 3 Examples are many. Bopp took for granted that the formative elements of Indo-European were once independent words; this is a needless and unwarranted assumption. The last descendant of his error is the assumption that IE compound words are historically derived from phrases (Jacobi, Compositum und Nebensatz, Bonn 1897; this even in Brug-mann, Grundrisz I)I, 1, pp. 37. 78; cf. TAPA 45. 73 ff.). The notion is gaining ground that some forms have less meaning than others and are therefore more subject to phonetic change (Horn, Sprachkiirper und Sprachfunktion, Palaestra 135, Berlin 1921); I, for one, can discover no workable definition of the terms 'meaning' and 'phonetic change' under which this notion can be upheld. The whole dispute, perhaps today as unstilled as fifty years ago, about the regularity of phonetic change, is at bottom a question of terminology. *Recall the difficulties and obscurities in the writings of Humboldt and Steinthal, and the psychological dispute of Paul, Wundt, Delbrueck. From our point of view, the last-named was wrong in denying the value of descriptive data, but right in saying that it is indifferent what system of psychology a linguist believes in (Grundfragen der Sprach-forschung, Strassburg 1901). The trouble over the nature of the sentence is largely non-linguistic; contrast the simplicity and usefulness of Meillet's definition (adopted below), 153},
  isbn = {00978507},
  file = {/Users/francojc/Zotero/storage/4M54XRCC/Bloomfield - 1926 - A Set of Postulates for the Science of Language.pdf}
}

@misc{Bobbitt2021,
  title = {Left Skewed vs. Right Skewed Distributions},
  author = {Bobbitt, Zach},
  year = {2021},
  month = jan,
  journal = {Statology},
  url = {https://www.statology.org/left-skewed-vs-right-skewed/},
  urldate = {2021-06-30},
  abstract = {This tutorial explains the difference between left skewed and right skewed distributions, including several examples.},
  langid = {american},
  file = {/Users/francojc/Zotero/storage/KL3A8YQU/left-skewed-vs-right-skewed.html}
}

@article{Boettiger2017,
  title = {An Introduction to Rocker: {{Docker}} Containers for {{R}}},
  shorttitle = {An Introduction to Rocker},
  author = {Boettiger, Carl and Eddelbuettel, Dirk},
  year = {2017},
  journal = {The R Journal},
  volume = {9},
  number = {2},
  pages = {527},
  issn = {2073-4859},
  doi = {10.32614/RJ-2017-065},
  urldate = {2023-06-22},
  abstract = {We describe the Rocker project, which provides a widely-used suite of Docker images with customized R environments for particular tasks. We discuss how this suite is organized, and how these tools can increase portability, scaling, reproducibility, and convenience of R users and developers.},
  langid = {english},
  keywords = {/unread,containers,docker,images,r,reproducible research,rocker},
  file = {/Users/francojc/Zotero/storage/YRNEPITC/Boettiger and Eddelbuettel - 2017 - An Introduction to Rocker Docker Containers for R.pdf}
}

@incollection{Bohmann2023,
  title = {Contrastive Usage Profiling: A Word Vector Perspective on World {{Englishes}}},
  shorttitle = {Contrastive Usage Profiling},
  booktitle = {Language and {{Linguistics}} in a {{Complex World}}},
  author = {Bohmann, Axel},
  editor = {Busse, Beatrix and Warnke, Ingo H.},
  year = {2023},
  volume = {32},
  pages = {11--30},
  publisher = {De Gruyter},
  file = {/Users/francojc/Zotero/storage/DCVMAHBR/Bohmann - 2023 - Contrastive Usage Profiling A Word Vector Perspec.pdf}
}

@article{Bolibaugh2021,
  title = {Towards a Credibility Revolution in Bilingualism Research: {{Open}} Data and Materials as Stepping Stones to More Reproducible and Replicable Research},
  shorttitle = {Towards a Credibility Revolution in Bilingualism Research},
  author = {Bolibaugh, Cylcia and Vanek, Norbert and Marsden, Emma J.},
  year = {2021},
  month = nov,
  journal = {Bilingualism: Language and Cognition},
  volume = {24},
  number = {5},
  pages = {801--806},
  publisher = {Cambridge University Press},
  issn = {1366-7289, 1469-1841},
  doi = {10.1017/S1366728921000535},
  urldate = {2021-12-16},
  abstract = {The extent to which findings in bilingualism research are contingent on specific analytic choices, experimental designs, or operationalisations, is currently unknown. Poor availability of data, analysis code, and materials has hindered the development of cumulative lines of research. In this review, we survey current practices and advocate a credibility revolution in bilingualism research through the adoption of minimum standards of transparency. Full disclosure of data and code is necessary not only to assess the reproducibility of original findings, but also to test the robustness of these findings to different analytic specifications. Similarly, full provision of experimental materials and protocols underpins assessment of both the replicability of original findings, as well as their generalisability to different contexts and samples. We illustrate the review with examples where good practice has advanced the agenda in bilingualism research and highlight resources to help researchers get started.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/V9MNKAR4/Bolibaugh et al. - 2021 - Towards a credibility revolution in bilingualism r.pdf;/Users/francojc/Zotero/storage/AP8EMQ5X/C4FC0550EE4537D8603942419B288C6E.html}
}

@article{Bolukbasi2016,
  title = {Man Is to Computer Programmer as Woman Is to Homemaker? {{Debiasing}} Word Embeddings},
  author = {Bolukbasi, Tolga and Chang, Kai-Wei and Zou, James and Saligrama, Venkatesh and Kalai, Adam},
  year = {2016},
  journal = {arXiv},
  eprint = {1607.06520},
  issn = {10495258},
  url = {http://arxiv.org/abs/1607.06520},
  abstract = {The blind application of machine learning runs the risk of amplifying biases present in data. Such a danger is facing us with word embedding, a popular framework to represent text data as vectors which has been used in many machine learning and natural language processing tasks. We show that even word embeddings trained on Google News articles exhibit female/male gender stereotypes to a disturbing extent. This raises concerns because their widespread use, as we describe, often tends to amplify these biases. Geometrically, gender bias is first shown to be captured by a direction in the word embedding. Second, gender neutral words are shown to be linearly separable from gender definition words in the word embedding. Using these properties, we provide a methodology for modifying an embedding to remove gender stereotypes, such as the association between between the words receptionist and female, while maintaining desired associations such as between the words queen and female. We define metrics to quantify both direct and indirect gender biases in embeddings, and develop algorithms to "debias" the embedding. Using crowd-worker evaluation as well as standard benchmarks, we empirically demonstrate that our algorithms significantly reduce gender bias in embeddings while preserving the its useful properties such as the ability to cluster related concepts and to solve analogy tasks. The resulting embeddings can be used in applications without amplifying gender bias.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/LRHQ2CDD/Bolukbasi et al. - 2016 - Man is to Computer Programmer as Woman is to Homem.pdf}
}

@article{Bosch1998,
  title = {Separating Hyperplanes and the Authorship of the Disputed Federalist Papers},
  author = {Bosch, Robert A. and Smith, Jason A.},
  year = {1998},
  journal = {American Mathematical Monthly},
  volume = {105},
  number = {7},
  pages = {601--608},
  issn = {00029890},
  doi = {10.2307/2589242},
  urldate = {2016-08-26},
  isbn = {0002-9890},
  file = {/Users/francojc/Zotero/storage/IM2DKRV4/Bosch, Smith - 1998 - Separating hyperplanes and the authorship of the disputed federalist papers.pdf}
}

@misc{Bowman2020,
  title = {{{OSF}} Prereg Template},
  author = {Bowman, Sara and DeHaven, Alexander C. and Errington, Timothy M. and Hardwicke, Tom E. and Mellor, David Thomas and Nosek, Brian A. and Soderberg, Courtney K.},
  year = {2020},
  month = jan,
  publisher = {MetaArXiv},
  doi = {10.31222/osf.io/epgjd},
  urldate = {2023-07-18},
  abstract = {Preregistration is the act of submitting a study plan, ideally also with analytical plan, to a registry prior to conducting the work. Preregistration increases the discoverability of research even if it does not get published further. Adding specific analysis plans can clarify the distinction between planned, confirmatory tests and unplanned, exploratory research.  This preprint contains a template for the ``OSF Prereg'' form available from the OSF Registry. An earlier version was originally developed for the Preregistration Challenge, an education campaign designed to initiate preregistration as a habit prior to data collection in basic research, funded by the Laura and John Arnold Foundation (now Arnold Ventures) and conducted by the Center for Open Science. More information is available at https://cos.io/prereg, and other templates are available at: https://osf.io/zab38/},
  langid = {american},
  keywords = {/unread,Bioethics and Medical Ethics,harking,Medicine and Health Sciences,open science,OSF,Other Social and Behavioral Sciences,p-hacking,Physical Sciences and Mathematics,preregistration,qrp,registration,research design,Social and Behavioral Sciences,Statistics and Probability,template},
  file = {/Users/francojc/Zotero/storage/DS2KXP4S/Bowman et al. - 2020 - OSF Prereg Template.pdf}
}

@article{Bransford1972,
  title = {Sentence Memory: A Constructive versus Interpretive Approach},
  author = {Bransford, {\relax JD} and Barclay, {\relax JR} and Franks, {\relax JJ}},
  year = {1972},
  journal = {Cognitive psychology},
  volume = {209},
  pages = {193--209},
  url = {http://www.sciencedirect.com/science/article/pii/0010028572900035},
  urldate = {2013-10-31},
  file = {/Users/francojc/Zotero/storage/KMVH97PM/Bransford, Barclay, Franks - 1972 - Sentence Memory A Constructive Versus Interpretive Approach.pdf}
}

@article{Breeze2013,
  title = {Lexical Bundles across Four Legal Genres},
  author = {Breeze, Ruth},
  year = {2013},
  month = jan,
  journal = {International Journal of Corpus Linguistics},
  volume = {18},
  number = {2},
  pages = {229--253},
  issn = {13846655},
  doi = {10.1075/ijcl.18.2.03bre},
  urldate = {2013-10-30},
  file = {/Users/francojc/Zotero/storage/IZANG9XU/Breeze - 2013 - Lexical bundles across four legal genres.pdf}
}

@inproceedings{Bresnan2007,
  title = {Predicting the Dative Alternation},
  booktitle = {Cognitive {{Foundations}} of {{Interpretation}}},
  author = {Bresnan, Joan and Cueni, Anna and Nikitina, Tatiana and Baayen, R. Harald},
  editor = {Bouma, G. and Kraemer, I. and Zwart, Jan-Wouter C},
  year = {2007},
  pages = {1--33},
  publisher = {KNAW},
  address = {Amsterdam},
  file = {/Users/francojc/Zotero/storage/HCIQA3VA/Bresnan et al. - 2007 - Predicting the Dative Alternation.pdf}
}

@article{Bresnan2007a,
  title = {A Few Lessons from Typology},
  author = {Bresnan, Joan},
  year = {2007},
  journal = {Linguistic Typology},
  volume = {11},
  number = {1},
  pages = {297--306},
  abstract = {Typology has a low profile in much of American linguistics, especially outside of phonology (Nichols 2007, Hyman 2007, Van Valin 2007). Yet, as I will suggest, the study of the results and methods of modern typology has important lessons for us as the field of linguistics undergoes a paradigm shift. Typologists study a wide range of language types, but I will show that even when one does theoretical work on a single, well-studied standardized national language like English, one can (and should) benefit from an awareness of typological findings. [ABSTRACT FROM AUTHOR]},
  file = {/Users/francojc/Zotero/storage/VENLH3VL/Bresnan - 2007 - A few lessons from typology.pdf}
}

@article{Briand2009,
  title = {A Similarity Measure to Assess the Stability of Classification Trees},
  author = {Briand, B{\'e}n{\'e}dicte and Ducharme, Gilles R. and Parache, Vanessa and {Mercat-Rommens}, Catherine},
  year = {2009},
  journal = {Computational Statistics and Data Analysis},
  volume = {53},
  number = {4},
  pages = {1208--1217},
  issn = {01679473},
  doi = {10.1016/j.csda.2008.10.033},
  abstract = {It has been recognized that Classification trees (CART) are unstable; a small perturbation in the input variables or a fresh sample can lead to a very different classification tree. Some approaches exist that try to correct this instability. However, their benefits can, at present, be appreciated only qualitatively. A similarity measure between two classification trees is introduced that can measure their closeness. Its usefulness is illustrated with synthetic data on the impact of radioactivity deposit through the environment. In this context, a modified node level stabilizing technique, referred to as the NLS-REP method, is introduced and shown to be more stable than the classical CART method. {\copyright} 2008 Elsevier B.V. All rights reserved.},
  isbn = {1532-4435},
  pmid = {10204200}
}

@article{Broman2018,
  title = {Data Organization in Spreadsheets},
  author = {Broman, Karl W. and Woo, Kara H.},
  year = {2018},
  month = jan,
  journal = {The American Statistician},
  volume = {72},
  number = {1},
  pages = {2--10},
  publisher = {Taylor \& Francis},
  issn = {0003-1305},
  doi = {10.1080/00031305.2017.1375989},
  urldate = {2021-04-21},
  abstract = {Spreadsheets are widely used software tools for data entry, storage, analysis, and visualization. Focusing on the data entry and storage aspects, this article offers practical recommendations for organizing spreadsheet data to reduce errors and ease later analyses. The basic principles are: be consistent, write dates like YYYY-MM-DD, do not leave any cells empty, put just one thing in a cell, organize the data as a single rectangle (with subjects as rows and variables as columns, and with a single header row), create a data dictionary, do not include calculations in the raw data files, do not use font color or highlighting as data, choose good names for things, make backups, use data validation to avoid data entry errors, and save the data in plain text files.},
  file = {/Users/francojc/Zotero/storage/7ZA9YH76/Broman and Woo - 2018 - Data Organization in Spreadsheets.pdf;/Users/francojc/Zotero/storage/VLE2BK9E/00031305.2017.html}
}

@book{Brown2005,
  title = {Encyclopedia of Language and Linguistics},
  author = {Brown, Keith},
  year = {2005},
  volume = {1},
  publisher = {Elsevier},
  file = {/Users/francojc/Zotero/storage/KLU9E7UK/cxYGQfiD_1oC.html}
}

@article{Brown2018,
  title = {Ten Quick Tips for Teaching Programming},
  author = {Brown, Neil C. C. and Wilson, Greg},
  year = {2018},
  journal = {PLOS Computational Biology},
  volume = {14},
  number = {4},
  pages = {e1006023},
  issn = {1553-7358},
  doi = {10.1371/journal.pcbi.1006023},
  abstract = {Research from educational psychology suggests that teaching and learning are subject-specific activities [1]: learning programming has a different set of challenges and techniques than learning physics or learning to read and write. Computing is a younger discipline than mathe- matics, physics, or biology, and while there have been correspondingly fewer studies of how best to teach it, there is a growing body of evidence about what works and what doesn't. This paper presents 10 quick tips that should be the foundation of any teaching of programming, whether formal or informal.},
  isbn = {1111111111},
  pmid = {29621229},
  file = {/Users/francojc/Zotero/storage/6H69NUMY/Brown, Wilson - 2018 - Ten quick tips for teaching programming.pdf}
}

@article{Bryan2017,
  title = {Excuse Me, Do You Have a Moment to Talk about Version Control?},
  author = {Bryan, Jennifer},
  year = {2017},
  journal = {PeerJ Preprints},
  volume = {5},
  pages = {1--23},
  issn = {2167-9843},
  doi = {10.7287/peerj.preprints.3159v2},
  abstract = {Data analysis, statistical research, and teaching statistics have at least one thing in common: these activities all produce many files! There are data files, source code, figures, tables, prepared reports, and much more. Most of these files evolve over the course of a project and often need to be shared with others, for reading or edits, as a project unfolds. Without explicit and structured management, project organization can easily descend into chaos, taking time away from the primary work and reducing the quality of the final product. This unhappy result can be avoided by repurposing tools and workflows from the software development world, namely, distributed version control. This article describes the use of the version control system Git and and the hosting site GitHub for statistical and data scientific workflows. Special attention is given to projects that use the statistical language R and, optionally, R Markdown documents. Supplementary materials include an annotated set of links to step-by-step tutorials, real world examples, and other useful learning resources.},
  file = {/Users/francojc/Zotero/storage/X3FQY98H/Bryan - 2017 - Excuse me, do you have a moment to talk about version control.pdf}
}

@book{Bryan2020,
  title = {Happy Git and {{GitHub}} for the {{useR}}},
  author = {Bryan, Jennifer and Hester, Jim},
  year = {2020},
  url = {https://happygitwithr.com/},
  urldate = {2021-01-06},
  abstract = {Using Git and GitHub with R, Rstudio, and R Markdown},
  file = {/Users/francojc/Zotero/storage/U778V6U4/happygitwithr.com.html}
}

@article{Brysbaert2011,
  title = {Do the Effects of Subjective Frequency and Age of Acquisition Survive Better Word Frequency Norms?},
  author = {Brysbaert, Marc and Cortese, Michael J},
  year = {2011},
  month = mar,
  journal = {Quarterly journal of experimental psychology (2006)},
  volume = {64},
  number = {3},
  eprint = {20700859},
  eprinttype = {pubmed},
  pages = {545--59},
  issn = {1747-0226},
  doi = {10.1080/17470218.2010.503374},
  urldate = {2011-03-15},
  abstract = {Megastudies with processing efficiency measures for thousands of words allow researchers to assess the quality of the word features they are using. In this article, we analyse reading aloud and lexical decision reaction times and accuracy rates for 2,336 words to assess the influence of subjective frequency and age of acquisition on performance. Specifically, we compare newly presented word frequency measures with the existing frequency norms of Kucera and Francis (1967), HAL (Burgess \& Livesay, 1998), Brysbaert and New (2009), and Zeno, Ivens, Millard, and Duvvuri (1995). We show that the use of the Kucera and Francis word frequency measure accounts for much less variance than the other word frequencies, which leaves more variance to be "explained" by familiarity ratings and age-of-acquisition ratings. We argue that subjective frequency ratings are no longer needed if researchers have good objective word frequency counts. The effect of age of acquisition remains significant and has an effect size that is of practical relevance, although it is substantially smaller than that of the first phoneme in naming and the objective word frequency in lexical decision. Thus, our results suggest that models of word processing need to utilize these recently developed frequency estimates during training or setting baseline activation levels in the lexicon.},
  pmid = {20700859},
  file = {/Users/francojc/Zotero/storage/Z5WJ4QH4/Brysbaert and Cortese - 2011 - Do the effects of subjective frequency and age of .pdf}
}

@incollection{Buckheit1995,
  title = {Wavelab and Reproducible Research},
  booktitle = {Wavelets and Statistics},
  author = {Buckheit, Jonathan B. and Donoho, David L.},
  year = {1995},
  pages = {55--81},
  publisher = {Springer},
  file = {/Users/francojc/Zotero/storage/Y5GFJYQD/Buckheit and Donoho - 1995 - Wavelab and reproducible research.pdf;/Users/francojc/Zotero/storage/HAIDPE44/978-1-4612-2544-7_5.html}
}

@techreport{Bukhari2020,
  type = {{{SSRN Scholarly Paper}}},
  title = {Data Science Curriculum: {{Current}} Scenario},
  shorttitle = {Data Science Curriculum},
  author = {Bukhari, Duaa},
  year = {2020},
  number = {3616600},
  address = {Rochester, NY},
  institution = {Social Science Research Network},
  url = {https://papers.ssrn.com/abstract=3616600},
  urldate = {2022-05-09},
  abstract = {Companies desires for making productive discoveries from big data have motivated academic institutions offering variety of different data science (DS) programs, in order to increases their graduates' ability to be data scientists who are capable to face the challenges of the new age. These data science programs represent a combination of subject areas from several disciplines. There are few studies have examined data science programs within a particular discipline, such as Business (e.g. Chen et al.). However, there are very few empirical studies that investigate DS programs and explore its curriculum structure across disciplines. Therefore, this study examines data science programs offered by American universities. The study aims to depict the current state of data science education in the U.S. to explore what discipline DS programs covers at the graduate level. The current study conducted an exploratory content analysis of 30 DS programs in the United States from a variety of disciplines. The analysis was conducted on course titles and course descriptions level. The study results indicate that DS programs required varying numbers of credit hours, including practicum and capstone. Management schools seem to take the lead and the initiative in lunching and hosting DS programs. In addition, all DS programs requires the basic knowledge of database design, representation, extraction and management. Furthermore, DS programs delivered information skills through their core courses. Moreover, the study results show that almost 40 percent of required courses in DS programs is involved information representations, retrieval and programming. Additionally, DS programs required courses also addressed communication visualization and mathematics skills.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/5WWYDGNT/Bukhari - 2020 - Data Science Curriculum Current Scenario.pdf;/Users/francojc/Zotero/storage/47P9MGBH/papers.html}
}

@article{Bullock2021,
  title = {Exploring a Loan Translation and Its Consequences in an Oral Bilingual Corpus},
  author = {Bullock, Barbara E. and Serigos, Jacqueline and Toribio, Almeida Jacqueline},
  year = {2021},
  month = jul,
  journal = {Journal of Language Contact},
  volume = {13},
  number = {3},
  pages = {612--635},
  publisher = {Brill},
  issn = {1877-4091, 1955-2629},
  doi = {10.1163/19552629-bja10027},
  urldate = {2021-08-17},
  abstract = {Abstract This work applies computational tools that have been used to model loanwords in newspaper corpora to an analysis of a loan translation in an oral bilingual corpus. The explicit goal of the contribution is to argue that a specific collocation found in a corpus of Spanish spoken in Texas, agarrar+NP (e.g., agarrar ayuda), is a loan translation that is calqued on English get+np support verb constructions (e.g., get help). We base our argument on the frequency and the linguistic distribution of the nonconventional usage within and between corpora and on the factors that favor its use. Our findings show that the overall frequency of agarrar is the same in Spanish in Texas as it is in the benchmark monolingual corpus of Mexican Spanish but that it is used differently in the two varieties, a difference that has grammatical, as well as semantic, ramifications.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/K2QLUXJT/Bullock et al. - 2021 - Exploring a Loan Translation and Its Consequences .pdf}
}

@inproceedings{Bunt2006,
  title = {Dimensions in Dialogue Act Annotation},
  booktitle = {Language {{Resource}} and {{Evaluation Conference}}},
  author = {Bunt, Harry},
  year = {2006},
  pages = {919--924},
  abstract = {This paper is concerned with the fundamentals of multidimensional dialogue act annotation, i.e. with what it means to annotate dialogues with information about the communicative acts that are performed with the utterances, taking various `dimensions' into account. Two ideas seem to be prevalent in the literature concerning the notion of dimension: (1) dimensions correspond to different types of information; and (2) a dimension is formed by a set of mutually exclusive tags. In DAMSL, for instance, the terms `dimension' and `layer' are used sometimes in the sense of (1) and sometimes in that of (2). We argue that being mutually exclusive is not a good criterion for a set of dialogue act types to constitute a dimension, even though the description of an object in a multidimensional space should never assign more than one value per dimension. We define a dimension of dialogue act annotation as an aspect of participating in a dialogue that can be addressed independently by means of dialogue acts. We show that DAMSL dimensions such as Info-request, Statement, and Answer do not qualify as proper dimensions, and that the communicative functions in these categories do not fall in any specific dimension, but should be considered as `general-purpose' in the sense that they can be used in any dimension. We argue that using the notion of dimension that we propose, a multidimensional taxonomy of dialogue acts emerges that optimally supports multidimensional dialogue act annotation.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/LDPEXTKN/Bunt - Dimensions in Dialogue Act Annotation.pdf}
}

@article{Bychkovska2017,
  title = {At the Same Time: {{Lexical}} Bundles in {{L1}} and {{L2}} University Student Argumentative Writing},
  shorttitle = {At the Same Time},
  author = {Bychkovska, Tetyana and Lee, Joseph J.},
  year = {2017},
  month = nov,
  journal = {Journal of English for Academic Purposes},
  volume = {30},
  pages = {38--52},
  issn = {1475-1585},
  doi = {10.1016/j.jeap.2017.10.008},
  urldate = {2021-04-06},
  abstract = {This corpus-based study compares L1-English and L1-Chinese undergraduate students' use of lexical bundles in English argumentative essays, and identifies the most common bundle misuses in L2 student writing. Data consist of two corpora of student-produced argumentative essays: 101 high-rated essays written by L1-English students and 105 high-rated essays written by L1-Chinese students. Using Biber's (Biber et~al., 1999; Biber et~al., 2004) structural and functional taxonomy, we compared the forms and functions of four-word bundles used by L1-English and L1-Chinese university students. Findings indicate that L2 students not only use substantially more bundle types and tokens than L1 writers, but the structural and functional patterns of bundles also differ. While L1 writers' bundles consist of mostly noun and preposition phrases, L2 students use significantly more verb phrase (clausal) bundles. Results also show that L2 student writers use significantly more stance bundles than L1 writers. In addition, most of the misused bundles in the L2 writers' essays pertain to grammatical mistakes, particularly with articles and prepositions. We conclude with some pedagogical implications for ESL composition.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/LLH4PDHD/Bychkovska and Lee - 2017 - At the same time Lexical bundles in L1 and L2 uni.pdf;/Users/francojc/Zotero/storage/2NNCE585/S1475158517301005.html}
}

@article{Caliskan2016,
  title = {Semantics Derived Automatically from Language Corpora Contain Human-like Biases},
  author = {Caliskan, Aylin and Bryson, Joanna J. and Narayanan, Arvind},
  year = {2016},
  journal = {Science},
  volume = {356},
  number = {6334},
  eprint = {1608.07187},
  pages = {183--186},
  issn = {0036-8075},
  doi = {10.1126/science.aal4230},
  abstract = {Artificial intelligence and machine learning are in a period of astounding growth. However, there are concerns that these technologies may be used, either with or without intention, to perpetuate the prejudice and unfairness that unfortunately characterizes many human institutions. Here we show for the first time that human-like semantic biases result from the application of standard machine learning to ordinary language---the same sort of language humans are exposed to every day. We replicate a spectrum of standard human biases as exposed by the Implicit Association Test and other well-known psychological studies. We replicate these using a widely used, purely statistical machine-learning model---namely, the GloVe word embedding---trained on a corpus of text from the Web. Our results indicate that language itself contains recoverable and accurate imprints of our historic biases, whether these are morally neutral as towards insects or flowers, problematic as towards race or gender, or even simply veridical, reflecting the \{{\textbackslash}em status quo\} for the distribution of gender with respect to careers or first names. These regularities are captured by machine learning along with the rest of semantics. In addition to our empirical findings concerning language, we also contribute new methods for evaluating bias in text, the Word Embedding Association Test (WEAT) and the Word Embedding Factual Association Test (WEFAT). Our results have implications not only for AI and machine learning, but also for the fields of psychology, sociology, and human ethics, since they raise the possibility that mere exposure to everyday language can account for the biases we replicate here.},
  archiveprefix = {arXiv},
  pmid = {28408601},
  file = {/Users/francojc/Zotero/storage/3PE6FVNA/Caliskan, Bryson, Narayanan - 2016 - Semantics derived automatically from language corpora contain human-like biases.pdf}
}

@article{Calude2008,
  title = {Clefting and Extraposition in {{English}}},
  author = {Calude, {\relax AS}},
  year = {2008},
  journal = {ICAME journal},
  number = {32},
  pages = {7--34},
  url = {http://scholar.google.com/scholar?hl=en&btnG=Search&q=intitle:Clefting+and+extraposition+in+English#0},
  urldate = {2014-09-04},
  file = {/Users/francojc/Zotero/storage/RQYE82DZ/Calude - 2008 - Clefting and extraposition in English.pdf}
}

@incollection{Campbell2001,
  title = {The History of Linguistics},
  booktitle = {The {{Handbook}} of {{Linguistics}}},
  author = {Campbell, Lyle},
  editor = {Aronoff, Mark and {Rees-Miller}, Janie},
  year = {2001},
  series = {Blackwell {{Handbooks}} in {{Linguistics}}},
  pages = {81--104},
  publisher = {Blackwell Publishers},
  keywords = {/unread,history,linguistics,science},
  file = {/Users/francojc/Zotero/storage/76WL5R77/Campbell_2001_The history of linguistics.pdf}
}

@article{Carmi2020,
  title = {Data Citizenship: Rethinking Data Literacy in the Age of Disinformation, Misinformation, and Malinformation},
  shorttitle = {Data Citizenship},
  author = {Carmi, Elinor and Yates, Simeon J. and Lockley, Eleanor and Pawluczuk, Alicja},
  year = {2020},
  month = may,
  journal = {Internet Policy Review},
  volume = {9},
  number = {2},
  issn = {2197-6775},
  url = {https://policyreview.info/articles/analysis/data-citizenship-rethinking-data-literacy-age-disinformation-misinformation-and},
  urldate = {2021-06-16},
  abstract = {In this paper we examine what data literacy means in the age of dis-/mis-/mal-information. We examine theoretical and methodological challenges researchers face when examining these two fields and how we can move forward by sharing our own experience in designing a survey to understand UK citizens data literacies.},
  file = {/Users/francojc/Zotero/storage/CK4FPQXF/Carmi et al. - 2020 - Data citizenship rethinking data literacy in the .pdf;/Users/francojc/Zotero/storage/2A7NWYRD/data-citizenship-rethinking-data-literacy-age-disinformation-misinformation-and.html}
}

@article{Carroll2011,
  title = {Spoken Word Classification in Children and Adults.},
  author = {Carroll, Julia M and Myers, Joanne M},
  year = {2011},
  month = feb,
  journal = {Journal of speech, language, and hearing research : JSLHR},
  volume = {54},
  number = {1},
  eprint = {20798324},
  eprinttype = {pubmed},
  pages = {127--47},
  issn = {1558-9102},
  doi = {10.1044/1092-4388(2010/08-0148)},
  abstract = {Preschool children often have difficulties in word classification, despite good speech perception and production. Some researchers suggest that they represent words using phonetic features rather than phonemes. In this study, the authors examined whether there is a progression from feature-based to phoneme-based processing across age groups and whether responses are consistent across tasks and stimuli.},
  pmid = {20798324}
}

@article{Casal2021,
  title = {Syntactic Complexity across Academic Research Article Part-Genres: A Cross-Disciplinary Perspective},
  shorttitle = {Syntactic Complexity across Academic Research Article Part-Genres},
  author = {Casal, J. Elliott and Lu, Xiaofei and Qiu, Xixin and Wang, Yuanheng and Zhang, Genggeng},
  year = {2021},
  month = mar,
  journal = {Journal of English for Academic Purposes},
  pages = {100996},
  issn = {1475-1585},
  doi = {10.1016/j.jeap.2021.100996},
  urldate = {2021-03-20},
  abstract = {This study examined eight measures of syntactic complexity across published research article part-genres (Introduction, Methods, Results, and Discussion) and three social science disciplines (Applied Linguistics, Psychology, and Economics). The corpus of 240 complete texts was analyzed using a modified version of the Syntactic Complexity Analyzer (SCA), and the measures were compared across disciplinary and part-genre variables using a Two-way MANOVA and a series of follow up MANOVA and ANOVA tests. The findings highlight a significant large effect of both discipline and part-genre on all eight syntactic complexity indices, as well as a significant but small effect size for the interaction of move and discipline on the complexity measures. Important disciplinary and part-genre based differences in the use of syntactically complex structures are discussed, as are the implications of these findings on EAP writing research and pedagogy.},
  langid = {english},
  keywords = {academic writing,corpus,corpus linguistics,Corpus of Social Sciences Research Articles (COSSRA),genre,language variation,register,syntactic complexity,syntax},
  file = {/Users/francojc/Zotero/storage/AZ2RAPC5/Casal et al_2021_Syntactic Complexity across academic research article part-genres.pdf;/Users/francojc/Zotero/storage/62VA7IMP/S1475158521000400.html}
}

@article{Castagnoli2009,
  title = {Regularities and Variations in Learner Translations: A Corpus-Based Study of Conjunctive Explicitation},
  author = {Castagnoli, S},
  year = {2009},
  url = {http://etd.adm.unipi.it/theses/available/etd-04252009-135411/},
  urldate = {2013-09-10}
}

@article{Cavnar1994,
  title = {N-Gram-Based Text Categorization},
  author = {Cavnar, William and Trenkle, John M},
  year = {1994},
  journal = {Proceedings of the Thin! Annual Symposium on Document and Information Retrieval},
  doi = {10.1.1.53.9367},
  isbn = {1932432655},
  file = {/Users/francojc/Zotero/storage/PUHPGMTA/Cavnar and Trenkle - 1994 - N-Gram-Based Text Categorization.pdf}
}

@article{Cetinkaya-Rundel2018,
  title = {Infrastructure and Tools for Teaching Computing throughout the Statistical Curriculum},
  author = {{\c C}etinkaya-Rundel, Mine and Rundel, Colin},
  year = {2018},
  month = jan,
  journal = {The American Statistician},
  volume = {72},
  number = {1},
  pages = {58--65},
  publisher = {Taylor \& Francis},
  issn = {0003-1305},
  doi = {10.1080/00031305.2017.1397549},
  urldate = {2021-05-17},
  abstract = {Modern statistics is fundamentally a computational discipline, but too often this fact is not reflected in our statistics curricula. With the rise of big data and data science, it has become increasingly clear that students want, expect, and need explicit training in this area of the discipline. Additionally, recent curricular guidelines clearly state that working with data requires extensive computing skills and that statistics students should be fluent in accessing, manipulating, analyzing, and modeling with professional statistical analysis software. Much has been written in the statistics education literature about pedagogical tools and approaches to provide a practical computational foundation for students. This article discusses the computational infrastructure and toolkit choices to allow for these pedagogical innovations while minimizing frustration and improving adoption for both our students and instructors. Supplementary materials for this article are available online.},
  file = {/Users/francojc/Zotero/storage/U48IDAWB/Çetinkaya-Rundel and Rundel - 2018 - Infrastructure and Tools for Teaching Computing Th.pdf;/Users/francojc/Zotero/storage/VKQX49MX/00031305.2017.html}
}

@misc{Cetinkaya-Rundel2021,
  title = {Introduction to Modern Statistics},
  author = {{\c C}etinkaya-Rundel, Mine and Hardin, Johanna},
  year = {2021},
  journal = {Introduction to Modern Statistics},
  url = {https://openintro-ims.netlify.app/index.html},
  urldate = {2021-06-03},
  file = {/Users/francojc/Zotero/storage/8D5QL752/index.html}
}

@article{Cetinkaya-Rundel2021a,
  title = {A Fresh Look at Introductory Data Science},
  author = {{\c C}etinkaya-Rundel, Mine and Ellison, Victoria},
  year = {2021},
  month = mar,
  journal = {Journal of Statistics and Data Science Education},
  volume = {29},
  number = {sup1},
  pages = {S16-S26},
  publisher = {Taylor \& Francis},
  issn = {null},
  doi = {10.1080/10691898.2020.1804497},
  urldate = {2021-05-10},
  abstract = {The proliferation of vast quantities of available datasets that are large and complex in nature has challenged universities to keep up with the demand for graduates trained in both the statistical and the computational set of skills required to effectively plan, acquire, manage, analyze, and communicate the findings of such data. To keep up with this demand, attracting students early on to data science as well as providing them a solid foray into the field becomes increasingly important. We present a case study of an introductory undergraduate course in data science that is designed to address these needs. Offered at Duke University, this course has no prerequisites and serves a wide audience of aspiring statistics and data science majors as well as humanities, social sciences, and natural sciences students. We discuss the unique set of challenges posed by offering such a course, and in light of these challenges, we present a detailed discussion into the pedagogical design elements, content, structure, computational infrastructure, and the assessment methodology of the course. We also offer a repository containing all teaching materials that are open-source, along with supplementary materials and the R code for reproducing the figures found in the article.},
  file = {/Users/francojc/Zotero/storage/2CDNSSC7/Çetinkaya-Rundel and Ellison - 2021 - A Fresh Look at Introductory Data Science.pdf;/Users/francojc/Zotero/storage/LW9KDIEH/10691898.2020.html}
}

@misc{Cetinkaya-Rundel2023,
  title = {Get Started with Quarto},
  author = {{\c C}etinkaya-Rundel, Mine},
  year = {2023},
  month = may,
  journal = {YouTube},
  url = {https://www.youtube.com/watch?v=_f3latmOhew},
  urldate = {2023-05-18},
  abstract = {This video walks you through creating documents, presentations, and websites and publishing with Quarto. The video features authoring Quarto documents with executable R code chunks using the RStudio Visual Editor (https://quarto.org/docs/visual-editor/). 00:00 Introduction 00:34 Authoring a document with Quarto 01:13 Using the RStudio visual editor 04:13 Code chunks and chunk options 06:31 Inserting cross references to figures and tables (https://quarto.org/docs/authoring/cro...) 08:56 Adding a citation from a DOI (https://quarto.org/docs/visual-editor...) 10:10 Seamlessly switching between output formats 10:58 Creating Quarto presentations (https://quarto.org/docs/presentations/) 14:36 Customizing the output location of code in presentations (https://quarto.org/docs/presentations...) 16:09 Creating a website from scratch (https://quarto.org/docs/websites/) 19:19 Creating multi-format documents (https://quarto.org/docs/output-format...) 20:22 Publishing the website to QuartoPub (https://quarto.org/docs/publishing/qu...)},
  collaborator = {{Posit PBC}},
  keywords = {/unread,literate-programming,quarto,r,reproducible,research}
}

@article{Chahine2023,
  title = {Spelling Issues: {{What}} Learner Corpora Can Reveal about {{L2}} Orthography},
  shorttitle = {Spelling Issues},
  author = {Chahine, Irina Kor and Uetova, Ekaterina},
  year = {2023},
  month = jan,
  journal = {Corpus},
  number = {24},
  publisher = {Bases, corpus et langage - UMR 6039},
  issn = {1638-9808},
  doi = {10.4000/corpus.8226},
  urldate = {2023-02-13},
  abstract = {The article is devoted to spelling errors of Russian learners in a French-speaking environment. Based on 1,816~spelling errors, the analysis focuses on four mechanisms (transposition, insertion, omission, and substitution), and the influence of contextual and non-contextual (cognitive, inter- and intralinguistic, extralinguistic) factors is taken into account for each mechanism in question. Despite the multidimensional nature of the factors involved, the recurrence of certain errors makes it possible to distinguish clear trends in the acquisition of linguistic facts. In addition, a quantitative analysis of the obtained data allows cross-sectional studies of errors, which traces back the steps in the acquisition of various aspects of Russian spelling, from A1 to C1 level for the first time. Moreover, the qualitative analysis of errors provides valuable data for pedagogical materials.},
  copyright = {All rights reserved},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/Y2DUECKY/Chahine_Uetova_2023_Spelling Issues.pdf}
}

@article{Chambers2020,
  title = {S, {{R}}, and Data Science},
  author = {Chambers, John M.},
  year = {2020},
  month = jun,
  journal = {Proceedings of the ACM on Programming Languages},
  volume = {4},
  number = {HOPL},
  pages = {1--17},
  issn = {2475-1421},
  doi = {10.1145/3386334},
  urldate = {2021-04-15},
  abstract = {Data science is increasingly important and challenging. It requires computational tools and programming environments that handle big data and difficult computations, while supporting creative, high-quality analysis. The R language and related software play a major role in computing for data science. R is featured in most programs for training in the field. R packages provide tools for a wide range of purposes and users. The description of a new technique, particularly from research in statistics, is frequently accompanied by an R package, greatly increasing the usefulness of the description.             The history of R makes clear its connection to data science. R was consciously designed to replicate in open-source software the contents of the S software. S in turn was written by data analysis researchers at Bell Labs as part of the computing environment for research in data analysis and collaborations to apply that research, rather than as a separate project to create a programming language. The features of S and the design decisions made for it need to be understood in this broader context of supporting effective data analysis (which would now be called data science). These characteristics were all transferred to R and remain central to its effectiveness. Thus, R can be viewed as based historically on a domain-specific language for the domain of data science.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/UIBFAPS5/Chambers - 2020 - S, R, and data science.pdf}
}

@book{Chan2014,
  title = {Routledge Encyclopedia of Translation Technology},
  author = {Chan, Sin-wai},
  year = {2014},
  publisher = {Routledge},
  file = {/Users/francojc/Zotero/storage/6TYDAV6E/Chan - 2014 - Routledge encyclopedia of translation technology.pdf;/Users/francojc/Zotero/storage/VSRTQE2U/S0FWBQAAQBAJ.html}
}

@incollection{Chater2015,
  title = {Towards a New Empiricism in Linguistics},
  booktitle = {Empiricist {{Approaches}} to {{Language Learning}}},
  author = {Chater, Nick and Clark, Alexander and Goldsmith, John A. and Perfors, Amy},
  year = {2015},
  eprint = {cond-mat/0402594v3},
  pages = {58--105},
  issn = {0873626X},
  doi = {10.1093/acprof},
  archiveprefix = {arXiv},
  isbn = {978-0-19-968267-6},
  pmid = {16683413},
  file = {/Users/francojc/Zotero/storage/R52PYMNI/Chater et al. - 2015 - Towards a new empiricism in linguistics.pdf}
}

@article{Chaudhry2021,
  title = {Sentiment Analysis of before and after Elections: {{Twitter}} Data of {{US}} Election 2020},
  shorttitle = {Sentiment Analysis of before and after Elections},
  author = {Chaudhry, Hassan Nazeer and Javed, Yasir and Kulsoom, Farzana and Mehmood, Zahid and Khan, Zafar Iqbal and Shoaib, Umar and Janjua, Sadaf Hussain},
  year = {2021},
  journal = {Electronics},
  volume = {10},
  number = {17},
  pages = {2082},
  publisher = {Multidisciplinary Digital Publishing Institute},
  file = {/Users/francojc/Zotero/storage/2S4AGEDX/Chaudhry et al. - 2021 - Sentiment analysis of before and after elections .pdf;/Users/francojc/Zotero/storage/YYCL7JER/2082.html}
}

@article{Chen2008,
  title = {Data, Information, and Knowledge in Visualization},
  author = {Chen, Min and Ebert, David and Hagen, Hans and Laramee, Robert S and Van Liere, Robert and Ma, Kwan-Liu and Ribarsky, William and Scheuermann, Gerik and Silver, Deborah},
  year = {2008},
  journal = {IEEE computer graphics and applications},
  volume = {29},
  number = {1},
  pages = {12--19},
  publisher = {IEEE},
  file = {/Users/francojc/Zotero/storage/BCLFD7XP/Chen et al. - 2008 - Data, information, and knowledge in visualization.pdf}
}

@book{Chomsky1957,
  title = {Syntactic Structures},
  author = {Chomsky, Noam},
  year = {1957},
  publisher = {Mouton {\textbackslash}\& Co.}
}

@article{Chomsky1959,
  title = {A Review of {{B}}.{{F}}. Skinner's Verbal Behavior},
  author = {Chomsky, Noam},
  year = {1959},
  journal = {Language},
  volume = {35},
  number = {1},
  pages = {26--58},
  file = {/Users/francojc/Zotero/storage/Q55TK8TM/Chomsky - 1959 - A Review of B.F. Skinner's Verbal Behavior.pdf}
}

@article{Chung2008,
  title = {Cross-Linguistic Comparisons of the Market Metaphors},
  author = {Chung, Siaw-Fong},
  year = {2008},
  month = jan,
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {4},
  number = {2},
  pages = {141--175},
  issn = {1613-7027},
  doi = {10.1515/CLLT.2008.007},
  urldate = {2014-08-12},
  file = {/Users/francojc/Zotero/storage/4BR4D4BP/Chung - 2008 - Cross-linguistic comparisons of the market metaphors.pdf}
}

@article{Church1990,
  title = {Word Association Norms, Mutual Information, and Lexicography},
  author = {Church, {\relax KW} and Hanks, P},
  year = {1990},
  journal = {Computational linguistics},
  number = {c},
  pages = {1--15},
  url = {http://dl.acm.org/citation.cfm?id=89095},
  urldate = {2014-10-14},
  file = {/Users/francojc/Zotero/storage/NLGF9EZR/Church, Hanks - 1990 - Word Association Norms, Mutual Information, and Lexicography.pdf}
}

@article{Cobb2015,
  title = {Mere Renovation Is Too Little Too Late: {{We}} Need to Rethink Our Undergraduate Curriculum from the Ground Up},
  author = {Cobb, George},
  year = {2015},
  journal = {American Statistician},
  volume = {69},
  number = {4},
  pages = {266--282},
  issn = {15372731},
  doi = {10.1080/00031305.2015.1093029},
  abstract = {The last half-dozen years have seen The American Statistician publish well-argued and provocative calls to change our thinking about statistics and how we teach it, among them Brown and Kass (2009), Nolan and Temple-Lang (2010), and Legler et al. (2010). Within this past year, the ASA has issued a new and comprehensive set of guidelines for undergraduate programs (ASA 2014). Accepting (and applauding) all this as background, the current article argues the need to rethink our curriculum from the ground up, and offers five principles and two caveats intended to help us along the path toward a new synthesis. These principles and caveats rest on my sense of three parallel evolutions: the convergence of trends in the roles of mathematics, computation, and context within statistics education. These ongoing changes, together with the articles cited above and the seminal provocation by Leo Breiman (2001) call for a deep rethinking of what we teach to undergraduates. In particular, following Brown and Kass, we should put priority on two goals, to make fundamental concepts accessible and to minimize prerequisites to research.},
  file = {/Users/francojc/Zotero/storage/JZ6QABY8/Cobb - 2015 - Mere Renovation is Too Little Too Late We Need to Rethink our Undergraduate Curriculum from the Ground Up.pdf}
}

@article{Collins2004,
  title = {Detecting Collaborations in Text: {{Comparing}} the Authors' Rhetorical Language Choices in the Federalist Papers},
  author = {Collins, Jeff and Kaufer, David and Vlachos, Pantelis and Butler, Brian and Ishizaki, Suguru},
  year = {2004},
  journal = {Computers and the Humanities},
  volume = {38},
  pages = {15--36},
  issn = {0010-4817},
  doi = {10.1023/B:CHUM.0000009291.06947.52},
  urldate = {2016-08-26},
  abstract = {In author attribution studies function words or lexical measures are often used to differ- entiate the authors' textual fingerprints. These studies can be thought of as quantifying the texts, representing the text with measured variables that stand for specific textual features. The resulting quantifications, while proven useful for statistically differentiating among the texts, bear no resem- blance to the understanding texts. In this paper we present an attribution study that, instead, characterizes the texts according to the representational come to know a text and distinguish its rhetorical purpose. From our automated quantification a human reader - even an astute one - would develop while reading the language choices of the authors, similar to a way we believe close human readers of The Federalist papers, it is clear why human readers find it impossible to distinguish the authorship of the disputed papers. Our findings suggest that changes occur in the processes of rhetorical invention when undertaken in collaborative situations. This points to a need to re-evaluate the premise of autonomous authorship that has informed attribution studies of The Federalist case.},
  isbn = {0010-4817},
  file = {/Users/francojc/Zotero/storage/A5IRN8SJ/Collins et al. - 2004 - Detecting collaborations in text Comparing the authors' rhetorical language choices in the Federalist Papers.pdf}
}

@article{Collins2004a,
  title = {Detecting Collaborations in Text Comparing the Authors' Rhetorical Language Choices in the Federalist Papers},
  author = {Collins, Jeff and Kaufer, David and Vlachos, Pantelis and Butler, Brian and Ishizaki, Suguru},
  year = {2004},
  month = feb,
  journal = {Computers and the Humanities},
  volume = {38},
  number = {1},
  pages = {15--36},
  issn = {1572-8412},
  doi = {10.1023/B:CHUM.0000009291.06947.52},
  urldate = {2023-04-14},
  abstract = {In author attribution studies function words or lexical measures areoften used to differentiate the authors' textual fingerprints. Thesestudies can be thought of as quantifying the texts, representing thetext with measured variables that stand for specific textual features.The resulting quantifications, while proven useful for statisticallydifferentiating among the texts, bear no resemblance to the understanding a human reader -- even an astute one -- would develop whilereading the texts. In this paper we present an attribution study that,instead, characterizes the texts according to the representationallanguage choices of the authors, similar to a way we believe close humanreaders come to know a text and distinguish its rhetorical purpose. Fromour automated quantification of The Federalist papers, it isclear why human readers find it impossible to distinguish the authorshipof the disputed papers. Our findings suggest that changes occur in theprocesses of rhetorical invention when undertaken in collaborativesituations. This points to a need to re-evaluate the premise ofautonomous authorship that has informed attribution studies of The Federalist case.},
  langid = {english},
  keywords = {/unread,authorship attribution,corpus,corpus linguistics,federalist papers,linguistics,statistics,stylometry},
  file = {/Users/francojc/Zotero/storage/8NZ6PDI7/Collins et al_2004_Detecting Collaborations in Text Comparing the Authors' Rhetorical Language.pdf}
}

@misc{Conway2010,
  title = {The Data Science {{Venn}} Diagram},
  author = {Conway, Drew},
  year = {2010},
  month = sep,
  journal = {drewconway.com},
  url = {http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram},
  urldate = {2024-04-04},
  abstract = {On Monday I---humbly---joined a group of NYC's most sophisticated thinkers on all things data for a half-day  unconference  to help  O'Reily  organize their upcoming  Strata conference . The break out sessions were fantastic, and the number of people in each allowed for outstanding, expert driven, discu},
  langid = {american},
  keywords = {data science,visualizations}
}

@article{Conway2012,
  title = {Does Complex or Simple Rhetoric Win Elections? {{An}} Integrative Complexity Analysis of {{U}}.{{S}}. Presidential Campaigns},
  shorttitle = {Does Complex or Simple Rhetoric Win Elections?},
  author = {Conway, Lucian Gideon and Gornick, Laura Janelle and Burfeind, Chelsea and Mandella, Paul and Kuenzli, Andrea and Houck, Shannon C. and Fullerton, Deven Theresa},
  year = {2012},
  journal = {Political Psychology},
  volume = {33},
  number = {5},
  pages = {599--618},
  issn = {1467-9221},
  doi = {10.1111/j.1467-9221.2012.00910.x},
  urldate = {2021-03-09},
  abstract = {Research suggests that the integrative complexity of political rhetoric tends to drop during election season, but little research to date directly addresses if this drop in complexity serves to increase or decrease electoral success. The two present studies help fill this gap. Study 1 demonstrates that, during the Democratic Party primary debates in 2003--2004, the eventual winners of the party nomination showed a steeper drop in integrative complexity as the election season progressed than nonwinning candidates. Study 2 presents laboratory evidence from the most recent presidential campaign demonstrating that, while the complexity of Obama's rhetoric had little impact on college students' subsequent intentions to vote for him, the complexity of McCain's rhetoric was significantly positively correlated with their likelihood of voting for him. Taken together, this research is inconsistent with an unqualified simple is effective view of the complexity-success relationship. Rather, it is more consistent with a compensatory view: Effective use of complexity (or simplicity) may compensate for perceived weaknesses. Thus, appropriately timed shifts in complexity levels, and/or violations of negative expectations relevant to complexity, may be an effective means of winning elections. Surprisingly, mere simplicity as such seems largely ineffective.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/I5LRNVRM/Conway et al. - 2012 - Does Complex or Simple Rhetoric Win Elections An .pdf;/Users/francojc/Zotero/storage/52QGXXJ7/j.1467-9221.2012.00910.html}
}

@misc{Corporalinguatoolsorg06-2021,
  title = {{Corpora -- linguatools.org}},
  url = {https://linguatools.org/tools/corpora/},
  urldate = {2021-06-09},
  langid = {ngerman},
  file = {/Users/francojc/Zotero/storage/5R6TMCH4/corpora.html}
}

@article{Coulthard2004,
  title = {Author Identification, Idiolect, and Linguistic Uniqueness},
  author = {Coulthard, Malcolm},
  year = {2004},
  journal = {Applied Linguistics},
  volume = {25},
  number = {4},
  pages = {431--447},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/5HI2MPR6/Coulthard - 2004 - Author Identification, Idiolect, and Linguistic Un.pdf}
}

@misc{Coursemanagementghclass06-2021,
  title = {Course Management with Ghclass},
  url = {https://rundel.github.io/ghclass/articles/articles/ghclass.html},
  urldate = {2021-06-09},
  abstract = {ghclass},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/8HS6SLUS/ghclass.html}
}

@article{Covington2010,
  title = {Cutting the Gordian Knot: {{The}} Moving-Average Type--Token Ratio ({{MATTR}})},
  author = {Covington, Michael A. and McFall, Joe D.},
  year = {2010},
  journal = {Journal of Quantitative Linguistics},
  volume = {17},
  number = {2},
  pages = {94--100},
  issn = {0929-6174},
  doi = {10.1080/09296171003643098},
  isbn = {0929617100364},
  file = {/Users/francojc/Zotero/storage/IUHUSSJM/Covington and McFall - 2010 - Cutting the Gordian Knot The Moving-Average Type–.pdf}
}

@article{Cross2006,
  title = {Design as a Discipline},
  author = {Cross, Nigel},
  year = {2006},
  journal = {Designerly Ways of Knowing},
  pages = {95--103},
  publisher = {Springer},
  file = {/Users/francojc/Zotero/storage/NI36BS2A/openurl.pdf;/Users/francojc/Zotero/storage/WNTZHJDC/10.html}
}

@article{Crossley2010,
  title = {Predicting Lexical Proficiency in Language Learner Texts Using Computational Indices},
  author = {Crossley, Scott A. and Salsbury, Tom and McNamara, Danielle S. and Jarvis, Scott},
  year = {2010},
  journal = {Language Testing},
  volume = {28},
  number = {1},
  pages = {561--580},
  doi = {10.1177/0265532210378031},
  urldate = {2024-01-10},
  langid = {english},
  keywords = {corpus linguistics,L2 learning,L2 writing,lexical diversity,text metrics},
  file = {/Users/francojc/Zotero/storage/2IGH39VH/Predicting lexical proficiency in language learner texts using computational indices.pdf}
}

@article{Crossley2013,
  title = {Frequency Effects or Context Effects in Second Language Learning: {{What}} Predicts Early Lexical Production?},
  shorttitle = {{{FREQUENCY EFFECTS OR CONTEXT EFFECTS IN SECOND LANGUAGE WORD LEARNING}}},
  author = {Crossley, Scott A. and Subtirelu, Nicholas and Salsbury, Tom},
  year = {2013},
  month = dec,
  journal = {Studies in Second Language Acquisition},
  volume = {35},
  number = {4},
  pages = {727--755},
  issn = {0272-2631, 1470-1545},
  doi = {10.1017/S0272263113000375},
  urldate = {2023-08-31},
  abstract = {This study examines frequency, contextual diversity, and contextual distinctiveness effects in predicting produced versus not-produced frequent nouns and verbs by early second language (L2) learners of English. The study analyzes whether word frequency is the strongest predictor of early L2 word production independent of contextual diversity and distinctiveness and whether differences exist in the lexical properties of nouns and verbs that can help explain beginning-level L2 word production. The study uses machine learning algorithms to develop models that predict produced and unproduced words in L2 oral discourse. The results demonstrate that word frequency is the strongest classifier of whether a noun is produced or not produced in beginning L2 oral discourse, whereas contextual diversity is the strongest classifier of whether a verb is produced or not produced. Post hoc tests reveal that nouns are more concrete, meaningful, imageable, specific, and unambiguous than verbs, which indicates that lexical properties may explain differences in noun and verb production. Thus, whereas distributional properties of nouns may allow lexical acquisition on the basis of association through exposure alone (i.e., nouns may adhere to frequency effects), the abstractness and ambiguity found in verbs make them difficult to acquire based solely on repetition. Therefore, verb acquisition may follow a principle of likely need characterized by contextual diversity effects.},
  langid = {english},
  keywords = {corpus study,frequency effects,L2,second language acquisition}
}

@article{Crossley2023,
  title = {Crowd-Sourcing Human Ratings of Linguistic Production},
  author = {Crossley, Scott and Cushing, Sara and Jarvis, Scott and Kyle, Kristopher},
  year = {2023},
  journal = {Proceedings of the Annual Meeting of the Cognitive Science Society},
  volume = {45},
  number = {45},
  url = {https://escholarship.org/uc/item/2zh6n03c},
  urldate = {2023-07-19},
  abstract = {This study examines the reliability and validity of using two types of crowd-sourced judgments to collect lexical diversity scores. Scaled and pairwise comparison approaches were used to collect data from non-expert Amazon Mechanical Turk workers. The reliability of the lexical diversity ratings for the crowd-sourced raters was assessed along with those from trained raters using a variety of reliability statistics. The validity of the ratings was examined by 1) comparing crowd-sourced and trained ratings, 2) comparing crowd-sourced and trained ratings to ratings of language proficiency, and 3) by using an objective measure of lexical diversity to predict the crowd-sourced and trained ratings. The results indicate that scaled crowd-sourced ratings showed strong reliability in terms of text and rater strata and showed fewer misfitted texts than the trained raters. The scaled crowd-sourced ratings were also strongly predicted by lexical diversity features derived from the texts themselves.},
  langid = {english},
  keywords = {/unread,corpus-aided,lexical diversity,ratings},
  file = {/Users/francojc/Zotero/storage/VD5GJ2I5/Crossley et al. - 2023 - Crowd-Sourcing Human Ratings of Linguistic Product.pdf}
}

@article{Crosthwaite2023,
  title = {Generative {{AI}} and the End of Corpus-Assisted Data-Driven Learning? {{Not}} so Fast!},
  shorttitle = {Generative {{AI}} and the End of Corpus-Assisted Data-Driven Learning?},
  author = {Crosthwaite, Peter and Baisa, Vit},
  year = {2023},
  month = dec,
  journal = {Applied Corpus Linguistics},
  volume = {3},
  number = {3},
  pages = {100066},
  issn = {2666-7991},
  doi = {10.1016/j.acorp.2023.100066},
  urldate = {2023-07-19},
  abstract = {This article explores the potential advantages of corpora over generative artificial intelligence (GenAI) in understanding language patterns and usage, while also acknowledging the potential of GenAI to address some of the main shortcomings of corpus-based data-driven learning (DDL). One of the main advantages of corpora is that we know exactly the domain of texts from which the corpus data is derived, something that we cannot track from current large language models underlying applications like ChatGPT. We know the texts that make up large general corpora such as BNC2014 and BAWE, and can even extract full texts from these corpora if needed. Corpora also allow for more nuanced analysis of language patterns, including the statistics behind multi-word units and collocations, which can be difficult for GenAI to handle. However, it is important to note that GenAI has its own strengths in advancing our understanding of language-in-use that corpora, to date, have struggled with. We therefore argue that by combining corpus and GenAI approaches, language learners can gain a more comprehensive understanding of how language works in different contexts than is currently possible using only a single approach.},
  langid = {english},
  keywords = {/unread,AI,ChatGPT,Corpora,corpus linguistics,Data-driven learning,DDL,generative AI,language teaching},
  file = {/Users/francojc/Zotero/storage/V75WL2KC/Crosthwaite and Baisa - 2023 - Generative AI and the end of corpus-assisted data-.pdf}
}

@phdthesis{Curran2003,
  title = {From Distributional to Semantic Similarity},
  author = {Curran, James Richard},
  year = {2003},
  abstract = {Lexical-semantic resources, including thesauri and WORDNET, have been successfully incorporated into a wide range of applications in Natural Language Processing. However they are very difficult and expensive to create and maintain, and their usefulness has been severely hampered by their limited coverage, bias and inconsistency. Automated and semi-automated methods for developing such resources are therefore crucial for further resource development and improved application performance. Systems that extract thesauri often identify similar words using the distributional hypothesis that similar words appear in similar contexts. This approach involves using corpora to examine the contexts each word appears in and then calculating the similarity between context distributions. Different definitions of context can be used, and I begin by examining how different types of extracted context influence similarity. To be of most benefit these systems must be capable of finding synonyms for rare words. Reliable context counts for rare events can only be extracted from vast collections of text. In this dissertation I describe how to extract contexts from a corpus of over 2 billion words. I describe techniques for processing text on this scale and examine the trade-off between context accuracy, information content and quantity of text analysed. Distributional similarity is at best an approximation to semantic similarity. I develop improved approximations motivated by the intuition that some events in the context distribution are more indicative of meaning than others. For instance, the object-of-verb context wear is far more indicative of a clothing noun than get. However, existing distributional techniques do not effectively utilise this information. The new context-weighted similarity metric I propose in this dissertation significantly outperforms every distributional similarity metric described in the literature. Nearest-neighbour similarity algorithms scale poorly with vocabulary and context vector size. To overcome this problem I introduce a new context-weighted approximation algorithm with bounded complexity in context vector size that significantly reduces the system runtime with only a minor performance penalty. I also describe a parallelized version of the system that runs on a Beowulf cluster for the 2 billion word experiments. To evaluate the context-weighted similarity measure I compare ranked similarity lists against gold-standard resources using precision and recall-based measures from Information Retrieval, since the alternative, application-based evaluation, can often be influenced by distributional as well as semantic similarity. I also perform a detailed analysis of the final results using WORDNET. Finally, I apply my similarity metric to the task of assigning words to WORDNET semantic categories. I demonstrate that this new approach outperforms existing methods and overcomes some of their weaknesses.},
  school = {The University of Edinburgh},
  file = {/Users/francojc/Zotero/storage/3BA8LA5D/Curran - 2003 - From Distributional to Semantic Similarity.pdf}
}

@article{Damerau1993,
  title = {Generating and Evaluating Domain-Oriented Multi-Word Terms from Texts},
  author = {Damerau, F J},
  year = {1993},
  journal = {Information Processing \& Management},
  volume = {29},
  number = {4},
  pages = {433--447},
  file = {/Users/francojc/Zotero/storage/NIWMYGDT/Damerau - 1993 - Generating and evaluating domain-oriented multi-word terms from texts.pdf}
}

@misc{DataNeverSleeps08-2021,
  type = {Infographic},
  title = {Data Never Sleeps 7.0},
  year = {2019},
  journal = {Data Never Sleeps 7.0},
  url = {https://www.domo.com/learn/infographic/data-never-sleeps-7},
  urldate = {2021-08-29},
  abstract = {In our seventh edition of Data Never Sleeps, you'll find out exactly how much data is generated in every minute of every day with some of the most popular platforms and companies in 2019.},
  collaborator = {{Domo, Inc}},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/UBH5XG6M/data-never-sleeps-7.html}
}

@inproceedings{Davidov2010,
  title = {Enhanced Sentiment Learning Using Twitter Hashtags and Smileys},
  booktitle = {Proceedings of the 23rd {{International Conference}} on {{Computational Linguistics}}},
  author = {Davidov, Dmitry and Tsur, O and Rappoport, A},
  year = {2010},
  number = {August},
  pages = {241--249},
  url = {http://dl.acm.org/citation.cfm?id=1944594},
  urldate = {2013-11-05}
}

@article{Davies2013,
  title = {Lexical and Semantic Age-of-Acquisition Effects on Word Naming in {{Spanish}}},
  author = {Davies, Robert and Barb{\'o}n, Analia and Cuetos, Fernando},
  year = {2013},
  month = feb,
  journal = {Memory \& Cognition},
  volume = {41},
  number = {2},
  pages = {297--311},
  issn = {0090-502X, 1532-5946},
  doi = {10.3758/s13421-012-0263-8},
  urldate = {2021-01-28},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/HJLFYZRQ/Davies et al. - 2013 - Lexical and semantic age-of-acquisition effects on.pdf}
}

@article{Davis1995,
  title = {The Articulatory Basis of Babbling},
  author = {Davis, Barbara L. and MacNeilage, Peter F.},
  year = {1995},
  journal = {Journal of Speech, Language, and Hearing Research},
  volume = {38},
  number = {6},
  pages = {1199--1211},
  publisher = {ASHA},
  abstract = {This article evaluates the ``Frames, then Content'' hypothesis for speech acquisition, which states that much of the patterning of babbling is a direct result of production of syllabic ``Frames'' by means of rhythmic mandibular oscillation, with relatively little of the intrasyllabic and intersyllabic ``Content'' of the syllable-like cycles under mandible-independent control. Analysis was based on a phonetically transcribed corpus of 6,659 utterances of 6 normally developing infants obtained from one-hour weekly audio-recordings over a 4--6 month period. Intrasyllabic predictions were that front vowels would preferentially co-occur with front (alveolar) consonants, back vowels with back (velar) consonants, and central vowels with labial consonants, with the latter effect presumably resulting from mandibular oscillation alone. Intersyllabic predictions were for more variegation in tongue height for vowels than in front-back tongue movement, and for consonant manner changes to predominate over place changes (related primarily to mandibular oscillation). All 30 individual predictions from both hypotheses were confirmed, leading to a conception of the articulatory basis of babbling as ``Frame Dominance.''},
  keywords = {/unread,english,first language acquisition,phonbank,phonetics,talkbank},
  file = {/Users/francojc/Zotero/storage/QBVDL3MH/openurl.html}
}

@article{Davis2002,
  title = {Acquisition of Serial Complexity in Speech Production: A Comparison of Phonetic and Phonological Approaches to First Word Production},
  shorttitle = {Acquisition of Serial Complexity in Speech Production},
  author = {Davis, Barbara L. and MacNeilage, Peter F. and Matyear, Christine L.},
  year = {2002},
  month = sep,
  journal = {Phonetica},
  volume = {59},
  number = {2-3},
  pages = {75--107},
  publisher = {De Gruyter Mouton},
  issn = {1423-0321},
  doi = {10.1159/000066065},
  urldate = {2023-05-24},
  abstract = {Comparison was made between performance-based and competence-based approaches to the understanding of first word production. The performance-related frame/content approach is representative of the biological/functional perspective of phonetics in seeking explanations based on motor, perceptual and cognitive aspects of speech actions. From this perspective, intrasyllabic consonant-vowel (CV) co-occurrence patterns and intersyllabic sequence patterns are viewed as reflective of biomechanical constraints emerging from mandibular oscillation cycles. A labial-coronal sequence effect involved, in addition, the problem of interfacing the lexicon with the motor system, as well as the additional problem of initiation of movement complexes. Competence-based approaches to acquisition are within the generative phonological tradition; involving an initial assumption of innate, speech-specific mental structures. While various current phonological approaches to acquisition involve consideration of sequence effects and intrasyllabic patterns, they do not adequately establish the proposed mental entities in infants of this age, and are nonexplanatory in the sense of not considering the causes of the structures and constraints that they posit.},
  langid = {english},
  keywords = {/unread,first language acquisition,phonetics}
}

@article{DeDeyne2018,
  title = {The ``{{Small World}} of {{Words}}'' {{English}} Word Association Norms for over 12,000 Cue Words},
  author = {De Deyne, Simon and Navarro, Danielle J. and Perfors, Amy and Brysbaert, Marc and Storms, Gert},
  year = {2018},
  journal = {Behavior Research Methods},
  publisher = {Behavior Research Methods},
  issn = {15543528},
  doi = {10.3758/s13428-018-1115-7},
  isbn = {1342801811157},
  file = {/Users/francojc/Zotero/storage/TPUUJVFQ/De Deyne et al. - 2018 - The “Small World of Words” English word associatio.pdf}
}

@misc{deHaas2021,
  title = {Surprising Shared Word Etymologies},
  author = {{de Haas}, Daniel},
  year = {2021},
  url = {https://www.danielde.dev/blog/surprising-shared-word-etymologies/},
  urldate = {2021-06-16},
  file = {/Users/francojc/Zotero/storage/64I8GK8Y/surprising-shared-word-etymologies.html}
}

@book{Deisenroth2020,
  title = {Mathematics for Machine Learning},
  author = {Deisenroth, Marc Peter and Faisal, A Aldo and Ong, Cheng Soon},
  year = {2020},
  month = apr,
  url = {https://mml-book.github.io/},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/E5VQYXG9/Deisenroth et al. - Mathematics for Machine Learning.pdf}
}

@article{DeMarneffe2021,
  title = {Universal Dependencies},
  author = {{de Marneffe}, Marie-Catherine and Manning, Christopher D. and Nivre, Joakim and Zeman, Daniel},
  year = {2021},
  month = jul,
  journal = {Computational Linguistics},
  volume = {47},
  number = {2},
  pages = {255--308},
  issn = {0891-2017},
  doi = {10.1162/coli_a_00402},
  urldate = {2024-04-04},
  abstract = {Universal dependencies (UD) is a framework for morphosyntactic annotation of human language, which to date has been used to create treebanks for more than 100 languages. In this article, we outline the linguistic theory of the UD framework, which draws on a long tradition of typologically oriented grammatical theories. Grammatical relations between words are centrally used to explain how predicate--argument structures are encoded morphosyntactically in different languages while morphological features and part-of-speech classes give the properties of words. We argue that this theory is a good basis for crosslinguistically consistent annotation of typologically diverse languages in a way that supports computational natural language understanding as well as broader linguistic studies.},
  keywords = {repository,resources,syntax},
  file = {/Users/francojc/Zotero/storage/FNIPRJ5U/de Marneffe et al. - 2021 - Universal Dependencies.pdf}
}

@article{Denny2018,
  title = {Text Preprocessing for Unsupervised Learning: {{Why}} It Matters, When It Misleads, and What to Do about It},
  author = {Denny, Matthew James and Spirling, Arthur},
  year = {2018},
  journal = {Political Analysis},
  volume = {26},
  pages = {168--189},
  issn = {1556-5068},
  doi = {10.1017/pan.2017.44},
  urldate = {2018-05-26},
  abstract = {We consider the impact of preprocessing decisions, a crucial first step in all text-as-data investigations. We note that grounded practical help on feature selection for the mostly unsupervised context in which social scientists work is scant: instead, advice is lifted from the supervised literature with little thought as to whether it is opti-mal, or even appropriate, for the tasks at hand. Worryingly, as we show with real data, substantive inferences and model performance for unsupervised approaches are not generally robust to perturbations of common preprocessing steps. We introduce a statistical procedure and easy-to-use software---preText---allowing scholars to examine the sensitivity of their findings under alternate preprocessing regimes. For a range of datasets, we show that while some steps are mostly harmless, researchers should be cautious about others, since they transform the data in ways likely to lead the unwary down different " forking paths " of inference.},
  file = {/Users/francojc/Zotero/storage/IL42RVYT/Denny and Spirling - 2018 - Text Preprocessing For Unsupervised Learning Why .pdf}
}

@book{Desagulier2017,
  title = {Corpus Linguistics and Statistics with {{R}}: {{Introduction}} to Quantitative Methods in Linguistics},
  shorttitle = {Corpus Linguistics and Statistics with {{R}}},
  author = {Desagulier, Guillaume},
  year = {2017},
  series = {Quantitative {{Methods}} in the {{Humanities}} and {{Social Sciences}}},
  publisher = {Springer International Publishing},
  address = {Cham},
  doi = {10.1007/978-3-319-64572-8},
  urldate = {2023-01-05},
  isbn = {978-3-319-64570-4 978-3-319-64572-8},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/K9FZTI9N/Desagulier_2017_Corpus Linguistics and Statistics with R.pdf;/Users/francojc/Zotero/storage/L2DCJSL8/getting-started-with-rocker.pdf}
}

@article{Deshors2016,
  title = {Profiling Verb Complementation Constructions across New {{Englishes}}},
  author = {Deshors, Sandra C and Gries, Stefan Th.},
  year = {2016},
  journal = {International Journal of Corpus Linguistics.},
  volume = {21},
  number = {2},
  pages = {192--218},
  urldate = {2016-09-07},
  abstract = {In this paper, we explore verb complementation patterns with to and ing in native English (British and American English) as compared to three Asian Englishes (Hong Kong, Indian, and Singaporean English). Based on data from the International Corpus of English annotated for variables describing the matrix verb and the complement, we run two random forests analyses to determine where the Asian English have developed complementation preferences different from the two native speaker varieties. We find not only a variety of differences between the Asian and the native Englishes, but also that the Asian Englishes are more similar (i.e. `better predicted by') the American English data. Further, as the first study of its kind to extend the MuPDAR approach from the now frequent regression analy- ses to random forests analysis, this study adds a potentially useful analytical tool to the osten messy and skewed observational data corpus linguists need to deal with.},
  file = {/Users/francojc/Zotero/storage/WWU5BE3I/Deshors and Gries - 2016 - Profiling verb complementation constructions acros.pdf}
}

@misc{Desjardins2019,
  title = {How Much Data Is Generated Each Day?},
  author = {Desjardins, Jeff},
  year = {2019},
  month = apr,
  journal = {Visual Capitalist},
  url = {https://www.visualcapitalist.com/how-much-data-is-generated-each-day/},
  urldate = {2021-06-17},
  abstract = {By 2020, there will be 40x more bytes of data than there are stars in the observable universe. See how much data gets added to the mix each and every day.},
  langid = {american},
  file = {/Users/francojc/Zotero/storage/Z47YX7SK/how-much-data-is-generated-each-day.html}
}

@article{Diez-Ortega2023,
  title = {Measuring the Development of Lexical Richness of {{L2 Spanish}}: A Longitudinal Learner Corpus Study},
  shorttitle = {Measuring the Development of Lexical Richness of {{L2}} Spanish},
  author = {{D{\'i}ez-Ortega}, Mar{\'i}a and Kyle, Kristopher},
  year = {2023},
  month = jul,
  journal = {Studies in Second Language Acquisition},
  pages = {1--31},
  issn = {0272-2631, 1470-1545},
  doi = {10.1017/S0272263123000384},
  urldate = {2023-08-03},
  abstract = {Research has indicated that lexical richness is an important indicator of second language (L2) proficiency. However, most research has examined written, cross-sectional English L2 corpora and does not necessarily indicate how spoken lexical use develops over time or whether observed trends are stable across L2s. This study adds to previous research on the development of spoken vocabulary by investigating lexical features of L2 Spanish learners over a 21-month period, using the LANGSNAP corpus. Multiple lexical richness indices used in previous studies were examined including lexical diversity, word frequency, word concreteness, and bigram strength of association. Linear mixed-effects models were run to examine changes over time. The results suggest that although some features of lexical richness (e.g., word frequency) see meaningful change over time, others (e.g., bigram T score) may not be indicative of L2 oral development.},
  langid = {english},
  keywords = {/unread,corpus-study,langsnap,lexical diversity,psycholinguistics,spanish},
  file = {/Users/francojc/Zotero/storage/PA954PDV/Díez-Ortega and Kyle - 2023 - Measuring the development of lexical richness of L.pdf}
}

@book{Diez2017,
  title = {{{OpenIntro}} Statistics},
  author = {Diez, David M. and Barr, Christopher D. and {\c C}etinkaya-Rundel, Mine},
  year = {2017},
  file = {/Users/francojc/Zotero/storage/5UMR8M5Y/Diez et al. - 2017 - OpenIntro Statistics.pdf}
}

@article{Dodds2011,
  title = {Temporal Patterns of Happiness and Information in a Global Social Network: Hedonometrics and Twitter.},
  author = {Dodds, Peter Sheridan and Harris, Kameron Decker and Kloumann, Isabel M and a Bliss, Catherine and Danforth, Christopher M},
  year = {2011},
  month = jan,
  journal = {PloS one},
  volume = {6},
  number = {12},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0026752},
  urldate = {2012-07-14},
  abstract = {Individual happiness is a fundamental societal metric. Normally measured through self-report, happiness has often been indirectly characterized and overshadowed by more readily quantifiable economic indicators such as gross domestic product. Here, we examine expressions made on the online, global microblog and social networking service Twitter, uncovering and explaining temporal variations in happiness and information levels over timescales ranging from hours to years. Our data set comprises over 46 billion words contained in nearly 4.6 billion expressions posted over a 33 month span by over 63 million unique users. In measuring happiness, we construct a tunable, real-time, remote-sensing, and non-invasive, text-based hedonometer. In building our metric, made available with this paper, we conducted a survey to obtain happiness evaluations of over 10,000 individual words, representing a tenfold size improvement over similar existing word sets. Rather than being ad hoc, our word list is chosen solely by frequency of usage, and we show how a highly robust and tunable metric can be constructed and defended.},
  pmid = {22163266},
  file = {/Users/francojc/Zotero/storage/AFCJQIRK/Dodds et al. - 2011 - Temporal patterns of happiness and information in .pdf}
}

@article{Donoho2017,
  title = {50 Years of Data Science},
  author = {Donoho, David},
  year = {2017},
  journal = {Journal of Computational and Graphical Statistics},
  volume = {26},
  number = {4},
  pages = {745--766},
  publisher = {Taylor \& Francis},
  issn = {15372715},
  doi = {10.1080/10618600.2017.1384734},
  abstract = {More than 50 years ago, John Tukey called for a reformation of academic statistics. In 'The Future of Data Analysis', he pointed to the existence of an as-yet unrecognized science, whose subject of interest was learning from data, or 'data analysis'. Ten to twenty years ago, John Chambers, Bill Cleveland and Leo Breiman independently once again urged academic statistics to expand its boundaries beyond the classical domain of theoretical statistics; Chambers called for more emphasis on data preparation and presentation rather than statistical modeling; and Breiman called for emphasis on prediction rather than inference. Cleveland even suggested the catchy name "Data Science" for his envisioned field. A recent and growing phenomenon is the emergence of "Data Science" programs at major universities, including UC Berkeley, NYU, MIT, and most recently the Univ. of Michigan, which on September 8, 2015 announced a \$100M "Data Science Initiative" that will hire 35 new faculty. Teaching in these new programs has significant overlap in curricular subject matter with traditional statistics courses; in general, though, the new initiatives steer away from close involvement with academic statistics departments. This paper reviews some ingredients of the current "Data Science moment", including recent commentary about data science in the popular media, and about how/whether Data Science is really different from Statistics. The now-contemplated field of Data Science amounts to a superset of the fields of statistics and machine learning which adds some technology for 'scaling up' to 'big data'. This chosen superset is motivated by commercial rather than intellectual developments. Choosing in this way is likely to miss out on the really important intellectual event of the next fifty years. Because all of science itself will soon become data that can be mined, the imminent revolution in Data Science is not about mere 'scaling up', but instead the emergence of scientific studies of data analysis science-wide. In the future, we will be able to predict how a proposal to change data analysis workflows would impact the validity of data analysis across all of science, even predicting the impacts field-by-field. Drawing on work by Tukey, Cleveland, Chambers and Breiman, I present a vision of data science based on the activities of people who are 'learning from data', and I describe an academic field dedicated to improving that activity in an evidence-based manner. This new field is a better academic enlargement of statistics and machine learning than today's Data Science Initiatives, while being able to accommodate the same short-term goals.},
  file = {/Users/francojc/Zotero/storage/ZA62NT8L/Donoho - 2017 - 50 Years of Data Science.pdf}
}

@article{Donoso2017,
  title = {Dialectometric Analysis of Language Variation in Twitter},
  author = {Donoso, Gonzalo and S{\'a}nchez, David},
  year = {2017},
  journal = {arXiv},
  file = {/Users/francojc/Zotero/storage/HV475LDB/Donoso and Sánchez - 2017 - Dialectometric analysis of language variation in T.pdf;/Users/francojc/Zotero/storage/INA9JPQI/1702.html}
}

@misc{Drexler2023,
  type = {Research Guide},
  title = {Research Guides: {{Federalist}} Papers: {{Primary}} Documents in American History: {{Introduction}}},
  shorttitle = {Research Guides},
  year = {2023},
  journal = {Library of Congress Research Guides},
  url = {https://guides.loc.gov/federalist-papers/introduction},
  urldate = {2023-08-09},
  abstract = {The Federalist Papers were a series of essays written by Alexander Hamilton, James Madison, and John Jay under the pen name "Publius." This guide compiles Library of Congress digital materials, external websites, and a print bibliography.},
  collaborator = {Drexler, Ken},
  langid = {english},
  keywords = {/unread,data,federalist papers,web scrape},
  file = {/Users/francojc/Desktop/Drexler-zotero.md}
}

@article{Du2021,
  title = {Zeta \& Eta: {{An}} Exploration and Evaluation of Two Dispersion-Based Measures of Distinctiveness},
  author = {Du, Keli and Dudar, Julia and Rok, Cora and Sch{\"o}ch, Christof},
  year = {2021},
  pages = {14},
  abstract = {In Corpus Linguistics, numerous statistical measures have been adopted to analyze large amounts of textual data in a contrastive perspective, in order to extract characteristic or ``distinctive'' features. While the most widely-used keyness measures are based on word frequency, an increasing number of research papers recently suggested dispersion-based measures as a better solution. These, however, are not new to Computational Literary Studies (CLS). In 2007, John Burrows introduced Zeta, a statistical measure that is mainly based on the degree of dispersion of a feature in a text corpus. In this paper, we also introduce Eta, a new measure of distinctiveness that is based on deviation of proportions suggested by Stefan Gries. By comparing Eta with Zeta, we demonstrate that both measures are able to identify relevant, interpretable distinctive words in a target corpus. Additionally, we make a first attempt to detect the key differences between these two measures by interpreting the top distinctive words.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/YA79VLY8/Du et al. - 2021 - Zeta & Eta An Exploration and Evaluation of Two D.pdf}
}

@book{Dubnjakovic2010,
  title = {A Practical Guide to Electronic Resources in the Humanities},
  author = {Dubnjakovic, Ana and Tomlin, Patrick},
  year = {2010},
  publisher = {Elsevier},
  file = {/Users/francojc/Zotero/storage/YV34R94J/CnpfAgAAQBAJ.html}
}

@misc{DuBois2005,
  title = {Santa {{Barbara Corpus}} of {{Spoken American English}}, Parts 1-4},
  author = {Du Bois, John W. and Chafe, Wallace L. and Meyer, Charles and Thompson, Sandra A. and Englebretson, Robert and Martey, Nii},
  year = {2005},
  publisher = {Linguistic Data Consortium},
  address = {Philadelphia},
  url = {https://www.linguistics.ucsb.edu/research/santa-barbara-corpus},
  urldate = {2024-04-04},
  keywords = {corpus,data,dataset}
}

@article{Duenas2010,
  title = {Attitude Markers in Business Management Research Articles: A Cross-Cultural Corpus-Driven Approach},
  author = {Due{\~n}as, Pilar Mur},
  year = {2010},
  month = mar,
  journal = {International Journal of Applied Linguistics},
  volume = {20},
  number = {1},
  pages = {50--72},
  issn = {08026106},
  doi = {10.1111/j.1473-4192.2009.00228.x},
  file = {/Users/francojc/Zotero/storage/Q52Z2Q2D/Dueñas - 2010 - Attitude markers in business management research articles a cross-cultural corpus-driven approach(2).pdf;/Users/francojc/Zotero/storage/UUH9WNXP/Dueñas - 2010 - Attitude markers in business management research articles a cross-cultural corpus-driven approach.pdf}
}

@article{Duran2004,
  title = {Developmental Trends in Lexical Diversity},
  author = {Duran, P.},
  year = {2004},
  month = jun,
  journal = {Applied Linguistics},
  volume = {25},
  number = {2},
  pages = {220--242},
  issn = {0142-6001, 1477-450X},
  doi = {10.1093/applin/25.2.220},
  urldate = {2024-01-10},
  langid = {english},
  keywords = {lexical diversity,lexical measures},
  file = {/Users/francojc/Zotero/storage/65YZ5QP7/Duran - 2004 - Developmental Trends in Lexical Diversity.pdf}
}

@article{Dzogang2017,
  title = {Circadian Mood Variations in Twitter Content},
  author = {Dzogang, Fabon and Lightman, Stafford and Cristianini, Nello},
  year = {2017},
  month = jan,
  journal = {Brain and Neuroscience Advances},
  volume = {1},
  pages = {239821281774450},
  issn = {2398-2128, 2398-2128},
  doi = {10.1177/2398212817744501},
  urldate = {2021-05-25},
  abstract = {Background:               Circadian regulation of sleep, cognition, and metabolic state is driven by a central clock, which is in turn entrained by environmental signals. Understanding the circadian regulation of mood, which is vital for coping with day-to-day needs, requires large datasets and has classically utilised subjective reporting.                                         Methods:               In this study, we use a massive dataset of over 800 million Twitter messages collected over 4 years in the United Kingdom. We extract robust signals of the changes that happened during the course of the day in the collective expression of emotions and fatigue. We use methods of statistical analysis and Fourier analysis to identify periodic structures, extrema, change-points, and compare the stability of these events across seasons and weekends.                                         Results:               We reveal strong, but different, circadian patterns for positive and negative moods. The cycles of fatigue and anger appear remarkably stable across seasons and weekend/weekday boundaries. Positive mood and sadness interact more in response to these changing conditions. Anger and, to a lower extent, fatigue show a pattern that inversely mirrors the known circadian variation of plasma cortisol concentrations. Most quantities show a strong inflexion in the morning.                                         Conclusion:               Since circadian rhythm and sleep disorders have been reported across the whole spectrum of mood disorders, we suggest that analysis of social media could provide a valuable resource to the understanding of mental disorder.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/3IX37BV2/Dzogang et al. - 2017 - Circadian mood variations in Twitter content.pdf}
}

@article{Dzogang2018,
  title = {Diurnal Variations of Psychometric Indicators in Twitter Content},
  author = {Dzogang, Fabon and Lightman, Stafford and Cristianini, Nello},
  editor = {Oster, Henrik},
  year = {2018},
  month = jun,
  journal = {PLOS ONE},
  volume = {13},
  number = {6},
  pages = {e0197002},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0197002},
  urldate = {2021-03-05},
  abstract = {The psychological state of a person is characterised by cognitive and emotional variables which can be inferred by psychometric methods. Using the word lists from the Linguistic Inquiry and Word Count, designed to infer a range of psychological states from the word usage of a person, we studied temporal changes in the average expression of psychological traits in the general population. We sampled the contents of Twitter in the United Kingdom at hourly intervals for a period of four years, revealing a strong diurnal rhythm in most of the psychometric variables, and finding that two independent factors can explain 85\% of the variance across their 24-h profiles. The first has peak expression time starting at 5am/6am, it correlates with measures of analytical thinking, with the language of drive (e.g power, and achievement), and personal concerns. It is anticorrelated with the language of negative affect and social concerns. The second factor has peak expression time starting at 3am/ 4am, it correlates with the language of existential concerns, and anticorrelates with expression of positive emotions. Overall, we see strong evidence that our language changes dramatically between night and day, reflecting changes in our concerns and underlying cognitive and emotional processes. These shifts occur at times associated with major changes in neural activity and hormonal levels.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/7NWHMKFD/Dzogang et al. - 2018 - Diurnal variations of psychometric indicators in T.pdf}
}

@article{Eddington2010,
  title = {A Comparison of Two Tools for Analyzing Linguistic Data: Logistic Regression and Decision Trees},
  author = {Eddington, David},
  year = {2010},
  journal = {Italian Journal of Linguistics},
  volume = {22},
  number = {2},
  pages = {265--286},
  issn = {11202726},
  abstract = {rbrul's instantiation of logistic regression which is limited to categorical variables. Therefore, decision tree analysis may help establish cutoff points when continuous data are converted into categories for Varbrul. Data sets containing knockouts and multinomial dependent variables, as well as those containing cells with zeros, are a challenge for Varbrul analysis. These are usually dealt with by recoding and reconfiguring the data. However, in some cases no amount of principled recoding is able to yield a parsimonious Varbrul analysis. Therefore, decision trees are suggested as an alternative method of analysis since they are not adversely affected by these factors. In order to contrast and compare the two methods, Varbrul and decision tree analyses of a number of linguistic data sets are presented.},
  file = {/Users/francojc/Zotero/storage/CIVGULTQ/Eddington - 2010 - A comparison of two tools for analyzing linguistic data logistic regression and decision trees.pdf}
}

@book{Egbert2020,
  title = {Doing Linguistics with a Corpus: {{Methodological}} Considerations for the Everyday User},
  author = {Egbert, Jesse and Larsson, Tove and Biber, Douglas},
  year = {2020},
  series = {Elements in {{Corpus Linguistics}}},
  publisher = {Cambridge University Press},
  abstract = {Paradoxically, doing corpus linguistics is both easier and harder than it has ever been before. On the one hand, it is easier because we have access to more existing corpora, more corpus analysis software tools, and more statistical methods than ever before. On the other hand, reliance on these existing corpora and corpus linguistic methods can potentially create layers of distance between the researcher and the language in a corpus, making it a challenge to do linguistics with a corpus. The goal of this Element is to explore ways for us to improve how we approach linguistic research questions with quantitative corpus data. We introduce and illustrate the major steps in the research process, including how to: select and evaluate corpora; establish linguistically motivated research questions, observational units, and variables; select linguistically interpretable variables; understand and evaluate existing corpus software tools; adopt minimally sufficient statistical methods; and qualitatively interpret quantitative findings.},
  file = {/Users/francojc/Zotero/storage/BU53WC6F/Egbert et al. - Doing Linguistics with a Corpus.pdf}
}

@article{Egbert2023,
  title = {Key Feature Analysis: A Simple, yet Powerful Method for Comparing Text Varieties},
  shorttitle = {Key Feature Analysis},
  author = {Egbert, Jesse and Biber, Douglas},
  year = {2023},
  month = apr,
  journal = {Corpora},
  volume = {18},
  number = {1},
  pages = {121--133},
  publisher = {Edinburgh University Press},
  issn = {1749-5032},
  doi = {10.3366/cor.2023.0275},
  urldate = {2023-04-30},
  abstract = {To date, corpus-based methods for comparing language varieties have fallen into one of two camps: (1) md analysis\,--\,a complicated multi-variate approach based on analysis of functionally motivated ...},
  keywords = {/unread,keyness,language variation},
  file = {/Users/francojc/Zotero/storage/HWDUYNST/Egbert_Biber_2023_Key feature analysis.pdf}
}

@article{Eichstaedt2015,
  title = {Psychological Language on Twitter Predicts County-Level Heart Disease Mortality},
  author = {Eichstaedt, Johannes C. and Schwartz, Hansen Andrew and Kern, Margaret L. and Park, Gregory and Labarthe, Darwin R. and Merchant, Raina M. and Jha, Sneha and Agrawal, Megha and Dziurzynski, Lukasz A. and Sap, Maarten and Weeg, Christopher and Larson, Emily E. and Ungar, Lyle H. and Seligman, Martin E. P.},
  year = {2015},
  month = feb,
  journal = {Psychological Science},
  volume = {26},
  number = {2},
  pages = {159--169},
  issn = {0956-7976, 1467-9280},
  doi = {10.1177/0956797614557867},
  urldate = {2021-06-08},
  abstract = {Hostility and chronic stress are known risk factors for heart disease, but they are costly to assess o used language expressed on Twitter to characterize community-level psychological correlates of age-a from atherosclerotic heart disease (AHD). Language patterns reflecting negative social relationships, and negative emotions---especially anger---emerged as risk factors; positive emotions and psycholo emerged as protective factors. Most correlations remained significant after controlling for income cross-sectional regression model based only on Twitter language predicted AHD mortality significantl a model that combined 10 common demographic, socioeconomic, and health risk factors, including sm hypertension, and obesity. Capturing community psychological characteristics through social media these characteristics are strong markers of cardiovascular mortality at the community level.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/S7UHJXHI/Eichstaedt et al. - 2015 - Psychological Language on Twitter Predicts County-.pdf}
}

@article{Eisenstein2012,
  title = {Mapping the Geographical Diffusion of New Words},
  author = {Eisenstein, Jacob and O'Connor, Brendan and Smith, Noah A and Xing, Eric P},
  year = {2012},
  journal = {Computation and Language},
  eprint = {1210.5268v3},
  pages = {1--13},
  doi = {10.1371/journal.pone.0113114},
  urldate = {2016-08-26},
  abstract = {Language in social media is rich with linguistic innovations, most strikingly in the new words and spellings that constantly enter the lexicon. Despite assertions about the power of social media to connect people across the world, we find that many of these neologisms are restricted to geographically compact areas. Even for words that become ubiquituous, their growth in popularity is often geographical, spreading from city to city. Thus, social media text offers a unique opportunity to study the diffusion of lexical change. In this paper, we show how an autoregressive model of word frequencies in social media can be used to induce a network of linguistic influence between American cities. By comparing the induced network with the geographical and demographic characteristics of each city, we can measure the factors that drive the spread of lexical innovation.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/5DKXBRYM/Eisenstein et al. - 2012 - Mapping the geographical diffusion of new words.pdf;/Users/francojc/Zotero/storage/SLQ567QS/Eisenstein et al. - 2012 - Mapping the geographical diffusion of new words(2).pdf}
}

@inproceedings{Eisenstein2021,
  title = {On Writing a Textbook on Natural Language Processing},
  booktitle = {Proceedings of the {{Fifth Workshop}} on {{Teaching NLP}}},
  author = {Eisenstein, Jacob},
  year = {2021},
  pages = {125--130},
  publisher = {Association for Computational Linguistics},
  address = {Online},
  doi = {10.18653/v1/2021.teachingnlp-1.22},
  urldate = {2021-06-13},
  abstract = {There are thousands of papers about natural language processing and computational linguistics, but very few textbooks. I describe the motivation and process for writing a college textbook on natural language processing, and offer advice and encouragement for readers who may be interested in writing a textbook of their own.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/X34KLEHD/Eisenstein - 2021 - On Writing a Textbook on Natural Language Processi.pdf}
}

@article{Ellis2009,
  title = {Formulaic Language in Native Speakers: {{Triangulating}} Psycholinguistics, Corpus Linguistics, and Education},
  author = {Ellis, Nick C. and {Simpson-Vlach}, Rita},
  year = {2009},
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {5},
  number = {1},
  pages = {61--78},
  file = {/Users/francojc/Zotero/storage/MDTB4CAC/Ellis, Simpson-Vlach - 2009 - Formulaic language in native speakers Triangulating psycholinguistics, corpus linguistics, and education.pdf}
}

@article{Ellis2012,
  title = {What Can We Count in Language, What Counts in Language Acquisition, Cognition, and Use?},
  author = {Ellis, {\relax NC}},
  year = {2012},
  journal = {Frequency Effects in Language Learning and {\dots}},
  pages = {1--33},
  url = {http://books.google.com/books?hl=en&lr=&id=35LNsOCFZe8C&oi=fnd&pg=PA7&dq=What+can+we+count+in+language,+what+counts+in+language+acquisition,+cognition,+and+use%3F&ots=NvbmflYP35&sig=W1FRiyTi0lYeJHNfWdXiJnPYTnc},
  urldate = {2013-07-02},
  file = {/Users/francojc/Zotero/storage/EH6PELHT/Ellis - 2012 - What can we count in language, what counts in language acquisition, cognition, and use.pdf}
}

@inproceedings{Etzioni2004,
  title = {Web-Scale Information Extraction in Knowitall:(Preliminary Results)},
  booktitle = {Proceedings of the 13th International Conference on {{World Wide Web}}},
  author = {Etzioni, O. and Cafarella, M. and Downey, D. and Kok, S. and Popescu, A.M. and Shaked, T. and Soderland, S. and Weld, D.S. and Yates, A.},
  year = {2004},
  pages = {100--110},
  publisher = {ACM},
  url = {http://portal.acm.org/citation.cfm?id=988687},
  urldate = {2011-10-26}
}

@techreport{Evans2024,
  type = {Zine},
  title = {How {{Git Works}}},
  author = {Evans, Julia},
  year = {2024},
  institution = {Wizard Zines},
  url = {https://wizardzines.com/zines/git/},
  file = {/Users/francojc/Zotero/storage/XWRU5W8X/Evans - 2024 - How Git Works.pdf;/Users/francojc/Zotero/storage/YFGYF4G7/git-cheat-sheet.pdf}
}

@article{Evert2017,
  title = {Understanding and Explaining Delta Measures for Authorship Attribution},
  author = {Evert, Stefan and Proisl, Thomas and Jannidis, Fotis and Reger, Isabella and Pielstr{\"o}m, Steffen and Sch{\"o}ch, Christof and Vitt, Thorsten},
  year = {2017},
  month = dec,
  journal = {Digital Scholarship in the Humanities},
  volume = {32},
  number = {suppl\_2},
  pages = {ii4-ii16},
  issn = {2055-7671},
  doi = {10.1093/llc/fqx023},
  urldate = {2021-02-05},
  abstract = {This article builds on a mathematical explanation of one the most prominent stylometric measures, Burrows's Delta (and its variants), to understand and explain its working. Starting with the conceptual separation between feature selection, feature scaling, and distance measures, we have designed a series of controlled experiments in which we used the kind of feature scaling (various types of standardization and normalization) and the type of distance measures (notably Manhattan, Euclidean, and Cosine) as independent variables and the correct authorship attributions as the dependent variable indicative of the performance of each of the methods proposed. In this way, we are able to describe in some detail how each of these two variables interact with each other and how they influence the results. Thus we can show that feature vector normalization, that is, the transformation of the feature vectors to a uniform length of 1 (implicit in the cosine measure), is the decisive factor for the improvement of Delta proposed recently. We are also able to show that the information particularly relevant to the identification of the author of a text lies in the profile of deviation across the most frequent words rather than in the extent of the deviation or in the deviation of specific words only.},
  file = {/Users/francojc/Zotero/storage/PW5U4VZE/Evert et al. - 2017 - Understanding and explaining Delta measures for au.pdf;/Users/francojc/Zotero/storage/3L347RE4/3865676.html}
}

@manual{Evert2023,
  type = {Manual},
  title = {Corpora: {{Statistics}} and Data Sets for Corpus Frequency Data},
  author = {Evert, Stephanie},
  year = {2023},
  url = {https://CRAN.R-project.org/package=corpora}
}

@article{Feder2021,
  title = {Causal Inference in Natural Language Processing: {{Estimation}}, Prediction, Interpretation and Beyond},
  shorttitle = {Causal Inference in Natural Language Processing},
  author = {Feder, Amir and Keith, Katherine A. and Manzoor, Emaad and Pryzant, Reid and Sridhar, Dhanya and {Wood-Doughty}, Zach and Eisenstein, Jacob and Grimmer, Justin and Reichart, Roi and Roberts, Margaret E. and Stewart, Brandon M. and Veitch, Victor and Yang, Diyi},
  year = {2021},
  month = sep,
  journal = {arXiv},
  eprint = {2109.00725},
  url = {http://arxiv.org/abs/2109.00725},
  urldate = {2021-09-07},
  abstract = {A fundamental goal of scientific research is to learn about causal relationships. However, despite its critical role in the life and social sciences, causality has not had the same importance in Natural Language Processing (NLP), which has traditionally placed more emphasis on predictive tasks. This distinction is beginning to fade, with an emerging area of interdisciplinary research at the convergence of causal inference and language processing. Still, research on causality in NLP remains scattered across domains without unified definitions, benchmark datasets and clear articulations of the remaining challenges. In this survey, we consolidate research across academic areas and situate it in the broader NLP landscape. We introduce the statistical challenge of estimating causal effects, encompassing settings where text is used as an outcome, treatment, or as a means to address confounding. In addition, we explore potential uses of causal inference to improve the performance, robustness, fairness, and interpretability of NLP models. We thus provide a unified overview of causal inference for the computational linguistics community.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/W6DBIBXP/Feder et al. - 2021 - Causal Inference in Natural Language Processing E.pdf;/Users/francojc/Zotero/storage/HCFAJXLG/2109.html}
}

@book{Firth1957,
  title = {Papers in Linguistics},
  author = {Firth, John R.},
  year = {1957},
  publisher = {Oxford University Press}
}

@article{Flowerdew2009,
  title = {Applying Corpus Linguistics to Pedagogy: A Critical Evaluation},
  author = {Flowerdew, L.},
  year = {2009},
  journal = {International journal of corpus linguistics},
  volume = {14},
  number = {3},
  pages = {393--417},
  publisher = {John Benjamins Publishing Company},
  url = {http://www.ingentaconnect.com/content/jbp/ijcl/2009/00000014/00000003/art00006},
  urldate = {2011-10-17},
  file = {/Users/francojc/Zotero/storage/YLN7QQ69/Flowerdew - 2009 - Applying corpus linguistics to pedagogy A critica.pdf}
}

@incollection{Ford2010,
  title = {Studying Syntactic Variation Using Convergent Evidence from Psycholinguistics and Usage},
  booktitle = {Research Methods in Language Variation and Change},
  author = {Ford, Marilyn and Bresnan, Joan},
  editor = {Krug, Manfred and Schl{\"u}ter, Julia},
  year = {2010},
  pages = {1--29},
  publisher = {Cambridge University Press},
  address = {Cambridge},
  file = {/Users/francojc/Zotero/storage/I9YE23LQ/Ford and Bresnan - 2013 - Studying syntactic variation using convergent evid.pdf}
}

@misc{Francis1961,
  title = {Brown {{Corpus}} of {{Present Day American English}}},
  author = {Francis, W. Nelson and Ku{\c c}era, Henry},
  year = {1961},
  url = {http://hdl.handle.net/20.500.12024/0402},
  urldate = {2021-07-26},
  copyright = {OTA Licence. This item may not be redistributed.},
  file = {/Users/francojc/Zotero/storage/UKDPIMW9/0402.html}
}

@article{Francis1979,
  title = {Brown {{Corpus}} Manual},
  author = {Francis, W. Nelson and Kucera, Henry},
  year = {1979},
  journal = {Letters to the Editor},
  volume = {5},
  number = {2},
  pages = {7},
  url = {http://gandalf.aksis.uib.no/icame/brown/bcm.html},
  urldate = {2023-11-14},
  keywords = {brown,corpus,english,manual},
  file = {/Users/francojc/Zotero/storage/3KWTJ7KJ/Francis and Kucera - 1979 - Brown corpus manual.pdf}
}

@article{Franco2022,
  title = {The Most Stable It's Ever Been: The Preterit/ Present Perfect Alternation in Spoken {{Ontario English}}},
  shorttitle = {The Most Stable It's Ever Been},
  author = {Franco, Karlien and Tagliamonte, Sali A.},
  year = {2022},
  journal = {English Language \& Linguistics},
  volume = {26},
  number = {4},
  pages = {779--806},
  publisher = {Cambridge University Press},
  file = {/Users/francojc/Zotero/storage/HAX5EJY8/Franco and Tagliamonte - 2022 - The most stable it's ever been the preteritprese.pdf}
}

@inproceedings{Francom2014,
  title = {{{ACTIV-ES}}: A Comparable , Cross-Dialect Corpus of `Everyday ' {{Spanish}} from {{Argentina}}, {{Mexico}} and {{Spain}}},
  booktitle = {The Ninth International Conference on {{Language Resources}} and {{Evaluation}}},
  author = {Francom, Jerid and Hulden, Mans and Ussishkin, Adam},
  year = {2014},
  pages = {1733--1737},
  file = {/Users/francojc/Zotero/storage/9SQ8BJSA/Francom et al. - 2014 - ACTIV-ES a comparable , cross-dialect corpus of ‘.pdf}
}

@incollection{Francom2022,
  title = {Corpus Studies of Syntax},
  booktitle = {The {{Cambridge Handbook}} of {{Experimental Syntax}}},
  author = {Francom, Jerid},
  editor = {Goodall, Grant},
  year = {2022},
  series = {Cambridge {{Handbooks}} in {{Language}} and {{Linguistics}}},
  pages = {687--713},
  publisher = {Cambridge University Press},
  isbn = {978-1-108-47480-1},
  keywords = {corpus linguistics,syntax}
}

@book{Francom2025,
  title = {An {{Introduction}} to {{Quantitative Text Analysis}} for {{Linguistics}}: {{Reproducible Research Using R}}},
  shorttitle = {An {{Introduction}} to {{Quantitative Text Analysis}} for {{Linguistics}}},
  author = {Francom, Jerid},
  year = {2025},
  publisher = {Routledge},
  url = {https://www.routledge.com/An-Introduction-to-Quantitative-Text-Analysis-for-Linguistics-Reproducible-Research-Using-R/Francom/p/book/9781032494265},
  urldate = {2024-07-30},
  abstract = {An Introduction to Quantitative Text Analysis for Linguistics: Reproducible Research Using R is a pragmatic textbook that equips students and researchers with the essential concepts and practical programming skills needed to conduct quantitative text analysis in a reproducible manner. Designed for undergraduate students and those new to the field, this book assumes no prior experience with statistics or programming, making it an accessible resource for anyone embarking on their journey into quan},
  isbn = {978-1-03-249426-5},
  langid = {english},
  keywords = {r,reproducible research,text analysis},
  file = {/Users/francojc/Zotero/storage/PNMGCZZL/9781032494265.html}
}

@article{Frechet2020,
  title = {Analysis of Text-Analysis Syllabi: {{Building}} a Text-Analysis Syllabus Using Scaling},
  shorttitle = {Analysis of Text-Analysis Syllabi},
  author = {Fr{\'e}chet, Nadjim and Savoie, Justin and Dufresne, Yannick},
  year = {2020},
  month = apr,
  journal = {PS: Political Science \& Politics},
  volume = {53},
  number = {2},
  pages = {338--343},
  publisher = {Cambridge University Press},
  issn = {1049-0965, 1537-5935},
  doi = {10.1017/S1049096519001732},
  urldate = {2020-10-06},
  abstract = {In the last decade, text-analytic methods have become a fundamental element of a political researcher's toolkit. Today, text analysis is taught in most major universities; many have entire courses dedicated to the topic. This article offers a systematic review of 45 syllabi of text-analysis courses around the world. From these syllabi, we extracted data that allowed us to rank canonical sources and discuss the variety of software used in teaching. Furthermore, we argue that our empirical method for building a text-analysis syllabus could easily be extended to syllabi for other courses. For instance, scholars can use our technique to introduce their graduate students to the field of systematic reviews while improving the quality of their syllabi.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/QD8J87SJ/Fréchet et al. - 2020 - Analysis of Text-Analysis Syllabi Building a Text.pdf;/Users/francojc/Zotero/storage/L639BBSN/63FD1DC30AEDAF1D2DC10D4FEDA539A7.html}
}

@book{Friendly2015,
  title = {Discrete Data Analysis with {{R}}: Visualization and Modeling Techniques for Categorical and Count Data},
  shorttitle = {Discrete Data Analysis with {{R}}},
  author = {Friendly, Michael and Meyer, David},
  year = {2015},
  volume = {120},
  publisher = {CRC Press},
  file = {/Users/francojc/Zotero/storage/DWMWWZEW/5Ts0CwAAQBAJ.html}
}

@article{Fung2003,
  title = {The Disputed Federalist Papers: {{SVM}} Feature Selection via Concave Minimization},
  author = {Fung, Glenn},
  year = {2003},
  journal = {Proceedings of the 2003 Conference on Diversity in Computing},
  volume = {V},
  pages = {42--46},
  doi = {10.1145/948542.948551},
  urldate = {2016-08-26},
  abstract = {In this paper, we use a method proposed by Bradley and Mangasarian ``Feature Selection via Concave Minimization and Support Vector Machines'' to solve the well-known dis- puted Federalist Papers classification problem. We find a separating plane that classifies correctly all the ''training set'' papers of known authorship, based on the relative fre- quencies of only three words. Using the obtained separating hyperplane in three dimensions, all of the 12 disputed papers ended up on the Madison side of the separating plane. This result coincides with previous work on this problem using other classification techniques.},
  isbn = {1581137907},
  file = {/Users/francojc/Zotero/storage/DRM9DFT2/Fung - 2003 - The disputed federalist papers SVM feature selection via concave minimization.pdf}
}

@article{Gablasova2017,
  title = {Collocations in Corpus-Based Language Learning Research: {{Identifying}}, Comparing, and Interpreting the Evidence},
  author = {Gablasova, Dana and Brezina, Vaclav and McEnery, Tony},
  year = {2017},
  journal = {Language Learning},
  volume = {67},
  number = {June},
  pages = {155--179},
  issn = {14679922},
  doi = {10.1111/lang.12225},
  abstract = {This article focuses on the use of collocations in language learning research (LLR). Collocations, as units of formulaic language, are becoming prominent in our under- standing of language learning and use; however,while the number of corpus-based LLR studies of collocations is growing, there is still a need for a deeper understanding of factors that play a role in establishing that two words in a corpus can be considered to be collocates. In this article we critically review both the application of measures used to identify collocability between words and the nature of the relationship between two collocates. Particular attention is paid to the comparison of collocability across different corpora representing different genres, registers, or modalities. Several issues involved in the interpretation of collocational patterns in the production of first language and second language users are also considered. Reflecting on the current practices in the field, further directions for collocation research are proposed.},
  isbn = {0031-9422},
  pmid = {25192302},
  keywords = {BNC corpus,collocations,corpus,formulaic language,language acquisition,pedagogy,second language learning},
  file = {/Users/francojc/Zotero/storage/HHL7F3B8/Gablasova et al. - 2017 - Collocations in Corpus-Based Language Learning Res.pdf}
}

@article{Gales2015,
  title = {The Stance of Stalking: A Corpus-Based Analysis of Grammatical Markers of Stance in Threatening Communications},
  shorttitle = {The Stance of Stalking},
  author = {Gales, Tammy},
  year = {2015},
  journal = {Corpora},
  volume = {10},
  number = {2},
  pages = {171--200},
  publisher = {Edinburgh University Press 22 George Square, Edinburgh EH8 9LF UK},
  file = {/Users/francojc/Zotero/storage/89GYQS56/Gales - 2015 - The stance of stalking A corpus-based analysis of.pdf;/Users/francojc/Zotero/storage/MI2DZ5XX/cor.2015.html}
}

@book{Gandrud2015,
  title = {Reproducible Research with {{R}} and {{R}} Studio},
  author = {Gandrud, Christopher},
  year = {2015},
  edition = {second edition},
  publisher = {CRC Press},
  issn = {03233847},
  abstract = {Bringing together computational research tools in one accessible source, Reproducible Research with R and RStudio guides you in creating dynamic and highly reproducible research. Suitable for researchers in any quantitative empirical discipline, it presents practical tools for data collection, data analysis, and the presentation of results. With straightforward examples, the book takes you through a reproducible research workflow, showing you how to use: R for dynamic data gathering and automated results presentation knitr for combining statistical analysis and results into one document LaTeX for creating PDF articles and slide shows, and Markdown and HTML for presenting results on the web Cloud storage and versioning services that can store data, code, and presentation files; save previous versions of the files; and make the information widely available Unix-like shell programs for compiling large projects and converting documents from one markup language to another RStudio to tightly integrate reproducible research tools in one place Whether you're an advanced user or just getting started with tools such as R and LaTeX, this book saves you time searching for information and helps you successfully carry out computational research. It provides a practical reproducible research workflow that you can use to gather and analyze data as well as dynamically present results in print and on the web. Supplementary files used for the examples and a reproducible research project are available on the author's website.},
  isbn = {978-1-4987-1537-9},
  pmid = {17811671},
  file = {/Users/francojc/Zotero/storage/UVF3YH7L/Gandrud - 2015 - Reproducible research with R and R studio, Second .pdf}
}

@book{Gandrud2020,
  title = {Reproducible Research with {{R}} and Rstudio},
  author = {Gandrud, Christopher},
  year = {2020},
  series = {The {{R}} Series},
  edition = {Third edition},
  publisher = {CRC Press},
  address = {Boca Raton, FL},
  abstract = {"Brings together the skills and tools needed for doing and presenting computational research. Using straightforward examples, the book takes you through an entire reproducible research workflow"--},
  isbn = {978-0-367-14398-5 978-0-367-14402-9},
  lccn = {Q180.55.S7 G36 2020}
}

@article{Garg2018,
  title = {Word Embeddings Quantify 100 Years of Gender and Ethnic Stereotypes},
  author = {Garg, Nikhil and Schiebinger, Londa and Jurafsky, Dan and Zou, James},
  year = {2018},
  month = apr,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {115},
  number = {16},
  pages = {E3635-E3644},
  publisher = {National Academy of Sciences},
  issn = {0027-8424, 1091-6490},
  doi = {10.1073/pnas.1720347115},
  urldate = {2021-03-12},
  abstract = {Word embeddings are a powerful machine-learning framework that represents each English word by a vector. The geometric relationship between these vectors captures meaningful semantic relationships between the corresponding words. In this paper, we develop a framework to demonstrate how the temporal dynamics of the embedding helps to quantify changes in stereotypes and attitudes toward women and ethnic minorities in the 20th and 21st centuries in the United States. We integrate word embeddings trained on 100 y of text data with the US Census to show that changes in the embedding track closely with demographic and occupation shifts over time. The embedding captures societal shifts---e.g., the women's movement in the 1960s and Asian immigration into the United States---and also illuminates how specific adjectives and occupations became more closely associated with certain populations over time. Our framework for temporal analysis of word embedding opens up a fruitful intersection between machine learning and quantitative social science.},
  chapter = {PNAS Plus},
  copyright = {{\copyright} 2018 . http://www.pnas.org/site/aboutpnas/licenses.xhtmlPublished under the PNAS license.},
  isbn = {9781720347118},
  langid = {english},
  pmid = {29615513},
  file = {/Users/francojc/Zotero/storage/YHU7S29T/Garg et al. - 2018 - Word embeddings quantify 100 years of gender and e.pdf;/Users/francojc/Zotero/storage/PUS67925/E3635.html}
}

@article{Garnier2015,
  title = {The {{PHaVE}} List: A Pedagogical List of Phrasal Verbs and Their Most Frequent Meaning Senses},
  shorttitle = {The {{PHaVE}} List},
  author = {Garnier, M{\'e}lodie and Schmitt, Norbert},
  year = {2015},
  month = nov,
  journal = {Language Teaching Research},
  volume = {19},
  number = {6},
  pages = {645--666},
  publisher = {SAGE Publications},
  issn = {1362-1688},
  doi = {10.1177/1362168814559798},
  urldate = {2023-09-03},
  abstract = {As researchers and practitioners are becoming more aware of the importance of multi-word items in English, there is little doubt that phrasal verbs deserve teaching attention in the classroom. However, there are thousands of phrasal verbs in English, and so the question for practitioners is which phrasal verbs to focus attention upon. Phrasal verb dictionaries typically try to be comprehensive, and this results in a very large number of phrasal verbs being listed, which does not help practitioners in selecting the most important ones to teach or test. There are phrasal verb lists available (Gardner and Davies, 2007; Liu, 2011), but these have a serious pedagogical shortcoming in that they do not account for polysemy. Research indicates that phrasal verbs are highly polysemous, having on average 5.6 meaning senses, although many of these are infrequent and peripheral. Thus practitioners also need guidance about which meaning senses are the most useful to address in instruction or tests. In response to this need, the PHrasal VErb Pedagogical List (PHaVE List) was developed. It lists the 150 most frequent phrasal verbs, and provides information on their key meaning senses, which cover 75\%+ of the occurrences in the Corpus of Contemporary American English. The PHaVE List gives the percentage of occurrence for each of these key meaning senses, along with definitions and example sentences written to be accessible for second language learners, in the style of the General Service List (West, 1953). A users? manual is also provided, indicating how to use the list appropriately.},
  keywords = {corpus,english,L2 acquisition,language teaching,phrasal verbs,verb particle constructions},
  file = {/Users/francojc/Zotero/storage/F34I3K72/Garnier and Schmitt - 2015 - The PHaVE List A pedagogical list of phrasal verbs and their most frequent meaning senses.pdf}
}

@incollection{Geeraert2018,
  title = {"{{Spilling}} the Bag" on Idomatic Variation},
  booktitle = {Multiword Expressions at Length and in Depth: {{Extended}} Papers from the {{MWE}} 2017 Workshop},
  author = {Geeraert, Kristina and Baayen, R. Harald and Newman, John},
  editor = {Markantonatou, Stella and Ramisch, Carlos and Savary, Agata and Vincze, Veronika},
  year = {2018},
  pages = {1--33},
  publisher = {Language Science Press},
  address = {Berlin, Germany},
  file = {/Users/francojc/Zotero/storage/9JMXKW5L/Geeraert, Baayen, Newman - 2018 - Spilling the bag on idomatic variation.pdf}
}

@misc{Gelman2013,
  title = {Statistics Is the Least Important Part of Data Science << Statistical Modeling, Causal Inference, and Social Science},
  author = {Gelman, Andrew},
  year = {2013},
  month = nov,
  journal = {Statistical Modeling, Causal Inference, and Social Science},
  url = {https://statmodeling.stat.columbia.edu/2013/11/14/statistics-least-important-part-data-science/},
  urldate = {2021-01-21},
  file = {/Users/francojc/Zotero/storage/FC5RDTH2/statistics-least-important-part-data-science.html}
}

@article{Gentleman2007,
  title = {Statistical Analyses and Reproducible Research},
  author = {Gentleman, Robert and Temple Lang, Duncan},
  year = {2007},
  journal = {Journal of Computational and Graphical Statistics},
  volume = {16},
  number = {1},
  pages = {1--23},
  publisher = {Taylor \& Francis},
  file = {/Users/francojc/Zotero/storage/QG8W5M88/Gentleman and Temple Lang - 2007 - Statistical analyses and reproducible research.pdf;/Users/francojc/Zotero/storage/BVKCRTKC/106186007X178663.html}
}

@article{Gentzkow2019,
  title = {Text as Data},
  author = {Gentzkow, Matthew and Kelly, Bryan and Taddy, Matt},
  year = {2019},
  journal = {Journal of Economic Literature},
  volume = {57},
  number = {3},
  pages = {535--74},
  file = {/Users/francojc/Zotero/storage/WRU4SGIC/Gentzkow et al. - 2019 - Text as data.pdf;/Users/francojc/Zotero/storage/A9II9J6N/articles.html}
}

@article{Gerrig2019,
  title = {New Initiatives to Promote Open Science at the Journal of Memory and Language},
  author = {Gerrig, Richard and Rastle, Kathleen},
  year = {2019},
  journal = {Journal of Memory and Language},
  volume = {104},
  number = {November 2018},
  pages = {126--127},
  publisher = {Elsevier},
  issn = {0749596X},
  doi = {10.1016/j.jml.2018.10.004},
  abstract = {Publication date: February 2019 Source: Journal of Memory and Language, Volume 104 Author(s): Richard Gerrig, Kathleen Rastle},
  file = {/Users/francojc/Zotero/storage/HEFPCFSW/Gerrig, Rastle - 2019 - New initiatives to promote open science at the Journal of Memory and Language.pdf}
}

@article{Gilquin2009,
  title = {Corpora and Experimental Methods: A State-of-the-Art Review},
  author = {Gilquin, Ga{\"e}tanelle and Gries, Stefan Th.},
  year = {2009},
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {5},
  number = {1},
  pages = {1--26},
  issn = {16137027},
  doi = {10.1515/CLLT.2009.001},
  abstract = {This paper offers a state-of-the-art review of the combination of corpora and experimental methods. Using a sample of recent studies, it shows (i) that psycholinguists regularly exploit the benefits of combining corpus and experimental data, whereas corpus linguists do so much more rarely, and (ii) that psycholinguists and corpus linguists use corpora in different ways in terms of the dichotomy of exploratory/descriptive vs. hypothesis-testing as well as the corpus-linguistic methods that are used. Possible reasons for this are suggested and arguments are presented for why (and how) corpus linguists should look more into the possibilities of complementing their corpus studies with experimental data. {\copyright} Walter de Gruyter.},
  keywords = {acceptability judgments,collocations,concordance,corpus linguistics,frequency effects,introspection,methodology,psycholinguistics,syntax},
  file = {/Users/francojc/Zotero/storage/7U33FI5E/Gilquin and Gries - 2009 - Corpora and experimental methods A state-of-the-a.pdf}
}

@article{Gilquin2011,
  title = {Errors and Disfluencies in Spoken Corpora: {{Setting}} the Scene},
  author = {Gilquin, Ga{\"e}tanelle and De Cock, Sylvie},
  year = {2011},
  month = jan,
  journal = {International Journal of Corpus Linguistics},
  volume = {16},
  number = {2},
  pages = {141--172},
  issn = {13846655},
  doi = {10.1075/ijcl.16.2.01gil},
  urldate = {2011-10-25},
  file = {/Users/francojc/Zotero/storage/9L62PIN6/Gilquin and De Cock - 2011 - Errors and disfluencies in spoken corpora Setting.pdf}
}

@misc{Github2024,
  type = {Code Repository},
  title = {{{GitHub}}},
  author = {{GitHub}},
  year = {2024},
  journal = {Let's build from here.},
  url = {https://github.com},
  urldate = {2024-04-04},
  abstract = {GitHub is where people build software. More than 100 million people use GitHub to discover, fork, and contribute to over 420 million projects.},
  langid = {english},
  keywords = {GitHub,repository}
}

@article{Go2009,
  title = {Twitter Sentiment Classification Using Distant Supervision},
  author = {Go, Alec and Bhayani, Richa and Huang, Lei},
  year = {2009},
  journal = {CS224N Project Report, Stanford},
  urldate = {2013-11-05}
}

@inproceedings{Godfrey1992,
  title = {{{SWITCHBOARD}}: {{Telephone}} Speech Corpus for Research and Development},
  booktitle = {Acoustics, {{Speech}}, and {{Signal Processing}}, 1992. {{ICASSP-92}}., 1992 {{IEEE International Conference}} On},
  author = {Godfrey, J.J. and Holliman, E.C. and McDaniel, J.},
  year = {1992},
  volume = {1},
  pages = {517--520},
  publisher = {IEEE},
  url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=225858},
  urldate = {2011-10-19},
  file = {/Users/francojc/Zotero/storage/LVBDED2E/Godfrey et al. - 1992 - SWITCHBOARD Telephone speech corpus for research .pdf}
}

@article{Gomez-Uribe2015,
  title = {The {{Netflix}} Recommender System: {{Algorithms}}, Business Value, and Innovation},
  shorttitle = {The Netflix Recommender System},
  author = {{Gomez-Uribe}, Carlos A. and Hunt, Neil},
  year = {2015},
  journal = {ACM Transactions on Management Information Systems (TMIS)},
  volume = {6},
  number = {4},
  pages = {1--19},
  publisher = {ACM New York, NY, USA},
  file = {/Users/francojc/Zotero/storage/HW6E999H/Gomez-Uribe and Hunt - 2015 - The netflix recommender system Algorithms, busine.pdf;/Users/francojc/Zotero/storage/7PFYZHQA/2843948.html}
}

@incollection{Gordon2016,
  title = {Phoneme Inventories},
  booktitle = {Phonological {{Typology}}},
  author = {Gordon, Matthew K.},
  editor = {Gordon, Matthew K.},
  year = {2016},
  month = apr,
  pages = {0},
  publisher = {Oxford University Press},
  doi = {10.1093/acprof:oso/9780199669004.003.0003},
  urldate = {2023-08-15},
  abstract = {This chapter provides an overview of the cross-linguistic distribution of phoneme inventories drawing on various typological databases. Results of a survey of phoneme frequency data within 34 languages suggest a parallel between cross-linguistic and intralanguage frequency. The distribution of phonemic length contrasts is also examined both across and within languages. Various explanations for the phoneme frequency patterns are introduced. These include accounts appealing directly to the phonetic notions of articulatory ease and/or perceptual distinctness (Dispersion Theory, Dispersion and Focalization Theory, and Quantal Theory) as well as theories that encode these physiological pressures indirectly via phonological features (Feature Enhancement Theory and Feature Economy Theory). The chapter concludes with an examination of the relationship between frequency and phonetic naturalness in shaping frequency biases diachronically.},
  isbn = {978-0-19-966900-4},
  keywords = {/unread,articulation,consonants,phonology,typology,vowels},
  file = {/Users/francojc/Zotero/storage/H9J8JNFY/Gordon - 2016 - Phoneme inventories.pdf}
}

@article{Gould2017,
  title = {Data Literacy Is Statistical Literacy},
  author = {Gould, Robert},
  year = {2017},
  journal = {Statistics Education Research Journal},
  volume = {16},
  number = {1},
  pages = {22--25},
  issn = {15701824},
  abstract = {Past definitions of statistical literacy should be updated in order to account for the greatly amplified role that data now play in our lives. Experience working with high-school students in an innovative data science curriculum has shown that teaching statistical literacy, augmented by data literacy, can begin early.},
  file = {/Users/francojc/Zotero/storage/757CQERL/Gould - 2017 - Data literacy is statistical literacy.pdf}
}

@article{Granger1998,
  title = {Prefabricated Patterns in Advanced {{EFL}} Writing: {{Collocations}} and Lexical Phrases},
  shorttitle = {Prefabricated Patterns in Advanced {{EFL}} Writing},
  author = {Granger, Sylviane},
  year = {1998},
  journal = {Phraseology: Theory, analysis and applications},
  pages = {145--160},
  publisher = {Oxford University Press Oxford},
  file = {/Users/francojc/Zotero/storage/442GRDEX/Granger_1998_Prefabricated patterns in advanced EFL writing.pdf}
}

@incollection{Granger1998a,
  title = {The Computer Learner Corpus: A Versatile New Source of Data for {{SLA}} Research},
  booktitle = {Learner {{English}} on {{Computer}}},
  author = {Granger, S},
  editor = {Granger, S},
  year = {1998},
  pages = {3--18},
  publisher = {Addison Wesley Longman},
  address = {London and New York},
  url = {https://uclouvain.be/en/research-institutes/ilc/cecl/locness.html},
  annotation = {LOCNESS corpus}
}

@article{Gray2007,
  title = {{{eScience-A}} Transformed Scientific Method},
  author = {Gray, Jim and Szalay, Alex},
  year = {2007},
  journal = {presentation to the Computer Science and Technology Board of the National Research Council, Mountain View, CA},
  file = {/Users/francojc/Zotero/storage/D2BUHZBF/Gray_Szalay_2007_eScience-A transformed scientific method.pdf}
}

@article{Gray2013,
  title = {Lexical Frames in Academic Prose and Conversation},
  author = {Gray, Bethany and Biber, Douglas},
  year = {2013},
  journal = {International Journal of Corpus Linguistics},
  number = {Hunston 2008},
  pages = {109--135},
  url = {http://www.ingentaconnect.com/content/jbp/ijcl/2013/00000018/00000001/art00006},
  urldate = {2013-10-30},
  file = {/Users/francojc/Zotero/storage/J4YJ7K82/Gray and Biber - 2013 - Lexical frames in academic prose and conversation.pdf}
}

@article{Greenland2016,
  title = {Statistical Tests, {{P}} Values, Confidence Intervals, and Power: A Guide to Misinterpretations},
  author = {Greenland, Sander and Senn, Stephen J. and Rothman, Kenneth J. and Carlin, John B. and Poole, Charles and Goodman, Steven N. and Altman, Douglas G.},
  year = {2016},
  journal = {European Journal of Epidemiology},
  volume = {31},
  number = {4},
  eprint = {1011.1669},
  pages = {337--350},
  publisher = {Springer Netherlands},
  issn = {15737284},
  doi = {10.1007/s10654-016-0149-3},
  abstract = {Misinterpretation and abuse of statistical tests, confidence intervals, and statistical power have been decried for decades, yet remain rampant. A key problem is that there are no interpretations of these concepts that are at once simple, intuitive, correct, and foolproof. Instead, correct use and interpretation of these statistics requires an attention to detail which seems to tax the patience of working scientists. This high cognitive demand has led to an epidemic of shortcut definitions and interpretations that are simply wrong, sometimes disastrously so-and yet these misinterpretations dominate much of the scientific literature. In light of this problem, we provide definitions and a discussion of basic statistics that are more general and critical than typically found in traditional introductory expositions. Our goal is to provide a resource for instructors, researchers, and consumers of statistics whose knowledge of statistical theory and technique may be limited but who wish to avoid and spot misinterpretations. We emphasize how violation of often unstated analysis protocols (such as selecting analyses for presentation based on the P values they produce) can lead to small P values even if the declared test hypothesis is correct, and can lead to large P values even if that hypothesis is incorrect. We then provide an explanatory list of 25 misinterpretations of P values, confidence intervals, and power. We conclude with guidelines for improving statistical interpretation and reporting.},
  archiveprefix = {arXiv},
  isbn = {1573-7284 (Electronic){\textbackslash}r0393-2990 (Linking)},
  pmid = {27209009},
  file = {/Users/francojc/Zotero/storage/V2CHVUNL/Greenland et al. - 2016 - Statistical tests, P values, confidence intervals, and power a guide to misinterpretations.pdf}
}

@article{Gries2005,
  title = {Syntactic Priming: A Corpus-Based Approach},
  author = {Gries, Stefan Th.},
  year = {2005},
  journal = {Journal of Psycholinguistic Research},
  volume = {34},
  number = {4},
  file = {/Users/francojc/Zotero/storage/LINQPYGE/Gries - 2005 - Syntactic Priming A Corpus-based Approach(2).pdf;/Users/francojc/Zotero/storage/PCRSTICL/Gries - 2005 - Syntactic Priming A Corpus-based Approach.pdf}
}

@inproceedings{Gries2009,
  title = {N-Grams and the Clustering of Genres},
  booktitle = {Workshop on {{Corpus}}, {{Colligation}}, {{Register Variation}} at the 31st {{Annual Meeting}} of the {{Deutsche Gesellschaft}} Fr {{Sprachwissenschaft}}},
  author = {Gries, Stefan Th. and Newman, J and Shaoul, C and Dilts, P},
  year = {2009},
  pages = {1--13},
  urldate = {2014-05-21},
  file = {/Users/francojc/Zotero/storage/7DMQTYWK/Gries et al. - 2009 - N-grams and the clustering of genres.pdf}
}

@book{Gries2009a,
  title = {Quantitative Corpus Linguistics with {{R}}: A Practical Introduction},
  author = {Gries, Stefan Th.},
  year = {2009},
  publisher = {Routledge},
  file = {/Users/francojc/Zotero/storage/V86KRIBB/Gries - 2009 - Quantitative corpus linguistics with R a practica.pdf}
}

@article{Gries2010,
  title = {Useful Statistics for Corpus Linguistics},
  author = {Gries, Stefan Th.},
  editor = {Almela, Mois{\'e}s},
  year = {2010},
  journal = {A mosaic of corpus linguistics: Selected approaches},
  pages = {269--291},
  publisher = {Citeseer},
  address = {Frankfurt am Main},
  urldate = {2011-11-01},
  file = {/Users/francojc/Zotero/storage/NEJBBRAT/Gries - 2010 - Useful statistics for corpus linguistics.pdf}
}

@book{Gries2013,
  title = {Statistics for Linguistics with {{R}} a Practical Introduction},
  author = {Gries, Stefan Th.},
  year = {2013},
  edition = {2nd rev. ed.},
  publisher = {De Gruyter Mouton},
  address = {Berlin},
  collaborator = {{ProQuest}},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/QF4Z8MM2/Gries - 2013 - Statistics for linguistics - Chapter 1.pdf}
}

@book{Gries2013a,
  title = {Statistics for Linguistics with {{R}}. {{A}} Practical Introduction},
  author = {Gries, Stefan Th.},
  year = {2013},
  edition = {2nd revise},
  urldate = {2013-08-12},
  isbn = {978-3-11-030728-3},
  file = {/Users/francojc/Zotero/storage/32VBUPVS/Gries - 2013 - Statistics for Linguistics with R. A Practical Int.pdf}
}

@article{Gries2014,
  title = {Using Regressions to Explore Deviations between Corpus Data and a Standard/ Target: Two Suggestions},
  shorttitle = {Using Regressions to Explore Deviations between Corpus Data and a Standard/Target},
  author = {Gries, Stefan Th. and Deshors, Sandra C.},
  year = {2014},
  month = may,
  journal = {Corpora},
  volume = {9},
  number = {1},
  pages = {109--136},
  issn = {1749-5032, 1755-1676},
  doi = {10.3366/cor.2014.0053},
  urldate = {2021-07-09},
  abstract = {The main goal of this study is to develop more appropriate ways to study variation between corpus data that instantiate a linguistic standard or target on the one hand, and corpus data that are compared to that standard, or that represent speakers that may aspire to approximate the target (such as second- or foreign-language learners). Using the example of SLA/FLA research, we first, briefly, discuss a highly influential model, Granger's (1996) Contrastive Interlanguage Analysis (CIA), and the extent to which much current research fails to exploit this model to its full potential. Then, we outline a few methodological suggestions that, if followed, can elevate corpus-based analysis in SLA/FLA to a new level of precision and predictive accuracy. Specifically, we propose that, and exemplify how, the inclusion of statistical interactions in regressions on corpus data can highlight important differences between native speakers (NS) and learners/non-native speakers (NNS) with different native linguistic (L1) backgrounds. Secondly, we develop a two-step regression procedure that answers one of the most important questions in SLA/FLA research -- `What would a native speaker do?' -- and, thus, allows us to study systematic deviations between NS and NNS at an unprecedented degree of granularity. Both methods are explained and exemplified in detail on the basis of over 5,000 uses of may and can produced by NSs of English and French and Chinese learners of English.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/9WDWKWKA/Gries and Deshors - 2014 - Using regressions to explore deviations between co.pdf}
}

@article{Gries2015,
  title = {The Most Under-Used Statistical Method in Corpus Linguistics: {{Multi-level}} (and Mixed-Effects) Models},
  author = {Gries, Stefan Th.},
  year = {2015},
  journal = {Corpora},
  volume = {10},
  number = {1},
  pages = {95--125},
  issn = {17551676},
  doi = {10.3366/cor.2015.0068},
  abstract = {Much statistical analysis of psycholinguistic data is now being done with so-called mixed-effects regression models. This development was spearheaded by a few highly influential introductory articles that (i) showed how these regression models are superior to what was the previous gold standard and, perhaps even more importantly, (ii) showed how these models are used practically. Corpus linguistics can benefit from mixed-effects/multi-level models for the same reason that psycholinguistics can - because, for example, speaker-specific and lexically specific idiosyncrasies can be accounted for elegantly; but, in fact, corpus linguistics needs them even more because (i) corpus-linguistic data are observational and, thus, usually unbalanced and messy/noisy, and (ii) most widely used corpora come with a hierarchical structure that corpus linguists routinely fail to consider. Unlike nearly all overviews of mixed-effects/multi-level modelling, this paper is specifically written for corpus linguists to get more of them to start using these techniques more. After a short methodological history, I provide a nontechnical introduction to mixed-effects models and then discuss in detail one example - particle placement in English - to show how mixed-effects/multilevel modelling results can be obtained and how they are far superior to those of traditional regression modelling. {\copyright} Edinburgh University Press.},
  isbn = {1749-5032{\textbackslash}r1755-1676},
  file = {/Users/francojc/Zotero/storage/SA4EKEC3/Gries - 2015 - The most under-used statistical method in corpus l.pdf}
}

@book{Gries2016,
  title = {Quantitative Corpus Linguistics with {{R}}: A Practical Introduction},
  shorttitle = {Quantitative Corpus Linguistics with {{R}}},
  author = {Gries, Stefan Th.},
  year = {2016},
  month = oct,
  edition = {2},
  publisher = {Routledge},
  address = {New York},
  doi = {10.4324/9781315746210},
  abstract = {As in its first edition, the new edition of Quantitative Corpus Linguistics with R demonstrates how to process corpus-linguistic data with the open-source programming language and environment R. Geared in general towards linguists working with observational data, and particularly corpus linguists, it introduces R programming with emphasis on: data processing and manipulation in general; text processing with and without regular expressions of large bodies of textual and/or literary data, and; basic aspects of statistical analysis and visualization. This book is extremely hands-on and leads the reader through dozens of small applications as well as larger case studies. Along with an array of exercise boxes and separate answer keys, the text features a didactic sequential approach in case studies by way of subsections that zoom in to every programming problem. The companion website to the book contains all relevant R code (amounting to approximately 7,000 lines of heavily commented code), most of the data sets as well as pointers to others, and a dedicated Google newsgroup. This new edition is ideal for both researchers in corpus linguistics and instructors who want to promote hands-on approaches to data in corpus linguistics courses.},
  isbn = {978-1-315-74621-0},
  keywords = {corpus linguistics,r}
}

@incollection{Gries2020,
  title = {Writing up a Corpus-Linguistic Paper},
  booktitle = {A {{Practical Handbook}} of {{Corpus Linguistics}}},
  author = {Gries, Stefan Th. and Paquot, Magali},
  editor = {Paquot, Magali and Gries, Stefan Th.},
  year = {2020},
  pages = {647--659},
  publisher = {Springer International Publishing},
  doi = {10.1007/978-3-030-46216-1_26},
  urldate = {2024-03-11},
  isbn = {978-3-030-46215-4 978-3-030-46216-1},
  langid = {english},
  keywords = {articles,publication,research},
  file = {/Users/francojc/Zotero/storage/PXZH29E9/Gries and Paquot - 2020 - Writing up a Corpus-Linguistic Paper.pdf}
}

@article{Gries2021,
  title = {({{Generalized}} Linear) Mixed-effects Modeling: A Learner Corpus Example},
  shorttitle = {({{Generalized}} Linear) Mixed-effects Modeling},
  author = {Gries, Stefan Th.},
  year = {2021},
  month = mar,
  journal = {Language Learning},
  pages = {lang.12448},
  issn = {0023-8333, 1467-9922},
  doi = {10.1111/lang.12448},
  urldate = {2021-03-26},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/96E2KG6T/Gries - 2021 - (Generalized Linear) Mixed‐Effects Modeling A Lea.pdf}
}

@book{Gries2021a,
  title = {Statistics for Linguistics with {{R}}},
  author = {Gries, Stefan Th.},
  year = {2021},
  publisher = {De Gruyter Mouton},
  file = {/Users/francojc/Zotero/storage/6N6IESCL/Gries - 2021 - Statistics for Linguistics with R.pdf;/Users/francojc/Zotero/storage/57KGLZJ7/html.html}
}

@article{Gries2022,
  title = {Toward More Careful Corpus Statistics: Uncertainty Estimates for Frequencies, Dispersions, Association Measures, and More},
  shorttitle = {Toward More Careful Corpus Statistics},
  author = {Gries, Stefan Th.},
  year = {2022},
  month = apr,
  journal = {Research Methods in Applied Linguistics},
  volume = {1},
  number = {1},
  pages = {100002},
  issn = {2772-7661},
  doi = {10.1016/j.rmal.2021.100002},
  urldate = {2022-01-18},
  abstract = {This article demonstrates that, counter to current practice, (i) corpus-linguistic studies should provide uncertainty/interval estimates for all corpus-linguistic statistics, even for basic/fundamental ones such as frequencies, dispersions, or association measures, and (ii) these statistics should be based on text-/file-based bootstrapping and confidence/data ellipses covering two or more dimensions of information. Four small case studies -- three more programmatic and one more applied -- are offered to exemplify the logic and method. The first case study shows how parametric confidence intervals or confidence intervals from word-based bootstrapping can be inappropriate; the second case study exemplifies the computation of frequency-cum-dispersion intervals; the third does the same for collocational/collostructional data (the ditransitive); and the last case study exemplifies the use of these methods in a diachronic statutory-interpretation context.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/EKCGZCMU/Gries - 2022 - Toward more careful corpus statistics uncertainty.pdf;/Users/francojc/Zotero/storage/MF36FDZ9/S2772766121000021.html}
}

@incollection{Gries2023,
  title = {New Technologies and Advances in Statistical Analysis in Recent Decades},
  booktitle = {The {{Handbook}} of {{Usage-Based Linguistics}}},
  author = {Gries, Stefan Th.},
  editor = {{D{\'i}az-Campos}, Manuel and Balasch, Sonia},
  year = {2023},
  edition = {first edition},
  publisher = {John Wiley \& Sons Inc},
  keywords = {corpus linguistics,pda,predictive data analysis,statistics},
  file = {/Users/francojc/Zotero/storage/XJEEGDTH/Gries - 2023 - New Technologies and Advances in Statistical Analy.pdf}
}

@incollection{Gries2023a,
  title = {Statistical Methods in Corpus Linguistics},
  booktitle = {Readings in Corpus Linguistics: {{A}} Teaching and Research Guide for Scholars in {{Nigeria}} and Beyond,},
  author = {Gries, Stefan Th.},
  year = {2023},
  pages = {78--114},
  file = {/Users/francojc/Zotero/storage/MX7QZCGG/Gries - 2023 - Statistical methods in corpus linguistics.pdf}
}

@article{Grieve2018,
  title = {Mapping Lexical Innovation on {{American}} Social Media},
  author = {Grieve, Jack and Nini, Andrea and Guo, Diansheng},
  year = {2018},
  journal = {Journal of English Linguistics},
  volume = {46},
  number = {4},
  pages = {293--319},
  publisher = {SAGE Publications Sage CA: Los Angeles, CA},
  file = {/Users/francojc/Zotero/storage/NNZ5C7PP/Grieve et al. - 2018 - Mapping lexical innovation on American social medi.pdf;/Users/francojc/Zotero/storage/6M73FGJ2/0075424218793191.html}
}

@misc{Grieve2024,
  title = {The {{Sociolinguistic Foundations}} of {{Language Modeling}}},
  author = {Grieve, Jack and Bartl, Sara and Fuoli, Matteo and Grafmiller, Jason and Huang, Weihang and Jawerbaum, Alejandro and Murakami, Akira and Perlman, Marcus and Roemling, Dana and Winter, Bodo},
  year = {2024},
  month = jul,
  number = {arXiv:2407.09241},
  eprint = {2407.09241},
  primaryclass = {cs},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/2407.09241},
  urldate = {2024-07-19},
  abstract = {In this paper, we introduce a sociolinguistic perspective on language modeling. We claim that large language models are inherently models of varieties of language, and we consider how this insight can inform the development and deployment of large language models. We begin by presenting a technical definition of the concept of a variety of language as developed in sociolinguistics. We then discuss how this perspective can help address five basic challenges in language modeling: social bias, domain adaptation, alignment, language change, and scale. Ultimately, we argue that it is crucial to carefully define and compile training corpora that accurately represent the specific varieties of language being modeled to maximize the performance and societal value of large language models.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Computation and Language},
  file = {/Users/francojc/Zotero/storage/DUCGM2YJ/Grieve et al. - 2024 - The Sociolinguistic Foundations of Language Modeling.pdf}
}

@article{Grimmer2013,
  title = {Text as Data: {{The}} Promise and Pitfalls of Automatic Content Analysis Methods for Political Texts},
  shorttitle = {Text as Data},
  author = {Grimmer, Justin and Stewart, Brandon M.},
  year = {2013/ed},
  journal = {Political Analysis},
  volume = {21},
  number = {3},
  pages = {267--297},
  publisher = {Cambridge University Press},
  issn = {1047-1987, 1476-4989},
  doi = {10.1093/pan/mps028},
  urldate = {2020-10-06},
  abstract = {Politics and political conflict often occur in the written and spoken word. Scholars have long recognized this, but the massive costs of analyzing even moderately sized collections of texts have hindered their use in political science research. Here lies the promise of automated text analysis: it substantially reduces the costs of analyzing large collections of text. We provide a guide to this exciting new area of research and show how, in many instances, the methods have already obtained part of their promise. But there are pitfalls to using automated methods---they are no substitute for careful thought and close reading and require extensive and problem-specific validation. We survey a wide range of new methods, provide guidance on how to validate the output of the models, and clarify misconceptions and errors in the literature. To conclude, we argue that for automated text methods to become a standard tool for political scientists, methodologists must contribute new methods and new methods of validation.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/2HHUPYZ8/Grimmer and Stewart - 2013 - Text as Data The Promise and Pitfalls of Automati.pdf;/Users/francojc/Zotero/storage/VS4ZYWUJ/F7AAC8B2909441603FEB25C156448F20.html}
}

@book{Grolemund2021,
  title = {Hands-on Programming with {{R}}},
  author = {Grolemund, Garrett},
  year = {2021},
  url = {https://rstudio-education.github.io/hopr/},
  urldate = {2021-09-08},
  abstract = {This book will teach you how to program in R, with hands-on examples. I wrote it for non-programmers to provide a friendly introduction to the R language. You'll learn how to load data, assemble and disassemble data objects, navigate R's environment system, write your own functions, and use all of R's programming tools. Throughout the book, you'll use your newfound skills to solve practical data science problems.},
  file = {/Users/francojc/Zotero/storage/8JEETMUS/hopr.html}
}

@article{Grundmann2010,
  title = {The Discourse of Climate Change: A Corpus-Based Approach},
  author = {Grundmann, Reiner and Krishnamurthy, R},
  year = {2010},
  journal = {Critical Approaches to Discourse Analysis across Disciplines},
  volume = {4},
  number = {2},
  pages = {125--146},
  url = {http://eprints.aston.ac.uk/10425/},
  urldate = {2012-11-12}
}

@misc{Gupta2020,
  title = {Data, Information, Knowledge, and Insights},
  author = {Gupta, Achin},
  year = {2020},
  month = aug,
  url = {https://www.linkedin.com/pulse/data-information-knowledge-insights-achin-gupta},
  urldate = {2021-06-02},
  abstract = {There is probably no segment of activity in the world attracting as much attention at present as that of knowledge management. Yet as I entered this arena of activity I quickly found there didn\&\#39;t seem to be a wealth of sources that seemed to make sense in terms of defining what knowledge actually wa},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/WRTEF29M/data-information-knowledge-insights-achin-gupta.html}
}

@inproceedings{Hahn2000,
  title = {Grammar at Its Best: {{The}} Development of a Rule-and Corpus-Based Grammar of {{English}} Tenses},
  booktitle = {Rethinking Language Pedagogy from a Corpus Perspective: {{Papers}} from the Third {{International Conference}} on {{Teaching}} and {{Language Corpora}}},
  author = {Hahn, Angela},
  year = {2000},
  pages = {193--206}
}

@inproceedings{Hamilton2016,
  title = {Diachronic Word Embeddings Reveal Statistical Laws of Semantic Change},
  booktitle = {Proceedings of the 54th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}}},
  author = {Hamilton, William L. and Leskovec, Jure and Jurafsky, Daniel},
  year = {2016},
  pages = {1489--1501},
  abstract = {Understanding how words change their meanings over time is key to models of language and cultural evolution, but historical data on meaning is scarce, making theories hard to develop and test. Word embeddings show promise as a diachronic tool, but have not been carefully evaluated. We develop a robust methodology for quantifying semantic change by evaluating word embeddings (PPMI, SVD, word2vec) against known historical changes. We then use this methodology to reveal statistical laws of semantic evolution. Using six historical corpora spanning four languages and two centuries, we propose two quantitative laws of semantic change: (i) the law of conformity---the rate of semantic change scales with an inverse power-law of word frequency; (ii) the law of innovation---independent of frequency, words that are more polysemous have higher rates of semantic change.},
  file = {/Users/francojc/Zotero/storage/XVHKE78F/Hamilton et al. - 2016 - Diachronic Word Embeddings Reveal Statistical Laws of Semantic Change.pdf}
}

@article{Hardie2003,
  title = {The Were-Subjunctive in British Rural Dialects: {{Marrying}} Corpus and Questionnaire Data},
  author = {Hardie, Andrew and McEnery, T},
  year = {2003},
  journal = {Computers and the Humanities},
  pages = {205--228},
  urldate = {2014-09-04},
  file = {/Users/francojc/Zotero/storage/KSU28JM6/Hardie, McEnery - 2003 - The Were-Subjunctive in British Rural Dialects Marrying Corpus and Questionnaire Data.pdf}
}

@book{Harris1951,
  title = {Methods in Structural Linguistics},
  author = {Harris, Zellig S.},
  year = {1951},
  publisher = {The University of Chicago Press},
  issn = {0028-0836},
  doi = {10.1038/1711084b0},
  abstract = {See wiki.},
  isbn = {0226317722 (pbk.)},
  pmid = {3833311},
  file = {/Users/francojc/Zotero/storage/XQQ2FJES/Harris - 1951 - Methods in structural linguistics.pdf}
}

@article{Harris1954,
  title = {Distributional Structure},
  author = {Harris, Zellig S.},
  year = {1954},
  month = aug,
  journal = {Word},
  volume = {10},
  number = {2-3},
  pages = {146--162},
  issn = {0043-7956, 2373-5112},
  doi = {10.1080/00437956.1954.11659520},
  urldate = {2023-09-26},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/ZU3XYNDD/Harris - 1954 - Distributional Structure.pdf}
}

@article{Hartmann2023,
  title = {Open Corpus Linguistics--or How to Overcome Common Problems in Dealing with Corpus Data by Adopting Open Research Practices},
  author = {Hartmann, Stefan},
  year = {2023},
  publisher = {PsyArXiv},
  keywords = {corpus linguistics,reproducible research},
  file = {/Users/francojc/Zotero/storage/ASHBCTYH/Hartmann_2023_Open Corpus Linguistics–or How to overcome common problems in dealing with.pdf}
}

@book{Hastie2013,
  title = {The Elements of Statistical Learning. {{Data}} Mining, Inference, and Prediction},
  author = {Hastie, Trevor and Tibshirani, Robert and Friedman, {\relax JJH}},
  year = {2013},
  url = {http://www-stat.stanford.edu/~tibs/book/preface.ps},
  urldate = {2014-01-02},
  file = {/Users/francojc/Zotero/storage/VRN9L28F/Hastie, Tibshirani, Friedman - 2013 - The Elements of Statistical Learning. Data Mining, Inference, and Prediction.pdf}
}

@article{Hay2002,
  title = {From Speech Perception to Morphology: {{Affix}} Ordering Revisited},
  author = {Hay, Jennifer},
  year = {2002},
  journal = {Language},
  volume = {78},
  number = {3},
  pages = {527--555},
  file = {/Users/francojc/Zotero/storage/UHUHBF8V/Hay - 2002 - From Speech Perception to Morphology Affix Orderi.pdf}
}

@article{He2020,
  title = {Big Data Set from Ratemyprofessor.Com for Professors' Teaching Evaluation},
  author = {He, Jibo},
  year = {2020},
  month = mar,
  volume = {2},
  publisher = {Mendeley Data},
  doi = {10.17632/fvtfjyvw7d.2},
  urldate = {2021-10-27},
  abstract = {This dataset is shared by Dr. Jibo HE, founder of the USEE Eye Tracking Inc. and professor of Tsinghua University. Contact me at hejibo@usee.tech if you need to get the full dataset in over 5G size. This is the dataset from RateMyProfessor.com for professors' teaching evaluation. The dataset is provided in two format. The zip file contains many single .csv files, with each file for a single professor's webpage. The RateMyProfessor\_Sample data.csv is the combined version with many professors' information in a single csv file. We have crawled almost one million professors' information. The total dataset is over 5. The dataset crawled and extracted from RMP has 18 variables. This part briefly describes each variable that needs to be analyzed.  Professor name: name of the professor who is rated  School name: university where the professor is currently teaching  Department name: currently working there  Local name: university's locally known as  State name: state which the university is located in  Year since first review: the professor's teaching age, from the first student evaluation to the time when we did the analysis in year 2019.  Star rating: the star rating of the professor's overall quality, the point 3.5-5.0 is good, 2.5-3.4 is average and 1.0-2.4 is poor according to RMP's official standard. This star rating is the average score given to professors by all student comments;  Take again: percentage of students who want to choose this course again;  Difficulty index: The difficulty level of a course. Point 1 is easiest, and point 5 is hardest. The difficulty index is the average score given to professors by all students;  Tags: the tag students chose to describe a professor;  Post date: the date when the student posted an evaluation of a course;  Student star: each student gives a star rating to a professor;  Student-rated difficulty: every student gives difficulty index to a professor;  Attendance: whether a course is mandatory or not;  For credit: whether students chose a course for credit (yes or no);  Would take again: whether students would like to choose a course again (yes or no)  Grade: student's final score of a course, such as A+, A, A-, B+, B, B-, C+, C, C-, D+, D, D-, F, WD, INC, Not, Audit/No. ``WD'' is Drop/Withdrawal. ``INC'' means Incomplete. ``Not'' is Not sure yet, and ``Audit/No'' is Audit/No Grade.  Comment: comments that students gave for professors.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/RMUSCL2F/2.html}
}

@article{Head2015,
  title = {The Extent and Consequences of P-Hacking in Science},
  author = {Head, Megan L. and Holman, Luke and Lanfear, Rob and Kahn, Andrew T. and Jennions, Michael D.},
  year = {2015},
  month = mar,
  journal = {PLoS Biology},
  volume = {13},
  number = {3},
  pages = {e1002106},
  publisher = {Public Library of Science},
  issn = {1545-7885},
  doi = {10.1371/journal.pbio.1002106},
  urldate = {2021-07-05},
  abstract = {A focus on novel, confirmatory, and statistically significant results leads to substantial bias in the scientific literature. One type of bias, known as ``p-hacking,'' occurs when researchers collect or select data or statistical analyses until nonsignificant results become significant. Here, we use text-mining to demonstrate that p-hacking is widespread throughout science. We then illustrate how one can test for p-hacking when performing a meta-analysis and show that, while p-hacking is probably common, its effect seems to be weak relative to the real effect sizes being measured. This result suggests that p-hacking probably does not drastically alter scientific consensuses drawn from meta-analyses.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/WFFV4JHB/Head et al. - 2015 - The Extent and Consequences of P-Hacking in Scienc.pdf;/Users/francojc/Zotero/storage/YIIIGEA3/article.html}
}

@misc{Henderson2022,
  title = {Pile of Law: {{Learning}} Responsible Data Filtering from the Law and a {{256GB}} Open-Source Legal Dataset},
  shorttitle = {Pile of Law},
  author = {Henderson, Peter and Krass, Mark S. and Zheng, Lucia and Guha, Neel and Manning, Christopher D. and Jurafsky, Dan and Ho, Daniel E.},
  year = {2022},
  month = jul,
  number = {arXiv:2207.00220},
  eprint = {2207.00220},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2207.00220},
  urldate = {2022-07-13},
  abstract = {One concern with the rise of large language models lies with their potential for significant harm, particularly from pretraining on biased, obscene, copyrighted, and private information. Emerging ethical approaches have attempted to filter pretraining material, but such approaches have been ad hoc and failed to take into account context. We offer an approach to filtering grounded in law, which has directly addressed the tradeoffs in filtering material. First, we gather and make available the Pile of Law, a 256GB (and growing) dataset of open-source English-language legal and administrative data, covering court opinions, contracts, administrative rules, and legislative records. Pretraining on the Pile of Law may potentially help with legal tasks that have the promise to improve access to justice. Second, we distill the legal norms that governments have developed to constrain the inclusion of toxic or private content into actionable lessons for researchers and discuss how our dataset reflects these norms. Third, we show how the Pile of Law offers researchers the opportunity to learn such filtering rules directly from the data, providing an exciting new research direction in model-based processing.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/IJBRYUNY/Henderson et al. - 2022 - Pile of Law Learning Responsible Data Filtering f.pdf;/Users/francojc/Zotero/storage/CVQ645T5/2207.html}
}

@article{Hicks2019,
  title = {Elements and Principles for Characterizing Variation between Data Analyses},
  author = {Hicks, Stephanie C. and Peng, Roger D.},
  year = {2019},
  month = jul,
  journal = {arXiv},
  eprint = {1903.07639},
  primaryclass = {stat},
  doi = {10.48550/arXiv.1903.07639},
  urldate = {2023-03-10},
  abstract = {The data revolution has led to an increased interest in the practice of data analysis. For a given problem, there can be significant or subtle differences in how a data analyst constructs or creates a data analysis, including differences in the choice of methods, tooling, and workflow. In addition, data analysts can prioritize (or not) certain objective characteristics in a data analysis, leading to differences in the quality or experience of the data analysis, such as an analysis that is more or less reproducible or an analysis that is more or less exhaustive. However, data analysts currently lack a formal mechanism to compare and contrast what makes analyses different from each other. To address this problem, we introduce a vocabulary to describe and characterize variation between data analyses. We denote this vocabulary as the elements and principles of data analysis, and we use them to describe the fundamental concepts for the practice and teaching of creating a data analysis. This leads to two insights: it suggests a formal mechanism to evaluate data analyses based on objective characteristics, and it provides a framework to teach students how to build data analyses.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/3T65JFPQ/Hicks and Peng - 2019 - Elements and Principles for Characterizing Variation between Data Analyses.pdf}
}

@book{Hodeghatta2016,
  title = {Business Analytics Using {{R}} - a Practical Approach},
  author = {Hodeghatta, Umesh R and Nayak, Umesha},
  year = {2016},
  edition = {First edit},
  publisher = {Apress},
  abstract = {Learn the fundamental aspects of the business statistics, data mining, and machine learning techniques required to understand the huge amount of data generated by your organization. This book explains practical business analytics through examples, covers the steps involved in using it correctly, and shows you the context in which a particular technique does not make sense. Further, Practical Business Analytics using R helps you understand specific issues faced by organizations and how the solutions to these issues can be facilitated by business analytics. This book will discuss and explore the following through examples and case studies: An introduction to R: data management and R functions The architecture, framework, and life cycle of a business analytics project Descriptive analytics using R: descriptive statistics and data cleaning Data mining: classification, association rules, and clustering Predictive analytics: simple regression, multiple regression, and logistic regression This book includes case studies on important business analytic techniques, such as classification, association, clustering, and regression. The R language is the statistical tool used to demonstrate the concepts throughout the book. What You Will Learn {$\bullet$} Write R programs to handle data {$\bullet$} Build analytical models and draw useful inferences from them {$\bullet$} Discover the basic concepts of data mining and machine learning {$\bullet$} Carry out predictive modeling {$\bullet$} Define a business issue as an analytical problem Who This Book Is For Beginners who want to understand and learn the fundamentals of analytics using R. Students, managers, executives, strategy and planning professionals, software professionals, and BI/DW professionals.},
  isbn = {978-1-4842-2513-4}
}

@article{Holmes1990,
  title = {Hedges and Boosters in Women's and Men's Speech},
  author = {Holmes, Janet},
  year = {1990},
  month = jan,
  journal = {Language \& Communication},
  volume = {10},
  number = {3},
  pages = {185--205},
  issn = {02715309},
  doi = {10.1016/0271-5309(90)90002-S},
  urldate = {2021-11-08},
  langid = {english},
  keywords = {gender,hedges,sociolinguistics},
  file = {/Users/francojc/Zotero/storage/7KX3AWIM/Holmes - 1990 - Hedges and boosters in women's and men's speech.pdf}
}

@article{Holmes1994,
  title = {Inferring Language Change from Computer Corpora: Some Methodological Problems},
  author = {Holmes, Janet},
  year = {1994},
  journal = {ICAME journal},
  volume = {18},
  pages = {27--40},
  file = {/Users/francojc/Zotero/storage/6VD2YGC4/Holmes - 1994 - Inferring language change from computer corpora s.pdf}
}

@article{Holmes1997,
  title = {Genre Analysis, and the Social Sciences: {{An}} Investigation of the Structure of Research Article Discussion Sections in Three Disciplines},
  shorttitle = {Genre Analysis, and the Social Sciences},
  author = {Holmes, Richard},
  year = {1997},
  month = jan,
  journal = {English for Specific Purposes},
  volume = {16},
  number = {4},
  pages = {321--337},
  issn = {08894906},
  doi = {10.1016/S0889-4906(96)00038-5},
  urldate = {2024-03-11},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/XFDS4ZIR/Holmes - 1997 - Genre analysis, and the social sciences An investigation of the structure of research article discu.pdf}
}

@article{Hornik2013,
  title = {The Textcat Package for N-{{Gram}} Based Text Categorization in {{R}}},
  author = {Hornik, Kurt and Mair, Patrick and Rauch, James and Geiger, Wilhelm and Buchta, Christian and Feinerer, Ingo},
  year = {2013},
  journal = {Journal of Statistical Software},
  volume = {52},
  number = {6},
  url = {http://www.jstatsoft.org/v52/i06/paper},
  abstract = {Identifying the language used will typically be the first step in most natural language processing tasks. Among the wide variety of language identification methods discussed in the literature, the ones employing the Cavnar and Trenkle (1994) approach to text categorization based on character n-gram frequencies have been particularly successful. This paper presents the R extension package textcat for n-gram based text categorization which implements both the Cavnar and Trenkle approach as well as a reduced n-gram approach designed to remove redundancies of the original approach. A multi-lingual corpus obtained from the Wikipedia pages available on a selection of topics is used to illustrate the functionality of the package and the performance of the provided language identification methods.}
}

@misc{HowMakeData10-2021,
  title = {How to Make a Data Dictionary},
  year = {2021},
  journal = {OSF Guides},
  url = {https://help.osf.io/hc/en-us/articles/360019739054-How-to-Make-a-Data-Dictionary},
  urldate = {2021-10-13},
  langid = {american},
  file = {/Users/francojc/Zotero/storage/WE2Z7TQS/360019739054-How-to-Make-a-Data-Dictionary.html}
}

@inproceedings{Hu2004,
  title = {Mining and Summarizing Customer Reviews},
  booktitle = {Proceedings of the Tenth {{ACM SIGKDD}} International Conference on {{Knowledge}} Discovery and Data Mining},
  author = {Hu, Minqing and Liu, Bing},
  year = {2004},
  pages = {168--177},
  file = {/Users/francojc/Zotero/storage/8XDQW2NC/Hu and Liu - 2004 - Mining and summarizing customer reviews.pdf;/Users/francojc/Zotero/storage/HSZDTCXT/1014052.html}
}

@book{Huff1954,
  title = {How to Lie with Statistics},
  author = {Huff, Darrell},
  year = {1954},
  publisher = {WW Norton \& Company},
  file = {/Users/francojc/Zotero/storage/SHKGGMDJ/Huff - 1954 - How to lie with statistics.pdf;/Users/francojc/Zotero/storage/9H5ZXMSL/SICioQIKhiwC.html}
}

@misc{Hughes2013,
  title = {How Forensic Linguistics Outed {{J}}.{{K}}. Rowling (Not to Mention James Madison, Barack Obama, and the Rest of Us)},
  author = {Hughes, Virginia},
  year = {2013},
  journal = {National Geographic},
  url = {http://phenomena.nationalgeographic.com/2013/07/19/how-forensic-linguistics-outed-j-k-rowling-not-to-mention-james-madison-barack-obama-and-the-rest-of-us/},
  urldate = {2016-08-26}
}

@article{Hundt1999,
  title = {"{{Agile}}" and "{{Uptight}}" Genres: The Corpus-Based Approach to Language Change in Progress},
  author = {Hundt, M and Mair, C},
  year = {1999},
  journal = {International Journal of Corpus Linguistics},
  url = {http://www.ingentaconnect.com/content/jbp/ijcl/1999/00000004/00000002/art00002},
  urldate = {2012-11-05},
  file = {/Users/francojc/Zotero/storage/A979JHSE/Hundt, Mair - 1999 - Agile and Uptight Genres the corpus-based approach to language change in progress.pdf}
}

@article{Hyland2008,
  title = {As Can Be Seen: {{Lexical}} Bundles and Disciplinary Variation},
  author = {Hyland, K},
  year = {2008},
  journal = {English for specific purposes},
  url = {http://www.sciencedirect.com/science/article/pii/S0889490607000233},
  urldate = {2014-11-06},
  file = {/Users/francojc/Zotero/storage/QBC3TWYT/Hyland - 2008 - As can be seen Lexical bundles and disciplinary v.pdf}
}

@inproceedings{Ide2001,
  title = {The {{American National Corpus}}: A Standardized Resource for {{American English}}},
  booktitle = {Proceedings of {{Corpus Linguistics}}},
  author = {Ide, Nancy and Macleod, Catherine},
  year = {2001},
  address = {Lancaster, UK},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/IAPS4QIY/Ide and Macleod - The American National Corpus A standardized resource for American English.pdf}
}

@inproceedings{Ide2008,
  title = {{{MASC}}: {{The Manually Annotated Sub-Corpus}} of {{American English}}},
  shorttitle = {{{MASC}}},
  booktitle = {Sixth {{International Conference}} on {{Language Resources}} and {{Evaluation}}, {{LREC}} 2008},
  author = {Ide, Nancy and Baker, Collin and Fellbaum, Christiane and Fillmore, Charles and Passonneau, Rebecca},
  year = {2008},
  pages = {2455--2460},
  publisher = {European Language Resources Association (ELRA)},
  keywords = {/unread,american,corpus,data,english,masc}
}

@book{Ignatow2017,
  title = {An Introduction to Text Mining: {{Research}} Design, Data Collection, and Analysis},
  shorttitle = {An Introduction to Text Mining},
  author = {Ignatow, Gabe and Mihalcea, Rada},
  year = {2017},
  publisher = {Sage Publications},
  file = {/Users/francojc/Zotero/storage/7ZRWMYWP/svk2DwAAQBAJ.html}
}

@article{Inuwa-Dutse2018,
  title = {Detection of Spam-Posting Accounts on Twitter},
  author = {{Inuwa-Dutse}, Isa and Liptrott, Mark and Korkontzelos, Ioannis},
  year = {2018},
  month = nov,
  journal = {Neurocomputing},
  volume = {315},
  pages = {496--511},
  issn = {09252312},
  doi = {10.1016/j.neucom.2018.07.044},
  urldate = {2021-11-22},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/PEY95FW5/Inuwa-Dutse et al. - 2018 - Detection of spam-posting accounts on Twitter.pdf}
}

@misc{Irizarry2018,
  title = {The Role of Academia in Data Science Education},
  author = {Irizarry, Rafael},
  year = {2018},
  month = nov,
  journal = {Simply Stats},
  url = {https://simplystatistics.org/2018/11/01/the-role-of-academia-in-data-science-education/},
  urldate = {2021-01-21},
  file = {/Users/francojc/Zotero/storage/VSRUNREJ/the-role-of-academia-in-data-science-education.html}
}

@article{Jaeger2007,
  title = {Implicit Learning and Syntactic Persistence: {{Surprisal}} and Cumulativity},
  author = {Jaeger, T Florian and Snider, Neal},
  year = {2007},
  journal = {University of Rochester Working Papers in the Language Sciences},
  volume = {3},
  number = {1},
  file = {/Users/francojc/Zotero/storage/2K8JZU72/Jaeger and Snider - 2007 - Implicit Learning and Syntactic Persistence Surpr.pdf}
}

@article{Jakulin2009,
  title = {Analyzing the Us Senate in 2003: {{Similarities}}, Clusters, and Blocs},
  author = {Jakulin, A and Buntine, W and Pira, TM La and Brasher, H},
  year = {2009},
  journal = {Political Analysis},
  doi = {doi:10.1093/pan/mpp006},
  urldate = {2016-08-26},
  file = {/Users/francojc/Zotero/storage/KHHAGV4S/Jakulin et al. - 2009 - Analyzing the us senate in 2003 Similarities, clusters, and blocs.pdf}
}

@book{James2013,
  title = {An Introduction to Statistical Learning with Applications in {{R}}},
  author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert},
  editor = {Casella, G and Fienberg, S and Olkin, I},
  year = {2013},
  publisher = {Springer},
  urldate = {2014-02-16},
  isbn = {978-1-4614-7137-0},
  file = {/Users/francojc/Zotero/storage/LEQ75V5T/James et al. - 2013 - An Introduction to Statistical Learning with Applications in R.pdf}
}

@article{Jaworska2016,
  title = {A Comparative Corpus-Assisted Discourse Study of the Representations of Hosts in Promotional Tourism Discourse},
  author = {Jaworska, Sylvia},
  year = {2016},
  journal = {Corpora},
  volume = {11},
  number = {1},
  pages = {83--111},
  publisher = {Edinburgh University Press 22 George Square, Edinburgh EH8 9LF UK},
  file = {/Users/francojc/Zotero/storage/6KBGVFRZ/Jaworska - 2016 - A comparative corpus-assisted discourse study of t.pdf;/Users/francojc/Zotero/storage/SPLZ4SG2/cor.2016.html}
}

@article{Jockers2010,
  title = {A Comparative Study of Machine Learning Methods for Authorship Attribution},
  author = {Jockers, Matthew L. and Witten, Daniela M.},
  year = {2010},
  month = jun,
  journal = {Literary and Linguistic Computing},
  volume = {25},
  number = {2},
  pages = {215--223},
  issn = {0268-1145},
  doi = {10.1093/llc/fqq001},
  urldate = {2021-02-05},
  abstract = {We compare and benchmark the performance of five classification methods, four of which are taken from the machine learning literature, in a classic authorship attribution problem involving the Federalist Papers. Cross-validation results are reported for each method, and each method is further employed in classifying the disputed papers and the few papers that are generally understood to be coauthored. These tests are performed using two separate feature sets: a ``raw'' feature set containing all words and word bigrams that are common to all of the authors, and a second ``pre-processed'' feature set derived by reducing the raw feature set to include only words meeting a minimum relative frequency threshold. Each of the methods tested performed well, but nearest shrunken centroids and regularized discriminant analysis had the best overall performances with 0/70 cross-validation errors.},
  file = {/Users/francojc/Zotero/storage/JRIV8BVS/Jockers and Witten - 2010 - A comparative study of machine learning methods fo.pdf;/Users/francojc/Zotero/storage/LGMNN8A4/944579.html}
}

@article{Jockers2013,
  title = {Testing Authorship in the Personal Writings of Joseph Smith Using {{NSC}} Classification},
  author = {Jockers, {\relax ML}},
  year = {2013},
  journal = {Literary and linguistic computing},
  volume = {28},
  number = {3},
  doi = {10.1093/llc/fqs041},
  urldate = {2014-05-21},
  file = {/Users/francojc/Zotero/storage/WDQ6CJTL/Jockers - 2013 - Testing authorship in the personal writings of Jos.pdf}
}

@book{Jockers2014,
  title = {Text Analysis with {{R}} for Students of Literature},
  author = {Jockers, Matthew Lee},
  year = {2014},
  publisher = {Springer},
  address = {New York},
  urldate = {2014-10-16},
  abstract = {Text Analysis with R for Students of Literature is written with students and scholars of literature in mind but will be applicable to other humanists and social scientists wishing to extend their methodological tool kit to include quantitative and computational approaches to the study of text. Computation provides access to information in text that we simply cannot gather using traditional qualitative methods of close reading and human synthesis. Text Analysis with R for Students of Literature provides a practical introduction to computational text analysis using the open source programming language R. R is extremely popular throughout the sciences and because of its accessibility, R is now used increasingly in other research areas. Readers begin working with text right away and each chapter works through a new technique or process such that readers gain a broad exposure to core R procedures and a basic understanding of the possibilities of computational text analysis at both the micro and macro scale. Each chapter builds on the previous as readers move from small scale ``microanalysis'' of single texts to large scale ``macroanalysis'' of text corpora, and each chapter concludes with a set of practice exercises that reinforce and expand upon the chapter lessons. The book's focus is on making the technical palatable and making the technical useful and immediately gratifying.},
  file = {/Users/francojc/Zotero/storage/4RADP8H8/Jockers - 2014 - Text Analysis with R for Students of Literature.pdf}
}

@incollection{Jockers2015,
  title = {Text-Mining the Humanities},
  booktitle = {A {{New Companion}} to {{Digital Humanities}}},
  author = {Jockers, Matthew L. and Underwood, Ted},
  editor = {Schreibman, Susan and Siemens, Ray and Unsworth, John},
  year = {2015},
  month = nov,
  pages = {291--306},
  publisher = {John Wiley \& Sons, Ltd},
  address = {Chichester, UK},
  doi = {10.1002/9781118680605.ch20},
  urldate = {2021-06-16},
  isbn = {978-1-118-68060-5 978-1-118-68059-9},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/3NGAWMMW/Jockers and Underwood - 2015 - Text-Mining the Humanities.pdf}
}

@misc{Johansson,
  title = {Lancaster-{{Oslo-Bergen Corpus}} of {{Modern English}} ({{LOB}})},
  shorttitle = {Lancaster-Oslo-Bergen Corpus of Modern English ({{LOB}})},
  author = {Johansson, Stig},
  url = {http://hdl.handle.net/20.500.12024/0167},
  urldate = {2021-06-07},
  abstract = {Hofland, Knut, and Johansson, Stig, 1924- . -- Manual of information to accompany the [Lancaster/Oslo/Bergen] corpus of British English for use with digital computers. -- Bergen : Norwegian Computing Centre for the Humanities, 1982 Johansson, Stig, 1924- et al. -- The tagged [Lancaster/Oslo/Bergen] corpus users' manual. -- Bergen : Norwegian Computing Centre for the Humanities, 1986 Publication based on OTA text:  Leech, Geoffrey N., and Leonard, Rosemary, 1938- . -- ``A computer corpus of British English'' in Hamburger phonetische beitr{\"a}ge 13 (1974): 41-57 Publication based on OTA text:  Leonard, Rosemary, 1938- . -- ``Some possible uses of the computer archive of modern English texts'' in Association for Literary and Linguistic Computing bulletin 2.2 (1974): 13-19 Publication based on OTA text:  Leonard, Rosemary, 1938- . -- ``The computer archive of modern English texts'' in Computational and mathematical linguistics : proceedings of the International Conference on Computational Linguistics : Pisa, 27/VIII-1/IX 1973. -- Zampolli, Antonio, and Calzolari, N., ed. -- Firenze : Olschki Editore, 1977-1980. -- (Biblioteca dell'``Archivum Romanicum''. Serie 2 : Linguistica ; 36-37) Publication based on OTA text:  Johansson, Stig, 1924- . -- ``Two corpora of modern English texts : presentation and comments'' in Et norsk datamaskinelt testkorpus : rapport fra en konferanse i Bergen, 19. og 20. oktober 1978. -- Bergen : Norwegian Computing Centre for the Humanities, 1979 Publication based on OTA text:  Johansson, Stig, 1924- , and Hofland, Knut. -- Microfiche concordance of the Lancaster-Oslo/Bergen corpus. -- Bergen : Norwegian Computing Centre for the Humanities, 1979 Publication based on OTA text:  Johansson, Stig, 1924- . -- ``Corpus-based studies of British and American English'' in Papers from the Scandanavian Symposium and American English, Stockholm, May, 18-19, 1979. -- Stockholm : Almqvist [and] Wiksell, 1980. -- (Stockholm studies in English ; 52) Publication based on OTA text:  Johansson, Stig, 1924- . -- ``Word frequencies in British and American English : some preliminary observations'' in ALVAR : a linguistically varied assortment of readings. Studies presented to Alvar Elleg{\aa}rd on the occasion of his 60th birthday. -- Stockholm : University of Stockholm. Deptartment of English, 1980 Publication based on OTA text:  Hofland, Knut, and Johansson, Stig, 1924- . -- Word frequencies in British and American English. -- Bergen : Norwegian Computing Centre for the Humanities, 1982 Publication based on OTA text:  Atwell, Eric, Leech, Geoffrey N., and Garside, Roger F. -- ``Analysis of the [Lancaster/Oslo/Bergen] corpus : progress and prospects'' in Corpus linguistics : recent developments in the use of computer corpora in English language research. -- Aarts, Jan M. G., and Meijs, Willem, eds. -- Amsterdam : Rodopi, 1984. -- (Costerus. New series ; 45). -- ISBN 9062036961. -- pp. 41-52 Publication based on OTA text:  Meijs, Willem. -- ```You can do so if you want to' : some elliptic structures in Brown and LOB and their syntactic description'' in Corpus linguistics : recent developments in the use of computer corpora in English language research. -- Aarts, Jan M. G., and Meijs, Willem, eds. -- Amsterdam : Rodopi, 1984. -- (Costerus. New series ; 45). -- ISBN 9062036961. -- pp. 141-162 Publication based on OTA text:  Johansson, Stig, 1924- , and Hofland, Knut. -- Frequency analysis of English vocabulary and grammar : based on the [Lancaster/Oslo/Bergen] corpus. -- Oxford : Clarendon Press, 1989. -- 2 v. -- ISBN 0-19-824221-2 (v. 1). -- ISBN 0-19-824222-0 (v. 2) Publication based on OTA text:  Belmore, Nancy. -- ``Working with Brown and [Lancaster/Oslo/Bergen] on a microcomputer'' in Theory and practice in corpus linguistics. -- Aarts, Jan M. G., and Meijs, Willem, eds. -- Amsterdam ; Atlanta [GA] : Rodopi, 1990. -- (Language and computers. Studies in practical linguistics ; 4). -- ISBN 9051831749},
  copyright = {Distributed by the University of Oxford under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.},
  howpublished = {University of Oxford},
  annotation = {Accepted: 2018-07-27},
  file = {/Users/francojc/Zotero/storage/F5WFF9LP/0167.html}
}

@book{Johnson2008,
  title = {Quantitative Methods in Linguistics},
  author = {Johnson, K},
  year = {2008},
  publisher = {Blackwell Pub.},
  file = {/Users/francojc/Zotero/storage/LTWVH3S6/Johnson - 2008 - Quantitative methods in linguistics.pdf}
}

@article{Jordan2019,
  title = {Examining Long-Term Trends in Politics and Culture through Language of Political Leaders and Cultural Institutions},
  author = {Jordan, Kayla N. and Sterling, Joanna and Pennebaker, James W. and Boyd, Ryan L.},
  year = {2019},
  journal = {Proceedings of the National Academy of Sciences},
  volume = {116},
  number = {9},
  pages = {3476--3481},
  publisher = {National Acad Sciences},
  file = {/Users/francojc/Zotero/storage/4YC8ZTCF/Jordan et al. - 2019 - Examining long-term trends in politics and culture.pdf;/Users/francojc/Zotero/storage/K8HM8ZI5/Appendix.pdf;/Users/francojc/Zotero/storage/Q55CAJXD/3476.html}
}

@misc{Juffs2020,
  title = {The {{University}} of {{Pittsburgh English Language Institute Corpus}} ({{PELIC}})},
  author = {Juffs, Alan and Han, Na-Rae and Naismith, Ben},
  year = {2020},
  month = aug,
  publisher = {Zenodo},
  doi = {10.5281/ZENODO.3991977},
  urldate = {2023-08-14},
  abstract = {This is the first public release of the dataset from the University of Pittsburgh English Language Institute Corpus (PELIC). PELIC is a publicly available 4.2-million-word learner corpus of written texts. These texts were collected in an English for Academic Purposes (EAP) context over seven years in the University of Pittsburgh's Intensive English Program and were produced by over 1100 students with a wide range of linguistic backgrounds and proficiency levels. PELIC is longitudinal, offering greater opportunities for tracking development in a natural classroom setting. In addition to the data, the PELIC repository contains corpus statistics and tutorials on how to access and analyze the data.},
  copyright = {Creative Commons Attribution No Derivatives 4.0 International, Open Access},
  langid = {english},
  keywords = {/unread,english-for-academic-purposes,esl,intensive-english-program,learner-corpus,lexical-analysis,longitudinal-data,second-language-acquisition,second-language-writing,tesol}
}

@article{Juola2006,
  title = {A Prototype for Authorship Attribution Studies},
  author = {Juola, Patrick and Sofko, John and Brennan, Patrick},
  year = {2006},
  journal = {Literary and Linguistic Computing},
  pages = {1--15},
  urldate = {2013-10-31}
}

@misc{Juola2013,
  title = {Rowling and "{{Galbraith}}": An Authorial Analysis},
  author = {Juola, Patrick},
  year = {2013},
  journal = {The Language Log},
  url = {http://languagelog.ldc.upenn.edu/nll/?p=5315}
}

@article{Juola2021,
  title = {Verifying Authorship for Forensic Purposes: A Computational Protocol and Its Validation},
  shorttitle = {Verifying Authorship for Forensic Purposes},
  author = {Juola, Patrick},
  year = {2021},
  month = may,
  journal = {Forensic Science International},
  pages = {110824},
  issn = {0379-0738},
  doi = {10.1016/j.forsciint.2021.110824},
  urldate = {2021-06-01},
  abstract = {Being able to identify the author of an anonymous or disputed document is an important task in forensic science. This can be treated as a form of pattern evidence based on writing style, but the subjective analysis of writing style may have all the well-known problems of other forms of subjective pattern evidence. In this paper, we demonstrate a computer program to address these issues. This program analyzes a pair of documents (a known document and a questioned document) to determine if they were written by the same author. More importantly, this paper also validates the accuracy of this program through a large-scale series of controlled experiments involving English language blogs. Across more than 32,000 different document pairs, the system achieved a measured accuracy of 77\%. This paper concludes that this system not only addresses a key problem in forensic linguistics, but also provides the repeatability, reproducibility, and measured accuracy levels that are key to the advancement of forensic science.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/4AUXHAWD/S0379073821001444.html}
}

@techreport{Jurafsky1997,
  title = {Switchboard {{SWBD-DAMSL}} Shallow-Discourse-Function Annotation Coders Manual, Draft 13},
  author = {Jurafsky, Daniel and Shriberg, Elizabeth and Biasca, Debra},
  year = {1997},
  number = {97-02},
  address = {Boulder, CO},
  institution = {University of Colorado, Boulder Institute of Cognitive Science}
}

@book{Jurafsky2020,
  title = {Speech and Language Processing},
  author = {Jurafsky, Daniel and Martin, James H.},
  year = {2020},
  url = {https://web.stanford.edu/~jurafsky/slp3/},
  urldate = {2021-06-25},
  file = {/Users/francojc/Zotero/storage/3QY2QBXQ/Jurafsky, Martin - 2000 - Speech and Language Processing(2).pdf;/Users/francojc/Zotero/storage/JI9M2GIZ/Jurafsky, Martin - 2000 - Speech and Language Processing.pdf}
}

@article{Kaplan2017,
  title = {Teaching Stats for Data Science},
  author = {Kaplan, Daniel},
  year = {2017},
  journal = {PeerJ Preprints},
  doi = {10.7287/peerj.preprints.3205v1},
  abstract = {The familiar mathematical topics of introductory statistics --- means, proportions, t-tests, normal and t distributions, chi-squared, etc. --- are a product of the first half of the 20th century. Naturally, they reflect the statistical conditions of that era: scarce, e.g. n {$<$} 10, data originating in benchtop or agricultural experiments; algorithms communicated via algebraic formulas. Today, applied statistics relates to a different environment: software is the means of algorithmic communication, observational and " unplanned " data are interpreted for causal relationships, and data are large both in n and the number of variables. This change in situation calls for a thorough rethinking of the topics in and approach to statistics education. (Cobb 2015) In this paper, I present a set of ten organizing blocks for intro stats that, I claim, are better suited to today's environment. 1. Data tables 2. Data graphics 3. Model functions 4. Model training 5. Effect size and covariates 6. Displays of distributions 7. Bootstrap replication 8. Prediction error 9. Comparing models 10. Generalization and causality The extent of each theme, and the division of the whole into ten blocks, has been made with an eye toward covering each in roughly one week of a standard course. Keeping in mind that one of the drivers of change in statistics has been the transition from algebra to software as the mode of describing algorithms, computing should be integrated thoroughly across blocks. I'll illustrate the sorts of algorithms that support the themes using two new packages for R: ggformula and mosaicModel. These packages integrate data-science software such as the tidyverse (Wickham and Grolemund 2016) with the pedagogical approach of the mosaic package (Pruim, Kaplan, and Horton 2017). The discussion and computer commands in this paper are oriented toward instructors. An implementation of the blocks for students should take another form, one with much more basic and expository narrative and interactive scaffolding for computer commands.},
  file = {/Users/francojc/Zotero/storage/F7QUBBGC/Kaplan - 2017 - Teaching Stats for Data Science.pdf}
}

@article{Katz2017,
  title = {Trump Sounds a Different Tone in First Address to Congress},
  author = {Katz, Josh and Aisch, Gregor},
  year = {2017},
  month = feb,
  journal = {The New York Times},
  issn = {0362-4331},
  url = {https://www.nytimes.com/interactive/2017/02/28/upshot/trump-sounds-different-tone-in-first-address-to-congress.html, https://www.nytimes.com/interactive/2017/02/28/upshot/trump-sounds-different-tone-in-first-address-to-congress.html},
  urldate = {2021-05-25},
  abstract = {Trump's speech was closer in tone to a typical State of the Union address, a far different approach from his generally combative one.},
  chapter = {The Upshot},
  langid = {american},
  file = {/Users/francojc/Zotero/storage/BM257JKZ/trump-sounds-different-tone-in-first-address-to-congress.html}
}

@article{Kaur2018,
  title = {A Systematic Review on Stopword Removal Algorithms},
  author = {Kaur, Jashanjot and Buttar, P. Kaur},
  year = {2018},
  journal = {International Journal on Future Revolution in Computer Science \& Communication Engineering},
  volume = {4},
  number = {4},
  pages = {207--210},
  keywords = {/unread,stopwords,text normalization,textbook},
  file = {/Users/francojc/Zotero/storage/CKAENYGI/Kaur and Buttar - 2018 - A systematic review on stopword removal algorithms.pdf}
}

@article{Kemp2010,
  title = {Texting versus Txtng: Reading and Writing Text Messages, and Links with Other Linguistic Skills},
  author = {Kemp, Nenagh},
  year = {2010},
  month = jan,
  journal = {Writing Systems Research},
  volume = {2},
  number = {1},
  pages = {53--71},
  issn = {1758-6801},
  doi = {10.1093/wsr/wsq002},
  urldate = {2012-10-07}
}

@inproceedings{Kennedy1998,
  title = {An Introduction to Corpus Linguistics},
  booktitle = {Studies in {{Language}} and {{Linguistics}}},
  author = {Kennedy, Graeme},
  year = {1998},
  publisher = {Longman},
  file = {/Users/francojc/Zotero/storage/3ICAU39X/Kennedy - 1998 - An introduction to corpus linguistics.pdf;/Users/francojc/Zotero/storage/HCRV74FL/Kennedy - 1998 - An introduction to corpus linguistics(2).pdf;/Users/francojc/Zotero/storage/QP7T48EJ/Kennedy - 1998 - An introduction to corpus linguistics(4).pdf;/Users/francojc/Zotero/storage/VUDBRLNZ/Kennedy - 1998 - An introduction to corpus linguistics(3).pdf}
}

@article{Kerr1998,
  title = {{{HARKing}}: {{Hypothesizing}} after the Results Are Known},
  shorttitle = {{{HARKing}}},
  author = {Kerr, Norbert L.},
  year = {1998},
  journal = {Personality and social psychology review},
  volume = {2},
  number = {3},
  pages = {196--217},
  publisher = {Sage Publications Sage CA: Los Angeles, CA},
  file = {/Users/francojc/Zotero/storage/P6F8B3GS/Kerr - 1998 - HARKing Hypothesizing after the results are known.pdf;/Users/francojc/Zotero/storage/AZ27ZD7J/s15327957pspr0203_4.html}
}

@inproceedings{Kessler1997,
  title = {Automatic Detection of Text Genre},
  booktitle = {Proceedings of the 35th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}}},
  author = {Kessler, B and Numberg, G and Sch{\"u}tze, H},
  year = {1997},
  pages = {32--38},
  url = {http://dl.acm.org/citation.cfm?id=979622},
  urldate = {2014-06-04},
  file = {/Users/francojc/Zotero/storage/URUCZTVB/Kessler et al. - 1997 - Automatic detection of text genre.pdf}
}

@misc{Kh2016,
  title = {How Big Data Is Affecting Business Decisions},
  author = {Kh, Ryan},
  year = {2016},
  journal = {Big Data Zone},
  url = {https://dzone.com/articles/how-big-data-is-affecting-business-decisions}
}

@article{Kilgarriff2007,
  title = {Googleology Is Bad Science},
  author = {Kilgarriff, Adam},
  year = {2007},
  journal = {Computational Linguistics},
  url = {http://www.mitpressjournals.org/doi/abs/10.1162/coli.2007.33.1.147},
  urldate = {2011-08-03}
}

@article{Klein2018,
  title = {A Practical Guide for Transparency in Psychological Science},
  author = {Klein, Olivier and Hardwicke, Tom E. and Aust, Frederik and Breuer, Johannes and Danielsson, Henrik and Hofelich Mohr, Alicia and Ijzerman, Hans and Nilsonne, Gustav and Vanpaemel, Wolf and Frank, Michael C.},
  year = {2018},
  journal = {Collabra: Psychology},
  volume = {4},
  number = {1},
  pages = {20},
  issn = {2474-7394},
  doi = {10.1525/collabra.158},
  abstract = {The credibility of scientific claims depends upon the transparency of the research products upon which they are based (e.g., study protocols, data, materials, and analysis scripts). As psychology navigates a period of unprecedented introspection, user-friendly tools and services that support open science have flourished. There has never been a better time to embrace transparent research practices. However, the plethora of decisions and choices involved can be bewildering. Here we provide a practical guide to help researchers navigate the process of preparing and sharing the products of their research. Being an open scientist means adopting a few straightforward research management practices, which lead to less error prone, reproducible research workflows. Further, this adoption can be piecemeal -- each incremental step towards complete transparency adds positive value. Transparent research practices not only improve the efficiency of individual researchers, they enhance the credibility of the knowledge generated by the scientific community.},
  file = {/Users/francojc/Zotero/storage/AC5RJH25/Klein et al. - 2018 - A Practical Guide for Transparency in Psychological Science.pdf}
}

@article{Kloumann2012,
  title = {Positivity of the {{English}} Language},
  author = {Kloumann, {\relax IM} and Danforth, {\relax CM} and Harris, {\relax KD} and Bliss, {\relax CA}},
  year = {2012},
  journal = {PLoS ONE},
  doi = {10.1371/journal.pone.0029484},
  urldate = {2016-08-26},
  file = {/Users/francojc/Zotero/storage/2NFALD2M/Kloumann et al. - 2012 - Positivity of the English language.pdf}
}

@article{Knuth1984,
  title = {Literate Programming},
  author = {Knuth, Donald Ervin},
  year = {1984},
  journal = {The computer journal},
  volume = {27},
  number = {2},
  pages = {97--111},
  publisher = {Oxford University Press},
  file = {/Users/francojc/Zotero/storage/PJ3R85B9/Knuth - 1984 - Literate programming.pdf;/Users/francojc/Zotero/storage/8TM338GA/343244.html}
}

@article{Koehn2005,
  title = {Europarl: A Parallel Corpus for Statistical Machine Translation},
  author = {Koehn, P},
  year = {2005},
  journal = {MT Summit X},
  pages = {12--16},
  file = {/Users/francojc/Zotero/storage/IYJUAX6D/Koehn - 2005 - Europarl A parallel corpus for statistical machin.pdf}
}

@article{Konig2021,
  title = {Exploring Reusability and Reproducibility for a Research Infrastructure for {{L1}} and {{L2}} Learner Corpora},
  author = {K{\"o}nig, Alexander and Frey, Jennifer-Carmen and Stemle, Egon W.},
  year = {2021},
  month = apr,
  journal = {Information},
  volume = {12},
  number = {5},
  pages = {199},
  issn = {2078-2489},
  doi = {10.3390/info12050199},
  urldate = {2021-06-13},
  abstract = {Up until today research in various educational and linguistic domains such as learner corpus research, writing research, or second language acquisition has produced a substantial amount of research data in the form of L1 and L2 learner corpora. However, the multitude of individual solutions combined with domain-inherent obstacles in data sharing have so far hampered comparability, reusability and reproducibility of data and research results. In this article, we present work in creating a digital infrastructure for L1 and L2 learner corpora and populating it with data collected in the past. We embed our infrastructure efforts in the broader field of infrastructures for scientific research, drawing from technical solutions and frameworks from research data management, among which the FAIR guiding principles for data stewardship. We share our experiences from integrating some L1 and L2 learner corpora from concluded projects into the infrastructure while trying to ensure compliance with the FAIR principles and the standards we established for reproducibility, discussing how far research data that has been collected in the past can be made comparable, reusable and reproducible. Our results show that some basic needs for providing comparable and reusable data are covered by existing general infrastructure solutions and can be exploited for domain-specific infrastructures such as the one presented in this article. Other aspects need genuinely domain-driven approaches. The solutions found for the corpora in the presented infrastructure can only be a preliminary attempt, and further community involvement would be needed to provide templates and models acknowledged and promoted by the community. Furthermore, forward-looking data management would be needed starting from the beginning of new corpus creation projects to ensure that all requirements for FAIR data can be met.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/YFA77JC7/König et al. - 2021 - Exploring Reusability and Reproducibility for a Re.pdf}
}

@article{Koppel2006,
  title = {Good News or Bad News? Let the Market Decide},
  author = {Koppel, M. and Shtrimberg, I.},
  year = {2006},
  journal = {Computing Attitude and Affect in Text: Theory and Applications},
  pages = {297--301},
  publisher = {Springer},
  urldate = {2011-11-03}
}

@incollection{Kostic2003,
  title = {Inflectional Morphology and Word Meaning: {{Orthogonal}} or Co-Implicative Cognitive Domains?},
  shorttitle = {Inflectional Morphology and Word Meaning},
  booktitle = {Morphological {{Structure}} in {{Language Processing}}},
  author = {Kosti{\'c}, Aleksandar and Markovi{\'c}, Tanja and Baucal, Aleksandar},
  editor = {Baayen, R. Harald and Schreuder, Robert},
  year = {2003},
  month = dec,
  pages = {1--44},
  publisher = {De Gruyter Mouton},
  doi = {10.1515/9783110910186.1},
  urldate = {2023-06-20},
  isbn = {978-3-11-017892-0},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/XCL223HI/Kostić et al. - 2003 - Inflectional morphology and word meaning Orthogon.pdf}
}

@article{Kowsari2019,
  title = {Text Classification Algorithms: A Survey},
  shorttitle = {Text Classification Algorithms},
  author = {Kowsari, Kamran and Meimandi, Kiana Jafari and Heidarysafa, Mojtaba and Mendu, Sanjana and Barnes, Laura E. and Brown, Donald E.},
  year = {2019},
  month = apr,
  journal = {Information},
  volume = {10},
  number = {4},
  eprint = {1904.08067},
  pages = {150},
  issn = {2078-2489},
  doi = {10.3390/info10040150},
  urldate = {2021-06-18},
  abstract = {In recent years, there has been an exponential growth in the number of complex documents and texts that require a deeper understanding of machine learning methods to be able to accurately classify texts in many applications. Many machine learning approaches have achieved surpassing results in natural language processing. The success of these learning algorithms relies on their capacity to understand complex models and non-linear relationships within data. However, finding suitable structures, architectures, and techniques for text classification is a challenge for researchers. In this paper, a brief overview of text classification algorithms is discussed. This overview covers different text feature extractions, dimensionality reduction methods, existing algorithms and techniques, and evaluations methods. Finally, the limitations of each technique and their application in the real-world problem are discussed.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/FALEQ7MS/Kowsari et al. - 2019 - Text Classification Algorithms A Survey.pdf;/Users/francojc/Zotero/storage/Y3ULA5ZH/1904.html}
}

@article{Krathwohl2002,
  title = {A Revision of {{Bloom}}'s {{Taxonomy}}: {{An}} Overview},
  shorttitle = {A Revision of Bloom's Taxonomy},
  author = {Krathwohl, David R.},
  year = {2002},
  journal = {Theory into Practice},
  volume = {41},
  number = {4},
  pages = {212--218},
  publisher = {Taylor \& Francis},
  keywords = {/unread,bloom,knowledge,learning-objectives},
  file = {/Users/francojc/Zotero/storage/XN9QUZCV/Krathwohl_2002_A revision of Bloom's taxonomy.pdf}
}

@book{Kucera1967,
  title = {Computational Analysis of Present Day {{American English}}},
  author = {Kucera, H and Francis, W N},
  year = {1967},
  publisher = {Brown University Press Providence}
}

@article{Kulkarni2015,
  title = {Statistically Significant Detection of Linguistic Change},
  author = {Kulkarni, Vivek and {Al-Rfou}, Rami and Perozzi, Bryan and Skiena, Steven},
  year = {2015},
  journal = {Proceedings of the 24th international conference on World Wide Web (WWW '15)},
  eprint = {1411.3315},
  pages = {625--635},
  doi = {10.1145/2736277.2741627},
  abstract = {We propose a new computational approach for tracking and detecting statistically significant linguistic shifts in the meaning and usage of words. Such linguistic shifts are especially prevalent on the Internet, where the rapid exchange of ideas can quickly change a word's meaning. Our meta-analysis approach constructs property time series of word usage, and then uses statistically sound change point detection algorithms to identify significant linguistic shifts. We consider and analyze three approaches of increasing complexity to generate such linguistic property time series, the culmination of which uses distributional characteristics inferred from word co-occurrences. Using recently proposed deep neural language models, we first train vector representations of words for each time period. Second, we warp the vector spaces into one unified coordinate system. Finally, we construct a distance-based distributional time series for each word to track its linguistic displacement over time. We demonstrate that our approach is scalable by tracking linguistic change across years of micro-blogging using Twitter, a decade of product reviews using a corpus of movie reviews from Amazon, and a century of written books using the Google Book Ngrams. Our analysis reveals interesting patterns of language usage change commensurate with each medium.},
  archiveprefix = {arXiv},
  isbn = {9781450334693},
  file = {/Users/francojc/Zotero/storage/VEMMTPRU/Kulkarni et al. - 2015 - Statistically significant detection of linguistic change.pdf}
}

@article{Kutuzov2016,
  title = {Clustering Comparable Corpora of Russian and Ukrainian Academic Texts: {{Word}} Embeddings and Semantic Fingerprints},
  author = {Kutuzov, Andrey and Kopotev, Mikhail and Sviridenko, Tatyana and Ivanova, Lyubov},
  year = {2016},
  journal = {arXiv},
  eprint = {1604.05372},
  abstract = {We present our experience in applying distributional semantics (neural word embeddings) to the problem of representing and clustering documents in a bilingual comparable corpus. Our data is a collection of Russian and Ukrainian academic texts, for which topics are their academic fields. In order to build language-independent semantic representations of these documents, we train neural distributional models on monolingual corpora and learn the optimal linear transformation of vectors from one language to another. The resulting vectors are then used to produce 'semantic fingerprints' of documents, serving as input to a clustering algorithm. The presented method is compared to several baselines including 'orthographic translation' with Levenshtein edit distance and outperforms them by a large margin. We also show that language-independent 'semantic fingerprints' are superior to multi-lingual clustering algorithms proposed in the previous work, at the same time requiring less linguistic resources.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/25BSIFMF/Kutuzov et al. - 2016 - Clustering Comparable Corpora of Russian and Ukrainian Academic Texts Word Embeddings and Semantic Fingerprints.pdf}
}

@inproceedings{Kyle2013,
  title = {Native Language Identification: A Key n-Gram Category Approach},
  shorttitle = {Native Language Identification},
  booktitle = {Proceedings of the {{Eighth Workshop}} on {{Innovative Use}} of {{NLP}} for {{Building Educational Applications}}},
  author = {Kyle, Kristopher and Crossley, Scott and Dai, Jianmin and McNamara, Danielle},
  year = {2013},
  month = jun,
  pages = {242--250},
  publisher = {Association for Computational Linguistics},
  address = {Atlanta, Georgia},
  url = {https://aclanthology.org/W13-1731},
  urldate = {2023-08-14},
  keywords = {/unread,L2 writing,predictive analysis},
  file = {/Users/francojc/Zotero/storage/YHSWVFKQ/Kyle et al. - 2013 - Native Language Identification A Key N-gram Categ.pdf}
}

@article{Lakoff1973,
  title = {Language and Woman's Place},
  author = {Lakoff, Robin},
  year = {1973},
  journal = {Language in Society},
  volume = {2},
  number = {1},
  pages = {45--80},
  url = {http://www.jstor.com/stable/4166707},
  abstract = {Our use of language embodies attitudes as well as referential meanings. 'Woman's language' has as foundation the attitude that women are marginal to the serious concerns of life, which are pre-empted by men. The marginality and powerlessness of women is reflected in both the ways women are expected to speak, and the ways in which women are spoken of. In appropriate women's speech, strong expression of feeling is avoided, expression of uncertainty is favored, and means of expression in regard to subject-matter deemed 'trivial' to the 'real' world are elaborated. Speech about women implies an object, whose sexual nature requires euphemism, and whose social roles are derivative and dependent in relation to men. The personal identity of women thus is linguistically submerged; the language works against treatment of women, as serious persons with individual views.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/BVSF424Q/Lakoff - 1973 - Language and Woman's Place.pdf}
}

@article{Landauer1998,
  title = {An Introduction to Latent Semantic Analysis},
  author = {Landauer, Thomas K and Folt, Peter W and Laham, Darrell},
  year = {1998},
  journal = {Discourse processes},
  volume = {25},
  number = {2},
  pages = {259--284},
  issn = {0163-853X},
  doi = {10.1080/01638539809545028},
  abstract = {Latent Semantic Analysis (LSA) is a theory and method for extracting and representing the contextual-usage meaning of words by statistical computations applied to a large corpus of text (Landauer and Dumais, 1997). The underlying idea is that the aggregate of all the word contexts in which a given word does and does not appear provides a set of mutual constraints that largely determines the similarity of meaning of words and sets of words to each other. The adequacy of LSA's reflection of human knowledge has been established in a variety of ways. For example, its scores overlap those of humans on standard vocabulary and subject matter tests; it mimics human word sorting and category judgments; it simulates word--word and passage--word lexical priming data; and, as reported in 3 following articles in this issue, it accurately estimates passage coherence, learnability of passages by individual students, and the quality and quantity of knowledge contained in an essay.},
  isbn = {0163853X},
  pmid = {14532333},
  file = {/Users/francojc/Zotero/storage/8Z8S3MME/Landauer, Folt, Laham - 1998 - An introduction to latent semantic analysis.pdf}
}

@book{Lantz2013,
  title = {Machine Learning with {{R}}},
  author = {Lantz, Brett},
  year = {2013},
  publisher = {Packt Publishing},
  address = {Birmingham},
  file = {/Users/francojc/Zotero/storage/4JVSCZPX/Lantz - 2013 - Machine learning with R.pdf}
}

@article{Larsson2022,
  title = {On the Status of Statistical Reporting versus Linguistic Description in Corpus Linguistics: A Ten-Year Perspective},
  author = {Larsson, Tove and Egbert, Jesse and Biber, Douglas},
  year = {2022},
  journal = {Corpora},
  volume = {17},
  number = {1},
  issn = {1749-5032},
  url = {https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1531032&dswid=-9122},
  urldate = {2021-02-03},
  abstract = {This study investigates (i) whether there has been a shift towards increased statistical focus in corpus linguistic research articles, and if so, (ii) whether it has had any repercussions for the attention paid to linguistic description. This is done through an analysis of the relative focus on statistical reporting vs. linguistic description in the way the results are reported and discussed in research articles published in four major corpus linguistics journals in 2009 and 2019. The results display a marked change: In 2009, a clear majority of the articles exhibit a preference for linguistic description over statistical reporting; in 2019, the exact opposite is true. The number of different statistical techniques employed has also gone up. While the increased statistical focus may reflect increased methodological sophistication, our results show that it has come at a cost: A diminished focus on linguistic description, evident, for example, through fewer text excerpts and linguistic examples, appear to be symptomatic of increasing distance from the language that is the object of study. We discuss these shifts and suggest some ways of employing sophisticated statistical techniques without sacrificing a focus on language.},
  file = {/Users/francojc/Zotero/storage/V7RPLEDF/Larsson et al. - 2022 - On the status of statistical reporting versus ling.pdf}
}

@incollection{Larsson2024,
  title = {On the Perils of Linguistically Opaque Measures and Methods: {{Toward}} Increased Transparency and Linguistic Interpretability},
  booktitle = {Corpora for Language Learning: {{Bridging}} the Research-Practice Divide},
  author = {Larsson, Tove and Biber, Douglas},
  editor = {Crosthwaite, Peter},
  year = {2024},
  pages = {131--141},
  publisher = {Taylor \& Francis},
  keywords = {methods,operationalization,reproducible research,research,variables},
  file = {/Users/francojc/Zotero/storage/WLBIN4ED/Larsson and Biber - ON THE PERILS OF LINGUISTICALLY OPAQUE MEASURES AND METHODS.pdf}
}

@misc{LDC2024,
  type = {Repository},
  title = {Linguistic Data Consortium},
  shorttitle = {{{LDC}}},
  year = {2024},
  journal = {Linguistic Data Consortium},
  url = {https://www.ldc.upenn.edu/},
  urldate = {2024-04-04}
}

@misc{Learnercorporaworld05-2021,
  title = {Learner Corpora around the World},
  journal = {UCLouvain},
  url = {https://uclouvain.be/en/research-institutes/ilc/cecl/learner-corpora-around-the-world.html},
  urldate = {2021-05-25},
  abstract = {This list is still work in progress. We would like it to be as comprehensive as possible. If you have a learner corpus or know of one that is not listed on this webpage, send a message to Magali Paquot and we will add it to the list. We hope you will find the list useful for your research! The list below only contains learner corpora, i.e. electronic collections of continuous},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/PJVIJLEW/learner-corpora-around-the-world.html}
}

@article{Leech1992,
  title = {100 Million Words of {{English}}: {{The British National Corpus}} ({{BNC}})},
  author = {Leech, Geoffrey},
  year = {1992},
  number = {1991},
  pages = {1--13},
  issn = {0254-4474},
  abstract = {The British National Corpus (BNC) project is a collaboration between commercial and academic partners. The leading partner is Oxford Universi- ty Press, and two other major publishers (Longaman Group Ltd and Cham- bers) are also contributing on the commercial side. On the academic side, the collaborators are the Oxford University Computing Service, and Lan- caster University. The British Library (our national library) also takes a role in such areas as archiving the corpus material and making it available. The project is supported approximately 50 percent by the British Gov- erment (Department of Trade and Industry, and Science and Engineering Research Council). It began in January 1991 and will continue for three years. The corpus is intended to be representative of a very broad range of English language use in speech and writing.},
  file = {/Users/francojc/Zotero/storage/GTKYD5WV/Leech - 1992 - 100 Million Words of English The British National Corpus (BNC).pdf}
}

@article{Leech1992a,
  title = {Computer Corpora---What Do They Tell Us about Culture?},
  author = {Leech, G and Fallon, R},
  year = {1992},
  journal = {ICAME journal},
  volume = {16},
  pages = {29--50},
  file = {/Users/francojc/Zotero/storage/EYEILPFQ/Leech and Fallon - 1992 - Computer corpora---what do they tell us about cult.pdf}
}

@misc{Leek2013,
  title = {The Key Word in "{{Data Science}}" Is Not Data, It Is Science {$\cdot$} Simply Statistics},
  author = {Leek, Jeff},
  year = {2013},
  month = dec,
  journal = {Simply Stats},
  url = {https://simplystatistics.org/2013/12/12/the-key-word-in-data-science-is-not-data-it-is-science/},
  urldate = {2021-01-21},
  file = {/Users/francojc/Zotero/storage/92Q6GCNH/the-key-word-in-data-science-is-not-data-it-is-science.html}
}

@article{Leisch2009,
  title = {Creating {{R}} Packages : A Tutorial},
  author = {Leisch, F.},
  year = {2009},
  journal = {Compstat 2008-Proceedings in Computational Statistics},
  number = {36},
  pages = {1--19},
  issn = {3790820830},
  doi = {0.1007/978-3-7908-2084-3},
  abstract = {This tutorial gives a practical introduction to creating R packages. We discuss how object oriented programming and S formulas can be used to give R code the usual look and feel, how to start a package from a collection of R functions, and how to test the code once the package has been created. As running example we use functions for standard linear regression analysis which are developed from scratch.},
  isbn = {978-3-7908-2083-6},
  file = {/Users/francojc/Zotero/storage/FRQ58H7L/Leisch - 2009 - Creating R Packages  A Tutorial.pdf}
}

@article{Lenko-Szymanska2014,
  title = {The Acquisition of Formulaic Language by {{EFL}} Learners: A Cross-Sectional and Cross-Linguistic Perspective},
  shorttitle = {The Acquisition of Formulaic Language by {{EFL}} Learners},
  author = {{Lenko-Szymanska}, Agnieszka},
  year = {2014},
  month = jan,
  journal = {International Journal of Corpus Linguistics},
  volume = {19},
  number = {2},
  pages = {225--251},
  publisher = {John Benjamins},
  issn = {1384-6655, 1569-9811},
  doi = {10.1075/ijcl.19.2.04len},
  urldate = {2021-04-06},
  abstract = {Most of the studies on the use of phraseology by second language learners concentrate on advanced L2 users. Researchers attempt to tease out to what extent learners' phraseology is different from the native one. There are almost no accounts of formulaic language emerging at the early stages of learning, particularly in foreign language settings. The research reported in this paper attempts to bridge this gap. It is exploratory in nature and investigates the emergence and use of lexical bundles by a range of students learning English in the classroom setting. The data analyzed in the study were drawn from the ICCI corpus and are examined with reference to learners' ages, stages of proficiency, and L1 backgrounds. The probed essays were written by students in grades 6, 9 and 12 with Chinese, German, Hebrew, Japanese, Polish and Spanish as their L1s.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/SE9Q8JVT/ijcl.19.2.html}
}

@book{Levshina2015,
  title = {How to Do Linguistics with {{R}}: {{Data}} Exploration and Statistical Analysis},
  shorttitle = {How to Do Linguistics with {{R}}},
  author = {Levshina, Natalia},
  year = {2015},
  publisher = {John Benjamins Publishing Company},
  file = {/Users/francojc/Zotero/storage/R69ARQ3M/dGnlCgAAQBAJ.html}
}

@book{Lewis2004,
  title = {Moneyball: {{The}} Art of Winning an Unfair Game},
  author = {Lewis, Michael},
  year = {2004},
  publisher = {WW Norton \& Company},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/W96KAI4M/Lewis - Moneyball The Art of Winning an Unfair Game.pdf}
}

@article{Lin2023,
  title = {Syntactic Complexity in Legal Translated Texts and the Use of Plain {{English}}: A Corpus-Based Study},
  author = {Lin, Xiaowen and Afzaal, Muhammad and Aldayel, Hessah Saleh},
  year = {2023},
  month = jan,
  journal = {Humanities and Social Sciences Communications},
  volume = {10},
  number = {1},
  pages = {17},
  issn = {2662-9992},
  doi = {10.1057/s41599-022-01485-x},
  abstract = {This study compared the linguistic difficulty of legal translated texts with the syntactic complexity of native English legal writings in order to demonstrate the statistically significant differences between the two big datasets. The study applies features of the syntactic complexity of sentences within legal text translations that translated texts are less complicated than their original counterparts. It provides an example of how easy plain English translation in legal communication might result in understandable target writings. The findings of the legal translation of the people of three regions reveal striking patterns in terms of syntactic complexity and legal communication in plain English, which are consistent with previous research. Complex nominal and hypotactic structures result in a high number of propositions per sentence, placing a high demand on the cognitive processing abilities of those who read and understand the text. The statistics show considerable differences among the three locations and various forms of company law corpora. The study is the first large-scale quantitative analysis of the accessibility of legal jargon compared to other forms of English, emphasizing the efficacy of plain-language initiatives in legal translations.}
}

@article{Liu2021,
  title = {Syntactic Complexity in Translated and Non-Translated Texts: A Corpus-Based Study of Simplification},
  shorttitle = {Syntactic Complexity in Translated and Non-Translated Texts},
  author = {Liu, Kanglong and Afzaal, Muhammad},
  editor = {Amancio, Diego Raphael},
  year = {2021},
  month = jun,
  journal = {PLoS ONE},
  volume = {16},
  number = {6},
  pages = {e0253454},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0253454},
  urldate = {2023-08-24},
  abstract = {This study approaches the investigation of the simplification hypotheses in corpus-based translation studies from a syntactic complexity perspective. The research is based on two comparable corpora, the English monolingual part of COCE (Corpus of Chinese-English) and the native English corpus of FLOB (Freiburg-LOB Corpus of British English). Using the 13 syntactic complexity measures falling into five subconstructs (i.e. length of production unit, amount of subordination, amount of coordination, phrasal complexity and overall sentence complexity), our results show that translation as a whole is less complex compared to non-translation, reflected most prominently in the amount of subordination and overall sentence complexity. Further pairwise comparison of the four subgenres of the corpora shows mixed results. Specifically, the translated news is homogenous to native news as evidenced by the complexity measures; the translated genres of general prose and academic writing are less complex compared to their native counterparts while translated fiction is more complex than non-translated fiction. It was found that mean sentence length always produced a significant effect on syntactic complexity, with higher syntactic complexity for longer sentence lengths in both corpora. ANOVA test shows a highly significant main effect of translation status, with higher syntactic complexity in the non-translated texts (FLOB) than the translated texts (COCE), which provides support for the simplification hypothesis in translation. It is also found that, apart from translation status, genre is an important variable in affecting the complexity level of translated texts. Our study offers new insights into the investigation of simplification hypothesis from the perspective of translation from English into Chinese.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/F9AVD698/Liu and Afzaal - 2021 - Syntactic complexity in translated and non-translated texts A corpus-based study of simplification.pdf}
}

@article{Lozano2009,
  title = {{{CEDEL2}}: {{Corpus}} Escrito Del Espa{\~n}ol {{L2}}},
  shorttitle = {{{CEDEL2}}},
  author = {Lozano, Crist{\'o}bal},
  year = {2009},
  journal = {Applied linguistics now: Understanding language and mind/ La Ling{\"u}{\'i}stica Aplicada Hoy: Comprendiendo el Lenguaje y la Mente},
  pages = {197--212},
  file = {/Users/francojc/Zotero/storage/PXQXADUP/Lozano - 2009 - CEDEL2 Corpus escrito del español L2.pdf}
}

@article{Lozano2022,
  title = {{{CEDEL2}}: {{Design}}, Compilation and Web Interface of an Online Corpus for {{L2 Spanish}} Acquisition Research},
  shorttitle = {{{CEDEL2}}},
  author = {Lozano, Crist{\'o}bal},
  year = {2022},
  month = oct,
  journal = {Second Language Research},
  volume = {38},
  number = {4},
  pages = {965--983},
  issn = {0267-6583, 1477-0326},
  doi = {10.1177/02676583211050522},
  urldate = {2024-07-27},
  abstract = {This article presents and reviews a new methodological resource for research in second language acquisition (SLA), CEDEL2 ( Corpus Escrito del Espa{\~n}ol L2 `L2 Spanish Written Corpus'), and its free online search-engine interface ( cedel2.learnercorpora.com ). CEDEL2 is a multi-first-language corpus (Spanish, English, German, Dutch, Portuguese, Italian, French, Greek, Russian, Japanese, Chinese, and Arabic) of L2 Spanish learners at all proficiency levels. It additionally contains several native control subcorpora (English, Portuguese, Greek, Japanese, and Arabic). Its latest release (version 2) holds material from around 4,400 speakers, which amounts to over 1,100,000 words. CEDEL2 follows strict corpus-design criteria (Sinclair, 2005) and L2 corpus-design recommendations (Tracy-Ventura and Paquot, 2021), and all subcorpora are equally designed to be fully contrastable, as recommended by Contrastive Interlanguage Analysis (Granger, 2015). Thanks to its design and web interface, CEDEL2 allows for complex searches which can be further narrowed down according to its SLA-motivated variables, e.g. first language (L1), proficiency level, self-reported proficiency level, age of onset to the L2, length of exposure to the L2, length of residence in a Spanish-speaking country, knowledge of other foreign languages, type of task, etc. These CEDEL2 features allow L2 researchers to address SLA questions and hypotheses.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/B5FI6IFY/Lozano - 2022 - CEDEL2 Design, compilation and web interface of an online corpus for L2 Spanish acquisition researc.pdf}
}

@article{Maas2011,
  title = {Learning Word Vectors for Sentiment Analysis},
  author = {Maas, Andrew L and Daly, Raymond E and Pham, Peter T and Huang, Dan and Ng, Andrew Y and Potts, Christopher},
  year = {2011},
  pages = {9},
  abstract = {Unsupervised vector-based approaches to semantics can model rich lexical meanings, but they largely fail to capture sentiment information that is central to many word meanings and important for a wide range of NLP tasks. We present a model that uses a mix of unsupervised and supervised techniques to learn word vectors capturing semantic term--document information as well as rich sentiment content. The proposed model can leverage both continuous and multi-dimensional sentiment information as well as non-sentiment annotations. We instantiate the model to utilize the document-level sentiment polarity annotations present in many online documents (e.g. star ratings). We evaluate the model using small, widely used sentiment and subjectivity corpora and find it out-performs several previously introduced methods for sentiment classification. We also introduce a large dataset of movie reviews to serve as a more robust benchmark for work in this area.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/5IIJAKTM/Maas et al. - Learning Word Vectors for Sentiment Analysis.pdf}
}

@misc{Macwhinney2024,
  type = {Repository},
  title = {{{TalkBank}}},
  shorttitle = {{{TalkBank}}},
  author = {Macwhinney, Brian},
  year = {2024},
  journal = {The TalkBank system},
  url = {https://talkbank.org/},
  urldate = {2024-04-04},
  keywords = {corpora,language,repository}
}

@misc{Magueresse2020,
  title = {Low-Resource Languages: A Review of Past Work and Future Challenges},
  shorttitle = {Low-Resource Languages},
  author = {Magueresse, Alexandre and Carles, Vincent and Heetderks, Evan},
  year = {2020},
  month = jun,
  number = {arXiv:2006.07264},
  eprint = {2006.07264},
  primaryclass = {cs},
  publisher = {arXiv},
  url = {http://arxiv.org/abs/2006.07264},
  urldate = {2023-06-15},
  abstract = {A current problem in NLP is massaging and processing low-resource languages which lack useful training attributes such as supervised data, number of native speakers or experts, etc. This review paper concisely summarizes previous groundbreaking achievements made towards resolving this problem, and analyzes potential improvements in the context of the overall future research direction.},
  archiveprefix = {arXiv},
  keywords = {/unread,challenges,Computer Science - Computation and Language,I.2.7,low-resourced languages,nlp,resources},
  file = {/Users/francojc/Zotero/storage/L3UQY6DF/Magueresse et al_2020_Low-resource Languages.pdf}
}

@book{Manning1999,
  title = {Foundations of Statistical Natural Language Processing},
  author = {Manning, {\relax CD} and Sch{\"u}tze, H},
  year = {1999},
  urldate = {2013-10-12},
  file = {/Users/francojc/Zotero/storage/42ZSQRPW/Manning and Schütze - 1999 - Foundations of statistical natural language proces.pdf}
}

@incollection{Manning2003,
  title = {Probabilistic Syntax},
  booktitle = {Probabilistic {{Linguistics}}},
  author = {Manning, Christopher},
  editor = {{Bod} and Hay, Jennifer and {Jannedy}},
  year = {2003},
  pages = {289--341},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  abstract = {``Everyone knows that language is variable.'' This is the bald sentence with which Sapir (1921:147) begins his chapter on language as an historical product. He goes on to emphasize how two speakers' usage is bound to differ ``in choice of words, in sentence structure, in the relative frequency with which particular forms or combinations of words are used''. I should add that much sociolinguistic and historical linguistic research has shown that the same speaker's usage is also variable (Labov 1966, Kroch 2001:722). However, the tradition of most syntacticians has been to ignore this thing that everyone knows.},
  file = {/Users/francojc/Zotero/storage/KQW2ZTNI/Manning - 2003 - Probabilistic syntax.pdf}
}

@book{Manning2008,
  title = {Introduction to Information Retrieval},
  author = {Manning, Christopher and Raghavan, P and Sch{\"u}tze, Hinrich},
  year = {2008},
  publisher = {Cambridge University Press},
  urldate = {2014-04-07},
  isbn = {978-0-521-86571-5},
  file = {/Users/francojc/Zotero/storage/DD4QUBGE/Manning et al. - 2008 - Introduction to information retrieval.pdf}
}

@article{Manning2011,
  title = {Part-of-Speech Tagging from 97\% to 100\%: {{Is}} It Time for Some Linguistics ?},
  author = {Manning, Christopher D},
  year = {2011},
  journal = {Computational Linguistics and Intelligent Text Processing},
  pages = {171-189.},
  url = {http://link.springer.com/chapter/10.1007/978-3-642-19400-9_14},
  urldate = {2014-02-24},
  file = {/Users/francojc/Zotero/storage/7Q5KMA63/Manning - 2011 - Part-of-Speech Tagging from 97% to 100% Is It Tim.pdf}
}

@article{Marcus1993,
  title = {Building a Large Annotated Corpus of {{English}}: {{The Penn Treebank}}},
  author = {Marcus, Mitchell P and Santorini, Beatrice and Marcinkiewicz, Mary Ann},
  year = {1993},
  journal = {Computational Linguistics},
  volume = {19},
  number = {2},
  pages = {313--330},
  abstract = {There is a growing consensus that significant, rapid progress can be made in both text understanding and spoken language understanding by investigating those phenom- ena that occur most centrally in naturally occurring unconstrained materials and by attempting to automatically extract information about language from very large cor- pora. Such corpora are beginning to serve as important research tools for investigators in natural language processing, speech recognition, and integrated spoken language systems, as well as in theoretical linguistics. Annotated corpora promise to be valu- able for enterprises as diverse as the automatic construction of statistical models for the grammar of the written and the colloquial spoken language, the development of explicit formal theories of the differing grammars of writing and speech, the investi- gation of prosodic phenomena in speech, and the evaluation and comparison of the adequacy of parsing models.},
  file = {/Users/francojc/Zotero/storage/SWZ7MXRN/Marcus, Santorini, Marcinkiewicz - 1993 - Building a large annotated corpus of English The Penn Treebank.pdf}
}

@article{Mariscal2009,
  title = {Early Acquisition of Gender Agreement in the {{Spanish}} Noun Phrase: Starting Small},
  shorttitle = {Early Acquisition of Gender Agreement in the Spanish Noun Phrase},
  author = {Mariscal, Sonia},
  year = {2009},
  month = jan,
  journal = {Journal of Child Language},
  volume = {36},
  number = {1},
  pages = {143--171},
  issn = {0305-0009, 1469-7602},
  doi = {10.1017/S0305000908008908},
  urldate = {2021-05-28},
  abstract = {Nativist and constructivist accounts differ in their characterization of children's knowledge of grammatical categories. In this paper we present research on the process of acquisition of a particular grammatical system, gender agreement in the Spanish noun phrase, in children under three years of age. The design of the longitudinal study employed presents some variations in relation to classical studies. The aim was to obtain a large corpus of NP data which would allow different types of analysis of the children's productions to be carried out. Intra-individual variability in early NP types was analyzed and measured, and an elicitation task for adjectives was used. Results show that the acquisition of NP and gender agreement is a complex process which advances as the children gradually integrate different pieces of evidence : phonological, distributional and functional. The reduction of variability as the grammatical process advances is a key feature for its explanation.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/4ZJU644P/Mariscal - 2009 - Early acquisition of gender agreement in the Spani.pdf}
}

@misc{MarkdownGetStarted08-2021,
  title = {R Markdown - Get Started},
  journal = {R Markdown},
  url = {https://rmarkdown.rstudio.com/lesson-1.html},
  urldate = {2021-08-22},
  annotation = {RStudio},
  file = {/Users/francojc/Zotero/storage/VS7P56F3/lesson-1.html}
}

@inproceedings{Marko2005,
  title = {Unsupervised Multilingual Word Sense Disambiguation via an Interlingua},
  booktitle = {{{PROCEEDINGS OF THE NATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE}}},
  author = {Mark{\'o}, Korn{\'e}l and Schulz, Stefan and Hahn, Udo},
  year = {2005},
  volume = {20},
  pages = {1075},
  publisher = {Menlo Park, CA; Cambridge, MA; London; AAAI Press; MIT Press; 1999},
  url = {http://scholar.google.com/scholar?hl=en&btnG=Search&q=intitle:Unsupervised+Multilingual+Word+Sense+Disambiguation+via+an+Interlingua#0},
  urldate = {2011-10-17}
}

@article{Marsden,
  title = {({{Why}}) Are Open Research Practices the Future for the Study of Language Learning?},
  author = {Marsden, Emma and {Morgan-Short}, Kara},
  journal = {Language Learning},
  volume = {n/a},
  number = {n/a},
  issn = {1467-9922},
  doi = {10.1111/lang.12568},
  urldate = {2023-04-24},
  abstract = {Open research practices are relevant to all stages of research, from conceptualization through dissemination. Here, we discuss key facets of open research, highlighting its rationales, infrastructures, behaviors, and challenges. Part I conceptualizes open research and its rationales. Part II identifies challenges such as the speed and cost of open research, the usability of open data and materials, the difficulties of conducting replication research, and the economics and sustainability of open access and open research generally. In discussing these challenges, we have sought to provide examples of good practice, describe and evaluate emerging innovations, and envision change. Part III considers ongoing coevolutions of culture, infrastructure, and behaviors and acknowledges the limitations of our review and of open research practices. We argue that open research is indeed a large part of our future, and most---if not all---challenges are surmountable, but doing so requires significant changes for many aspects of the research process.},
  langid = {english},
  keywords = {/unread,open science,reproducible research},
  file = {/Users/francojc/Zotero/storage/D9MVWS3P/Marsden_Morgan-Short_(Why) Are Open Research Practices the Future for the Study of Language Learning.pdf}
}

@article{Marsden2019,
  title = {Inclusion of Research Materials When Submitting an Article to Language Learning},
  author = {Marsden, Emma and Crossley, Scott and Ellis, Nick and Kormos, Judit and {Morgan-Short}, Kara and Thierry, Guillaume},
  year = {2019},
  journal = {Language Learning},
  volume = {69},
  number = {4},
  pages = {795--801},
  issn = {14679922},
  doi = {10.1111/lang.12378},
  abstract = {Building on initiatives to promote high quality methodologies and Open Science practices in the language sciences, Language Learning will request, as of January 1, 2020, that all submissions to the journal include, whenever possible, the full materials used in the study for peer review. This includes materials used to elicit and code primary and secondary data (such as questionnaires, language tests, interview or observation schedules, and coding schemas). These materials will be shared with reviewers to better inform the peer review process and ensure rigorous evaluation of the methods used. If the manuscript is accepted, authors will then be encouraged to make their materials available on an open, sustainable repository, though there is no requirement to do so. In this Editorial, we outline the benefits of this policy for the advancement of the language sciences and discuss some potential concerns that authors may have.},
  file = {/Users/francojc/Zotero/storage/GHLD2T9N/Marsden et al. - 2019 - Inclusion of Research Materials When Submitting an.pdf}
}

@article{Marwick2018,
  title = {Packaging Data Analytical Work Reproducibly Using {{R}} (and Friends)},
  author = {Marwick, Ben and Boettiger, Carl and Mullen, Lincoln},
  year = {2018},
  journal = {The American Statistician},
  volume = {72},
  number = {1},
  pages = {80--88},
  publisher = {Taylor \& Francis},
  file = {/Users/francojc/Zotero/storage/R7FCQMK2/Marwick et al. - 2018 - Packaging data analytical work reproducibly using .pdf;/Users/francojc/Zotero/storage/W3F9XX27/00031305.2017.html}
}

@article{Maurer2016,
  title = {Comparison of Learning Outcomes for Simulation-Based and Traditional Inference Curricula in a Designed Educational Experiment},
  author = {Maurer, Karsten and Lock, Dennis},
  year = {2016},
  journal = {Technology Innovations in Statistics Education},
  volume = {9},
  number = {1},
  url = {https://escholarship.org/uc/item/0wm523b0},
  urldate = {2024-04-01},
  keywords = {significance testing,simulation-based},
  file = {/Users/francojc/Zotero/storage/JPUYXAY5/Maurer and Lock - 2016 - Comparison of learning outcomes for simulation-based and traditional inference curricula in a design.pdf}
}

@article{Mauriello2021,
  title = {{{SAD}}: A Stress Annotated Dataset for Recognizing Everyday Stressors in {{SMS-like}} Conversational Systems},
  author = {Mauriello, Matthew Louis and Lincoln, Emmanuel Thierry and Hon, Grace and Simon, Dorien and Jurafsky, Dan and Paredes, Pablo E},
  year = {2021},
  pages = {7},
  abstract = {There is limited infrastructure for providing stress management services to those in need. To address this problem, chatbots are viewed as a scalable solution. However, one limiting factor is having clear definitions and examples of daily stress on which to build models and methods for routing appropriate advice during conversations. We developed a dataset of 6850 SMS-like sentences that can be used to classify input using a scheme of 9 stressor categories derived from: stress management literature, live conversations from a prototype chatbot system, crowdsourcing, and targeted web scraping from an online repository. In addition to releasing this dataset, we show results that are promising for classification purposes. Our contributions include: (i) a categorization of daily stressors, (ii) a dataset of SMS-like sentences, (iii) an analysis of this dataset that demonstrates its potential efficacy, and (iv) a demonstration of its utility for implementation via a simulation of model response times.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/DLZHDBNV/Mauriello et al. - 2021 - SAD A Stress Annotated Dataset for Recognizing Ev.pdf}
}

@incollection{McCarthy2004,
  title = {Research in the Teaching of Speaking},
  booktitle = {Annual {{Review}} of {{Applied Linguistics}}},
  author = {McCarthy, Michael and O'Keeffe, Anne},
  year = {2004},
  month = jun,
  volume = {24},
  pages = {26--43},
  publisher = {Cambridge University Press},
  issn = {0267-1905},
  doi = {10.1017/S0267190504000029},
  urldate = {2013-08-20},
  file = {/Users/francojc/Zotero/storage/VLF4339A/McCarthy, O'Keeffe - 2004 - Research in the Teaching of Speaking.pdf}
}

@phdthesis{McCarthy2005,
  title = {An Assessment of the Range and Usefulness of Lexical Diversity Measures and the Potential of the Measure of Textual, Lexical Diversity ({{MTLD}})},
  author = {McCarthy, Philip M.},
  year = {2005},
  address = {United States -- Tennessee},
  url = {https://www.proquest.com/docview/305349212/abstract/AE06F16DD8524B7FPQ/1},
  urldate = {2023-03-18},
  abstract = {Lexical diversity encompasses a wide variety of measures, all of which seek to quantify the range of vocabulary deployed in a given text. Researchers use these measures of lexical diversity in many fields, including language acquisition, stylistics, neuropathology, and linguistic forensics. Unfortunately, text length confounds all the measures, leading to questions of the conclusions of some studies. Many alternative measures have been introduced but all have failed to overcome the problem of text length correlation. We introduce and test a new measure of lexical diversity: the measure of textual, lexical diversity (MTLD). We test MTLD and 13 of the best known traditional measures of lexical diversity against the largest corpus yet established for such a test: 23 genres of spoken and written texts, comprising 414,000 words. The results of these tests supply evidence that none of the traditional measures avoid correlation with text length. MTLD, however, does not correlate with text length over the ranges tested suggesting that MTLD is the first reliable measure of lexical diversity. The significance of such a measure is that researchers and educators will be able to assess the lexical diversity of both spoken and written texts without concern for the differing text lengths. We also test all the traditional measures against a further corpus of NS and NNS. In these tests, both MTLD and some of the traditional measures predicted differences in the results. We conclude that MTLD is the only sophisticated measure that avoids correlation with text length but that using other sophisticate measures, in conjunction with MTLD, may be the best approach to analysing texts.},
  copyright = {Database copyright ProQuest LLC; ProQuest does not claim copyright in the individual underlying works.},
  isbn = {9780542460111},
  langid = {english},
  school = {The University of Memphis},
  keywords = {/unread,corpus linguistics,lexical diversity},
  file = {/Users/francojc/Zotero/storage/MH9VPVAX/McCarthy_An assessment of the range and usefulness of lexical diversity measures and the.pdf}
}

@book{Mcenery2001,
  title = {Corpus Linguistics: {{An}} Introduction},
  author = {Mcenery, Tony and Wilson, Andrew},
  year = {2001},
  edition = {2nd},
  publisher = {Edinburgh University Press},
  address = {Edinburgh}
}

@article{McEnery2007,
  title = {Parallel and Comparable Corpora: {{What}} Are They up To?},
  author = {McEnery, {\relax AM}},
  year = {2007},
  journal = {Corpus},
  url = {http://eprints.lancs.ac.uk/59/},
  urldate = {2011-10-27},
  file = {/Users/francojc/Zotero/storage/427C3ICG/McEnery - 2007 - Parallel and comparable corpora What are they up .pdf}
}

@book{McEnery2012,
  title = {Corpus Linguistics: {{Method}}, Theory and Practice},
  author = {McEnery, Tony and Hardie, Andrew},
  year = {2012},
  publisher = {Cambridge University Press},
  address = {Cambridge},
  file = {/Users/francojc/Zotero/storage/XZ8SKWK5/McEnery and Hardie - 2012 - Corpus Linguistics Method, Theory and Practice.pdf}
}

@article{McNamara2010,
  title = {Linguistic Features of Writing Quality},
  author = {McNamara, Danielle S. and Crossley, Scott A. and McCarthy, Philip M.},
  year = {2010},
  month = jan,
  journal = {Written Communication},
  volume = {27},
  number = {1},
  pages = {57--86},
  issn = {0741-0883, 1552-8472},
  doi = {10.1177/0741088309351547},
  urldate = {2023-08-14},
  abstract = {In this study, a corpus of expert-graded essays, based on a standardized scoring rubric, is computationally evaluated so as to distinguish the differences between those essays that were rated as high and those rated as low. The automated tool, Coh-Metrix, is used to examine the degree to which high- and low-proficiency essays can be predicted by linguistic indices of cohesion (i.e., coreference and connectives), syntactic complexity (e.g., number of words before the main verb, sentence structure overlap), the diversity of words used by the writer, and characteristics of words (e.g., frequency, concreteness, imagability). The three most predictive indices of essay quality in this study were syntactic complexity (as measured by number of words before the main verb), lexical diversity (as measured by the Measure of Textual Lexical Diversity), and word frequency (as measured by Celex, logarithm for all words). Using 26 validated indices of cohesion from Coh-Metrix, none showed differences between high- and low-proficiency essays and no indices of cohesion correlated with essay ratings. These results indicate that the textual features that characterize good student writing are not aligned with those features that facilitate reading comprehension. Rather, essays judged to be of higher quality were more likely to contain linguistic features associated with text difficulty and sophisticated language.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/francojc/Zotero/storage/VWI5HMMM/McNamara et al. - 2010 - Linguistic Features of Writing Quality.pdf}
}

@article{McNamara2017,
  title = {Wrangling Categorical Data in {{R}}},
  author = {McNamara, Amelia and Horton, Nicholas J},
  year = {2017},
  journal = {The American Statistician},
  pages = {0--0},
  issn = {0003-1305},
  doi = {10.1080/00031305.2017.1356375},
  abstract = {Data wrangling is a critical foundation of data science, and wrangling of categorical data is an important component of this process. However, categorical data can introduce unique issues in data wrangling, particularly in real-world settings with collaborators and periodically-updated dynamic data. This paper discusses common problems arising from categorical variable transformations in R, demonstrates the use of factors, and suggests approaches to address data wrangling challenges. For each problem, we present at least two strategies for management, one in base R and the other from the `tidyverse.' We consider several motivating examples, suggest defensive coding strategies, and outline principles for data wrangling to help ensure data quality and sound analysis.},
  file = {/Users/francojc/Zotero/storage/S3GYNGW3/McNamara, Horton - 2017 - Wrangling categorical data in R.pdf}
}

@misc{MediaHopperCreate06-2021,
  title = {Media Hopper Create},
  url = {https://media.ed.ac.uk/playlist/dedicated/169801461/1_gjzcgkec/1_bz9yza6s},
  urldate = {2021-06-09},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/V7DAQIEC/1_bz9yza6s.html}
}

@article{Merullo2019,
  title = {Investigating Sports Commentator Bias within a Large Corpus of American Football Broadcasts},
  author = {Merullo, Jack and Yeh, Luke and Handler, Abram and Grissom II, Alvin and O'Connor, Brendan and Iyyer, Mohit},
  year = {2019},
  month = oct,
  journal = {arXiv},
  eprint = {1909.03343},
  url = {http://arxiv.org/abs/1909.03343},
  urldate = {2021-05-25},
  abstract = {Sports broadcasters inject drama into playby-play commentary by building team and player narratives through subjective analyses and anecdotes. Prior studies based on small datasets and manual coding show that such theatrics evince commentator bias in sports broadcasts. To examine this phenomenon, we assemble FOOTBALL, which contains 1,455 broadcast transcripts from American football games across six decades that are automatically annotated with 250K player mentions and linked with racial metadata. We identify major confounding factors for researchers examining racial bias in FOOTBALL, and perform a computational analysis that supports conclusions from prior social science studies.},
  archiveprefix = {arXiv},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/2CYPXSGR/Merullo et al. - 2019 - Investigating Sports Commentator Bias within a Lar.pdf}
}

@article{Miani2021,
  title = {{{LOCO}}: {{The}} 88-Million-Word Language of Conspiracy Corpus},
  shorttitle = {{{LOCO}}},
  author = {Miani, Alessandro and Hills, Thomas and Bangerter, Adrian},
  year = {2021},
  month = oct,
  journal = {Behavior Research Methods},
  issn = {1554-3528},
  doi = {10.3758/s13428-021-01698-z},
  urldate = {2022-03-16},
  abstract = {The spread of online conspiracy theories represents a serious threat to society. To understand the content of conspiracies, here we present the language of conspiracy (LOCO) corpus. LOCO is an 88-million-token corpus composed of topic-matched conspiracy (N\,=\,23,937) and mainstream (N\,=\,72,806) documents harvested from 150 websites. Mimicking internet user behavior, documents were identified using Google by crossing a set of seed phrases with a set of websites. LOCO is hierarchically structured, meaning that each document is cross-nested within websites (N\,=\,150) and topics (N\,=\,600, on three different resolutions). A rich set of linguistic features (N\,=\,287) and metadata includes upload date, measures of social media engagement, measures of website popularity, size, and traffic, as well as political bias and factual reporting annotations. We explored LOCO's features from different perspectives showing that documents track important societal events through time (e.g., Princess Diana's death, Sandy Hook school shooting, coronavirus outbreaks), while patterns of lexical features (e.g., deception, power, dominance) overlap with those extracted from online social media communities dedicated to conspiracy theories. By computing within-subcorpus cosine similarity, we derived a subset of the most representative conspiracy documents (N\,=\,4,227), which, compared to other conspiracy documents, display prototypical and exaggerated conspiratorial language and are more frequently shared on Facebook. We also show that conspiracy website users navigate to websites via more direct means than mainstream users, suggesting confirmation bias. LOCO and related datasets are freely available at https://osf.io/snpcg/.},
  langid = {english},
  annotation = {https://osf.io/snpcg/},
  file = {/Users/francojc/Zotero/storage/NY587DSI/Miani et al. - 2021 - LOCO The 88-million-word language of conspiracy c.pdf}
}

@article{Michel2011,
  title = {Quantitative Analysis of Culture Using Millions of Digitized Books},
  author = {Michel, Jean-Baptiste and Shen, Yuan Kui and Aiden, Aviva Presser and Veres, Adrian and Gray, Matthew K. and Pickett, Joseph P. and Hoiberg, Dale and Clancy, Dan and Norvig, Peter and Orwant, Jon},
  year = {2011},
  journal = {science},
  volume = {331},
  number = {6014},
  pages = {176--182},
  publisher = {American Association for the Advancement of Science},
  file = {/Users/francojc/Zotero/storage/QXMPTCMZ/Michel et al. - 2011 - Quantitative analysis of culture using millions of.pdf;/Users/francojc/Zotero/storage/G768DTY8/176.html}
}

@misc{Microsoft2024,
  type = {Software},
  title = {Visual {{Studio Code}}},
  shorttitle = {{{VS Code}}},
  author = {{Microsoft}},
  year = {2024},
  journal = {Code Editing. Redefined.},
  url = {https://code.visualstudio.com/},
  urldate = {2024-04-04},
  abstract = {Visual Studio Code is a code editor redefined and optimized for building and debugging modern web and cloud applications.~ Visual Studio Code is free and available on your favorite platform - Linux, macOS, and Windows.},
  keywords = {software,vscode}
}

@inproceedings{Mikolov2013b,
  title = {Distributed Representations of Words and Phrases and Their Compositionality},
  booktitle = {Advances in Neural Information Processing Systems},
  author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
  year = {2013},
  pages = {3111--3119},
  abstract = {The recently introduced continuous Skip-gram model is an efficient method for learning high-quality distributed vector representations that capture a large num-ber of precise syntactic and semantic word relationships. In this paper we present several extensions that improve both the quality of the vectors and the training speed. By subsampling of the frequent words we obtain significant speedup and also learn more regular word representations. We also describe a simple alterna-tive to the hierarchical softmax called negative sampling. An inherent limitation of word representations is their indifference to word order and their inability to represent idiomatic phrases. For example, the meanings of " Canada " and " Air " cannot be easily combined to obtain " Air Canada " . Motivated by this example, we present a simple method for finding phrases in text, and show that learning good vector representations for millions of phrases is possible.},
  file = {/Users/francojc/Zotero/storage/PJBQ8UQF/Mikolov et al. - 2013 - Distributed Representations of Words and Phrases and their Compositionality.pdf}
}

@article{Miller2003,
  title = {The Cognitive Revolution: A Historical Perspective},
  author = {Miller, George A.},
  year = {2003},
  journal = {Trends in Cognitive Sciences},
  volume = {7},
  number = {3},
  pages = {141--144},
  issn = {13646613},
  doi = {10.1016/S1364-6613(03)00029-9},
  abstract = {Cognitive science is a child of the 1950s, the product of a time when psychology, anthropology and linguistics were redefining themselves and computer science and neuroscience as disciplines were coming into existence. Psychology could not participate in the cognitive revolution until it had freed itself from behaviorism, thus restoring cognition to scientific respectability. By then, it was becoming clear in several disciplines that the solution to some of their problems depended crucially on solving problems traditionally allocated to other disciplines. Collaboration was called for: this is a personal account of how it came about.},
  isbn = {1364-6613},
  pmid = {12639696},
  file = {/Users/francojc/Zotero/storage/V8B4FLYR/Miller - 2003 - The cognitive revolution A historical perspective.pdf}
}

@book{Millikan1923,
  title = {The Electron and the Light-Quant from the Experimental Point of View},
  author = {Millikan, Robert A.},
  year = {1923},
  journal = {Nobel prize acceptance speech},
  isbn = {981-02-3402-3},
  file = {/Users/francojc/Zotero/storage/WZIDE5PN/Millikan - 1923 - The electron and the light-quant from the experime.pdf}
}

@book{Miner2012,
  title = {Practical Text Mining and Statistical Analysis for Non-Structured Text Data Applications},
  author = {Miner, Gary and IV, John Elder and Fast, Andrew and Hill, Thomas and Nisbet, Robert and Delen, Dursun},
  year = {2012},
  publisher = {Academic Press},
  abstract = {Practical Text Mining and Statistical Analysis for Non-structured Text Data Applications brings together all the information, tools and methods a professional will need to efficiently use text mining applications and statistical analysis. Winner of a 2012 PROSE Award in Computing and Information Sciences from the Association of American Publishers, this book presents a comprehensive how-to reference that shows the user how to conduct text mining and statistically analyze results. In addition to providing an in-depth examination of core text mining and link detection tools, methods and operations, the book examines advanced preprocessing techniques, knowledge representation considerations, and visualization approaches. Finally, the book explores current real-world, mission-critical applications of text mining and link detection using real world example tutorials in such varied fields as corporate, finance, business intelligence, genomics research, and counterterrorism activities. The world contains an unimaginably vast amount of digital information which is getting ever vaster ever more rapidly. This makes it possible to do many things that previously could not be done: spot business trends, prevent diseases, combat crime and so on. Managed well, the textual data can be used to unlock new sources of economic value, provide fresh insights into science and hold governments to account. As the Internet expands and our natural capacity to process the unstructured text that it contains diminishes, the value of text mining for information retrieval and search will increase dramatically. Extensive case studies, most in a tutorial format, allow the reader to 'click through' the example using a software program, thus learning to conduct text mining analyses in the most rapid manner of learning possibleNumerous examples, tutorials, power points and datasets available via companion website on Elsevierdirect.comGlossary of text mining terms provided in the appendix},
  isbn = {978-0-12-386979-1},
  langid = {english}
}

@article{Mollin2009,
  title = {Combining Corpus Linguistic and Psychological Data on Word Co-Occurrences: {{Corpus}} Collocates versus Word Associations},
  author = {Mollin, S},
  year = {2009},
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {5},
  number = {2},
  pages = {175--200},
  file = {/Users/francojc/Zotero/storage/J42J8V3X/Mollin_2009_Combining corpus linguistic and psychological data on word co-occurrences.pdf}
}

@article{Monroe2008,
  title = {Fightin'words: {{Lexical}} Feature Selection and Evaluation for Identifying the Content of Political Conflict},
  shorttitle = {Fightin'words},
  author = {Monroe, Burt L. and Colaresi, Michael P. and Quinn, Kevin M.},
  year = {2008},
  journal = {Political Analysis},
  volume = {16},
  number = {4},
  pages = {372--403},
  publisher = {Cambridge University Press},
  abstract = {Entries in the burgeoning ``text-as-data'' movement are often accompanied by lists or visualizations of how word (or other lexical feature) usage differs across some pair or set of documents. These are intended either to establish some target semantic concept (like the content of partisan frames) to estimate word-specific measures that feed forward into another analysis (like locating parties in ideological space) or both. We discuss a variety of techniques for selecting words that capture partisan, or other, differences in political speech and for evaluating the relative importance of those words. We introduce and emphasize several new approaches based on Bayesian shrinkage and regularization. We illustrate the relative utility of these approaches with analyses of partisan, gender, and distributive speech in the U.S. Senate.},
  keywords = {/unread,lexical,political discourse,political science},
  file = {/Users/francojc/Zotero/storage/LRAUGLWT/Monroe et al_2008_Fightin'words.pdf}
}

@misc{MoreInformationFair09-2021,
  title = {More Information on Fair Use {\textbar} {{U}}.{{S}}. Copyright Office},
  url = {https://www.copyright.gov/fair-use/more-info.html},
  urldate = {2021-09-10},
  file = {/Users/francojc/Zotero/storage/LXEE8PBJ/more-info.html}
}

@article{Morris2019,
  title = {Using Simulation Studies to Evaluate Statistical Methods},
  author = {Morris, Tim P. and White, Ian R. and Crowther, Michael J.},
  year = {2019},
  journal = {Statistics in Medicine},
  volume = {38},
  number = {11},
  pages = {2074--2102},
  issn = {1097-0258},
  doi = {10.1002/sim.8086},
  urldate = {2023-10-24},
  abstract = {Simulation studies are computer experiments that involve creating data by pseudo-random sampling. A key strength of simulation studies is the ability to understand the behavior of statistical methods because some ``truth'' (usually some parameter/s of interest) is known from the process of generating the data. This allows us to consider properties of methods, such as bias. While widely used, simulation studies are often poorly designed, analyzed, and reported. This tutorial outlines the rationale for using simulation studies and offers guidance for design, execution, analysis, reporting, and presentation. In particular, this tutorial provides a structured approach for planning and reporting simulation studies, which involves defining aims, data-generating mechanisms, estimands, methods, and performance measures (``ADEMP''); coherent terminology for simulation studies; guidance on coding simulation studies; a critical discussion of key performance measures and their estimation; guidance on structuring tabular and graphical presentation of results; and new graphical presentations. With a view to describing recent practice, we review 100 articles taken from Volume 34 of Statistics in Medicine, which included at least one simulation study and identify areas for improvement.},
  copyright = {{\copyright} 2019 The Authors. Statistics~in~Medicine Published by John Wiley \& Sons Ltd.},
  langid = {english},
  keywords = {graphics for simulation,Monte Carlo,simulation design,simulation reporting,simulation studies,simulation-based models,statistics},
  file = {/Users/francojc/Zotero/storage/BGYJM34H/Morris et al. - 2019 - Using simulation studies to evaluate statistical methods.pdf}
}

@article{Mosteller1963,
  title = {Inference in an Authorship Problem},
  author = {Mosteller, Frederick and Wallace, David L},
  year = {1963},
  journal = {Journal of the American Statistical Association},
  volume = {58},
  number = {302},
  eprint = {2283270},
  eprinttype = {jstor},
  pages = {275--309},
  issn = {64020828},
  url = {https://www.jstor.org/stable/2283270},
  urldate = {2016-08-26},
  isbn = {1575865521},
  pmid = {3427},
  file = {/Users/francojc/Zotero/storage/SV95MINJ/Mosteller and Wallace - 1963 - Inference in an Authorship Problem.pdf}
}

@book{Mosteller1983,
  title = {Applied {{Bayesian}} and Classical Inference: {{The}} Case of the {{Federalist Papers}}},
  author = {Mosteller, Frederick and Wallace, David L},
  year = {1983},
  issn = {0176-4268},
  doi = {10.1007/BF01890079},
  urldate = {2016-08-26},
  abstract = {The new version has two additions. First, at the suggestion of Stephen Stigler I we have replaced the Table of Contents by what he calls an Analytic Table of Contents. Following the title of each section or subsection is a description of the content of the section. This material helps the reader in several ways, for example: by giving a synopsis of the book, by explaining where the various data tables are and what they deal with, by telling what theory is described where. We did several distinct full studies for the Federalist papers as well as many minor side studies. Some or all may offer information both to the applied and the theoretical reader. We therefore try to give in this Contents more than the few cryptic words in a section heading to {\textasciitilde}peed readers in finding what they want. Seconq, we have prepared an extra chapter dealing with authorship work published from. about 1969 to 1983. Although a chapter cannot compre- hensively Gover a field where many books now appear, it can mention most ofthe book-length works and the main thread of authorship' studies published in English. We founq biblical authorship studies so extensive and com- plicated that we thought it worthwhile to indicate some papers that would bring out the controversies that are taking place. We hope we have given the flavor of developments over the 15 years mentioned. We have also corrected a few typographical errors.},
  isbn = {0-387-90991-5},
  pmid = {3428}
}

@article{Munafo2017,
  title = {A Manifesto for Reproducible Science},
  author = {Munaf{\`o}, Marcus R and Nosek, Brian A and Bishop, Dorothy V M and Button, Katherine S and Chambers, Christopher D and Percie, Nathalie and Simonsohn, Uri and Wagenmakers, Eric-jan},
  year = {2017},
  journal = {Nature},
  volume = {1},
  pages = {1--9},
  issn = {2397-3374},
  doi = {10.1038/s41562-016-0021},
  urldate = {2017-02-03},
  isbn = {4156201600},
  file = {/Users/francojc/Zotero/storage/43QENGK8/Munafò et al. - 2017 - A manifesto for reproducible science.pdf}
}

@book{Munoz2006,
  title = {Age and the Rate of Foreign Language Learning},
  editor = {Mu{\~n}oz, Carmen},
  year = {2006},
  series = {Second {{Language Acquisition Series}}},
  edition = {1},
  volume = {19},
  publisher = {Multilingual Matters},
  address = {Clevedon},
  isbn = {978-1-85359-891-3},
  keywords = {/unread,corpus,dataset,second language acquisition,talkbank}
}

@misc{Munoz2007,
  title = {{{SLABank Catalan-English}} Barcelona Corpus},
  author = {Mu{\~n}oz, Carmen},
  year = {2007},
  publisher = {TalkBank},
  doi = {10.21415/T5S89C},
  urldate = {2021-09-12}
}

@article{Murakami2017,
  title = {`{{What}} Is This Corpus about?': Using Topic Modelling to Explore a Specialised Corpus},
  shorttitle = {`{{What}} Is This Corpus About?},
  author = {Murakami, Akira and Thompson, Paul and Hunston, Susan and Vajn, Dominik},
  year = {2017},
  month = aug,
  journal = {Corpora},
  volume = {12},
  number = {2},
  pages = {243--277},
  issn = {1749-5032, 1755-1676},
  doi = {10.3366/cor.2017.0118},
  urldate = {2023-09-19},
  abstract = {This paper introduces topic modelling, a machine learning technique that automatically identifies `topics' in a given corpus. The paper illustrates its use in the exploration of a corpus of academic English. It first offers the intuitive explanation of the underlying mechanism of topic modelling and describes the procedure for building a model, including the decisions involved in the model-building process. The paper then explores the model. A topic in topic models is characterised by a set of co-occurring words, and we will demonstrate that such topics bring us rich insights into the nature of a corpus. As exemplary tasks, this paper identifies the prominent topics in different parts of papers, investigates the chronological change of a journal, and reveals different types of papers in the journal. The paper further compares topic modelling to two more traditional techniques in corpus linguistics, semantic annotation and keywords analysis, and highlights the strengths of topic modelling. We believe that topic modelling is particularly useful in the initial exploration of a corpus.},
  langid = {english},
  keywords = {corpus linguistics,machine learning,methodology,topic modeling,unsupervised learning},
  file = {/Users/francojc/Zotero/storage/MI7KZJ8X/Murakami et al. - 2017 - ‘What is this corpus about’ using topic modelling to explore a specialised corpus.pdf}
}

@book{Navarro2021,
  title = {Learning Statistics with {{R}}},
  author = {Navarro, Daniel},
  year = {2021},
  url = {https://learningstatisticswithr.com/},
  urldate = {2021-02-24},
  copyright = {CC By-SA},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/YZCN87J4/Navarro - 2021 - Learning Statistics with R.pdf}
}

@misc{Nelson2021,
  title = {Mining the Dispatch},
  author = {Nelson, Robert K.},
  year = {2021},
  journal = {Mining the Dispatch},
  url = {https://dsl.richmond.edu/dispatch/},
  urldate = {2021-06-16},
  file = {/Users/francojc/Zotero/storage/WMZD9IQL/dispatch.html}
}

@misc{Neovim2024,
  type = {Software},
  title = {Neovim},
  shorttitle = {Neovim},
  year = {2024},
  month = jul,
  journal = {Hyperextensible Vim-based Text Editor},
  url = {https://neovim.io/},
  urldate = {2024-07-16},
  collaborator = {Neovim Team},
  keywords = {software,text editor},
  file = {/Users/francojc/Zotero/storage/93JDMN66/neovim.io.html}
}

@incollection{Nerlich2010,
  title = {Theory and Language of Climate Change Communication},
  booktitle = {Wiley {{Interdisciplinary Reviews}}: {{Climate Change}}},
  author = {Nerlich, Brigitte and Koteyko, Nelya and Brown, Brian},
  year = {2010},
  volume = {1},
  pages = {97--110},
  publisher = {Wiley},
  doi = {10.1002/wcc.002},
  abstract = {Climate change communication has become a salient topic in science and society. It has grown to be something like a booming industry alongside more established 'communication enterprises', such as health communication, risk communication, and science communication. This article situates the theory of climate change communication within theoretical developments in the field of science communication. It discusses the importance and difficulties inherent in talking about climate change to different types of publics using various types of communication tools and strategies. It engages with the difficult issue of the relationship between climate change communication and behavior change, and it focuses, in particular, on the role of language (metaphors, words, strategies, frames, and narratives) in conveying climate change issues to stakeholders. In the process, it attempts to provide an overview of emerging theories of climate change communication, theories that recently have begun to proliferate quite dramatically. In some cases, we can, therefore only provide signposts to the most relevant research that is being carried out with regard to climate change communication without being able to engage with all its aspects. We end with an assessment of how communication could be improved in light of the theories and practices discussed in this article .}
}

@article{Neuman2013,
  title = {Metaphor Identification in Large Texts Corpora},
  author = {Neuman, Yair and Assaf, Dan and Cohen, Yohai and Last, Mark and Argamon, Shlomo and Howard, Newton and Frieder, Ophir},
  editor = {Altmann, Eduardo G.},
  year = {2013},
  month = apr,
  journal = {PLoS ONE},
  volume = {8},
  number = {4},
  pages = {e62343},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0062343},
  urldate = {2021-04-14},
  abstract = {Identifying metaphorical language-use (e.g., sweet child) is one of the challenges facing natural language processing. This paper describes three novel algorithms for automatic metaphor identification. The algorithms are variations of the same core algorithm. We evaluate the algorithms on two corpora of Reuters and the New York Times articles. The paper presents the most comprehensive study of metaphor identification in terms of scope of metaphorical phrases and annotated corpora size. Algorithms' performance in identifying linguistic phrases as metaphorical or literal has been compared to human judgment. Overall, the algorithms outperform the state-of-the-art algorithm with 71\% precision and 27\% averaged improvement in prediction over the base-rate of metaphors in the corpus.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/TFIYKVDG/Neuman et al. - 2013 - Metaphor Identification in Large Texts Corpora.pdf}
}

@article{Newberry2017,
  title = {Evolutionary Forces in Language Change},
  author = {Newberry, Mitchell G. and Ahern, Christopher A. and Clark, Robin and Plotkin, Joshua B.},
  year = {2017},
  journal = {Nature},
  publisher = {Macmillan Publishers Limited},
  doi = {10.1038/nature24455},
  abstract = {Languages and genes are both transmitted from generation to generation, with opportunity for differential reproduction and survivorship of forms. Here we apply a rigorous inference framework, drawn from population genetics, to distinguish between two broad mechanisms of language change: drift and selection. Drift is change that results from stochasticity in transmission and it may occur in the absence of any intrinsic difference between linguistic forms; whereas selection is truly an evolutionary force arising from intrinsic differences -- for example, when one form is preferred by members of the population. Using large corpora of parsed texts spanning the 12th century to the 21st century, we analyze three examples of grammatical changes in English: the regularization of past-tense verbs, the rise of the periphrastic `do', and syntactic variation in verbal negation. We show that we can reject stochastic drift in favor of a selective force driving some of these language changes, but not others. The strength of drift depends on a word's frequency, and so drift provides an alternative explanation for why some words are more prone to change than others. Our results suggest an important role for stochasticity in language change, and they provide a null model against which selective theories of language evolution must be compared.},
  file = {/Users/francojc/Zotero/storage/KTRH4JR6/Newberry et al. - 2017 - Evolutionary forces in language change.pdf}
}

@article{Nguyen2020,
  title = {How We Do Things with Words: {{Analyzing}} Text as Social and Cultural Data},
  shorttitle = {How We Do Things with Words},
  author = {Nguyen, Dong and Liakata, Maria and DeDeo, Simon and Eisenstein, Jacob and Mimno, David and Tromble, Rebekah and Winters, Jane},
  year = {2020},
  month = aug,
  journal = {Frontiers in Artificial Intelligence},
  volume = {3},
  eprint = {1907.01468},
  pages = {62},
  issn = {2624-8212},
  doi = {10.3389/frai.2020.00062},
  urldate = {2020-10-06},
  abstract = {In this article we describe our experiences with computational text analysis. We hope to achieve three primary goals. First, we aim to shed light on thorny issues not always at the forefront of discussions about computational text analysis methods. Second, we hope to provide a set of best practices for working with thick social and cultural concepts. Our guidance is based on our own experiences and is therefore inherently imperfect. Still, given our diversity of disciplinary backgrounds and research practices, we hope to capture a range of ideas and identify commonalities that will resonate for many. And this leads to our final goal: to help promote interdisciplinary collaborations. Interdisciplinary insights and partnerships are essential for realizing the full potential of any computational text analysis that involves social and cultural concepts, and the more we are able to bridge these divides, the more fruitful we believe our work will be.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/SSNHTMVY/Nguyen et al. - 2020 - How we do things with words Analyzing text as soc.pdf;/Users/francojc/Zotero/storage/6UHT3WMB/1907.html}
}

@inproceedings{Nisioi2016,
  title = {A Corpus of Native, Non-Native and Translated Texts},
  booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation ({{LREC}} 2016)},
  author = {Nisioi, Sergiu and Rabinovich, Ella and Dinu, Liviu P. and Wintner, Shuly},
  year = {2016},
  publisher = {European Language Resources Association (ELRA)},
  address = {Portoro{\u z}, Slovenia},
  isbn = {978-2-9517408-9-1},
  langid = {english}
}

@article{Nivre2016,
  title = {Universal Dependencies v1: A Multilingual Treebank Collection},
  author = {Nivre, Joakim and {de Marneffe}, Marie-Catherine and Ginter, Filip and Goldberg, Yoav and Haji{\v c}, Jan and Manning, Christopher D and McDonald, Ryan and Petrov, Slav and Pyysalo, Sampo and Silveira, Natalia and Tsarfaty, Reut and Zeman, Daniel},
  year = {2016},
  journal = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)},
  pages = {1659--1666},
  doi = {?},
  abstract = {Cross-linguistically consistent annotation is necessary for sound comparative evaluation and cross-lingual learning experiments. It is also useful for multilingual system development and comparative linguistic studies. Universal Dependencies is an open community effort to create cross-linguistically consistent treebank annotation for many languages within a dependency-based lexicalist framework. In this paper, we describe v1 of the universal guidelines, the underlying design principles, and the currently available treebanks for 33 languages.},
  langid = {english},
  keywords = {/unread},
  file = {/Users/francojc/Zotero/storage/C4IJ32BT/Nivre et al. - Universal Dependencies v1 A Multilingual Treebank.pdf}
}

@article{Nivre2020,
  title = {Universal Dependencies v2: {{An}} Evergrowing Multilingual Treebank Collection},
  shorttitle = {Universal Dependencies V2},
  author = {Nivre, Joakim and De Marneffe, Marie-Catherine and Ginter, Filip and Haji{\v c}, Jan and Manning, Christopher D. and Pyysalo, Sampo and Schuster, Sebastian and Tyers, Francis and Zeman, Daniel},
  year = {2020},
  journal = {arXiv arXiv:2004.10643},
  eprint = {2004.10643},
  archiveprefix = {arXiv},
  keywords = {lexicon,resources,syntax,treebank,universal dependencies},
  file = {/Users/francojc/Zotero/storage/E8SR7KRY/Nivre et al. - 2020 - Universal Dependencies v2 An evergrowing multilingual treebank collection.pdf}
}

@techreport{Norvig2011,
  title = {On Chomsky and the Two Cultures of Statistical Learning},
  author = {Norvig, Peter},
  year = {2011},
  journal = {On-line essay in response to Chomsky's remarks},
  urldate = {2016-08-26},
  file = {/Users/francojc/Zotero/storage/VSNAN96A/Norvig - 2011 - On Chomsky and the two cultures of statistical learning.pdf}
}

@book{Oakes1998,
  title = {Statistics for Corpus Linguistics},
  author = {Oakes, Michael},
  year = {1998},
  publisher = {Edinburgh University Press},
  urldate = {2011-10-04},
  file = {/Users/francojc/Zotero/storage/P5BHFLP7/Oakes - 1998 - Statistics for Corpus Linguistics.pdf}
}

@misc{Ochs2020,
  title = {There Is an {{R}} in Reproducibility},
  author = {Ochs, Dr Andreas},
  year = {2020},
  month = oct,
  journal = {Medium},
  url = {https://laptrinhx.com/there-is-an-r-in-reproducibility-1004067835/},
  urldate = {2021-05-04},
  abstract = {Make your next R project reproducible with these five easy principles, and generate the same results again, and again, and again.},
  langid = {english}
}

@article{Okamoto2015,
  title = {Is Corpus Word Frequency a Good Yardstick for Selecting Words to Teach? {{Threshold}} Levels for Vocabulary Selection},
  shorttitle = {Is Corpus Word Frequency a Good Yardstick for Selecting Words to Teach?},
  author = {Okamoto, Mayumi},
  year = {2015},
  month = jul,
  journal = {System},
  volume = {51},
  pages = {1--10},
  issn = {0346-251X},
  doi = {10.1016/j.system.2015.03.004},
  urldate = {2023-08-31},
  abstract = {This paper examines the suitability of word frequency as a criterion for vocabulary selection in general courses of English as a Foreign Language. Specifically, it addresses two issues: the relation between word frequency in corpora and native speakers' self-reported frequency of word use, and the relation between word frequency and word dispersion in corpora. Seventeen native speakers of English were asked how frequently they think they use 240 target words. The dispersion rates of the target words were calculated according to their sub-frequencies observed in the British National Corpus and the Corpus of Contemporary American English. The results indicate that corpus word frequency is closely related to native speakers' self-reported frequency of word use up to the 7000-word level, and that word frequency is related to word dispersion, with high dispersion rates observed up to the 6000-word level. These results have important implications for vocabulary selection for EFL textbooks and goal setting for vocabulary teaching. They suggest that native speakers make reasonable judgments on word usefulness up to the 7000-word level in terms of frequency. Furthermore, it would seem sensible to set the upper limit of vocabulary teaching at the 6000-word level in terms of frequency and dispersion.},
  keywords = {language teaching,Self-reported frequency of word use,vocabulary,Vocabulary selection,Vocabulary teaching goals,Word dispersion,word frequency,Word frequency},
  file = {/Users/francojc/Zotero/storage/I7VMSIPB/Okamoto - 2015 - Is corpus word frequency a good yardstick for selecting words to teach Threshold levels for vocabul.pdf}
}

@article{Olohan2002,
  title = {Comparable Corpora in Translation Research: {{Overview}} of Recent Analyses Using the Translational {{English Corpus}}},
  author = {Olohan, M},
  year = {2002},
  journal = {LREC Language Resources in Translation Work and  {\dots}},
  urldate = {2013-09-10},
  file = {/Users/francojc/Zotero/storage/UEHHGXI4/Olohan - 2002 - Comparable corpora in translation research Overvi.pdf}
}

@book{Olohan2004,
  title = {Introducing Corpora in Translation Studies},
  author = {Olohan, M},
  year = {2004},
  url = {http://books.google.com/books?hl=en&lr=&id=o5nMjtTA2vgC&oi=fnd&pg=PP1&ots=b-COqCUo9y&sig=xVSNc24NOtLiVYlqZwozbAvzjOw},
  urldate = {2012-09-10}
}

@article{Olohan2008,
  title = {Leave It out! {{Using}} a Comparable Corpus to Investigate Aspects of Explicitation in Translation},
  author = {Olohan, Maeve},
  year = {2008},
  journal = {Cadernos de Tradu{\c c}{\~a}o},
  pages = {153--169},
  urldate = {2013-09-10},
  file = {/Users/francojc/Zotero/storage/SASHHZ92/Olohan - 2008 - Leave it out! Using a comparable corpus to investigate aspects of explicitation in translation.pdf}
}

@article{Orpin2005,
  title = {Corpus Linguistics and Critical Discourse Analysis: {{Examining}} the Ideology of Sleaze},
  author = {Orpin, Debbie},
  year = {2005},
  month = jan,
  journal = {International Journal of Corpus Linguistics},
  volume = {10},
  number = {1},
  pages = {37--61},
  publisher = {John Benjamins Publishing Company},
  issn = {13846655},
  doi = {10.1075/ijcl.10.1.03orp},
  urldate = {2011-10-31}
}

@misc{OSF2024,
  title = {Open Science Framework},
  shorttitle = {{{OSF}}},
  author = {{Center for Open Science}},
  year = {2024},
  url = {https://osf.io/},
  urldate = {2024-04-04},
  keywords = {repository,reproducible research,research}
}

@article{Pado2007,
  title = {Dependency-Based Construction of Semantic Space Models},
  author = {Pad{\'o}, Sebastian and Lapata, Mirella},
  year = {2007},
  journal = {Computational Linguistics},
  volume = {33},
  number = {2},
  pages = {161--199},
  issn = {0891-2017},
  doi = {10.1162/coli.2007.33.2.161},
  abstract = {Traditionally, vector-based semantic space models use word co-occurrence counts from large corpora to represent lexical meaning. In this article we present a novel framework for construct- ing semantic spaces that takes syntactic relations into account.We introduce a formalization for this class of models, which allows linguistic knowledge to guide the construction process. We evaluate our framework on a range of tasks relevant for cognitive science and natural language processing: semantic priming, synonymy detection, and word sense disambiguation. In all cases, our framework obtains results that are comparable or superior to the state of the art.},
  isbn = {0891201715309312},
  file = {/Users/francojc/Zotero/storage/QJHR5MLB/Padó, Lapata - 2007 - Dependency-Based Construction of Semantic Space Models.pdf}
}

@inproceedings{Paetzold2016,
  title = {Collecting and Exploring Everyday Language for Predicting Psycholinguistic Properties of Words},
  booktitle = {Proceedings of the 26th {{International Conference}} on {{Computational Linguistics}} ({{COLING-16}})},
  author = {Paetzold, Gustavo Henrique and Specia, Lucia},
  year = {2016},
  pages = {1669--1679},
  urldate = {2016-12-29},
  file = {/Users/francojc/Zotero/storage/DMY3773Q/Paetzold and Specia - 2016 - Collecting and Exploring Everyday Language for Pre.pdf}
}

@article{Pak2010,
  title = {Twitter as a Corpus for Sentiment Analysis and Opinion Mining.},
  author = {Pak, Alexander and Paroubek, Patrick},
  year = {2010},
  journal = {LREC},
  pages = {1320--1326},
  urldate = {2013-11-12}
}

@inproceedings{Pang2002,
  title = {Thumbs up? {{Sentiment}} Classification Using Machine Learning Techniques},
  booktitle = {Proceedings of the {{Conference}} on {{Empirical Methods}} in {{Natural Language Processing}}},
  author = {Pang, Bo and Lee, Lillian and Vaithyanathan, Skivakumar},
  year = {2002},
  number = {July},
  pages = {79--86},
  urldate = {2013-10-31},
  file = {/Users/francojc/Zotero/storage/ERYX35DN/Pang, Lee, Vaithyanathan - 2002 - Thumbs up Sentiment Classification using Machine Learning Techniques.pdf}
}

@book{Paquot2020a,
  title = {A Practical Handbook of Corpus Linguistics},
  editor = {Paquot, Magali and Gries, Stefan Th.},
  year = {2020},
  publisher = {Springer},
  address = {Switzerland},
  file = {/Users/francojc/Zotero/storage/EAYGSBLK/X7osEAAAQBAJ.html}
}

@book{Paradis2005,
  title = {R for Beginners},
  author = {Paradis, Emmanuel},
  year = {2005},
  journal = {Lance},
  volume = {21},
  issn = {1474547X},
  doi = {10.1016/S0140-6736(09)61155-7},
  abstract = {BACKGROUND: Multiple reconstructive procedures are common for the reconstruction of complex facial deformities of skin, soft tissues, bony structures, and functional subunits, such as the nose, lips, and eyelids. However, the results have been unsatisfactory. An innovative approach entailing a single surgical procedure of face allograft transplantation is a viable alternative and gives improved results. METHODS: On Dec 9, 2008, a 45-year-old woman with a history of severe midface trauma underwent near-total face transplantation in which 80\% of her face was replaced with a tailored composite tissue allograft. We addressed issues of immunosuppressive therapy, psychological and ethical outcomes, and re-integration of the patient into society. FINDINGS: After the operation, the patient did well physically and psychologically, and tolerated immunosuppression without any major complication. Routine biopsy on day 47 after transplantation showed rejection of graft mucosa; however, a single bolus of corticosteroids reversed rejection. During the first 3 weeks after transplantation, the patient accepted her new face; 6 months after surgery, the functional outcome has been excellent. In contrast to her status before transplantation, the patient can now breathe through her nose, smell, taste, speak intelligibly, eat solid foods, and drink from a cup. INTERPRETATION: We show the feasibility of reconstruction of severely disfigured patients in a single surgical procedure using composite face allotransplantation. Therefore, this should be taken in consideration as an early option for severely disfigured patients. FUNDING: None.},
  isbn = {3-900051-07-0},
  pmid = {19608265},
  file = {/Users/francojc/Zotero/storage/Z63DFGDJ/Paradis - 2005 - R for Beginners.pdf}
}

@misc{Pasek2012,
  title = {Writing the Empirical Social Science Research Paper: A Guide for the Perplexed: (545122012-003)},
  shorttitle = {Writing the Empirical Social Science Research Paper},
  author = {Pasek, Josh},
  year = {2012},
  publisher = {American Psychological Association},
  doi = {10.1037/e545122012-003},
  urldate = {2024-03-11},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/9BXNA7I6/Pasek - 2012 - Writing the empirical social science research paper A guide for the perplexed (545122012-003).pdf}
}

@article{Peng2011,
  title = {Reproducible Research in Computational Science},
  author = {Peng, Roger D},
  year = {2011},
  journal = {Science},
  volume = {334},
  number = {6060},
  eprint = {0901.4552},
  pages = {1226--7},
  issn = {1095-9203},
  doi = {10.1126/science.1213847},
  abstract = {Computational science has led to exciting new developments, but the nature of the work has exposed limitations in our ability to evaluate published findings. Reproducibility has the potential to serve as a minimum standard for judging scientific claims when full independent replication of a study is not possible.},
  archiveprefix = {arXiv},
  isbn = {1095-9203 (Electronic){\textbackslash}r0036-8075 (Linking)},
  pmid = {22144613},
  file = {/Users/francojc/Zotero/storage/YAC2Z69B/Peng - 2011 - Reproducible Research in Computational Science.pdf}
}

@inproceedings{Pennington2014,
  title = {{{GloVe}}: Global Vectors for Word Representation},
  booktitle = {Proc. of {{Empirical Methods}} in {{Natural Language Processing}} ({{EMNLP}})},
  author = {Pennington, Jeffery and Socher, Richard and Manning, Christopher},
  year = {2014},
  file = {/Users/francojc/Zotero/storage/AIDWD7KR/Pennington, Socher, Manning - 2014 - GloVe global vectors for word representation.pdf}
}

@article{Pereira2000,
  title = {Formal Grammar and Information Theory: Together Again?},
  author = {Pereira, Fernando},
  year = {2000},
  journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences},
  volume = {358},
  number = {1769},
  pages = {1239--1253},
  issn = {1364-503X},
  doi = {10.1098/rsta.2000.0583},
  abstract = {In the last 40 years, research on models of spoken and written language has been split between two seemingly irreconcilable traditions: formal linguistics in the Chomsky tradition, and information theory in the Shannon tradition. Zellig Harris had advocated a close alliance between grammatical and information-theoretic principles in the analysis of natural language, and early formal-language theory provided another strong link between information theory and linguistics. Nevertheless, in most research on language and computation, grammatical and information-theoretic approaches had moved far apart. Today, after many years on the defensive, the information-theoretic approach has gained new strength and achieved practical successes in speech recognition, information retrieval, and, increasingly, in language analysis and machine translation. The exponential increase in the speed and storage capacity of computers is the proximate cause of these engineering successes, allowing the automatic estimation of the parameters of probabilistic models of language by counting occurrences of linguistic events in very large bodies of text and speech. However, I will argue that information-theoretic and computational ideas are also playing an increasing role in the scientific understanding of language, and will help bring together formal-linguistic and information-theoretic perspectives.},
  isbn = {9027247366},
  file = {/Users/francojc/Zotero/storage/2ZJ8NE47/Pereira - 2000 - Formal grammar and information theory together ag.pdf}
}

@article{Perez2017,
  title = {Word-Initial Rhotic Clusters in {{Spanish-speaking}} Preschoolers in {{Chile}} and {{Granada}}, {{Spain}}},
  author = {Perez, Denisse and Vivar, Pilar and Bernhardt, Barbara May and Mendoza, Elvira and {\'A}vila, Carmen and Carballo, Gloria and Fresneda, Dolores and Mu{\~n}oz, Juana and Vergara, Patricio},
  year = {2017},
  journal = {Clinical Linguistics \& Phonetics},
  volume = {32},
  number = {5-6},
  pages = {481--505},
  publisher = {Taylor \& Francis},
  keywords = {/unread,first language acquisition,phonetics,spanish,talkbank},
  file = {/Users/francojc/Zotero/storage/FNTLRBMS/Perez et al. - 2017 - Word-initial rhotic clusters in Spanish-speaking p.pdf}
}

@book{Perlin2017,
  title = {Processing and Analyzing Financial Data with {{R}}},
  author = {Perlin, Marcelo S.},
  year = {2017},
  edition = {First edit},
  publisher = {Agencia Brasileira de ISBN},
  abstract = {This book introduces the reader to the use of R and RStudio as a platform for processing and analyzing financial data. The book covers all necessary knowledge for using R, from its installation in your computer to the organization and development of scripts. For every chapter, the book presents practical and replicable examples of R code, providing context and facilitating the learning process. Based on the material, the reader will learn how to download financial data from local files or the Internet, represent and process it using native objects in R, and create tables and figures to report the results in a technical document. The book is organized based on the author's practical experience in scientific research and includes instructions for using the best R packages for each purpose, such as xtable and texreg for reporting tables, dplyr in data processing, and ggplot2 in creating figures. After showing the capabilities of R in processing financial data, the last chapter presents three complete and reproducible examples of research in Finance. This book is recommended for researchers and students interested in learning how to use R. No prior knowledge of programming or finance is required to take advantage of this book. After finishing, the reader will have enough knowledge to develop their own scripts autonomously, producing academic documents or data analysis for public and private institutions.},
  isbn = {978-85-922435-5-5}
}

@article{Petrenz2011,
  title = {Stable Classification of Text Genres},
  author = {Petrenz, Philipp and Webber, Bonnie},
  year = {2011},
  month = jun,
  journal = {Computational Linguistics},
  volume = {37},
  number = {2},
  pages = {385--393},
  issn = {0891-2017, 1530-9312},
  doi = {10.1162/COLI_a_00052},
  urldate = {2023-09-18},
  abstract = {Every text has at least one topic and at least one genre. Evidence for a text's topic and genre comes, in part, from its lexical and syntactic features---features used in both Automatic Topic Classification and Automatic Genre Classification (AGC). Because an ideal AGC system should be stable in the face of changes in topic distribution, we assess five previously published AGC methods with respect to both performance on the same topic--genre distribution on which they were trained and stability of that performance across changes in topic--genre distribution. Our experiments lead us to conclude that (1) stability in the face of changing topical distributions should be added to the evaluation critera for new approaches to AGC, and (2) Part-of-Speech features should be considered individually when developing a high-performing, stable AGC system for a particular, possibly changing corpus.},
  langid = {english},
  keywords = {genre,register,text classification},
  file = {/Users/francojc/Zotero/storage/U5CWQUX6/Petrenz and Webber - 2011 - Stable Classification of Text Genres.pdf}
}

@article{Piantadosi2011,
  title = {Word Lengths Are Optimized for Efficient Communication},
  author = {Piantadosi, S. T. and Tily, H. and Gibson, E.},
  year = {2011},
  month = jan,
  journal = {Proceedings of the National Academy of Sciences},
  volume = {2010},
  number = {14},
  pages = {1--4},
  issn = {0027-8424},
  doi = {10.1073/pnas.1012551108},
  urldate = {2011-01-29},
  file = {/Users/francojc/Zotero/storage/DYPPNNII/Piantadosi, Tily, Gibson - 2011 - Word lengths are optimized for efficient communication(2).pdf;/Users/francojc/Zotero/storage/TVQFJJWS/Piantadosi, Tily, Gibson - 2011 - Word lengths are optimized for efficient communication.pdf}
}

@book{Pinker1994,
  title = {The Language Instinct: {{How}} the Mind Creates Language},
  author = {Pinker, Steven},
  year = {1994},
  journal = {Language},
  volume = {71},
  eprint = {1011.1669v3},
  publisher = {Perennial Classics},
  issn = {00978507},
  doi = {10.2307/416234},
  abstract = {In this classic, the world's expert on language and mind lucidly explains everything you always wanted to know about language: how it works, how children learn it, how it changes, how the brain computes it, and how it evolved. With deft use of examples of humor and wordplay, Steven Pinker weaves our vast knowledge of language into a compelling story: language is a human instinct, wired into our brains by evolution. The Language Instinct received the William James Book Prize from the American Psychological Association and the Public Interest Award from the Linguistics Society of America. This edition includes an update on advances in the science of language since The Language Instinct was first published.},
  archiveprefix = {arXiv},
  isbn = {0-06-133646-7},
  pmid = {15530376}
}

@book{Pochhacker2004,
  title = {Introducing Interpreting Studies},
  author = {P{\"o}chhacker, F},
  year = {2004},
  url = {http://books.google.com/books?hl=en&lr=&id=HreH56AfbDkC&oi=fnd&pg=PP1&ots=dhEz2Om-g0&sig=lnojhGCGjTNXd0uL7qfWBF_Zsok},
  urldate = {2012-09-10}
}

@misc{Posit2024,
  type = {Software},
  title = {{{RStudio}}},
  author = {{Posit}},
  year = {2024},
  journal = {RStudio},
  url = {https://posit.co},
  urldate = {2024-04-04},
  abstract = {The RStudio Integrated Development Environment (IDE) is the preferred tools for data scientists who develop in R \& Python.},
  keywords = {RStudio,website}
}

@misc{ProjectGutenberg05-2023,
  title = {Project Gutenberg},
  year = {2023/},
  journal = {Project Gutenberg},
  url = {https://www.gutenberg.org/},
  urldate = {2023-05-24},
  abstract = {Project Gutenberg is a library of free eBooks.},
  copyright = {Public domain},
  langid = {english},
  keywords = {/unread,books,literature,politics,public domain,repository}
}

@article{Pruim2017,
  title = {The Mosaic Package: {{Helping}} Students to 'think with Data' Using {{R}}},
  author = {Pruim, Randall and Kaplan, Daniel T. and Horton, Nicholas J.},
  year = {2017},
  journal = {R Journal},
  volume = {9},
  number = {1},
  pages = {77--102},
  issn = {20734859},
  doi = {10.32614/rj-2017-024},
  abstract = {The mosaic package provides a simplified and systematic introduction to the core functionality related to descriptive statistics, visualization, modeling, and simulation-based inference required in first and second courses in statistics. This introduction to the package describes some of the guiding principles behind the design of the package and provides illustrative examples of several of the most important functions it implements. These can be combined to help students "think with data" using R in their early course work, starting with simple, yet powerful, declarative commands.},
  file = {/Users/francojc/Zotero/storage/8PAEZ3XG/Pruim, Kaplan, Horton - 2017 - The mosaic package Helping students to 'think with data' using R.pdf}
}

@misc{PublicResource2021,
  title = {{The general index}},
  author = {{Public Resource}},
  year = {2021},
  month = oct,
  url = {http://archive.org/details/GeneralIndex},
  urldate = {2021-12-01},
  abstract = {Welcome to the General Index The General Index consists of 3 tables derived from 107,233,728 journal articles. A table of n-grams, ranging from unigrams to 5-grams, is extracted using SpaCy. Each~ of the 355,279,820,087 rows of the n-gram table consists of an n-gram coupled with a journal article id. A second table is constructed using Yake and consists of 19,740,906,314 rows, each with a keywords and an article id. A third table associates an article id with metadata.~  The metadata, readme, and sample files are available on this item in the data downloads area.The README.txt file contains more information.The corpus of articles has been split into 16 slices. We keep the keywords and n-grams files, one for each of the 16 slices, in separate items. We do this to avoid overloading the servers. Be careful, these files are big and inflate greatly upon unzipping.  The keywords are located here: [ 0 : 1 : 2 : 3 : 4 : 5 : 6 : 7 : 8 : 9 : a : b : c : d : e : f ]The n-grams are located here: [ 0 : 1 : 2 : 3 : 4 : 5 : 6 : 7 : 8 : 9 : a : b : c : d : e : f ]You can see all the items at this URL: https://archive.org/search.php?query=\%22general\%20index\%22\%20AND\%20collection\%3Amulticasting Public Resource has also made available The TDM Today Show and an early release of The Florilegium: A Special Index to Plants. A previous article about our work in this area appeared in Nature. The General Index was also the subject of a more recent article in Nature.  Declaration of Support for the General Index  ``Public Resource, a registered nonprofit organization based in California, has created a General Index to scientific journals. The General Index consists of a listing of n-grams, from unigrams to five-grams, extracted from 107 million journal articles.   The General Index is non-consumptive, in that the underlying articles are not released, and it is transformative in that the release consists of the extraction of facts that are derived from that underlying corpus.  The General Index is available for free download with no restrictions on use. This is an initial release, and the hope is to improve the quality of text extraction, broaden the scope of the underlying corpus, provide more sophisticated metrics associated with terms, and other enhancements.   Access to the full corpus of scholarly journals is an essential facility to the practice of science in our modern world. The General Index is an invaluable utility for researchers who wish to search for articles about plants, chemicals, genes, proteins, materials, geographical locations, and other entities of interest. The General Index allows scholars and students all over the world to perform specialized and customized searches within the scope of their disciplines and research over the full corpus.   Access to knowledge is a human right and the increase and diffusion of knowledge depends on our ability to stand on the shoulders of giants. We applaud the release of the General Index and look forward to the progress of this worthy endeavor.''   Signatories to the Declaration of Support  Dr. Vinton G. Cerf, Internet Pioneer  Dr. Gitanjali Yadav, National Institute of Plant Genome Research and Cambridge University  Dr. Ross Mounce, Arcadia  Dr. Ian T. Foster, University of Chicago  Dr. Amitabh Joshi, J.C. Bose National Fellow; Jawaharlal Nehru Centre for Advanced Scientific ResearchHeather Joseph, Executive Director, SPARCDr. Corynne McSherry, Legal Director, Electronic Frontier FoundationDr. Lawrence Liang, Ambedkar University School of LawDr. Dinesh Singh, Former Vice Chancellor, University of DelhiDr. Pamela Samuelson, University of California, Berkeley School of LawAlexander B. Howard, Director, The Digital Democracy ProjectBlair MacIntyre, Professor, Georgia TechKaylea Champion, PhD Student, University of WashingtonSamuel Klein, Curator, Knowledge Futures GroupEric Brunner, OregonianPeter C. Richardson, Associate Technical Fellow, The Boeing Company  Federico Leva, Wikimedia ItaliaNick Shockey, Director of Programs \& Engagement, SPARCChristof Sch{\"o}ch, Professor of Digital Humanities, Trier University, GermanyDave Hansen, Librarian, Duke University Lambert Heller, Librarian, Leibniz Information Centre for Science and Technology  Cameron Neylon, Professor, Curtin University Roger Levy, Professor, Massachusetts Institute of Technology Lingfei Wu, Professor, The University of Pittsburgh Onur Varol, Professor, Sabanci University Matthew Elvey, Medical Researcher, Yale University Daniel St{\"o}kl Ben Ezra, Professor, Ecole Pratique des Hautes Etudes James Evans, Professor, University of Chicago Peter Suber, Office for Scholarly Communication, Harvard University Philip Young, Librarian, Virginia Tech Gavin Moodie, Doctor, University of Toronto Memo Cordova, Librarian, Boise State UniversityOscar Perea Rodriguez, Lecturer, University of San FranciscoKyle K. Courtney, Copyright Advisor, Harvard UniversityAgitha T.G, Professor, RetiredSubbiah Arunachalam , Professor, Indian Institute of ScienceRochelle Pinto, Independent ResearcherRahul Siddharthan, Professor "G", The Institute of Mathematical Sciences, ChennaiDr. Himender Bharti, Professor, Puniabi University, PatialaFernando Gonzalez-Candelas, Professor, University of Valencia, SpainJasjeet Singh Bagla, Professor, IISER MohaliAnirudh Gupta, Data Scientist, ThoughtworksMichael Travers, Software Engineer, Parker Institute for Cancer ImmunotherapyM P Gururajan, Professor, IIT BombayTim O'Reilly, CEO, O'Reilly MediaChris Mills, Engineering Manager, IndeedMark Johnson, Technologist and Adjunct Professor, North Carolina State UniversityJeff Cox, Lawyer, UniCourtCable Green, Director of Open Education, Creative CommonsAshutosh Sharma, Research Student, The University of Trans-Disciplinary Health Sciences and Technology, IndiaDavid S. Reed, Founder, Center for Public AdministratorsJean-Claude Gu{\'e}don, Professor (retired), Universit{\'e} de Montr{\'e}alChris Hartgerink, Director, Liberate Science GmbHKhaeruddin Kiramang, Student, Curtin UniversityRamy Arnaout, Professor, Beth Israel Deaconess Medical Center  Ian Connor, Industry Supervisor, QUTMartin R. Lucas, Lawyer, Chambers of Martin R. LucasJorge Cortell, Former Associate Professor of Intellectual Property, Polytechnic University of ValenciaLane Rasberry, Research Scientist, School of Data Science, University of VirginiaJames Clement, Research Scientist, Betterhumans IncUri Hasson, Professor of Cognitive Neuroscience, University of Trento ItalyLJ Eads, Data Scientist, The MITRE Corporation  Jerry Goldman, Professor Emeritus, Northwestern UniversityAlex O. Holcombe, Professor, The University of SydneyRajarshi Das, Research, FatBrain.AIM Madhan, Librarian, Jindal Global UniversityMark Hahnel, CEO, FigshareNidhal Selmi, Software Engineer, Arizona State UniversityJohn J. Murphy, Physician/Clinical Informaticist, Veterans Health AdministrationAllen Riddell, Assistant Professor, Indiana University BloomingtonDerek Hefley, Graduate Student, Missouri S\&T  Antonio Max, Author, Independent ResearcherBethaney Hatch, Executive Assistant, ArrantaBioDeborah Salerno, Independent Medical Writer, Salerno ScientificGeethanjali Sreenivasarao Pavar, PhD Researcher, University of EdinburghDr. Nimal Chandrasena, Former Associate Professor, University of ColomboCarlos Denner, Professor, University of Brasilia{\'A}lvaro Salad{\'e}n Roa, Professor, Universidad de CartagenaNAFIUL, Student, Mymensingh Medical College  Vincent Raymond, Professor, Universit{\'e} Laval (Ville de Qu{\'e}bec)Dr. O. O. Ilori, Head of Department , Obafemi Awolowo University  Gina Santos Itchon, Professor, Xavier University - Ateneo de CagayanDr. Marjorie J. Hinds, Independent ResearcherRafael Lairet, Professor, Universidad Sim{\'o}n Bol{\'i}varC. Mitchell Clark, University of Nebraska-LincolnOladimeji Oluwalasinu, Student, Obafemi Awolowo University Ile-IfeZhiwen Hu, Professor, Zhejiang Gongshang UniversitySahaya G. Selvam, Associate Professor, Marist International University CollegeDr. Johannes Kabisch, Professor, NTNUPhilip Meier, CTO, Maila HealthFilip Vukovinski, Researcher, StaatsinstitutCatherine Demoliou, Professor, University of NicosiaDaniel Mietchen, Researcher, Ronin InstituteMarc Robinson-Rechavi, Professor, University of LausanneTristan Henderson, Graduate Student, Mississippi State UniversityIvan Arisi, Scientist, European Brain Research Institute (EBRI)V. Jithin, Student, Wildlife Institute of IndiaAmos Bairoch, Professor, Swiss Institute of Bioinformatics  Peter Murray-Rust, Dr, University of CambridgeRobert H'obbes' Zakon, Founding Principal, Zakon Group LLC},
  copyright = {There are no rights reserved.},
  langid = {Science is our universal language.}
}

@article{Radev2015,
  title = {A Bibliometric and Network Analysis of the Field of Computational Linguistics},
  author = {Radev, Dragomir R and Joseph, Mark Thomas and Gibson, Brian and Muthukrishnan, Pradeep},
  year = {2015},
  journal = {Journal of the Association for Information Science and Technology},
  eprint = {0803.1716},
  issn = {19335954},
  doi = {10.1002/asi},
  abstract = {Academic librarians throughout higher education add value to the teaching and learning missions of their institutions though information literacy instruction. To demonstrate the full impact of librarians on students in higher education, librarians need comprehensive information literacy assessment plans, composed of instructional program-level and outcome-level components, that summarize the purpose of information literacy assessment, emphasize the theoretical basis of their assessment efforts, articulate specific information literacy goals and outcomes, describe the major assessment methods and tools used to capture evidence of student learning, report assessment results, and highlight improvements made as a consequence of learning assessment.},
  archiveprefix = {arXiv},
  isbn = {9783848215430},
  pmid = {502955140},
  file = {/Users/francojc/Zotero/storage/LTRCDVNZ/Radev et al. - 2015 - A Bibliometric and Network Analysis of the Field of Computational Linguistics.pdf}
}

@misc{Rauh2020,
  title = {The {{ParlSpeech V2}} Data Set: {{Full-text}} Corpora of 6.3 Million Parliamentary Speeches in the Key Legislative Chambers of Nine Representative Democracies},
  shorttitle = {The {{ParlSpeech V2}} Data Set},
  author = {Rauh, Christian and Schwalbach, Jan},
  year = {2020},
  month = jan,
  publisher = {Harvard Dataverse},
  doi = {10.7910/DVN/L4OAKN},
  urldate = {2021-10-21},
  abstract = {ParlSpeech V2 contains complete full-text vectors of more than 6.3 million parliamentary speeches in the key legislative chambers of Austria, the Czech Republic, Germany, Denmark, the Netherlands, New Zealand, Spain, Sweden, and the United Kingdom, covering periods between 21 and 32 years. Meta-data include information on date, speaker, party, and partially agenda item under which a speech was held. The accompanying release note provides a more detailed guide to the data.}
}

@misc{RCommunity2024,
  type = {Repository},
  title = {The Comprehensive {{R}} Archive Network},
  shorttitle = {{{CRAN}}},
  author = {{R Community}},
  year = {2024},
  journal = {The Comprehensive R Archive Network},
  url = {https://cran.r-project.org/},
  urldate = {2024-04-04}
}

@article{Reagan2016,
  title = {The Emotional Arcs of Stories Are Dominated by Six Basic Shapes},
  author = {Reagan, Andrew J. and Mitchell, Lewis and Kiley, Dilan and Danforth, Christopher M. and Dodds, Peter Sheridan},
  year = {2016},
  journal = {arXiv arXiv:1606.07772},
  number = {1},
  eprint = {1606.07772},
  url = {http://arxiv.org/abs/1606.07772},
  abstract = {Advances in computing power, natural language processing, and digitization of text now make it possible to study our a culture's evolution through its texts using a "big data" lens. Our ability to communicate relies in part upon a shared emotional experience, with stories often following distinct emotional trajectories, forming patterns that are meaningful to us. Here, by classifying the emotional arcs for a filtered subset of 1,737 stories from Project Gutenberg's fiction collection, we find a set of six core trajectories which form the building blocks of complex narratives. We strengthen our findings by separately applying optimization, linear decomposition, supervised learning, and unsupervised learning. For each of these six core emotional arcs, we examine the closest characteristic stories in publication today and find that particular emotional arcs enjoy greater success, as measured by downloads.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/9UZL7ISC/Reagan et al. - 2016 - The emotional arcs of stories are dominated by six basic shapes.pdf}
}

@misc{ResolutionCyberinfrastructureLinguistic04-2021,
  title = {Resolution on Cyberinfrastructure {\textbar} Linguistic Society of America},
  url = {https://www.linguisticsociety.org/resource/resolution-cyberinfrastructure},
  urldate = {2021-04-08},
  file = {/Users/francojc/Zotero/storage/4S6LLNKT/resolution-cyberinfrastructure.html}
}

@article{Reyes2004,
  title = {Functions of Code Switching in Schoolchildren's Conversations},
  author = {Reyes, Iliana},
  year = {2004},
  month = apr,
  journal = {Bilingual Research Journal},
  volume = {28},
  number = {1},
  pages = {77--98},
  issn = {1523-5882, 1523-5890},
  doi = {10.1080/15235882.2004.10162613},
  urldate = {2021-06-14},
  abstract = {This study examined the code-switching patterns in the speech of immigrant Spanish-speaking children. Seven- and 10-year-old boys and girls from bilingual classrooms were each paired with a mutually selected friend, and their speech was collected in two contexts: while the children waited for an expected science experiment and when they worked together to follow an instruction worksheet about hands-on magnetic materials. This study presents data on the discourse characteristics of children's code switching, and the functions that Spanish and English have according to context. In addition, the data are described in relation to children's language competence and preference. The study found that code switching occurred both within and across turns. The older children's switches were more frequent and were deployed for a wider variety of functions than the younger children's. The results challenge the negative view that code switching by children who are learning two languages is due to lack of proficiency, and instead support the view that it is used as a strategy to extend their communicative competence during peer interaction.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/WFETEH87/Reyes - 2004 - Functions of Code Switching in Schoolchildren's Co.pdf}
}

@phdthesis{Riehemann2001,
  title = {A Constructional Approach to Idioms and Word Formation},
  author = {Riehemann, Susanne Z.},
  year = {2001},
  school = {Stanford},
  file = {/Users/francojc/Zotero/storage/YZTJH5B8/Riehemann - 2001 - A constructional approach to idioms and word forma.pdf}
}

@techreport{Ritterman2009,
  title = {Using Prediction Markets and Twitter to Predict a Swine Flu Pandemic},
  author = {Ritterman, J and Osborne, M and Klein, E},
  year = {2009}
}

@misc{Riva2021,
  title = {Word Embeddings: {{CBOW}} vs Skip-Gram {\textbar} Baeldung on Computer Science},
  shorttitle = {Word Embeddings},
  author = {Riva, Martin},
  year = {2021},
  month = mar,
  url = {https://www.baeldung.com/cs/word-embeddings-cbow-vs-skip-gram},
  urldate = {2021-03-12},
  langid = {american},
  file = {/Users/francojc/Zotero/storage/SKPFY7N5/word-embeddings-cbow-vs-skip-gram.html}
}

@book{Robinson2020,
  title = {Build a Career in Data Science},
  author = {Robinson, Emily and Nolis, Jacqueline},
  year = {2020},
  publisher = {Manning Publications Co},
  address = {Shelter Island, NY},
  isbn = {978-1-61729-624-6},
  langid = {english},
  lccn = {QA76.9.D343 R63 2020},
  annotation = {OCLC: on1140724911},
  file = {/Users/francojc/Zotero/storage/GJ2JK4D8/Robinson and Nolis - 2020 - Build a career in data science.pdf}
}

@book{Robinson2020a,
  title = {Build a Career in Data Science},
  author = {Robinson, Emily and Nolis, Jacqueline},
  year = {2020},
  publisher = {Manning Publications Co},
  address = {Shelter Island, NY},
  isbn = {978-1-61729-624-6},
  langid = {english},
  lccn = {QA76.9.D343 R63 2020},
  annotation = {OCLC: on1140724911},
  file = {/Users/francojc/Zotero/storage/T3VTF3J3/Robinson and Nolis - 2020 - Build a career in data science.pdf}
}

@book{Robinson2022,
  title = {Welcome to Text Mining with {{R}} {\textbar} Text Mining with {{R}}},
  author = {Robinson, David and Silge, Julia},
  year = {2022},
  month = nov,
  publisher = {O'Reilly Media},
  url = {https://www.tidytextmining.com/},
  urldate = {2023-01-08},
  abstract = {A guide to text analysis within the tidy data framework, using the tidytext package and other tidy tools},
  langid = {english}
}

@book{Rodrigues2023,
  title = {Building Reproducible Analytical Pipelines with {{R}}},
  author = {Rodrigues, Bruno},
  year = {2023},
  month = apr,
  url = {https://raps-with-r.dev/},
  urldate = {2023-05-18},
  langid = {english},
  keywords = {/unread,docker,git,r,reproducibility,research}
}

@article{Roediger2000,
  title = {Distortions of Memory},
  author = {Roediger, H.L. L and McDermott, K.B. B},
  year = {2000},
  journal = {The Oxford handbook of memory},
  pages = {149--162},
  publisher = {Oxford University Press},
  url = {http://psycnet.apa.org/psycinfo/2000-00111-010},
  urldate = {2011-07-18}
}

@article{Roland2007,
  title = {Frequency of Basic {{English}} Grammatical Structures: A Corpus Analysis},
  shorttitle = {Frequency of Basic English Grammatical Structures},
  author = {Roland, Douglas and Dick, Frederic and Elman, Jeffrey L.},
  year = {2007},
  month = oct,
  journal = {Journal of memory and language},
  volume = {57},
  number = {3},
  pages = {348--379},
  issn = {0749-596X},
  doi = {10.1016/j.jml.2007.03.002},
  urldate = {2023-09-21},
  abstract = {Many recent models of language comprehension have stressed the role of distributional frequencies in determining the relative accessibility or ease of processing associated with a particular lexical item or sentence structure. However, there exist relatively few comprehensive analyses of structural frequencies, and little consideration has been given to the appropriateness of using any particular set of corpus frequencies in modeling human language. We provide a comprehensive set of structural frequencies for a variety of written and spoken corpora, focusing on structures that have played a critical role in debates on normal psycholinguistics, aphasia, and child language acquisition, and compare our results with those from several recent papers to illustrate the implications and limitations of using corpus data in psycholinguistic research.},
  pmcid = {PMC2722756},
  pmid = {19668599},
  keywords = {corpora,corpus-based,english,genre analysis,psycholinguistics},
  file = {/Users/francojc/Zotero/storage/QCU5YR6F/Roland et al. - 2007 - Frequency of Basic English Grammatical Structures A Corpus Analysis.pdf}
}

@incollection{Rollinson2010,
  title = {Learner Corpora and Second Language Acquisition: {{Introducing WriCLE}}},
  booktitle = {Analizar datos{$>$}{{Describir}} Variaci{\'o}n/{{Analysing}} data{$>$}{{Describing}} Variation},
  author = {Rollinson, Paul and Mendikoetxea, Amaya},
  editor = {Bueno Alonso, J. L. and Gonz{\'a}lez {\'A}lvarez, D. and Kirsten Torrado, U. and Mart{\'i}nez Insua, A. E. and {P{\'e}rez-Guerra}, J. and Rama Mart{\'i}nez, E. and Rodr{\'i}guez V{\'a}zquez, R.},
  year = {2010},
  pages = {1--12},
  publisher = {Universidade de Vigo},
  address = {Vigo},
  isbn = {978-84-8158-479-0}
}

@article{Ronkin1999,
  title = {Mock Ebonics : {{Linguistic}} Racism in Parodies of Ebonics on the Internet},
  author = {Ronkin, Maggie and Karn, H.E.},
  year = {1999},
  journal = {Journal of sociolinguistics},
  volume = {3},
  number = {3},
  pages = {360--380},
  publisher = {Wiley Online Library},
  url = {http://onlinelibrary.wiley.com/doi/10.1111/1467-9481.00083/full},
  urldate = {2011-11-01},
  file = {/Users/francojc/Zotero/storage/FMII6E47/Ronkin, Karn - 1999 - Mock Ebonics Linguistic racism in parodies of Ebonics on the Internet.pdf}
}

@misc{ROpenSci2024,
  type = {Repository},
  title = {The {{R-Universe System}}},
  author = {{ROpenSci}},
  year = {2024},
  journal = {The R-Universe System},
  url = {https://ropensci.org/r-universe/},
  urldate = {2024-04-04},
  abstract = {The R-universe platform is a new umbrella project under which we experiment with various new ideas for improving publication and discovery of research software in R.},
  langid = {english},
  keywords = {R-universe system,software}
}

@article{Rossman2014a,
  title = {Using Simulation-Based Inference for Learning Introductory Statistics},
  author = {Rossman, Allan J. and Chance, Beth L.},
  year = {2014},
  journal = {WIREs Computational Statistics},
  volume = {6},
  number = {4},
  pages = {211--221},
  issn = {1939-0068},
  doi = {10.1002/wics.1302},
  urldate = {2024-02-05},
  abstract = {Recent curriculum development projects emphasize teaching simulation and randomization-based statistical inference as a prominent feature in introductory statistics courses. We describe the goals, distinctive features, and examples from some of these projects. Technology is a key component of these courses, so we mention desirable features of the various technology products used with this approach. We also discuss how student learning is being assessed in such courses, along with how the curriculum effort itself is being evaluated. We also touch on some challenges that we have encountered with teaching these courses, both from a student and a faculty viewpoint. WIREs Comput Stat 2014, 6:211--221. doi: 10.1002/wics.1302 This article is categorized under: Statistical and Graphical Methods of Data Analysis {$>$} Bootstrap and Resampling Statistical Models {$>$} Simulation Models},
  copyright = {{\copyright} 2014 Wiley Periodicals, Inc.},
  langid = {english},
  keywords = {hypothesis-testing,randomization tests,simulation,statistics,teaching,technology},
  file = {/Users/francojc/Zotero/storage/ZGTAN6C4/Rossman and Chance - 2014 - Using simulation-based inference for learning introductory statistics.pdf}
}

@article{Rowley2007,
  title = {The Wisdom Hierarchy: {{Representations}} of the {{DIKW}} Hierarchy},
  author = {Rowley, Jennifer},
  year = {2007},
  journal = {Journal of Information Science},
  volume = {33},
  number = {2},
  pages = {163--180},
  issn = {01655515},
  doi = {10.1177/0165551506070706},
  file = {/Users/francojc/Zotero/storage/JI72LT9J/Rowley - 2007 - The wisdom hierarchy Representations of the DIKW .pdf}
}

@misc{RPubsClusteringPCA09-2023,
  title = {{{RPubs}} - Clustering on {{PCA}} Results},
  url = {https://rpubs.com/Bury/ClusteringOnPcaResults},
  urldate = {2023-09-19},
  keywords = {clustering,pca,principle component analysis,R}
}

@book{Ruhlemann2020,
  title = {Visual Linguistics with {{R}}: A Practical Introduction to Quantitative Interactional Linguistics},
  shorttitle = {Visual Linguistics with {{R}}},
  author = {R{\"u}hlemann, Christoph},
  year = {2020},
  month = jul,
  publisher = {John Benjamins Publishing Company},
  abstract = {This book is a textbook on R, a programming language and environment for statistical analysis and visualization. Its primary aim is to introduce R as a research instrument in quantitative Interactional Linguistics. Focusing on visualization in R, the book presents original case studies on conversational talk-in-interaction based on corpus data and explains in good detail how key graphs in the case studies were programmed in R. It also includes task sections to enable readers to conduct their own research and compute their own visualizations in R. Both the code underlying the key graphs in the case studies and the datasets used in the case studies as well as in the task sections are made available on the book's companion website.},
  googlebooks = {BwXtDwAAQBAJ},
  isbn = {978-90-272-6098-7},
  langid = {english}
}

@article{Salido2018,
  title = {Comparing Learners' and Native Speakers' Use of Collocations in Written {{Spanish}}},
  author = {Salido, Marcos Garc{\'i}a and Garcia, Marcos},
  year = {2018},
  month = nov,
  journal = {International Review of Applied Linguistics in Language Teaching},
  volume = {56},
  number = {4},
  pages = {401--426},
  publisher = {De Gruyter Mouton},
  issn = {1613-4141},
  doi = {10.1515/iral-2016-0103},
  urldate = {2021-10-29},
  abstract = {This article compares the use of collocations in texts written by native speakers and advanced learners of Spanish. The collocations studied were first identified in the sample texts on the grounds of phraseological criteria and subsequently assigned frequency information corresponding to their occurrence in a reference corpus of Spanish. The distribution of collocations with different frequency of co-occurrence and mutual information scores in the texts of the two samples was then compared, as was their proportion in the collocation repertoire of native speakers and learners. The study revealed significant differences both in terms of frequency and mutual information.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/3S7JMFWX/Salido_Garcia_2018_Comparing learners’ and native speakers’ use of collocations in written Spanish.pdf}
}

@article{Sapir1929,
  title = {The Status of Linguistics as a Science},
  author = {Sapir, Edward},
  year = {1929},
  journal = {Language},
  pages = {207--214},
  publisher = {JSTOR},
  keywords = {/unread,history,linguistics,science},
  file = {/Users/francojc/Zotero/storage/9I9HVB5V/Sapir_1929_The status of linguistics as a science.pdf}
}

@article{Sardinha2017,
  title = {American Television and Off-Screen Registers: A Corpus-Based Comparison},
  shorttitle = {American Television and Off-Screen Registers},
  author = {Sardinha, Tony Berber and Pinto, Marcia Veirano},
  year = {2017},
  month = apr,
  journal = {Corpora},
  volume = {12},
  number = {1},
  pages = {85--114},
  issn = {1749-5032, 1755-1676},
  doi = {10.3366/cor.2017.0110},
  urldate = {2020-11-13},
  abstract = {In this paper, we present an analysis of a corpus of American television programmes, comprising 930 texts from 191 different TV programmes (excluding commercial breaks and infomercials), totalling 5,320,159 tokens. This analysis compares different programme types (live politics, movies, news desk, reality shows, etc.) against one another and against nontelevisual registers (face-to-face conversations, telephone conversations, prepared speeches, press reviews, etc.), using the multidimensional (MD) approach to register variation (Biber, 1988). The goal of this analysis is to determine how homogeneous/heterogeneous the language of television is as well as to how it compares to the registers of English (Biber, 1988). We sought to detect both the similarities and differences among the TV registers and with respect to the five major dimensions of register variation previously identified by Biber (1988). Linguistic studies of television language to date (e.g., Al-Surmi, 2012; Bednarek, 2010; Quaglio, 2009; and Rey, 2001) have generally focussed on few or individual TV registers. This study provides a much more comprehensive view of American television discourse by relying on a large multi-register corpus optimised for representativeness through the application of Biber's (1993) method for sampling adequacy. The results show distinct differences among the registers' dimensions, suggesting that, on the whole, present-day American TV language is varied and patterned. In addition, the findings indicate that several TV registers have close counterparts in off-screen communication, such as conversations, speeches and interviews. All things considered, this study shows in detail the contact and separation points across different TV registers, as well as across TV and off-screen registers.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/Y4TY4YXX/Sardinha and Pinto - 2017 - American television and off-screen registers a co.pdf}
}

@article{Savoy2018,
  title = {Analysis of the Style and the Rhetoric of the 2016 {{US}} Presidential Primaries},
  author = {Savoy, Jacques},
  year = {2018},
  month = apr,
  journal = {Digital Scholarship in the Humanities},
  volume = {33},
  number = {1},
  pages = {143--159},
  issn = {2055-7671},
  doi = {10.1093/llc/fqx007},
  urldate = {2021-02-05},
  abstract = {This present article examines the verbal style and rhetoric of the candidates of the 2016 US presidential primary elections. To achieve this objective, this study analyzes the oral communication forms used by the candidates during the TV debates. When considering the most frequent lemmas, the candidates can be split into two groups, one using more frequently the pronoun `I', and the second favoring more the `we' (which corresponds to candidates leaving the presidential run sooner). According to several overall stylistic indicators, candidate Trump clearly adopted a simple and direct communication style, avoiding complex formulation and vocabulary. From a topical perspective, our analysis generates a map showing the affinities between candidates. This investigation results in the presence of three distinct groups of candidates, the first one with the Democrats (Clinton, O'Malley, and Sanders), the second with three Republicans (Bush, Cruz, Rubio), and the last with the duo Trump and Kasich, with, at a small distance, Paul. The over-used terms and typical sentences associated with each candidate reveal their specific topics such as `simple flat tax' for Cruz, `balanced budget' for Kasich, negativity with Trump, or critiques against large corporations and Wall Street for Sanders.},
  file = {/Users/francojc/Zotero/storage/NFHCHC96/Savoy - 2018 - Analysis of the style and the rhetoric of the 2016.pdf;/Users/francojc/Zotero/storage/XBK7XH7J/2993886.html}
}

@article{Saxena2020,
  title = {Machine Learning Methods for Computer-Aided Breast Cancer Diagnosis Using Histopathology: A Narrative Review},
  shorttitle = {Machine Learning Methods for Computer-Aided Breast Cancer Diagnosis Using Histopathology},
  author = {Saxena, Shweta and Gyanchandani, Manasi},
  year = {2020},
  journal = {Journal of medical imaging and radiation sciences},
  volume = {51},
  number = {1},
  pages = {182--193},
  publisher = {Elsevier},
  file = {/Users/francojc/Zotero/storage/5CVCNZQF/Saxena and Gyanchandani - 2020 - Machine learning methods for computer-aided breast.pdf;/Users/francojc/Zotero/storage/VNGEHN6F/S193986541930551X.html}
}

@book{schegloff2007sequence,
  title = {Sequence Organization in Interaction},
  author = {Schegloff, E.A.},
  year = {2007},
  series = {Primer in Conversation Analysis},
  publisher = {Cambridge University Press},
  url = {https://books.google.com/books?id=5XbJRFQ4dhsC},
  isbn = {978-0-521-53279-2},
  keywords = {/unread}
}

@article{Schield2004,
  title = {Information Literacy, Statistical Literacy and Data Literacy},
  author = {Schield, Milo},
  year = {2004},
  journal = {IASSIST Quarterly (IQ)},
  issn = {16609336},
  doi = {10.4028/www.scientific.net/AMM.353-356.3263},
  abstract = {Introduction The evaluation of information is a key element in information literacy, statistical literacy and data literacy. As such, all three literacies are inter-related. It is difficult to promote information literacy or data literacy without promoting statistical literacy. While their relative importance varies with one's perspective, these three literacies are united in dealing with similar problems that face students in college. More attention is needed on how these three literacies relate and how they may be taught synergistically. All librarians are interested in information literacy; archivists and data librarians are interested in data literacy. Both should consider teaching statistical literacy as a service to students who need to critically evaluate information in arguments. Information Literacy The need for information literacy has been highlighted in the US by several organizations including the American Library Association (ALA). In 1989 a call for information literacy was issued by the ALA Presidential Committee on Information Literacy. 2 In 1989, the National Forum on Information Literacy 3 was formed. And in 1998, the ALA/ACRL (Association of College and Research Libraries) issued a progress report. 4 Each organization and each report had some differences in their approach to information literacy. But one element was common to all -- the need for the critical evaluation of information. The ALA and ACRL issued a set of information literacy competency standards for higher education. 5 Information literacy is a set of abilities requiring indi-viduals to " recognize when information is needed and have the ability to locate, evaluate, and use effectively the needed information. " An information literate individual is able to: (1) Determine the extent of information needed, (2) Access the needed information effectively and efficiently, (3) Evaluate information and its sources critically, (4) Incorporate selected in-formation into one's knowledge base, (5) Use informa-tion effectively to accomplish a specific purpose, and (6) Understand the economic, legal, and social issues surrounding the use of information, and access and use information ethically and legally. In their presentation of the standards for information literacy, the American Association of School Librarians Association (AASL) for Educational Communications and Technology presented evaluation as one of three standards},
  isbn = {9783037857755},
  file = {/Users/francojc/Zotero/storage/CKWRSFQJ/Schield - 2004 - Information literacy, statistical literacy and dat.pdf}
}

@article{Schoonvelde2019,
  title = {Liberals Lecture, Conservatives Communicate: {{Analyzing}} Complexity and Ideology in 381,609 Political Speeches},
  shorttitle = {Liberals Lecture, Conservatives Communicate},
  author = {Schoonvelde, Martijn and Brosius, Anna and Schumacher, Gijs and Bakker, Bert N.},
  editor = {Wisneski, Daniel},
  year = {2019},
  month = feb,
  journal = {PLOS ONE},
  volume = {14},
  number = {2},
  pages = {e0208450},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0208450},
  urldate = {2021-03-09},
  abstract = {There is some evidence that liberal politicians use more complex language than conservative politicians. This evidence, however, is based on a specific set of speeches of US members of Congress and UK members of Parliament. This raises the question whether the relationship between ideology and linguistic complexity is a more general phenomenon or specific to this small group of politicians. To address this question, this paper analyzes 381,609 speeches given by politicians from five parliaments, by twelve European prime ministers, as well as speeches from party congresses over time and across countries. Our results replicate and generalize earlier findings: speakers from culturally liberal parties use more complex language than speakers from culturally conservative parties. Economic left-right differences, on the other hand, are not systematically linked to linguistic complexity.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/LGS29LIG/Schoonvelde et al. - 2019 - Liberals lecture, conservatives communicate Analy.pdf}
}

@article{Schoot2011,
  title = {Moving beyond Traditional Null Hypothesis Testing: {{Evaluating}} Expectations Directly},
  shorttitle = {Moving beyond Traditional Null Hypothesis Testing},
  author = {Schoot, Rens Van De and Hoijtink, Herbert and {Jan-Willem}, Romeijn},
  year = {2011},
  journal = {Frontiers in Psychology},
  volume = {2},
  issn = {1664-1078},
  doi = {10.3389/fpsyg.2011.00024},
  urldate = {2024-04-01},
  file = {/Users/francojc/Zotero/storage/2YRLR769/Schoot et al. - 2011 - Moving Beyond Traditional Null Hypothesis Testing Evaluating Expectations Directly.pdf}
}

@article{Schreuder1997,
  title = {How Complex Simplex Words Can Be},
  author = {Schreuder, Robert and Baayen, R. Harald},
  year = {1997},
  journal = {Journal of memory and language},
  volume = {37},
  number = {1},
  pages = {118--139},
  publisher = {Elsevier},
  keywords = {/unread,family size,mental lexicon,morphology,word recognition},
  file = {/Users/francojc/Zotero/storage/WXHZDFZE/Schreuder_Baayen_1997_How complex simplex words can be.pdf}
}

@inproceedings{Schulz2004,
  title = {Cognate Mapping: A Heuristic Strategy for the Semi-Supervised Acquisition of a {{Spanish}} Lexicon from a {{Portuguese}} Seed Lexicon},
  booktitle = {Proceedings of the 20th International Conference on {{Computational Linguistics}}},
  author = {Schulz, S. and Mark{\'o}, K. and Sbrissia, E. and Nohama, P. and Hahn, U.},
  year = {2004},
  pages = {813--es},
  publisher = {Association for Computational Linguistics},
  url = {http://portal.acm.org/citation.cfm?id=1220472},
  urldate = {2011-10-17}
}

@article{Schwartz2013,
  title = {Personality, Gender, and Age in the Language of Social Media: {{The}} Open-Vocabulary Approach},
  shorttitle = {Personality, Gender, and Age in the Language of Social Media},
  author = {Schwartz, H. Andrew and Eichstaedt, Johannes C. and Kern, Margaret L. and Dziurzynski, Lukasz and Ramones, Stephanie M. and Agrawal, Megha and Shah, Achal and Kosinski, Michal and Stillwell, David and Seligman, Martin E. P. and Ungar, Lyle H.},
  editor = {Preis, Tobias},
  year = {2013},
  month = sep,
  journal = {PLoS ONE},
  volume = {8},
  number = {9},
  pages = {e73791},
  issn = {1932-6203},
  doi = {10.1371/journal.pone.0073791},
  urldate = {2021-06-08},
  abstract = {We analyzed 700 million words, phrases, and topic instances collected from the Facebook messages of 75,000 volunteers, who also took standard personality tests, and found striking variations in language with personality, gender, and age. In our open-vocabulary technique, the data itself drives a comprehensive exploration of language that distinguishes people, finding connections that are not captured with traditional closed-vocabulary word-category analyses. Our analyses shed new light on psychosocial processes yielding results that are face valid (e.g., subjects living in high elevations talk about the mountains), tie in with other research (e.g., neurotic people disproportionately use the phrase `sick of' and the word `depressed'), suggest new hypotheses (e.g., an active life implies emotional stability), and give detailed insights (males use the possessive `my' when mentioning their `wife' or `girlfriend' more often than females use `my' with `husband' or 'boyfriend'). To date, this represents the largest study, by an order of magnitude, of language and personality.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/7RPVUU8N/Schwartz et al. - 2013 - Personality, Gender, and Age in the Language of So.pdf}
}

@article{Sedgwick2015,
  title = {Units of Sampling, Observation, and Analysis},
  author = {Sedgwick, Philip},
  year = {2015},
  month = oct,
  journal = {BMJ (online)},
  volume = {351},
  pages = {h5396},
  doi = {10.1136/bmj.h5396},
  abstract = {Researchers sought the views of the British public on the acceptability of the use of personal medical data for the purposes of public health research and surveillance without individual consent. A cross sectional study was performed by the Office for National Statistics. A survey was constructed to ascertain the acceptability of the use of identifiable information for public health purposes in the context of the National Cancer Registry. The participants were recruited using multistage sampling of adults in the UK during March and April 2015. The multistage sampling consisted of three stages. At the first stage a sample of postal districts in the UK was selected at random, with the probability of selection proportional to size. Within each district, a random sample of private households was selected. In households with more than one adult, one person was selected at random. Face to face interviews were carried out with 2872 adults.1 Of the 2872 respondents, 72\% (95\% confidence interval 70\% to 74\%) did not consider any of the following to be an invasion of their privacy by the National Cancer Registry: inclusion of postcode, inclusion of name and address, or the receipt of a letter inviting them to a research study on the basis of inclusion in the registry. It was concluded that most of the British public does not {\dots}},
  keywords = {/unread,research,unit of analysis,unit of observation},
  file = {/Users/francojc/Zotero/storage/HE49MYLW/Sedgwick - 2015 - Units of sampling, observation, and analysis.pdf}
}

@book{Sellors2019,
  title = {Field Guide to the {{R}} Ecosystem},
  author = {Sellors, Mark},
  year = {2019},
  month = oct,
  url = {http://fg2re.sellorm.com},
  urldate = {2021-03-18},
  abstract = {This guide aims to introduce the reader to the main elements of the R ecosystem.},
  file = {/Users/francojc/Zotero/storage/PGBQHWTD/fg2re.sellorm.com.html}
}

@article{Serigos2020,
  title = {Using Automated Methods to Explore the Social Stratification of Anglicisms in {{Spanish}}},
  author = {Serigos, Jacqueline},
  year = {2020},
  month = dec,
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {0},
  number = {0},
  pages = {000010151520190052},
  issn = {1613-7035, 1613-7027},
  doi = {10.1515/cllt-2019-0052},
  urldate = {2021-07-22},
  abstract = {Traditionally, automated methods for loanword detection have not received an abundance of attention within the field of language contact. However, as research on loanwords has begun utilizing corpora with word counts in the millions, these generous quantities of data pose challenges for traditional methods of linguistic annotation. This paper presents a method for automatically detecting anglicisms within Spanish text and presents a case study, applying this method to explore the social stratification of anglicisms in Argentine media. The findings of the case study suggest that anglicisms may function as prestige markers in Argentina, which may be a logical consequence of the mode of contact: those of upper socio-economic status have greater access to outlets where loanwords seem to emerge, such as the media, Internet, and second language education.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/472USBXS/Serigos - 2020 - Using automated methods to explore the social stra.pdf}
}

@phdthesis{Shriberg1994,
  title = {Preliminaries to a Theory of Speech Disfluencies},
  author = {Shriberg, Elizabeth Ellen},
  year = {1994},
  urldate = {2023-10-31},
  school = {University of California at Berkeley},
  keywords = {disfluencies,filled pauses},
  file = {/Users/francojc/Zotero/storage/G8Y6BTCQ/Shriberg - 1994 - Preliminaries to a theory of speech disfluencies.pdf}
}

@inproceedings{Shriberg1996,
  title = {Disfluencies in {{Switchboard}}},
  booktitle = {Proceedings of International Conference on Spoken Language Processing},
  author = {Shriberg, Elizabeth},
  year = {1996},
  series = {1},
  volume = {96},
  address = {Philadelphia, PA},
  file = {/Users/francojc/Zotero/storage/VKJBLGEP/Shriberg - 1996 - Disfluencies in Switchboard.pdf}
}

@article{Shriberg1998,
  title = {Can Prosody Aid the Automatic Classification of Dialog Acts in Conversational Speech?},
  author = {Shriberg, Elizabeth and Bates, Rebecca and Taylor, Paul and Stolcke, Andreas and Jurafsky, Daniel and Ries, Klaus and Coccaro, Noah and Martin, Rachel and Meteer, Marie and {Van Ess-Dykema}, Carol},
  year = {1998},
  journal = {Language and Speech},
  volume = {41},
  number = {3--4},
  pages = {439--487}
}

@book{Silge2017,
  title = {Text Mining with {{R}}: A Tidy Approach},
  author = {Silge, Julia and Robinson, David},
  year = {2017},
  publisher = {O'Reilly Media},
  url = {https://www.tidytextmining.com/}
}

@book{Silge2022,
  title = {Supervised Machine Learning for Text Analysis in {{R}}},
  author = {Silge, Julia and Hvitfeld, Emil},
  year = {2022},
  month = may,
  series = {Data {{Science}}},
  publisher = {CRC Press},
  url = {https://smltar.com/},
  urldate = {2023-01-08},
  abstract = {Supervised Machine Learning for Text Analysis in R}
}

@inproceedings{Silveira2014,
  title = {A Gold Standard Dependency Corpus for {{English}}},
  booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({{LREC-2014}})},
  author = {Silveira, Natalia and Dozat, Timothy and {de Marneffe}, Marie-Catherine and Bowman, Samuel and Connor, Miriam and Bauer, John and Manning, Christopher D.},
  year = {2014}
}

@inproceedings{Simard2001,
  title = {Sub-Sentential Exploitation of Translation Memories},
  booktitle = {Machine {{Translation Summit VIII}}},
  author = {Simard, M and Langlais, P},
  year = {2001},
  pages = {335--339},
  publisher = {Citeseer},
  file = {/Users/francojc/Zotero/storage/QHP32BSI/Simard, Langlais - 2001 - Sub-sentential exploitation of translation memories.pdf}
}

@incollection{Sinclair2015,
  title = {Text Analysis and Visualization: {{Making}} Meaning Count},
  booktitle = {A {{New Companion}} to {{Digital Humanities}}},
  author = {Sinclair, St{\'e}fan and Rockwell, Geoffrey},
  editor = {Schreibman, Susan and Siemens, Ray and Unsworth, John},
  year = {2015},
  month = nov,
  pages = {20},
  publisher = {John Wiley \& Sons, Ltd},
  address = {Chichester, UK},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/CLKBCJYQ/Sinclair and Rockwell - 2015 - Text Analysis and Visualization Making Meaning Co.pdf}
}

@book{Skinner1948,
  title = {Verbal Behavior},
  author = {Skinner, B. F.},
  year = {1948},
  journal = {William James Lectures},
  publisher = {Harvard University Press},
  abstract = {William James Lectures Harvard University 1948},
  file = {/Users/francojc/Zotero/storage/IU4EURCB/Skinner - 1948 - Verbal Behavior.pdf}
}

@incollection{Slobin2006,
  title = {What Makes Manner of Motion Salient?: {{Explorations}} in Linguistic Typology, Discourse, and Cognition},
  shorttitle = {What Makes Manner of Motion Salient?},
  booktitle = {Typological {{Studies}} in {{Language}}},
  author = {Slobin, Dan I.},
  editor = {Hickmann, Maya and Robert, St{\'e}phane},
  year = {2006},
  volume = {66},
  pages = {59--81},
  publisher = {John Benjamins Publishing Company},
  address = {Amsterdam},
  doi = {10.1075/tsl.66.05slo},
  urldate = {2023-09-26},
  isbn = {978-90-272-2977-9 978-90-272-2978-6 978-90-272-9355-8},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/NADCX9SP/Slobin - 2006 - What makes manner of motion salient Explorations in linguistic typology, discourse, and cognition.pdf}
}

@misc{Soltoff2020,
  title = {Computing for the Social Sciences},
  author = {Soltoff, Benjamin},
  year = {2020},
  journal = {Computing for the Social Sciences},
  url = {https://cfss.uchicago.edu/},
  urldate = {2020-10-06},
  abstract = {Introductory course to reproducible research and programming in the social sciences.},
  langid = {american},
  file = {/Users/francojc/Zotero/storage/3NWGGH6C/cfss.uchicago.edu.html}
}

@article{Soni2021,
  title = {Abolitionist Networks: {{Modeling}} Language Change in Nineteenth-Century Activist Newspapers},
  shorttitle = {Abolitionist Networks},
  author = {Soni, Sandeep and Klein, Lauren and Eisenstein, Jacob},
  year = {2021},
  month = mar,
  journal = {arXiv},
  eprint = {2103.07538},
  url = {http://arxiv.org/abs/2103.07538},
  urldate = {2021-03-20},
  abstract = {The abolitionist movement of the nineteenth-century United States remains among the most significant social and political movements in US history. Abolitionist newspapers played a crucial role in spreading information and shaping public opinion around a range of issues relating to the abolition of slavery. These newspapers also serve as a primary source of information about the movement for scholars today, resulting in powerful new accounts of the movement and its leaders. This paper supplements recent qualitative work on the role of women in abolition's vanguard, as well as the role of the Black press, with a quantitative text modeling approach. Using diachronic word embeddings, we identify which newspapers tended to lead lexical semantic innovations -- the introduction of new usages of specific words -- and which newspapers tended to follow. We then aggregate the evidence across hundreds of changes into a weighted network with the newspapers as nodes; directed edge weights represent the frequency with which each newspaper led the other in the adoption of a lexical semantic change. Analysis of this network reveals pathways of lexical semantic influence, distinguishing leaders from followers, as well as others who stood apart from the semantic changes that swept through this period. More specifically, we find that two newspapers edited by women -- THE PROVINCIAL FREEMAN and THE LILY -- led a large number of semantic changes in our corpus, lending additional credence to the argument that a multiracial coalition of women led the abolitionist movement in terms of both thought and action. It also contributes additional complexity to the scholarship that has sought to tease apart the relation of the abolitionist movement to the women's suffrage movement, and the vexed racial politics that characterized their relation.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/KZZNU4ZJ/Soni et al. - 2021 - Abolitionist Networks Modeling Language Change in.pdf;/Users/francojc/Zotero/storage/Q4MTL7WN/2103.html}
}

@misc{Sonnad2021,
  title = {The Great American Word Mapper},
  author = {Sonnad, Nikhil},
  year = {2021},
  journal = {Quartz},
  url = {https://qz.com/862325/the-great-american-word-mapper/},
  urldate = {2021-05-25},
  abstract = {Make your own maps and find out where Americans use 100,000 different words},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/H3ITBTPI/the-great-american-word-mapper.html}
}

@book{Spector2008,
  title = {Data Manipulation with {{R}}},
  author = {Spector, Phil},
  year = {2008},
  journal = {Springer},
  publisher = {Springer},
  issn = {1557170X},
  doi = {10.1007/978-1-4419-7976-6},
  isbn = {0-201-07616-0},
  pmid = {21097328},
  file = {/Users/francojc/Zotero/storage/RJGGK9CF/Spector - 2008 - Data Manipulation with R.pdf}
}

@misc{StanfordDataScience10-2020,
  title = {Stanford Data Science Initiative},
  url = {https://sdsi.stanford.edu/about},
  urldate = {2020-10-06},
  file = {/Users/francojc/Zotero/storage/IN79TCDB/about.html}
}

@book{Stanton2013,
  title = {An Introduction to Data Science},
  author = {Stanton, Jeffrey},
  year = {2013},
  file = {/Users/francojc/Zotero/storage/PLVDGM7R/Stanton - 2013 - An Introduction to Data Science.pdf}
}

@article{Stefanowitsch2003,
  title = {Collostructions: {{Investigating}} the Interaction of Words and Constructions},
  author = {Stefanowitsch, Anatol and Gries, Stefan Th.},
  year = {2003},
  journal = {International Journal of Corpus Linguistics},
  volume = {8},
  number = {2},
  pages = {209--243},
  publisher = {John Benjamins Publishing Company},
  issn = {13846655},
  doi = {10.1075/ijcl.8.2.03ste},
  urldate = {2011-11-04},
  abstract = {This paper introduces an extension of collocational analysis that takes into account grammatical structure and is specifically geared to investigating the interaction of lexemes and the grammatical constructions associated with them. The method is framed in a construction-based approach to language, i.e. it assumes that grammar consists of signs (form-meaning pairs) and is thus not fundamentally different from the lexicon. Themethod is applied to linguistic expressions at various levels of abstraction (words, semi-fixed phrases, argument structures, tense, aspect and mood). Themethod has two main applications: first, to increase the adequacy of grammatical description by providing an objective way of identifying themeaning of a grammatical construction and determining the degree to which particular slots in it prefer or are restricted to a particular set of lexemes; second, to provide data for linguistic theory-building.},
  isbn = {1384-6655},
  file = {/Users/francojc/Zotero/storage/XYNZLXU3/Stefanowitsch, Gries - 2003 - Collostructions Investigating the interaction of words and constructions.pdf}
}

@book{Sternberg2010,
  title = {The Psychologist's Companion: A Guide to Writing Scientific Papers for Students and Researchers},
  shorttitle = {The Psychologist's Companion},
  author = {Sternberg, Robert J. and Sternberg, Karin},
  year = {2010},
  month = sep,
  edition = {5},
  publisher = {Cambridge University Press},
  doi = {10.1017/CBO9780511762024},
  urldate = {2024-03-11},
  abstract = {The Psychologist's Companion is intended for students as well as young professionals and writers at all stages of their careers seeking inspiration and guidelines for better scientific writing. This book is also a resource for researchers in related fields. It has been comprehensively updated, revised, and extended for its fifth edition and includes the latest style guidelines of the American Psychological Association's Publication Manual (sixth edition, 2009) as well as chapters encompassing the entire research process from doing literature research and planning an experiment to writing the paper. It features new chapters on literature research; ethics; and generating, evaluating, and selling ideas. The Psychologist's Companion also provides information on writing book proposals, grant proposals, and lectures.},
  isbn = {978-0-511-76202-4 978-0-521-19571-3 978-0-521-14482-7}
}

@article{Stolcke2000,
  title = {Dialogue Act Modeling for Automatic Tagging and Recognition of Conversational Speech},
  author = {Stolcke, Andreas and Ries, Klaus and Coccaro, Noah and Shriberg, Elizabeth and Bates, Rebecca and Jurafsky, Daniel and Taylor, Paul and Martin, Rachel and Meteer, Marie and {Van Ess-Dykema}, Carol},
  year = {2000},
  journal = {Computational Linguistics},
  volume = {26},
  number = {3},
  pages = {339--371}
}

@article{Subtirelu2015,
  title = {``{{She}} Does Have an Accent But{\dots}'': {{Race}} and Language Ideology in Students' Evaluations of Mathematics Instructors on Ratemyprofessors.Com},
  shorttitle = {``{{She}} Does Have an Accent But{\dots}''},
  author = {Subtirelu, Nicholas Close},
  year = {2015},
  month = feb,
  journal = {Language in Society},
  volume = {44},
  number = {1},
  pages = {35--62},
  issn = {0047-4045, 1469-8013},
  doi = {10.1017/S0047404514000736},
  urldate = {2021-09-21},
  abstract = {Abstract             Nonnative English speakers (NNESs) who teach at English-medium institutions in the United States (US) have frequently been the subject of student complaints. Research into language ideologies concerning NNESs in the US suggests that such complaints can be understood as manifestations of a broader project of social exclusion operating, in part, through the ideological construction of the NNES as incomprehensible Other. The present study explores the extent to which such ideological presuppositions and exaggerative performances are observable in students' evaluations of `Asian' mathematics instructors on the website RateMyProfessors.com (RMP). A mixed methodological approach combining statistical analysis of numeric RMP ratings, quantitative corpus linguistic techniques, and critical discourse analysis was employed. Findings confirm the presence of disadvantages related to `Asian' instructors' race and language. However, RMP users' discourse is shown to be less overtly discriminatory and instead to reproduce dominant language ideology in subtle, previously undescribed ways.(Student evaluations, higher education, university teaching, nonnative speakers, second language users, ethnicity, critical discourse analysis, corpus linguistics, formulaic language)*},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/N8S3NU39/Subtirelu - 2015 - “She does have an accent but…” Race and language .pdf}
}

@misc{SWDA2008,
  title = {Switchboard {{Dialog Act Corpus}}. {{Web}} Download.},
  author = {{University of Colorado Boulder}},
  year = {2008},
  publisher = {Linguistic Data Consortium},
  url = {https://catalog.ldc.upenn.edu/docs/LDC97S62/}
}

@article{Swerts1998,
  title = {Filled Pauses as Markers of Discourse Structure},
  author = {Swerts, Marc and Wichmann, Anne and Beun, Robbert-Jan},
  year = {1998},
  journal = {Journal of Pragmatics},
  volume = {30},
  number = {4},
  pages = {485--496},
  doi = {10.1016/S0378-2166(98)00014-9},
  abstract = {This study aims to test quantitatively whether  lled pauses (FPs) may highlight discourse structure. More speci cally, it is  rst investigated whether FPs are more typical in the vicinity of major discourse boundaries. Secondly, the FPs are analyzed acoustically, to check whether those occurring at major discourse boundaries are segmentally and prosodically di erent from those at shallower breaks. Analyses of twelve spontaneous monologues (Dutch) show that phrases following major discourse boundaries more often contain FPs. Additionally, FPs after stronger breaks tend to occur phraseinitially, whereas the majority of the FPs after weak boundaries are in phrase-internal position. Also, acoustic observations reveal that FPs at major discourse boundaries are both segmentally and prosodically distinct. They also di er with respect to the distribution of neighbouring silent pauses.},
  langid = {english},
  keywords = {/unread,discourse analysis,filled pauses},
  file = {/Users/francojc/Zotero/storage/4DG5EMNY/Swerts et al. - Filled Pauses as Markers of Discourse Structure.pdf}
}

@article{Swinney1979,
  title = {The Access and Processing of Idiomatic Expressions},
  author = {Swinney, David A. and Cutler, Anne},
  year = {1979},
  journal = {Journal of Verbal Learning and Verbal Behavior},
  volume = {18},
  number = {5},
  pages = {523--534},
  issn = {00225371},
  doi = {10.1016/S0022-5371(79)90284-6},
  abstract = {Two experiments examined the nature of access, storage, and comprehension of idiomatic phrases. In both studies a Phrase Classification Task was utilized. In this, reaction times to determine whether or not word strings constituted acceptable English phrases were measured. Classification times were significantly faster to idiom than to matched control phrases. This effect held under conditions involving different categories of idioms, different transitional probabilities among words in the phrases, and different levels of awareness of the presence of idioms in the materials. The data support a Lexical Representation Hypothesis for the processing of idioms. {\copyright} 1979 Academic Press, Inc.},
  isbn = {0022-5371},
  file = {/Users/francojc/Zotero/storage/UDLXQNGR/Swinney, Cutler - 1979 - The access and processing of idiomatic expressions.pdf}
}

@inproceedings{Szmrecsanyi2004,
  title = {On Operationalizing Syntactic Complexity},
  booktitle = {Le Poids Des Mots. {{Proceedings}} of the Seventh International Conference on Textual Data Statistical Analysis. {{Louvain-la-Neuve}}},
  author = {Szmrecsanyi, Benedikt},
  year = {2004},
  volume = {2},
  pages = {1032--1039},
  keywords = {/unread,syntactic complexity,syntax},
  file = {/Users/francojc/Zotero/storage/XB83UJXH/Szmrecsanyi_2004_On operationalizing syntactic complexity.pdf}
}

@article{Tagliamonte2005,
  title = {So Who? {{Like}} How? {{Just}} What?},
  author = {Tagliamonte, Sali},
  year = {2005},
  month = nov,
  journal = {Journal of Pragmatics},
  volume = {37},
  number = {11},
  pages = {1896--1915},
  issn = {03782166},
  doi = {10.1016/j.pragma.2005.02.017},
  urldate = {2012-10-26}
}

@article{Talarico2003,
  title = {Confidence, Not Consistency, Characterizes Flashbulb Memories},
  author = {Talarico, Jennifer M. and Rubin, David C.},
  year = {2003},
  month = sep,
  journal = {Psychological Science},
  volume = {14},
  number = {5},
  pages = {455--461},
  issn = {0956-7976, 1467-9280},
  doi = {10.1111/1467-9280.02453},
  urldate = {2021-07-22},
  abstract = {On September 12, 2001, 54 Duke students recorded their memory of first hearing about the terrorist attacks of September 11 and of a recent everyday event. They were tested again either 1, 6, or 32 weeks later. Consistency for the flashbulb and everyday memories did not differ, in both cases declining over time. However, ratings of vividness, recollection, and belief in the accuracy of memory declined only for everyday memories. Initial visceral emotion ratings correlated with later belief in accuracy, but not consistency, for flashbulb memories. Initial visceral emotion ratings predicted later posttraumatic stress disorder symptoms. Flashbulb memories are not special in their accuracy, as previously claimed, but only in their perceived accuracy.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/XXW3AHF5/Talarico and Rubin - 2003 - Confidence, Not Consistency, Characterizes Flashbu.pdf}
}

@article{Talmy1991,
  title = {Path to Realization: A Typology of Event Conflation},
  shorttitle = {Path to Realization},
  author = {Talmy, Leonard},
  year = {1991},
  month = jul,
  journal = {Annual Meeting of the Berkeley Linguistics Society},
  volume = {17},
  number = {1},
  pages = {480},
  issn = {2377-1666, 0363-2946},
  doi = {10.3765/bls.v17i0.1620},
  urldate = {2023-09-26},
  abstract = {Proceedings of the Seventeenth Annual Meeting of the Berkeley Linguistics Society: General Session and Parasession on The Grammar of Event Structure (1991), pp. 480-519},
  file = {/Users/francojc/Zotero/storage/IFKRV92W/Talmy - 1991 - Path to Realization A Typology of Event Conflation.pdf}
}

@article{Tarone1998,
  title = {On the Use of the Passive and Active Voice in Astrophysics Journal Papers: {{With}} Extensions to Other Languages and Other Fields},
  author = {Tarone, Elaine and Dwyer, Sharon and Gillette, Susan and Icke, Vincent},
  year = {1998},
  journal = {English for Specific Purposes},
  volume = {17},
  number = {1},
  pages = {113--132},
  issn = {08894906},
  doi = {10.1016/S0889-4906(97)00032-X},
  urldate = {2011-02-10},
  abstract = {In this paper we examine the frequency of the active and passive verb forms in two astrophysics journal articles, finding that we plus an active verb occurs at least as frequently as the passive in both articles. On the basis of consultation with an informant in astrophysics we find that astrophysics papers typify a previously unidentified type of research article, the logical argument scientific paper; in this type of paper, the rhetorical structure is quite different from that of the `standard scientific' experimental paper. Within the structure of the logical argument paper, we propose four rhetorical functions of the passive as opposed to we plus an active verb: (1) we indicates the author's procedural choice, while the passive indicates an established or standard procedure; (2) we is used to describe the author's own work and the passive to describe the work of others, unless that work is not mentioned in contrast to the author's in which case the active is used; (3) the passive is used to describe the author's proposed studies; and (4) the use of the active or the passive is determined by focus due to the length of an element or the need for emphasis. We suggest that similar uses of the passive and active voice may also extend to English journal papers in other fields, particularly those in which the subject matter does not lend itself to experimentation, and in which papers take the form of a logical argument rather than an experimental study. We review evidence produced subsequent to this study which suggests that astrophysics papers written in Russian may use the equivalent of the passive and active voice in a similar way to that described for English.},
  file = {/Users/francojc/Zotero/storage/23I5IT7Y/Tarone et al. - 1998 - On the use of the passive and active voice in astrophysics journal papers With extensions to other languages and.pdf}
}

@misc{Tatman2018,
  title = {Parity in Utility: {{One}} Way to Think about Fairness in Machine Learning Tools},
  shorttitle = {Parity in Utility},
  author = {Tatman, {\textasciitilde} Rachael},
  year = {2018},
  month = feb,
  url = {https://makingnoiseandhearingthings.com/2018/02/25/parity-in-utility-one-way-to-think-about-fairness-in-machine-learning-tools/},
  urldate = {2020-10-06},
  abstract = {First, a confession: part of the reason I'm writing this blog post today is becuase I'm having major FOMO on account of having missed~\#FAT2018, the first annual Conference on Fairness, {\dots}},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/P6P3DYQY/parity-in-utility-one-way-to-think-about-fairness-in-machine-learning-tools.html}
}

@article{Taylor2008,
  title = {What Is Corpus Linguistics? {{What}} the Data Says},
  author = {Taylor, Charlotte},
  year = {2008},
  journal = {ICAME Journal},
  number = {32},
  pages = {179--200},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/NW24D6VI/Taylor - What is corpus linguistics What the data says.pdf}
}

@book{Teubert2007,
  title = {Corpus Linguistics : A Short Introduction},
  author = {Teubert, Wolfgang and Cerm{\'a}kov{\'a}, Anna},
  year = {2007},
  url = {http://www.loc.gov/catdir/toc/fy0801/2007298647.html},
  isbn = {9780826494801 (hbk.) 0826494803 (hbk.) 9780826494818 (pbk.) 0826494811 (pbk.)},
  pmid = {14957883}
}

@misc{TheRFoundation2024,
  type = {Software},
  title = {The {{R}} Project for Statistical Computing},
  shorttitle = {The {{R}} Project for Statistical Computing},
  author = {{The R Foundation}},
  year = {2024},
  journal = {R: The R Project for Statistical Computing},
  url = {https://www.r-project.org/},
  urldate = {2024-04-04},
  keywords = {homepage,R}
}

@article{Tiedemann2009,
  title = {A Collection of Multilingual Parallel Corpora with Tools and Interfaces},
  author = {Tiedemann, J},
  year = {2009},
  journal = {Recent Advances in Natural Language Processing},
  url = {http://books.google.com/books?hl=en&lr=&id=xUWRy8OUGZsC&oi=fnd&pg=PA237&dq=A+collection+of+multilingual+parallel+corpora+with+tools+and+interfaces&ots=46jjzqZdSY&sig=rHZGaOu1fHirXg9Hm7STWRzOUBE},
  urldate = {2014-10-16}
}

@article{Tinoco2007,
  title = {The {{VARILEX}} Project-{{Spanish}} Lexical Variation},
  author = {Tinoco, Antonio Ruiz and Ueda, Hiroto},
  year = {2007},
  journal = {Linguistica Atlantica},
  volume = {27},
  pages = {117--121},
  file = {/Users/francojc/Documents/Academic/Research/Data/Language/Datasets/Varilex/varilex.txt;/Users/francojc/Zotero/storage/CWCT7IAY/varilex-r.pdf;/Users/francojc/Zotero/storage/SESVALPU/Tinoco and Ueda - 2007 - The VARILEX Project-Spanish Lexical Variation.pdf}
}

@article{Tizon-Couto2021,
  title = {Variables Are Valuable: Making a Case for Deductive Modeling},
  shorttitle = {Variables Are Valuable},
  author = {{Tiz{\'o}n-Couto}, David and Lorenz, David},
  year = {2021},
  month = sep,
  journal = {Linguistics},
  publisher = {De Gruyter Mouton},
  issn = {1613-396X},
  doi = {10.1515/ling-2019-0050},
  urldate = {2021-09-07},
  abstract = {Following the quantitative turn in linguistics, the field appears to be in a methodological ``wild west'' state where much is possible and new frontiers are being explored, but there is relatively little guidance in terms of firm rules or conventions. In this article, we focus on the issue of variable selection in regression modeling. It is common to aim for a ``minimal adequate model'' and eliminate ``non-significant'' variables by statistical procedures. We advocate an alternative, ``deductive modeling'' approach that retains a ``full'' model of variables generated from our research questions and objectives. Comparing the statistical model to a camera, i.e.,~a tool to produce an image of reality, we contrast the deductive and predictive (minimal) modeling approaches on a dataset from a corpus study. While a minimal adequate model is more parsimonious, its selection procedure is blind to the research aim and may conceal relevant information. Deductive models, by contrast, are grounded in theory, have higher transparency (all relevant variables are reported) and potentially a greater accuracy of the reported effects. They are useful for answering research questions more directly, as they rely explicitly on prior knowledge and hypotheses, and allow for estimation and comparison across datasets.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/9BP3BJTA/Tizón-Couto and Lorenz - 2021 - Variables are valuable making a case for deductiv.pdf}
}

@article{Tottie2011,
  title = {Uh and Um as Sociolinguistic Markers in {{British English}}},
  author = {Tottie, Gunnel},
  year = {2011},
  month = jan,
  journal = {International Journal of Corpus Linguistics},
  volume = {16},
  number = {2},
  pages = {173--197},
  issn = {13846655},
  urldate = {2011-10-24},
  file = {/Users/francojc/Zotero/storage/V7STZGT3/Tottie - 2011 - Uh and Um as sociolinguistic markers in British English.pdf}
}

@article{Tottie2014,
  title = {On the Use of Uh and Um in {{American English}}},
  author = {Tottie, Gunnel},
  year = {2014},
  month = apr,
  journal = {Functions of Language},
  volume = {21},
  number = {1},
  pages = {6--29},
  issn = {0929-998X, 1569-9765},
  doi = {10.1075/fol.21.1.02tot},
  urldate = {2021-10-29},
  abstract = {This study examines the use of               uh               and               um               --- referred to jointly as UHM --- in 14 conversations totaling c. 62,350 words from the Santa Barbara Corpus of Spoken American English. UHM was much less frequent than in British English with 7.5 vs. 14.5 instances per million words in the British National Corpus. However, as in British English the frequency of UHM was closely correlated to extra-linguistic context. Conversations in non-private environments (such as offices and classrooms) had higher frequencies than those taking place in private spaces, mostly homes. Time required for planning, especially when difficult subjects were discussed, appeared to be an important explanatory factor. It is clear that UHM cannot be dismissed as mere hesitation or disfluency; it functions as a pragmatic marker on a par with               well               ,               you know               , and               I mean               , sharing some of the functions of these in discourse. Although the role of sociolinguistic factors was less clear, the tendencies for older speakers and educated speakers to use UHM more frequently than younger and less educated ones paralleled British usage, but contrary to British usage, there were no gender differences.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/VBUHP4SN/Tottie - 2014 - On the use of uh and um in American .pdf}
}

@article{Tottie2019,
  title = {From Pause to Word: Uh, Um and Er in Written {{American English}}},
  shorttitle = {From Pause to Word},
  author = {Tottie, Gunnel},
  year = {2019},
  journal = {English Language and Linguistics},
  volume = {23},
  number = {1},
  pages = {105--130},
  publisher = {Cambridge University Press},
  address = {Cambridge, United Kingdom},
  issn = {13606743},
  doi = {http://dx.doi.org/10.1017/S1360674317000314},
  urldate = {2021-10-29},
  abstract = {This article describes and discusses the appearance and increasing frequency of uh, um and er in American English journalistic prose from the 1960s to the early 2000s as part of the colloquialization of the language. The three variants uh, um and er are shown to have different uses in writing than in speech; in writing they can be shown to qualify as words, while their status in speech appears to be on a cline of wordhood. In writing, they belong to the class of stance adverbs, serving metalinguistic purposes. Two types are distinguished, depending on sentence placement: in initial position, uh, um and er are attitude adverbs and in medial position, they are style adverbs. Although er is dispreferred in initial position and preferred for correction of previously used words, every variant can be used for all discourse-pragmatic functions, which supports classifying them as one lexeme.},
  copyright = {Copyright {\copyright} Cambridge University Press 2017},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/B5JAZ38H/Tottie - 2019 - From pause to word uh, um and er in written Ameri.pdf}
}

@article{Turney2010,
  title = {From Frequency to Meaning: {{Vector}} Space Models of Semantics},
  author = {Turney, Peter D and Pantel, Patrick},
  year = {2010},
  journal = {Journal of Artificial Intelligence Research intelligence research},
  urldate = {2014-06-30},
  file = {/Users/francojc/Zotero/storage/RA47AATJ/Turney, Pantel - 2010 - From frequency to meaning Vector space models of semantics.pdf}
}

@misc{UScopyright2024,
  title = {{{US}} Copyright Office},
  shorttitle = {U.{{S}}. {{Copyright Office}}},
  journal = {Copyright Law of the United States (Title 17)},
  url = {https://www.copyright.gov/title17/},
  urldate = {2024-03-23},
  keywords = {copyright,data,datasets,ethics},
  file = {/Users/francojc/Downloads/Copyright Law of the United States  U.S. Copyright Office.pdf;/Users/francojc/Zotero/storage/FFB5VFD2/title17.html}
}

@inproceedings{vanderLee2017,
  title = {Exploring Lexical and Syntactic Features for Language Variety Identification},
  booktitle = {Proceedings of the Fourth Workshop on {{NLP}} for Similar Languages, Varieties and Dialects ({{VarDial}})},
  author = {{van der Lee}, Chris and {van den Bosch}, Antal},
  year = {2017},
  pages = {190--199},
  file = {/Users/francojc/Zotero/storage/T6YA33FU/van der Lee_van den Bosch_2017_Exploring lexical and syntactic features for language variety identification.pdf}
}

@incollection{VazquezVeiga2016,
  title = {Discourse Markers in {{CEDEL2}} and {{SPLLOC}} Corpora of Learner {{Spanish}}: {{Analysis}} of Some Lexical-Pragmatic Failures},
  shorttitle = {Discourse Markers in {{CEDEL2}} and {{SPLLOC}} Corpora of Learner Spanish},
  booktitle = {Studies in {{Corpus Linguistics}}},
  author = {V{\'a}zquez Veiga, Nancy},
  editor = {{Alonso-Ramos}, Margarita},
  year = {2016},
  month = dec,
  volume = {78},
  pages = {267--297},
  publisher = {John Benjamins Publishing Company},
  address = {Amsterdam},
  doi = {10.1075/scl.78.10vaz},
  urldate = {2021-05-28},
  abstract = {This chapter examines the use of discourse markers (DMs) in the inter-language of L2 Spanish learners whose L1 is English, based on a qualitative and quantitative analysis of written and oral samples from the CEDEL2 and SPLLOC corpora, respectively. From the initial description of the utility of each corpus in relation to the study of DMs acquisition, this chapter then turns its attention to an analysis of interference in the form of semantic extension in the use of oh, actualmente, eventualmente, usualmente and entonces. The section after that focuses on lexical-pragmatic failures arising from semantic simplification, specifically in relation to DMs containing the form fin. The chapter concludes with some final comments regarding lexical-pragmatic failures in DMs use, including the possible social repercussions of such failures.},
  isbn = {978-90-272-1075-3 978-90-272-6624-8},
  langid = {english},
  keywords = {cedel2,discourse analysis,discourse markers,splloc},
  file = {/Users/francojc/Zotero/storage/2TYV8L2X/Vázquez Veiga - 2016 - Discourse markers in CEDEL2 and SPLLOC corpora of .pdf}
}

@article{Vito2013,
  title = {Lessons for Policing from Moneyball: {{The}} Views of Police Managers -- a Research Note},
  shorttitle = {Lessons for Policing from Moneyball},
  author = {Vito, Anthony G. and Vito, Gennaro F.},
  year = {2013},
  month = jun,
  journal = {American Journal of Criminal Justice},
  volume = {38},
  number = {2},
  pages = {236--244},
  issn = {1066-2316, 1936-1351},
  doi = {10.1007/s12103-012-9171-5},
  urldate = {2021-04-06},
  abstract = {Michael Lewis' book Moneyball demonstrates how Oakland A's General Manager Billy Beane used research evidence to improve his team's performance in a cost-effective manner. This presentation focuses upon the responses of police managers attending the Administrative Officer's Course in the Southern Police Institute at the University of Louisville. The respondents identified three elements of Moneyball that could be applied to police management: 1) using statistical analysis to guide operations, 2) challenging the status quo, and 3) doing more with less.},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/Y7PDRFE2/Vito and Vito - 2013 - Lessons for Policing from Moneyball The Views of .pdf}
}

@article{Voigt2017,
  title = {Language from Police Body Camera Footage Shows Racial Disparities in Officer Respect},
  author = {Voigt, Rob and Camp, Nicholas P. and Prabhakaran, Vinodkumar and Hamilton, William L. and Hetey, Rebecca C. and Griffiths, Camilla M. and Jurgens, David and Jurafsky, Dan and Eberhardt, Jennifer L.},
  year = {2017},
  journal = {Proceedings of the National Academy of Sciences},
  volume = {114},
  number = {25},
  pages = {6521--6526},
  publisher = {National Acad Sciences},
  file = {/Users/francojc/Zotero/storage/DAXFZ79R/Voigt et al. - 2017 - Language from police body camera footage shows rac.pdf;/Users/francojc/Zotero/storage/T38LMJ3M/6521.html}
}

@article{Warstadt2019,
  title = {Neural Network Acceptability Judgments},
  author = {Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R.},
  year = {2019},
  month = oct,
  journal = {arXiv},
  eprint = {1805.12471},
  url = {http://arxiv.org/abs/1805.12471},
  urldate = {2021-05-28},
  abstract = {This paper investigates the ability of artificial neural networks to judge the grammatical acceptability of a sentence, with the goal of testing their linguistic competence. We introduce the Corpus of Linguistic Acceptability (CoLA), a set of 10,657 English sentences labeled as grammatical or ungrammatical from published linguistics literature. As baselines, we train several recurrent neural network models on acceptability classification, and find that our models outperform unsupervised models by Lau et al. (2016) on CoLA. Error-analysis on specific grammatical phenomena reveals that both Lau et al.'s models and ours learn systematic generalizations like subject-verb-object order. However, all models we test perform far below human level on a wide range of grammatical constructions.},
  archiveprefix = {arXiv},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/ZMZ33ICZ/Warstadt et al. - 2019 - Neural Network Acceptability Judgments.pdf}
}

@article{Wasow2005,
  title = {Intuitions in Linguistic Argumentation},
  author = {Wasow, Thomas and Arnold, Jennifer},
  year = {2005},
  journal = {Lingua},
  volume = {115},
  number = {11},
  pages = {1481--1496},
  file = {/Users/francojc/Zotero/storage/B2ZRWMIX/Wasow and Arnold - 2005 - Intuitions in linguistic argumentation.pdf}
}

@article{Wasserstein2019,
  title = {Moving to a World beyond ``p {$<$} 0.05''},
  author = {Wasserstein, Ronald L. and Schirm, Allen L. and Lazar, Nicole A.},
  year = {2019},
  month = mar,
  journal = {The American Statistician},
  volume = {73},
  number = {sup1},
  pages = {1--19},
  issn = {0003-1305},
  doi = {10.1080/00031305.2019.1583913},
  urldate = {2022-12-20},
  file = {/Users/francojc/Zotero/storage/XGHBUGP3/Wasserstein et al. - 2019 - Moving to a World Beyond “p  0.05”.pdf}
}

@book{Weiss2015,
  title = {Fundamentals of Predictive Text Mining},
  author = {Weiss, Sholom M. and Indurkhya, Nitin and Zhang, Tong},
  year = {2015},
  month = sep,
  publisher = {Springer},
  abstract = {This successful textbook on predictive text mining offers a unified perspective on a rapidly evolving field, integrating topics spanning the varied disciplines of data science, machine learning, databases, and computational linguistics. Serving also as a practical guide, this unique book provides helpful advice illustrated by examples and case studies.This highly anticipated second edition has been thoroughly revised and expanded with new material on deep learning, graph models, mining social media, errors and pitfalls in big data evaluation, Twitter sentiment analysis, and dependency parsing discussion. The fully updated content also features in-depth discussions on issues of document classification, information retrieval, clustering and organizing documents, information extraction, web-based data-sourcing, and prediction and evaluation.Topics and features: presents a comprehensive, practical and easy-to-read introduction to text mining; includes chapter summaries, useful historical and bibliographic remarks, and classroom-tested exercises for each chapter; explores the application and utility of each method, as well as the optimum techniques for specific scenarios; provides several descriptive case studies that take readers from problem description to systems deployment in the real world; describes methods that rely on basic statistical techniques, thus allowing for relevance to all languages (not just English); contains links to free downloadable industrial-quality text-mining software and other supplementary instruction material.Fundamentals of Predictive Text Mining is an essential resource for IT professionals and managers, as well as a key text for advanced undergraduate computer science students and beginning graduate students.},
  googlebooks = {FUiGCgAAQBAJ},
  isbn = {978-1-4471-6750-1},
  langid = {english}
}

@article{Welbers2017,
  title = {Text Analysis in {{R}}},
  author = {Welbers, Kasper and Van Atteveldt, Wouter and Benoit, Kenneth},
  year = {2017},
  journal = {Communication Methods and Measures},
  volume = {11},
  number = {4},
  pages = {245--265},
  publisher = {Taylor \& Francis},
  file = {/Users/francojc/Zotero/storage/SYR4CTIZ/Welbers et al. - 2017 - Text analysis in R.pdf;/Users/francojc/Zotero/storage/FNJ8TGJ2/19312458.2017.html}
}

@misc{WelcomeDataScience05-2021,
  title = {Welcome {\textbar} Data Science in a Box},
  url = {https://datasciencebox.org/},
  urldate = {2021-05-17},
  file = {/Users/francojc/Zotero/storage/3F7DBPAT/datasciencebox.org.html}
}

@misc{WelcomeTidyverse03-2021,
  title = {Welcome to the Tidyverse},
  year = {2019},
  month = nov,
  url = {https://tidyverse.tidyverse.org/articles/paper.html},
  urldate = {2021-03-18},
  abstract = {tidyverse},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/RW7Q6CZU/paper.html}
}

@article{Wickham2014,
  title = {Tidy Data},
  author = {Wickham, Hadley},
  year = {2014},
  journal = {Journal of Statistical Software},
  volume = {59},
  number = {10},
  eprint = {1501.0228},
  issn = {1548-7660},
  doi = {10.18637/jss.v059.i10},
  abstract = {In this paper we present the R package gRain for propagation in graphical indepen- dence networks (for which Bayesian networks is a special instance). The paper includes a description of the theory behind the computations. The main part of the paper is an illustration of how to use the package. The paper also illustrates how to turn a graphical model and data into an independence network.},
  archiveprefix = {arXiv},
  isbn = {9780387781662},
  pmid = {18291371},
  file = {/Users/francojc/Zotero/storage/FMQN8GCU/Wickham - 2014 - Tidy Data.pdf}
}

@book{Wickham2014a,
  title = {Advanced {{R}}},
  author = {Wickham, Hadley},
  year = {2014},
  publisher = {CRC Press},
  urldate = {2017-08-31},
  file = {/Users/francojc/Zotero/storage/EA5HN9BI/Wickham - 2014 - Advanced R.pdf}
}

@book{Wickham2017,
  title = {R for Data Science},
  author = {Wickham, Hadley and Grolemund, Garrett},
  year = {2017},
  edition = {First edit},
  publisher = {O'Reilly Media},
  url = {http://r4ds.had.co.nz/},
  abstract = {Learn how to use R to turn raw data into insight, knowledge, and understanding. This book introduces you to R, RStudio, and the tidyverse, a collection of R packages designed to work together to make data science fast, fluent, and fun. Suitable for readers with no previous programming experience, R for Data Science is designed to get you doing data science as quickly as possible. Authors Hadley Wickham and Garrett Grolemund guide you through the steps of importing, wrangling, exploring, and modeling your data and communicating the results. You'll get a complete, big-picture understanding of the data science cycle, along with basic tools you need to manage the details. Each section of the book is paired with exercises to help you practice what you've learned along the way. You'll learn how to: Wrangle---transform your datasets into a form convenient for analysis Program---learn powerful R tools for solving data problems with greater clarity and ease Explore---examine your data, generate hypotheses, and quickly test them Model---provide a low-dimensional summary that captures true "signals" in your dataset Communicate---learn R Markdown for integrating prose, code, and results},
  isbn = {1-4919-1039-9}
}

@book{Wickham2023,
  title = {R Packages: Organize, Test, Document, and Share Your Code},
  shorttitle = {R Packages},
  author = {Wickham, Hadley and Bryan, Jennifer},
  year = {2023},
  edition = {second edition},
  publisher = {O'Reilly},
  address = {Beijing},
  isbn = {978-1-09-813494-5},
  lccn = {QA276.45.R3 W53 2023},
  keywords = {Computer software,Development,Guides manuels etc,Handbooks and manuals,Handbooks manuals etc,R (Computer program language),R (Langage de programmation)},
  annotation = {OCLC: on1390610493}
}

@article{Wiebels2021,
  title = {Leveraging Containers for Reproducible Psychological Research},
  author = {Wiebels, Kristina and Moreau, David},
  year = {2021},
  month = apr,
  journal = {Advances in Methods and Practices in Psychological Science},
  volume = {4},
  number = {2},
  pages = {25152459211017853},
  publisher = {SAGE Publications Inc},
  issn = {2515-2459},
  doi = {10.1177/25152459211017853},
  urldate = {2023-06-23},
  abstract = {Containers have become increasingly popular in computing and software engineering and are gaining traction in scientific research. They allow packaging up all code and dependencies to ensure that analyses run reliably across a range of operating systems and software versions. Despite being a crucial component for reproducible science, containerization has yet to become mainstream in psychology. In this tutorial, we describe the logic behind containers, what they are, and the practical problems they can solve. We walk the reader through the implementation of containerization within a research workflow with examples using Docker and R. Specifically, we describe how to use existing containers, build personalized containers, and share containers alongside publications. We provide a worked example that includes all steps required to set up a container for a research project and can easily be adapted and extended. We conclude with a discussion of the possibilities afforded by the large-scale adoption of containerization, especially in the context of cumulative, open science, toward a more efficient and inclusive research ecosystem.},
  langid = {english},
  keywords = {/unread,docker,git,github,psychology,r,reproducible research},
  file = {/Users/francojc/Zotero/storage/S64NY6S6/Wiebels and Moreau - 2021 - Leveraging Containers for Reproducible Psychological Research.pdf}
}

@article{Wilson2017,
  title = {Good Enough Practices in Scientific Computing},
  author = {Wilson, Greg and Bryan, Jennifer and Cranston, Karen and Kitzes, Justin and Nederbragt, Lex and Teal, Tracy K.},
  year = {2017},
  journal = {PLoS Computational Biology},
  volume = {13},
  number = {6},
  eprint = {1609.00037},
  pages = {1--20},
  issn = {15537358},
  doi = {10.1371/journal.pcbi.1005510},
  abstract = {We present a set of computing tools and techniques that every researcher can and should adopt. These recommendations synthesize inspiration from our own work, from the experiences of the thousands of people who have taken part in Software Carpentry and Data Carpentry workshops over the past six years, and from a variety of other guides. Unlike some other guides, our recommendations are aimed specifically at people who are new to research computing.},
  archiveprefix = {arXiv},
  isbn = {1111111111},
  pmid = {28640806},
  file = {/Users/francojc/Zotero/storage/B8KP5EGJ/Wilson et al. - 2017 - Good enough practices in scientific computing.pdf}
}

@article{Winter2013,
  title = {Linear Models and Linear Mixed Effects Models in {{R}} with Linguistic Applications},
  author = {Winter, Bodo},
  year = {2013},
  journal = {arXiv},
  eprint = {1308.5499},
  pages = {1--42},
  doi = {http://arxiv.org/pdf/1308.5499.pdf},
  abstract = {This text is a conceptual introduction to mixed effects modeling with linguistic applications, using the R programming environment. The reader is introduced to linear modeling and assumptions, as well as to mixed effects/multilevel modeling, including a discussion of random intercepts, random slopes and likelihood ratio tests. The example used throughout the text focuses on the phonetic analysis of voice pitch data.},
  archiveprefix = {arXiv},
  file = {/Users/francojc/Zotero/storage/M6VR2IT6/Winter - 2013 - Linear models and linear mixed effects models in R with linguistic applications.pdf}
}

@book{Winter2019,
  title = {Statistics for Linguists: {{An}} Introduction Using {{R}}},
  shorttitle = {Statistics for Linguists},
  author = {Winter, Bodo},
  year = {2019},
  month = nov,
  publisher = {Routledge},
  address = {New York},
  doi = {10.4324/9781315165547},
  abstract = {Statistics for Linguists: An Introduction Using R is the first statistics textbook on linear models for linguistics. The book covers simple uses of linear models through generalized models to more advanced approaches, maintaining its focus on conceptual issues and avoiding excessive mathematical details. It contains many applied examples using the R statistical programming environment. Written in an accessible tone and style, this text is the ideal main resource for graduate and advanced undergraduate students of Linguistics statistics courses as well as those in other fields, including Psychology, Cognitive Science, and Data Science.},
  isbn = {978-1-315-16554-7}
}

@misc{Wordembeddingdemo06-2021,
  title = {Word Embedding Demo},
  journal = {TurkuNLP Group},
  url = {http://bionlp-www.utu.fi/wv_demo/},
  urldate = {2021-06-02},
  file = {/Users/francojc/Zotero/storage/JP8B63BT/wv_demo.html}
}

@article{Wulff2007,
  title = {Brutal {{Brits}} and Persuasive {{Americans}}},
  author = {Wulff, S and Stefanowitsch, A and Gries, Stefan Th.},
  year = {2007},
  journal = {Aspects of Meaning},
  urldate = {2013-10-30},
  file = {/Users/francojc/Zotero/storage/JAGZ3U53/Wulff et al. - 2007 - Brutal Brits and persuasive Americans.pdf}
}

@book{Wynne2005,
  title = {Developing Linguistic Corpora: A Guide to Good Practice},
  editor = {Wynne, Martin},
  year = {2005},
  publisher = {Oxford: Oxbow Books},
  url = {https://bond-lab.github.io/Corpus-Linguistics/dlc/},
  urldate = {2023-06-21},
  keywords = {/unread,annotation,corpus development,data,resources}
}

@article{Xiao2014,
  title = {Lexical and Grammatical Properties of Translational {{Chinese}}: {{Translation}} Universal Hypotheses Reevaluated from the {{Chinese}} Perspective},
  author = {Xiao, Richard and Dai, Guangrong},
  year = {2014},
  month = jan,
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {10},
  number = {1},
  issn = {1613-7035},
  doi = {10.1515/cllt-2013-0016},
  urldate = {2014-09-11},
  abstract = {Corpus-based Translation Studies focuses on translation as a product by comparing comparable corpora of translated and non-translated texts. A number of distinctive features of translations have been posited including, for example, explicitation, simplification, normalisation, levelling out, source language interference, and under-representation of target language unique items. Nevertheless, research of this area has until recently been confined largely to translational English and closely related European languages. If the features of translational language that have been reported on the basis of these languages are to be generalised as 'translation universals', the language pairs involved must not be restricted to English and closely related European languages. Clearly, evidence from a genetically distant language pair such as English and Chinese is arguably more convincing, if not indispensable. This article explores, in the broad context of translation universal research, lexical and grammatical properties of translational Chinese on the basis of two one-million-word balanced comparable corpora of translated and non-translated native Chinese texts. The findings of this empirical study of the properties of translational Chinese have enabled a reevaluation, from the perspective of translational Chinese, of largely English-based translation universal hypotheses. [ABSTRACT FROM AUTHOR]}
}

@book{Xie2018,
  title = {R Markdown: {{The}} Definitive Guide},
  shorttitle = {R Markdown},
  author = {Xie, Yihui and Allaire, Joseph J. and Grolemund, Garrett},
  year = {2018},
  publisher = {CRC Press},
  file = {/Users/francojc/Zotero/storage/77CXIXZZ/Xie et al. - 2018 - R markdown The definitive guide.pdf;/Users/francojc/Zotero/storage/FUIGWC3M/_FwPEAAAQBAJ.html}
}

@book{Xie2021,
  title = {R Markdown Cookbook},
  author = {Xie, Yihui and Dervieux, Christophe and Riederer, Emily},
  year = {2021},
  url = {https://bookdown.org/yihui/rmarkdown-cookbook/},
  urldate = {2021-08-29},
  abstract = {This book showcases short, practical examples of lesser-known tips and tricks to helps users get the most out of these tools. After reading this book, you will understand how R Markdown documents are transformed from plain text and how you may customize nearly every step of this processing. For example, you will learn how to dynamically create content from R code, reference code in other documents or chunks, control the formatting with customer templates, fine-tune how your code is processed, and incorporate multiple languages into your analysis.},
  file = {/Users/francojc/Zotero/storage/5KNEDR2Y/rmarkdown-cookbook.html}
}

@book{Xie2021a,
  title = {R Markdown: {{The}} Definitive Guide},
  shorttitle = {R Markdown},
  author = {Xie, Yihui and Allaire, Joseph J. and Grolemund, Garrett},
  year = {2021},
  url = {https://bookdown.org/yihui/rmarkdown/},
  urldate = {2021-08-29},
  abstract = {The first official book authored by the core R Markdown developers that provides a comprehensive and accurate reference to the R Markdown ecosystem. With R Markdown, you can easily create reproducible data analysis reports, presentations, dashboards, interactive applications, books, dissertations, websites, and journal articles, while enjoying the simplicity of Markdown and the great power of R and other languages.},
  file = {/Users/francojc/Zotero/storage/95SQ7YDX/rmarkdown.html}
}

@article{Xu2021,
  title = {A Syntactic Complexity Analysis of Translational {{English}} across Genres},
  author = {Xu, Jiajin and Li, Jialei},
  year = {2021},
  month = nov,
  journal = {Across Languages and Cultures},
  volume = {22},
  number = {2},
  pages = {214--232},
  publisher = {Akad{\'e}miai Kiad{\'o}},
  issn = {1588-2519, 1585-1923},
  doi = {10.1556/084.2021.00015},
  urldate = {2021-12-02},
  abstract = {Abstract This study compares the syntactic complexity between translational and non-translational English across four genres (i.e. fiction, news, general prose, and academic prose) and examines the connections between various forms and degrees of syntactic complexity measures and explicitation. Fourteen syntactic complexity indices were examined based on a one-million-word translational English corpus (COTE) and a one-million-word non-translational English corpus (i.e. FLOB), respectively. This study shows that syntactic explicitation in translations varies with the formality of discourse. The most significant complexity difference between translational vis-{\`a}-vis non-translational English is found in fiction, which is regarded as the major contributor to translational English syntactic complexity. No significant difference in syntactic complexity was observed between the two types of academic English texts. Translational English news and general prose stand between fiction and academic texts. Translational fiction and news are characterised by more phrasal complexity features such as coordinate and complex nominal phrases, and a key indicator of translational English general prose complexity is subordination. The findings of this study will help students of translation to make informed decisions on the arrangement of sentence structures when given texts of different genres.},
  chapter = {Across Languages and Cultures},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/L78UEHM3/Xu and Li - 2021 - A syntactic complexity analysis of translational E.pdf;/Users/francojc/Zotero/storage/RJTR8FFK/article-p214.html}
}

@article{Yanai2019,
  title = {Night Science},
  author = {Yanai, Itai and Lercher, Martin},
  year = {2019},
  month = aug,
  journal = {Genome Biology},
  volume = {20},
  number = {1},
  pages = {179},
  issn = {1474-760X},
  doi = {10.1186/s13059-019-1800-6},
  urldate = {2021-01-25},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/EERKRJVR/Yanai and Lercher - 2019 - Night science.pdf}
}

@article{Yanai2020,
  title = {A Hypothesis Is a Liability},
  author = {Yanai, Itai and Lercher, Martin},
  year = {2020},
  month = sep,
  journal = {Genome Biology},
  volume = {21},
  number = {1},
  pages = {231},
  issn = {1474-760X},
  doi = {10.1186/s13059-020-02133-w},
  urldate = {2021-01-25},
  file = {/Users/francojc/Zotero/storage/NBE8YVTW/Yanai and Lercher - 2020 - A hypothesis is a liability.pdf;/Users/francojc/Zotero/storage/UIYPCY7K/s13059-020-02133-w.html}
}

@misc{Zenodo2013,
  title = {Zenodo},
  shorttitle = {Zenodo},
  author = {{European Organization For Nuclear Research and OpenAIRE}},
  year = {2013},
  publisher = {CERN},
  doi = {10.25495/7GXK-RD71},
  langid = {english}
}

@article{Zhang2018,
  title = {Deep Learning for Sentiment Analysis: A Survey},
  shorttitle = {Deep Learning for Sentiment Analysis},
  author = {Zhang, Lei and Wang, Shuai and Liu, Bing},
  year = {2018},
  month = jul,
  journal = {WIREs Data Mining and Knowledge Discovery},
  volume = {8},
  number = {4},
  issn = {1942-4787, 1942-4795},
  doi = {10.1002/widm.1253},
  urldate = {2021-10-24},
  langid = {english},
  file = {/Users/francojc/Zotero/storage/NF65Q7U2/Zhang et al. - 2018 - Deep learning for sentiment analysis A survey.pdf}
}

@book{Zipf1949,
  title = {Human Behavior and the Principle of Least Effort},
  author = {Zipf, George Kingsley},
  year = {1949},
  publisher = {Addison-Wesley Press},
  address = {Oxford, England}
}

@article{Zufferey2014,
  title = {A Multifactorial Analysis of Explicitation in Translation},
  author = {Zufferey, Sandrine and Cartoni, Bruno},
  year = {2014},
  journal = {Target. International Journal of Translation Studies},
  volume = {26},
  number = {3},
  pages = {361--384},
  publisher = {John Benjamins},
  keywords = {/unread,connectives,explicitation,translation},
  file = {/Users/francojc/Zotero/storage/Y9YW5B84/Zufferey and Cartoni - 2014 - A multifactorial analysis of explicitation in tran.pdf}
}

@article{Zufferey2017,
  title = {Annotating the Meaning of Discourse Connectives in Multilingual Corpora},
  author = {Zufferey, Sandrine and Degand, Liesbeth},
  year = {2017},
  journal = {Corpus Linguistics and Linguistic Theory},
  volume = {13},
  number = {2},
  issn = {16137035},
  doi = {10.1515/cllt-2013-0022},
  abstract = {Discourse connectives are lexical items indicating coherence relations between discourse segments. Even though many languages possess a whole range of connectives, important divergences exist cross-linguistically in the number of connectives that are used to express a given relation. For this reason, connectives are not easily paired with a univocal translation equivalent across languages. This paper is a first attempt to design a reliable method to annotate the meaning of discourse connectives cross-linguistically using corpus data. We present the methodological choices made to reach this aim and report three annotation experiments using the framework of the Penn Discourse Tree Bank.},
  file = {/Users/francojc/Zotero/storage/R4VXCA9G/Zufferey, Degand - 2017 - Annotating the meaning of discourse connectives in multilingual corpora.pdf}
}