Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update publications page #14

Merged
merged 2 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 42 additions & 24 deletions _bibliography/papers.bib
Original file line number Diff line number Diff line change
@@ -1,45 +1,60 @@
---
---

@inproceedings{plu-aacl-arda23,
author = {Uzunoğlu, Arda and
Şahin, Gözde Gül},
title = {Benchmarking Procedural Language Understanding for Low-Resource Languages: A Case Study on Turkish},
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = november,
year = "2023",
address = "Bali, Indonesia",
publisher = "Association for Computational Linguistics",
abstract = "Understanding procedural natural language (e.g., step-by-step instructions) is a crucial step to execution and planning. However, while there are ample corpora and downstream tasks available in English, the field lacks such resources for most languages. To address this gap, we conduct a case study on Turkish procedural texts. We first expand the number of tutorials in Turkish wikiHow from 2,000 to 52,000 using automated translation tools, where the translation quality and loyalty to the original meaning are validated by a team of experts on a random set. Then, we generate several downstream tasks on the corpus, such as linking actions, goal inference, and summarization. To tackle these tasks, we implement strong baseline models via fine-tuning large language-specific models such as TR-BART and BERTurk, as well as multilingual models such as mBART, mT5, and XLM. We find that language-specific models consistently outperform their multilingual models by a significant margin across most procedural language understanding~(PLU) tasks.",

@inproceedings{uzunoglu-ahin:2023:ijcnlp,
abbr = {IJCNLP-AACL},
bibtex_show = {true},
pdf = {2309.06698.pdf},
author = {Uzunoglu, Arda and Şahin, Gözde},
title = {Benchmarking Procedural Language Understanding for Low-Resource Languages: A Case Study on Turkish},
booktitle = {Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics},
month = {November},
year = {2023},
address = {Nusa Dua, Bali},
publisher = {Association for Computational Linguistics},
pages = {804--819},
url = {https://aclanthology.org/2023.ijcnlp-long.52},
abstract = "Understanding procedural natural language (e.g., step-by-step instructions) is a crucial step to execution and planning. However, while there are ample corpora and downstream tasks available in English, the field lacks such resources for most languages. To address this gap, we conduct a case study on Turkish procedural texts. We first expand the number of tutorials in Turkish wikiHow from 2,000 to 52,000 using automated translation tools, where the translation quality and loyalty to the original meaning are validated by a team of experts on a random set. Then, we generate several downstream tasks on the corpus, such as linking actions, goal inference, and summarization. To tackle these tasks, we implement strong baseline models via fine-tuning large language-specific models such as TR-BART and BERTurk, as well as multilingual models such as mBART, mT5, and XLM. We find that language-specific models consistently outperform their multilingual models by a significant margin across most procedural language understanding~(PLU) tasks."
}

@inproceedings{gecturk-aacl23,
author = {Kara, Atakan and
Safian, Farrin and
Bond, Andrew and
Şahin, Gözde Gül},
title = {GECTurk: Grammatical Error Correction and Detection Dataset for Turkish},
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Findings)",
month = november,
year = "2023",
address = "Bali, Indonesia",
publisher = "Association for Computational Linguistics",
abstract = "Grammatical Error Detection and Correction (GEC) tools have proven useful for native speakers and second language learners. Developing such tools requires a large amount of parallel, annotated data, which is unavailable for most languages. Synthetic data generation is a common practice to overcome the scarcity of such data. However, it is not straightforward for morphologically rich languages like Turkish due to complex writing rules that require phonological, morphological, and syntactic information. In this work, we present a flexible and extensible synthetic data generation pipeline for Turkish covering more than 20 expert-curated grammar and spelling rules (a.k.a., writing rules) implemented through complex transformation functions. Using the pipeline, we derive 130,000 high-quality parallel sentences from professionally edited articles. Additionally, we create a more realistic test set by manually annotating a set of movie reviews. We implement three baselines formulating the task as i) neural machine translation, ii) sequence tagging, and iii) few-shot learning with prefix tuning, achieving strong results. Then we perform a zero-shot evaluation of our pretrained models on the coarse-grained BOUN -de/-da and fine-grained expert annotated dataset. Our results suggest that our corpus, GECTurk, is high-quality and allows knowledge transfer for the out-of-domain setting. To encourage further research on Turkish GEC, we release our dataset, baseline models, and synthetic data generation pipeline with https://anonymous.4open.science/r/tr-gec-17D6/.",
@inproceedings{kara-EtAl:2023:findings,
abbr = {IJCNLP-AACL},
bibtex_show = {true},
pdf = {2309.11346.pdf},
author = {Kara, Atakan and Marouf Sofian, Farrin and Bond, Andrew and Şahin, Gözde},
title = {GECTurk: Grammatical Error Correction and Detection Dataset for Turkish},
booktitle = {Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics},
month = {November},
year = {2023},
address = {Nusa Dua, Bali},
publisher = {Association for Computational Linguistics},
pages = {278--290},
url = {https://aclanthology.org/2023.findings-ijcnlp.26},
abstract = "Grammatical Error Detection and Correction (GEC) tools have proven useful for native speakers and second language learners. Developing such tools requires a large amount of parallel, annotated data, which is unavailable for most languages. Synthetic data generation is a common practice to overcome the scarcity of such data. However, it is not straightforward for morphologically rich languages like Turkish due to complex writing rules that require phonological, morphological, and syntactic information. In this work, we present a flexible and extensible synthetic data generation pipeline for Turkish covering more than 20 expert-curated grammar and spelling rules (a.k.a., writing rules) implemented through complex transformation functions. Using the pipeline, we derive 130,000 high-quality parallel sentences from professionally edited articles. Additionally, we create a more realistic test set by manually annotating a set of movie reviews. We implement three baselines formulating the task as i) neural machine translation, ii) sequence tagging, and iii) few-shot learning with prefix tuning, achieving strong results. Then we perform a zero-shot evaluation of our pretrained models on the coarse-grained “BOUN -de/-da” and fine-grained expert annotated dataset. Our results suggest that our corpus, GECTurk, is high-quality and allows knowledge transfer for the out-of-domain setting. To encourage further research on Turkish GEC, we release our dataset, baseline models, and synthetic data generation pipeline with https://anonymous.4open.science/r/tr-gec-17D6/."
}


@inproceedings{mbl-subha23,
abbr = {INLG},
bibtex_show = {true},
pdf = {2023.inlg-main.18.pdf},
author = {Vadlamannati, Subha and
Şahin, Gözde Gül},
title = {Metric-Based In-context Learning: {A} Case Study in Text Simplification},
booktitle = "Proceedings of the 16th International Natural Language Generation Conference",
month = september,
month = {September},
year = "2023",
address = "Prague, Czech Republic",
publisher = "Association for Computational Linguistics",
url = {https://aclanthology.org/2023.inlg-main.18.pdf},
abstract = "In-context learning (ICL) for large language models has proven to be a powerful approach for many natural language processing tasks. However, determining the best method to select examples for ICL is nontrivial as the results can vary greatly depending on the quality, quantity, and order of examples used. In this paper, we conduct a case study on text simplification (TS) to investigate how to select the best and most robust examples for ICL. We propose Metric-Based in-context Learning (MBL) method that utilizes commonly used TS metrics such as SARI, compression ratio, and BERT-Precision for selection. Through an extensive set of experiments with various-sized GPT models on standard TS benchmarks such as TurkCorpus and ASSET, we show that examples selected by the top SARI scores perform the best on larger models such as GPT-175B, while the compression ratio generally performs better on smaller models such as GPT-13B and GPT-6.7B. Furthermore, we demonstrate that MBL is generally robust to example orderings and out-of-domain test sets, and outperforms strong baselines and state-of-the-art finetuned language models. Finally, we show that the behaviour of large GPT models can be implicitly controlled by the chosen metric. Our research provides a new framework for selecting examples in ICL, and demonstrates its effectiveness in text simplification tasks, breaking new ground for more accurate and efficient NLG systems."
}

@inproceedings{klie-etal-2023-lessons,
abbr = {EACL},
bibtex_show = {true},
pdf = {2023.eacl-main.261.pdf},
title = "Lessons Learned from a Citizen Science Project for Natural Language Processing",
author = {Klie, Jan-Christoph and
Lee, Ji-Ung and
Expand All @@ -51,7 +66,7 @@ @inproceedings{klie-etal-2023-lessons
Eckart De Castilho, Richard and
Gurevych, Iryna},
booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
month = may,
month = {May},
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
Expand All @@ -63,6 +78,9 @@ @inproceedings{klie-etal-2023-lessons


@inproceedings{DBLP:conf/eacl/PuertoSG23,
abbr = {EACL},
bibtex_show = {true},
pdf = {2023.eacl-main.259.pdf},
author = {Haritz Puerto and
G{\"{o}}zde G{\"{u}}l Sahin and
Iryna Gurevych},
Expand Down
16 changes: 11 additions & 5 deletions _data/venues.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"AJP":
url: https://aapt.scitation.org/journal/ajp
color: "#00369f"
"EACL":
url: https://aclanthology.org/venues/eacl/
color: "#ff3200"

"PhysRev":
url: https://journals.aps.org/

"IJCNLP-AACL":
url: https://aclanthology.org/venues/ijcnlp/
color: "#c27ba0"

"INLG":
url: https://aclanthology.org/venues/inlg/
color: "#e6ba8d"
3 changes: 3 additions & 0 deletions _pages/publications.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ nav_rank: 4
{% endfor %}

</div>



Binary file added assets/pdf/2023.eacl-main.259.pdf
Binary file not shown.
Binary file added assets/pdf/2023.eacl-main.261.pdf
Binary file not shown.
Binary file not shown.
Binary file added assets/pdf/2309.06698.pdf
Binary file not shown.
Binary file added assets/pdf/2309.11346.pdf
Binary file not shown.
Binary file removed assets/pdf/papers/2023_Benchmark_PLU.pdf
Binary file not shown.
Binary file removed assets/pdf/papers/2023_GECTurk.pdf
Binary file not shown.