@inproceedings{6c560bbc8b284ca9abebcebdb8056bd8,
title = "ZAEBUC: An Annotated Arabic-English Bilingual Writer Corpus",
abstract = "We present ZAEBUC, an annotated Arabic-English bilingual writer corpus comprising short essays by first-year university students at Zayed University in the United Arab Emirates. We describe and discuss the various guidelines and pipeline processes we followed to create the annotations and quality check them. The annotations include spelling and grammar correction, morphological tokenization, Part-of-Speech tagging, lemmatization, and Common European Framework of Reference (CEFR) ratings. All of the annotations are done on Arabic and English texts using consistent guidelines as much as possible, with tracked alignments among the different annotations, and to the original raw texts. For morphological tokenization, POS tagging, and lemmatization, we use existing automatic annotation tools followed by manual correction. We also present various measurements and correlations with preliminary insights drawn from the data and annotations. The publicly available ZAEBUC corpus and its annotations are intended to be the stepping stones for additional annotations.",
keywords = "Annotated Corpus, Arabic, CEFR, English, Learner Corpus",
author = "Nizar Habash and David Palfreyman",
note = "Funding Information: The ZAEBUC project was funded by a Zayed University Research Incentive Fund (RIF #R19068). We would like to thank Ramy Eskander for helpful discussions and his work as part of Ramitechs. Publisher Copyright: {\textcopyright} European Language Resources Association (ELRA), licensed under CC-BY-NC-4.0.; 13th International Conference on Language Resources and Evaluation Conference, LREC 2022 ; Conference date: 20-06-2022 Through 25-06-2022",
year = "2022",
language = "English (US)",
series = "2022 Language Resources and Evaluation Conference, LREC 2022",
publisher = "European Language Resources Association (ELRA)",
pages = "79--88",
editor = "Nicoletta Calzolari and Frederic Bechet and Philippe Blache and Khalid Choukri and Christopher Cieri and Thierry Declerck and Sara Goggi and Hitoshi Isahara and Bente Maegaard and Joseph Mariani and Helene Mazo and Jan Odijk and Stelios Piperidis",
booktitle = "2022 Language Resources and Evaluation Conference, LREC 2022",
}