@inproceedings{7263c2cb0e92404abc9dfa3c3dc34512,
title = "Unsupervised morphology-based vocabulary expansion",
abstract = "We present a novel way of generating unseen words, which is useful for certain applications such as automatic speech recognition or optical character recognition in low-resource languages. We test our vocabulary generator on seven low-resource languages by measuring the decrease in out-of-vocabulary word rate on a held-out test set. The languages we study have very different morphological properties; we show how our results differ depending on the morphological complexity of the language. In our best result (on Assamese), our approach can predict 29% of the token-based out-of-vocabulary with a small amount of unlabeled training data.",
author = "Rasooli, {Mohammad Sadegh} and Thomas Lippincott and Nizar Habash and Owen Rambow",
year = "2014",
doi = "10.3115/v1/p14-1127",
language = "English (US)",
isbn = "9781937284725",
series = "52nd Annual Meeting of the Association for Computational Linguistics, ACL 2014 - Proceedings of the Conference",
publisher = "Association for Computational Linguistics (ACL)",
pages = "1349--1359",
booktitle = "Long Papers",
note = "52nd Annual Meeting of the Association for Computational Linguistics, ACL 2014 ; Conference date: 22-06-2014 Through 27-06-2014",
}