@inproceedings{1e014a29a3994412aa65ca8c17af817a,
title = "YADAC: Yet another dialectal Arabic corpus",
abstract = "This paper presents the first phase of building YADAC - a multi-genre Dialectal Arabic (DA) corpus - that is compiled using Web data from microblogs (i.e. Twitter), blogs/forums and online knowledge market services in which both questions and answers are user-generated. In addition to introducing two new genres to the current efforts of building DA corpora (i.e. microblogs and question-answer pairs extracted from online knowledge market services), the paper highlights and tackles several new issues related to building DA corpora that have not been handled in previous studies: function-based Web harvesting and dialect identification, vowel-based spelling variation, linguistic hypercorrection and its effect on spelling variation, unsupervised Part-of-Speech (POS) tagging and base phrase chunking for DA. Although the algorithms for both POS tagging and base-phrase chunking are still under development, the results are promising.",
keywords = "Dialect Identification, Dialectal Arabic, POS tagging",
author = "Rania Al-Sabbagh and Roxana Girju",
year = "2012",
language = "English (US)",
series = "Proceedings of the 8th International Conference on Language Resources and Evaluation, LREC 2012",
publisher = "European Language Resources Association (ELRA)",
pages = "2882--2889",
editor = "Dogan, {Mehmet Ugur} and Joseph Mariani and Asuncion Moreno and Sara Goggi and Khalid Choukri and Nicoletta Calzolari and Jan Odijk and Thierry Declerck and Bente Maegaard and Stelios Piperidis and Helene Mazo and Olivier Hamon",
booktitle = "Proceedings of the 8th International Conference on Language Resources and Evaluation, LREC 2012",
note = "8th International Conference on Language Resources and Evaluation, LREC 2012 ; Conference date: 21-05-2012 Through 27-05-2012",
}