@inproceedings{aca12e8b39d94ee48fcc2128e736216f,
title = "Improving Consistency of Crowdsourced Multimedia Similarity for Evaluation",
abstract = "Building evaluation datasets for information retrieval is a time-consuming and exhausting activity. To evaluate research over novel corpora, researchers are increasingly turning to crowdsourcing to efficiently distribute the evaluation dataset creation among many workers. However, there has been little investigation into the effect of instrument design on data quality in crowdsourced evaluation datasets. We pursue this question through a case study, music similarity judgments in a music digital library evaluation, where we find that even with trusted graders song pairs are not consistently rated the same. We find that much of this low intra-coder consistency can be attributed to the task design and judge effects, concluding with recommendations for achieving reliable evaluation judgments for music similarity and other normative judgment tasks.",
keywords = "crowdsourcing, music retrieval, similarity judgments",
author = "Peter Organisciak and Downie, {J. Stephen}",
note = "Publisher Copyright: {\textcopyright} 2015 ACM.; 15th ACM/IEEE-CE Joint Conference on Digital Libraries, JCDL 2015 ; Conference date: 21-06-2015 Through 25-06-2015",
year = "2015",
month = jun,
day = "21",
doi = "10.1145/2756406.2756942",
language = "English (US)",
series = "Proceedings of the ACM/IEEE Joint Conference on Digital Libraries",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "115--118",
booktitle = "JCDL 2015 - Proceedings of the 15th ACM/IEEE-CE Joint Conference on Digital Libraries",
address = "United States",
}