@inproceedings{9d7cc04a6dda44a1b5d1f73c125fad63,
title = "Unsupervised Story Discovery from Continuous News Streams via Scalable Thematic Embedding",
abstract = "Unsupervised discovery of stories with correlated news articles in real-time helps people digest massive news streams without expensive human annotations. A common approach of the existing studies for unsupervised online story discovery is to represent news articles with symbolic- or graph-based embedding and incrementally cluster them into stories. Recent large language models are expected to improve the embedding further, but a straightforward adoption of the models by indiscriminately encoding all information in articles is ineffective to deal with text-rich and evolving news streams. In this work, we propose a novel thematic embedding with an off-the-shelf pretrained sentence encoder to dynamically represent articles and stories by considering their shared temporal themes. To realize the idea for unsupervised online story discovery, a scalable framework USTORY is introduced with two main techniques, theme- and time-aware dynamic embedding and novelty-aware adaptive clustering, fueled by lightweight story summaries. A thorough evaluation with real news data sets demonstrates that USTORY achieves higher story discovery performances than baselines while being robust and scalable to various streaming settings.",
keywords = "Document Embedding, News Story Discovery, News Stream Mining",
author = "Susik Yoon and Dongha Lee and Yunyi Zhang and Jiawei Han",
note = "Publisher Copyright: {\textcopyright} 2023 Copyright held by the owner/author(s). Publication rights licensed to ACM.; 46th International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 2023 ; Conference date: 23-07-2023 Through 27-07-2023",
year = "2023",
month = jul,
day = "19",
doi = "10.1145/3539618.3591782",
language = "English (US)",
series = "SIGIR 2023 - Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval",
publisher = "Association for Computing Machinery",
pages = "802--811",
booktitle = "SIGIR 2023 - Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval",
address = "United States",
}