@inproceedings{a443216887a845f4aac55e25fda43386,
title = "Mining text outliers in document directories",
abstract = "Nowadays, it is common to classify collections of documents into (human-generated, domain-specific) directory structures, such as email or document folders. But documents may be classified wrongly, for a multitude of reasons. Then they are outlying w.r.t. the folder they end up in. Orthogonally to this, and more specifically, two kinds of errors can occur: (O) Out-of-distribution: the document does not belong to any existing folder in the directory; and (M) Misclassification: the document belongs to another folder. It is this specific combination of issues that we address in this article, i.e., we mine text outliers from massive document directories, considering both error types. We propose a new proximity-based algorithm, which we dub kj-Nearest Neighbours (kj-NN). Our algorithm detects text outliers by exploiting semantic similarities and introduces a self-supervision mechanism that estimates the relevance of the original labels. Our approach is efficient and robust to large proportions of outliers. kj-NN also promotes the interpretability of the results by proposing alternative label names and by finding the most similar documents for each outlier. Our real-world experiments demonstrate that our approach outperforms the competitors by a large margin.",
keywords = "Anomaly Detection, Data Cleaning, Document Filtering, Nearest-Neighbour Search, Text Mining",
author = "Edouard Fouche and Yu Meng and Fang Guo and Honglei Zhuang and Klemens Bohm and Jiawei Han",
note = "Funding Information: ACKNOWLEDGMENT This work was supported by the DFG Research Training Group 2153: {\textquoteleft}Energy Status Data – Informatics Methods for its Collection, Analysis and Exploitation{\textquoteright}, the German Federal Ministry of Education and Research (BMBF) via Software Campus (01IS17042) and sponsored in part by US DARPA KAIROS Program No. FA8750-19-2-1004 and SocialSim Program No. W911NF-17-C-0099, National Science Foundation IIS 16-18481, IIS 17-04532, and IIS-17-41317, and DTRA HDTRA11810026. Publisher Copyright: {\textcopyright} 2020 IEEE.; 20th IEEE International Conference on Data Mining, ICDM 2020 ; Conference date: 17-11-2020 Through 20-11-2020",
year = "2020",
month = nov,
doi = "10.1109/ICDM50108.2020.00024",
language = "English (US)",
series = "Proceedings - IEEE International Conference on Data Mining, ICDM",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "152--161",
editor = "Claudia Plant and Haixun Wang and Alfredo Cuzzocrea and Carlo Zaniolo and Xindong Wu",
booktitle = "Proceedings - 20th IEEE International Conference on Data Mining, ICDM 2020",
address = "United States",
}