@inproceedings{42c830af7dd64e6b86b5da5764986767,
title = "Is It Overkill? Analyzing Feature-Space Concept Drift in Malware Detectors",
abstract = "Concept drift is a major challenge faced by machine learning-based malware detectors when deployed in practice. While existing works have investigated methods to detect concept drift, it is not yet well understood regarding the main causes behind the drift. In this paper, we design experiments to empirically analyze the impact of feature-space drift (new features introduced by new samples) and compare it with data-space drift (data distribution shift over existing features). Surprisingly, we find that data-space drift is the dominating contributor to the model degradation over time while feature-space drift has little to no impact. This is consistently observed over both Android and PE malware detectors, with different feature types and feature engineering methods, across different settings. We further validate this observation with recent online learning based malware detectors that incrementally update the feature space. Our result indicates the possibility of handling concept drift without frequent feature updating, and we further discuss the open questions for future research.",
keywords = "concept-drift, machine-learning, malware-classifier",
author = "Zhi Chen and Zhenning Zhang and Zeliang Kan and Limin Yang and Jacopo Cortellazzi and Feargus Pendlebury and Fabio Pierazzi and Lorenzo Cavallaro and Gang Wang",
note = "On the flip side, what{\textquoteright}s the benefit of not updating features frequently? First, during model re-training, if a brand new feature set is used, it means the model will need to be either re-trained from scratch on the new feature space (costly) or use simpler linear models (less accurate) for incremental updating like DroidEvolver. In comparison, with a fixed feature set, there is more flexibility in model choices and incremental updating methods. Second, frequent feature updating may take on features that are only temporally useful (i.e., due to short-term drift), but hurt the models{\textquoteright} long-term performance (e.g., Fig. 2). We believe more work is needed to understand the impact of feature-space updating. Acknowledgment. This work was supported in part by NSF grants CNS-2055233 and CNS-1955719, C3.AI Research, IBM-Illinois Discovery Accelerator Institute, and a gift from AVAST.; 44th IEEE Symposium on Security and Privacy Workshops, SPW 2023 ; Conference date: 22-05-2023 Through 25-05-2023",
year = "2023",
doi = "10.1109/SPW59333.2023.00007",
language = "English (US)",
series = "Proceeding - 44th IEEE Symposium on Security and Privacy Workshops, SPW 2023",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "21--28",
booktitle = "Proceeding - 44th IEEE Symposium on Security and Privacy Workshops, SPW 2023",
address = "United States",
}