@inproceedings{7d9b4c0070ee4e98973df6f52808a903,
title = "TwHIN-BERT: A Socially-Enriched Pre-trained Language Model for Multilingual Tweet Representations at Twitter",
abstract = "Pre-trained language models (PLMs) are fundamental for natural language processing applications. Most existing PLMs are not tailored to the noisy user-generated text on social media, and the pre-training does not factor in the valuable social engagement logs available in a social network. We present TwHIN-BERT, a multilingual language model productionized at Twitter, trained on in-domain data from the popular social network. TwHIN-BERT differs from prior pre-trained language models as it is trained with not only text-based self-supervision but also with a social objective based on the rich social engagements within a Twitter heterogeneous information network (TwHIN). Our model is trained on 7 billion tweets covering over 100 distinct languages, providing a valuable representation to model short, noisy, user-generated text. We evaluate our model on various multilingual social recommendation and semantic understanding tasks and demonstrate significant metric improvement over established pre-trained language models. We open-source TwHIN-BERT and our curated hashtag prediction and social engagement benchmark datasets to the research community.",
keywords = "language models, social engagement, social media",
author = "Xinyang Zhang and Yury Malkov and Omar Florez and Serim Park and Brian McWilliams and Jiawei Han and Ahmed El-Kishky",
note = "Publisher Copyright: {\textcopyright} 2023 ACM.; 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD 2023 ; Conference date: 06-08-2023 Through 10-08-2023",
year = "2023",
month = aug,
day = "6",
doi = "10.1145/3580305.3599921",
language = "English (US)",
series = "Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining",
publisher = "Association for Computing Machinery",
pages = "5597--5607",
booktitle = "KDD 2023 - Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining",
address = "United States",
}