@article{aa04efe921b64008a62d571463d7cdce,
title = "Optimizing sparse tensor times matrix on GPUs",
abstract = "This work optimizes tensor-times-dense matrix multiply (Ttm) for general sparse and semi-sparse tensors on CPU and NVIDIA GPU platforms. Ttm is a computational kernel in tensor methods-based data analytics and data mining applications, such as the popular Tucker decomposition. We first design an in-place sequential SpTtm to avoid explicit data reorganizing between a tensor and a matrix in its conventional approach. We further optimize SpTtm on NVIDIA GPU platforms. Five approaches including employing fine thread granularity, arranging coalesced memory access, rank blocking, and using fast GPU shared memory are developed for GPU-SpTtm. We also optimize semi-sparse tensor-times-dense matrix multiply (SspTtm) to take advantage of the inside dense sub-structures. The optimized SpTtm and SspTtm are applied to Tucker decomposition to improve its overall performance. Our sequential SpTtm is 3–120× faster than the SpTtm from Tensor Toolbox library. GPU-SpTtm obtains 6–19× speedup on NVIDIA K40c and 23–67× speedup on NVIDIA P100 over CPU-SpTtm respectively. Our GPU-SpTtm is 3.9× faster than the state-of-the-art GPU implementation. Our SspTtm implementations outperform SpTtms by up to 4.5×, which handles the input semi-sparse tensor in a general way. Tucker decomposition achieves up to 3.2× speedup after applying the optimized Ttms. The code will be publicly released in ParTI! library: https://github.com/hpcgarage/ParTI.",
keywords = "GPU, Irregular algorithms, Sparse tensors, Tensor decomposition",
author = "Yuchen Ma and Jiajia Li and Xiaolong Wu and Chenggang Yan and Jimeng Sun and Richard Vuduc",
note = "Funding Information: This material is based upon work supported by the U.S. National Science Foundation (NSF) Award Number 1533768 , Zhejiang Province Nature Science Foundation of China LR17F030006 , National Nature Science Foundation of China ( 61671196 , 61327902 ), IBM Ph.D. Fellowship Award, and the Laboratory Directed Research and Development program at Sandia National Laboratories, a multi-mission laboratory managed and operated by National Technology and Engineering Solutions of Sandia, LLC, a wholly owned subsidiary of Honeywell International, Inc., for the U.S. Department of Energy{\textquoteright}s National Nuclear Security Administration under contract DE-NA0003525 . Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of NSF, NSFC, IBM, or Sandia National Laboratories. Funding Information: This material is based upon work supported by the U.S. National Science Foundation (NSF) Award Number 1533768, Zhejiang Province Nature Science Foundation of China LR17F030006, National Nature Science Foundation of China (61671196, 61327902), IBM Ph.D. Fellowship Award, and the Laboratory Directed Research and Development program at Sandia National Laboratories, a multi-mission laboratory managed and operated by National Technology and Engineering Solutions of Sandia, LLC, a wholly owned subsidiary of Honeywell International, Inc. for the U.S. Department of Energy's National Nuclear Security Administration under contract DE-NA0003525. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of NSF, NSFC, IBM, or Sandia National Laboratories. Publisher Copyright: {\textcopyright} 2018 Elsevier Inc.",
year = "2019",
month = jul,
doi = "10.1016/j.jpdc.2018.07.018",
language = "English (US)",
volume = "129",
pages = "99--109",
journal = "Journal of Parallel and Distributed Computing",
issn = "0743-7315",
publisher = "Academic Press Inc.",
}