@inproceedings{44e11cdc35e34b73a76aaa41ecde9913,
title = "An input-adaptive and in-place approach to dense tensor-times-matrix multiply",
abstract = "This paper describes a novel framework, called InTensLi ({"}intensely{"}), for producing fast single-node implementations of dense tensor-times-matrix multiply (Ttm) of arbitrary dimension. Whereas conventional implementations of Ttm rely on explicitly converting the input tensor operand into a matrix - -in order to be able to use any available and fast general matrix-matrix multiply (Gemm) implementation - -our framework's strategy is to carry out the Ttm in-place, avoiding this copy. As the resulting implementations expose tuning parameters, this paper also describes a heuristic empirical model for selecting an optimal configuration based on the Ttm's inputs. When compared to widely used single-node Ttm implementations that are available in the Tensor Toolbox and Cyclops Tensor Framework (Ctf), In-TensLi's in-place and input-adaptive Ttm implementations achieve 4× and 13× speedups, showing Gemm-like performance on a variety of input sizes.",
keywords = "code generation, multilinear algebra, offline autotuning, tensor operation",
author = "Jiajia Li and Casey Battaglino and Ioakeim Perros and Jimeng Sun and Richard Vuduc",
note = "Publisher Copyright: {\textcopyright} 2015 ACM.; International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015 ; Conference date: 15-11-2015 Through 20-11-2015",
year = "2015",
month = nov,
day = "15",
doi = "10.1145/2807591.2807671",
language = "English (US)",
series = "International Conference for High Performance Computing, Networking, Storage and Analysis, SC",
publisher = "IEEE Computer Society",
booktitle = "Proceedings of SC 2015",
address = "United States",
}