@article{b8bf61f3131a4e3c8ffc78adc29da59e,
title = "A study of vectorization for matrix-free finite element methods",
abstract = "Vectorization is increasingly important to achieve high performance on modern hardware with SIMD instructions. Assembly of matrices and vectors in the finite element method, which is characterized by iterating a local assembly kernel over unstructured meshes, poses difficulties to effective vectorization. Maintaining a user-friendly high-level interface with a suitable degree of abstraction while generating efficient, vectorized code for the finite element method is a challenge for numerical software systems and libraries. In this work, we study cross-element vectorization in the finite element framework Firedrake via code transformation and demonstrate the efficacy of such an approach by evaluating a wide range of matrix-free operators spanning different polynomial degrees and discretizations on two recent CPUs using three mainstream compilers. Our experiments show that our approaches for cross-element vectorization achieve 30% of theoretical peak performance for many examples of practical significance, and exceed 50% for cases with high arithmetic intensities, with consistent speed-up over (intra-element) vectorization restricted to the local assembly kernels.",
keywords = "Finite element method, code generation, global assembly, vectorization",
author = "Tianjiao Sun and Lawrence Mitchell and Kaushik Kulkarni and Andreas Kl{\"o}ckner and Ham, {David A.} and Kelly, {Paul H.J.}",
note = "Funding Information: The authors would like to thank Tobias Grosser, Richard Veras, J. Ramanujam and P. Sadayappan for their valuable insights during our discussions which started at Dagstuhl Seminar 18111 on Loop Optimization. The authors are grateful to James Cownie and Andrew Mollinson at Intel Corp. as well as Koki Sagiyama at Imperial College London for providing access to the Skylake platform. The author(s) disclosed receipt of the following financial support for the research, authorship, and/or publication of this article: This work was supported by the Engineering and Physical Sciences Research Council [grant numbers EP/L016796/1, EP/R029423/1], and the Natural Environment Research Council [grant number NE/K008951/1]. It was further funded by the US Navy Office of Naval Research under grant number N00014-14-1-0117 and the US National Science Foundation under grant number CCF-1524433. AK gratefully acknowledges a hardware gift from Nvidia Corporation. Funding Information: The author(s) disclosed receipt of the following financial support for the research, authorship, and/or publication of this article: This work was supported by the Engineering and Physical Sciences Research Council [grant numbers EP/L016796/1, EP/R029423/1], and the Natural Environment Research Council [grant number NE/K008951/1]. It was further funded by the US Navy Office of Naval Research under grant number N00014-14-1-0117 and the US National Science Foundation under grant number CCF-1524433. AK gratefully acknowledges a hardware gift from Nvidia Corporation. ",
year = "2020",
month = nov,
day = "1",
doi = "10.1177/1094342020945005",
language = "English (US)",
volume = "34",
pages = "629--644",
journal = "International Journal of High Performance Computing Applications",
issn = "1094-3420",
publisher = "SAGE Publications Inc.",
number = "6",
}