@inproceedings{07bfbc54ea064d70b193bbe9ad028abc,
title = "An efficient GPU implementation technique for higher-order 3D stencils",
abstract = "Stencils are a family of widely used computational patterns that play a critical role in various scientific and engineering applications. Stencil computations are known to be memory-bandwidth bound, thus a number of different techniques and algorithms that optimizes memory bandwidth usage have been proposed. However, existing techniques fall short in addressing the needs of large stencils, particularly more advanced stencil patterns involving non-axis aligned grid points. To handle non-axis aligned grid points, existing methods either use 3D caching or 2D caching schemes with more than one pass over the stencil per iteration, which suffers from the high intensity of memory accesses. The large number of memory accesses in these methods hinder the available performance. In this work, we present a new GPU-based implementation technique called 'SWiC' that focuses on using 2D caching to efficiently implement advanced 3D stencil patterns, involving non-axis aligned grid points, and reducing global memory transactions by increased data reuse while only requiring a single pass per iteration. In contrast to the current approaches that maintain input register queues, the proposed approach maintains and updates the output register queue instead. The analysis shows that SWiC achieves a significant reduction in memory transactions which translates to a significant application speedup, 1.6x to 5.76x, when compared to the current state-of-the-art GPU stencil implementation. 'SWiC' was evaluated across the latest three Nvidia GPU architectures as of the writing of this paper, as well as various stencil patterns and sizes. We also show that 'SWiC' does not suffer from performance penalties when applied to simpler 3D stencils without non-axis aligned grid points, covering a wide application range. When running on a multi-node setting, we study the scaling efficiency of SWiC and show that it is able to achieve a weak scaling efficiency of about 96%.",
keywords = "3D stencil, CUDA, GPU, High order stencil, MHD, Stencil",
author = "Omer Anjum and Simon, {Garcia De Gonzalo} and Mert Hidayetoglu and Hwu, {Wen Mei}",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 21st IEEE International Conference on High Performance Computing and Communications, 17th IEEE International Conference on Smart City and 5th IEEE International Conference on Data Science and Systems, HPCC/SmartCity/DSS 2019 ; Conference date: 10-08-2019 Through 12-08-2019",
year = "2019",
month = aug,
doi = "10.1109/HPCC/SmartCity/DSS.2019.00086",
language = "English (US)",
series = "Proceedings - 21st IEEE International Conference on High Performance Computing and Communications, 17th IEEE International Conference on Smart City and 5th IEEE International Conference on Data Science and Systems, HPCC/SmartCity/DSS 2019",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "552--561",
editor = "Zheng Xiao and Yang, {Laurence T.} and Pavan Balaji and Tao Li and Keqin Li and Albert Zomaya",
booktitle = "Proceedings - 21st IEEE International Conference on High Performance Computing and Communications, 17th IEEE International Conference on Smart City and 5th IEEE International Conference on Data Science and Systems, HPCC/SmartCity/DSS 2019",
address = "United States",
}