@conference {, title = {Scaling Point Set Registration in 3D Across Thread Counts on Multicore and Hardware Accelerator Platforms through Autotuning for Large Scale Analysis of Scientific Point Clouds}, booktitle = {IEEE International Workshop on Benchmarking, Performance Tuning and Optimization for Big Data Applications (BPOD 2017)}, year = {2017}, month = {2017-12}, publisher = {IEEE}, organization = {IEEE}, address = {Boston, MA}, abstract = {In this article, we present an autotuning approach applied to systematic performance engineering of the EM-ICP (Expectation-Maximization Iterative Closest Point) algorithm for the point set registration problem. We show how we were able to exceed the performance achieved by the reference code through multiple dependence transformations and automated procedure of generating and evaluating numerous implementation variants. Furthermore, we also managed to exploit code transformations that are not that common during manual optimization but yielded better performance in our tests for the EM-ICP algorithm. Finally, we maintained high levels of performance rate in a portable fashion across a wide range of HPC hardware platforms including multicore, many-core, and GPU-based accelerators. More importantly, the results indicate consistently high performance level and ability to move the task of data analysis through point-set registration to any modern compute platform without the concern of inferior asymptotic efficiency.}, doi = {https://doi.org/10.1109/BigData.2017.8258258}, author = {Piotr Luszczek and Jakub Kurzak and Ichitaro Yamazaki and David Keffer and Jack Dongarra} }