%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.04", %%% date = "13 December 2012", %%% time = "17:12:03 MST", %%% filename = "supercomputing2012.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "14292 4061 23619 229572", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "BibTeX; bibliography; SC'12; SC'2012; %%% Supercomputing '2012", %%% license = "public domain", %%% supported = "yes", %%% abstract = "", %%% docstring = "This is a complete bibliography of papers %%% published in the proceedings of %%% Supercomputing '2012. %%% %%% The conference World-Wide Web site is %%% %%% http://sc12.supercomputing.org/ %%% %%% The organizers of this conference series %%% maintain a World-Wide Web site at %%% %%% http://www.supercomp.org/ %%% %%% where pointers to Web pages for the %%% conferences from 1988 to date may be found. %%% %%% At version 1.04, the year coverage looked %%% like this: %%% %%% 2012 ( 106) %%% %%% InProceedings: 105 %%% Proceedings: 1 %%% %%% Total entries: 106 %%% %%% In this bibliography, entries are sorted in %%% order of PDF file numbers. %%% %%% The on-line electronic proceedings do not %%% contain sequential page numbers, although %%% there is an ISBN assigned for the %%% proceedings. A pagecount field is given with %%% each entry, extracted from the PDF file: some %%% of the articles lack page numbers altogether, %%% others number pages 1, 2, 3, ... %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\ifx \undefined \circled \def \circled #1{(#1)}\fi" # "\ifx \undefined \reg \def \reg {\circled{R}}\fi" # "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Publishers and their addresses: @String{pub-ACM = "ACM Press"} @String{pub-ACM:adr = "New York, NY 10036, USA"} @String{pub-IEEE = "IEEE Computer Society Press"} @String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"} %%% ==================================================================== %%% Bibliography entries from main SC'12 Proceedings @InProceedings{Chhugani:2012:BPS, author = "Jatin Chhugani and Changkyu Kim and Hemant Shukla and Jongsoo Park and Pradeep Dubey and John Shalf and Horst D. Simon", title = "Billion-particle {SIMD}-friendly two-point correlation on large-scale {HPC} cluster systems", crossref = "Hollingsworth:2012:SPI", pages = "1:1--1:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a002.pdf", abstract = "Two-point Correlation Function (TPCF) is widely used in astronomy to characterize the distribution of matter/energy in the Universe, and help derive the physics that can trace back to the creation of the universe. However, it is prohibitively slow for current sized datasets, and would continue to be a critical bottleneck with the trend of increasing dataset sizes to billions of particles and more, which makes TPCF a compelling benchmark application for future exa-scale architectures. State-of-the-art TPCF implementations do not map well to the underlying SIMD hardware, and also suffer from load-imbalance for large core counts. In this paper, we present a novel SIMD-friendly histogram update algorithm that exploits the spatial locality of histogram updates to achieve near-linear SIMD scaling. We also present a load-balancing scheme that combines domain-specific initial static division of work and dynamic task migration across nodes to effectively balance computation across nodes. Using Zin supercomputer at Lawrence Livermore National Laboratory (25,600 cores of Intel\reg{} Xeon\reg{} E5-2670, each with 256-bit SIMD), we achieve 90\% parallel efficiency and 96\% SIMD efficiency, and perform TPCF computation on a 1.7 billion particle dataset in 5.3 hours (at least 35 x faster than previous approaches). In terms of cost per performance (measured in flops/\$), we achieve at least an order-of-magnitude (11.1 x) higher flops/\$ as compared to the best known results [1]. Consequently, we now have line-of-sight to achieving the processing power for correlation computation to process billion+ particles telescopic data.", acknowledgement = ack-nhfb, articleno = "1", } @InProceedings{Mirin:2012:TRT, author = "Arthur A. Mirin and David F. Richards and James N. Glosli and Erik W. Draeger and Bor Chan and Jean-luc Fattebert and William D. Krauss and Tomas Oppelstrup and John Jeremy Rice and John A. Gunnels and Viatcheslav Gurev and Changhoan Kim and John Magerlein and Matthias Reumann and Hui-Fang Wen", title = "Toward real-time modeling of human heart ventricles at cellular resolution: simulation of drug-induced arrhythmias", crossref = "Hollingsworth:2012:SPI", pages = "2:1--2:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a001.pdf", abstract = "We have developed a highly efficient and scalable cardiac electrophysiology simulation capability that supports groundbreaking resolution and detail to elucidate the mechanisms of sudden cardiac death from arrhythmia. We can simulate thousands of heartbeats at a resolution of 0.1 mm, comparable to the size of cardiac cells, thereby enabling scientific inquiry not previously possible. Based on scaling results from the partially deployed Sequoia IBM Blue Gene/Q machine at Lawrence Livermore National Laboratory and planned optimizations, we estimate that by SC12 we will simulate 8--10 heartbeats per minute --- a time-to-solution 400--500 times faster than the state-of-the-art. Performance between 8 and 11 PFlop/s on the full 1,572,864 cores is anticipated, representing 40--55 percent of peak. The power of the model is demonstrated by illuminating the subtle arrhythmogenic mechanisms of anti-arrhythmic drugs that paradoxically increase arrhythmias in some patient populations.", acknowledgement = ack-nhfb, articleno = "2", } @InProceedings{Bui-Thanh:2012:ESU, author = "Tan Bui-Thanh and Carsten Burstedde and Omar Ghattas and James Martin and Georg Stadler and Lucas C. Wilcox", title = "Extreme-scale {UQ} for {Bayesian} inverse problems governed by {PDEs}", crossref = "Hollingsworth:2012:SPI", pages = "3:1--3:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a003.pdf", abstract = "Quantifying uncertainties in large-scale simulations has emerged as the central challenge facing CS{\&}E. When the simulations require supercomputers, and uncertain parameter dimensions are large, conventional UQ methods fail. Here we address uncertainty quantification for large-scale inverse problems in a Bayesian inference framework: given data and model uncertainties, find the pdf describing parameter uncertainties. To overcome the curse of dimensionality of conventional methods, we exploit the fact that the data are typically informative about low-dimensional manifolds of parameter space to construct low rank approximations of the covariance matrix of the posterior pdf via a matrix-free randomized method. We obtain a method that scales independently of the forward problem dimension, the uncertain parameter dimension, the data dimension, and the number of cores. We apply the method to the Bayesian solution of an inverse problem in 3D global seismic wave propagation with over one million uncertain earth model parameters, 630 million wave propagation unknowns, on up to 262K cores, for which we obtain a factor of over 2000 reduction in problem dimension. This makes UQ tractable for the inverse problem.", acknowledgement = ack-nhfb, articleno = "3", } @InProceedings{Habib:2012:UES, author = "Salman Habib and Vitali Morozov and Hal Finkel and Adrian Pope and Katrin Heitmann and Kalyan Kumaran and Tom Peterka and Joe Insley and David Daniel and Patricia Fasel and Nicholas Frontiere and Zarija Luki{\'c}", title = "The universe at extreme scale: multi-petaflop sky simulation on the {BG/Q}", crossref = "Hollingsworth:2012:SPI", pages = "4:1--4:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a004.pdf", abstract = "Remarkable observational advances have established a compelling cross-validated model of the Universe. Yet, two key pillars of this model --- dark matter and dark energy --- remain mysterious. Next-generation sky surveys will map billions of galaxies to explore the physics of the 'Dark Universe'. Science requirements for these surveys demand simulations at extreme scales; these will be delivered by the HACC (Hybrid/Hardware Accelerated Cosmology Code) framework. HACC's novel algorithmic structure allows tuning across diverse architectures, including accelerated and multi-core systems. On the IBM BG/Q, HACC attains unprecedented scalable performance --- currently 6.23 PFlops at 62\% of peak and 92\% parallel efficiency on 786,432 cores (48 racks) --- at extreme problem sizes with up to almost two trillion particles, larger than any cosmological simulation yet performed. HACC simulations at these scales will for the first time enable tracking individual galaxies over the entire volume of a cosmological survey.", acknowledgement = ack-nhfb, articleno = "4", } @InProceedings{Ishiyama:2012:PAB, author = "Tomoaki Ishiyama and Keigo Nitadori and Junichiro Makino", title = "4.45 Pflops astrophysical {$N$}-body simulation on {K} computer: the gravitational trillion-body problem", crossref = "Hollingsworth:2012:SPI", pages = "5:1--5:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a005.pdf", abstract = "As an entry for the 2012 Gordon-Bell performance prize, we report performance results of astrophysical N -body simulations of one trillion particles performed on the full system of K computer. This is the first gravitational trillion-body simulation in the world. We describe the scientific motivation, the numerical algorithm, the parallelization strategy, and the performance analysis. Unlike many previous Gordon-Bell prize winners that used the tree algorithm for astrophysical N -body simulations, we used the hybrid TreePM method, for similar level of accuracy in which the short-range force is calculated by the tree algorithm, and the long-range force is solved by the particle-mesh algorithm. We developed a highly-tuned gravity kernel for short-range forces, and a novel communication algorithm for long-range forces. The average performance on 24576 and 82944 nodes of K computer are 1.53 and 4.45 Pflops, which correspond to 49\% and 42\% of the peak speed.", acknowledgement = ack-nhfb, articleno = "5", } @InProceedings{Henschel:2012:DLW, author = "Robert Henschel and Stephen Simms and David Hancock and Scott Michael and Tom Johnson and Nathan Heald and Thomas William and Donald Berry and Matt Allen and Richard Knepper and Matthew Davy and Matthew Link and Craig A. Stewart", title = "Demonstrating {Lustre} over a {100Gbps} wide area network of 3,500km", crossref = "Hollingsworth:2012:SPI", pages = "6:1--6:8", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a006.pdf", abstract = "As part of the SCinet Research Sandbox at the Supercomputing 2011 conference, Indiana University (IU) demonstrated use of the Lustre high performance parallel file system over a dedicated 100 Gbps wide area network (WAN) spanning more than 3,500 km (2,175 mi). This demonstration functioned as a proof of concept and provided an opportunity to study Lustre's performance over a 100 Gbps WAN. To characterize the performance of the network and file system, low level iperf network tests, file system tests with the IOR benchmark, and a suite of real-world applications reading and writing to the file system were run over a latency of 50.5 ms. In this article we describe the configuration and constraints of the demonstration and outline key findings.", acknowledgement = ack-nhfb, articleno = "6", } @InProceedings{Meister:2012:SDD, author = "Dirk Meister and J{\"u}rgen Kaiser and Andre Brinkmann and Toni Cortes and Michael Kuhn and Julian Kunkel", title = "A study on data deduplication in {HPC} storage systems", crossref = "Hollingsworth:2012:SPI", pages = "7:1--7:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a008.pdf", abstract = "Deduplication is a storage saving technique that is highly successful in enterprise backup environments. On a file system, a single data block might be stored multiple times across different files, for example, multiple versions of a file might exist that are mostly identical. With deduplication, this data replication is localized and redundancy is removed --- by storing data just once, all files that use identical regions refer to the same unique data. The most common approach splits file data into chunks and calculates a cryptographic fingerprint for each chunk. By checking if the fingerprint has already been stored, a chunk is classified as redundant or unique. Only unique chunks are stored. This paper presents the first study on the potential of data deduplication in HPC centers, which belong to the most demanding storage producers. We have quantitatively assessed this potential for capacity reduction for 4 data centers (BSC, DKRZ, RENCI, RWTH). In contrast to previous deduplication studies focusing mostly on backup data, we have analyzed over one PB (1212 TB) of online file system data. The evaluation shows that typically 20\% to 30\% of this online data can be removed by applying data deduplication techniques, peaking up to 70\% for some data sets. This reduction can only be achieved by a subfile deduplication approach, while approaches based on whole-file comparisons only lead to small capacity savings.", acknowledgement = ack-nhfb, articleno = "7", } @InProceedings{Xie:2012:COB, author = "Bing Xie and Jeffrey Chase and David Dillow and Oleg Drokin and Scott Klasky and Sarp Oral and Norbert Podhorszki", title = "Characterizing output bottlenecks in a supercomputer", crossref = "Hollingsworth:2012:SPI", pages = "8:1--8:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a007.pdf", abstract = "Supercomputer I/O loads are often dominated by writes. HPC (High Performance Computing) file systems are designed to absorb these bursty outputs at high bandwidth through massive parallelism. However, the delivered write bandwidth often falls well below the peak. This paper characterizes the data absorption behavior of a center-wide shared Lustre parallel file system on the Jaguar supercomputer. We use a statistical methodology to address the challenges of accurately measuring a shared machine under production load and to obtain the distribution of bandwidth across samples of compute nodes, storage targets, and time intervals. We observe and quantify limitations from competing traffic, contention on storage servers and I/O routers, concurrency limitations in the client compute node operating systems, and the impact of variance (stragglers) on coupled output such as striping. We then examine the implications of our results for application performance and the design of I/O middleware systems on shared supercomputers.", acknowledgement = ack-nhfb, articleno = "8", } @InProceedings{Mustafa:2012:PSL, author = "Dheya Mustafa and Rudolf Eigenmann", title = "Portable section-level tuning of compiler parallelized applications", crossref = "Hollingsworth:2012:SPI", pages = "9:1--9:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a012.pdf", abstract = "Automatic parallelization of sequential programs combined with tuning is an alternative to manual parallelization. This method has the potential to substantially increase productivity and is thus of critical importance for exploiting the increased computational power of today's multicores. A key difficulty is that parallelizing compilers are generally unable to estimate the performance impact of an optimization on a whole program or a program section at compile time; hence, the ultimate performance decision today rests with the developer. Building an autotuning system to remedy this situation is not a trivial task. This work presents a portable empirical autotuning system that operates at program-section granularity and partitions the compiler options into groups that can be tuned independently. To our knowledge, this is the first approach delivering an autoparallelization system that ensures performance improvements for nearly all programs, eliminating the users' need to experiment with such tools to strive for highest application performance.", acknowledgement = ack-nhfb, articleno = "9", } @InProceedings{Jordan:2012:MOA, author = "Herbert Jordan and Peter Thoman and Juan J. Durillo and Simone Pellegrini and Philipp Gschwandtner and Thomas Fahringer and Hans Moritsch", title = "A multi-objective auto-tuning framework for parallel codes", crossref = "Hollingsworth:2012:SPI", pages = "10:1--10:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a013.pdf", abstract = "In this paper we introduce a multi-objective auto-tuning framework comprising compiler and runtime components. Focusing on individual code regions, our compiler uses a novel search technique to compute a set of optimal solutions, which are encoded into a multi-versioned executable. This enables the runtime system to choose specifically tuned code versions when dynamically adjusting to changing circumstances. We demonstrate our method by tuning loop tiling in cache-sensitive parallel programs, optimizing for both runtime and efficiency. Our static optimizer finds solutions matching or surpassing those determined by exhaustively sampling the search space on a regular grid, while using less than 4\% of the computational effort on average. Additionally, we show that parallelism-aware multi-versioning approaches like our own gain a performance improvement of up to 70\% over solutions tuned for only one specific number of threads.", acknowledgement = ack-nhfb, articleno = "10", } @InProceedings{Christen:2012:PCH, author = "Matthias Christen and Olaf Schenk and Yifeng Cui", title = "{Patus} for convenient high-performance stencils: evaluation in earthquake simulations", crossref = "Hollingsworth:2012:SPI", pages = "11:1--11:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a014.pdf", abstract = "Patus is a code generation and auto-tuning framework for stencil computations targeting modern multi and many-core processors. The goals of the framework are productivity and portability for achieving high performance on the target platform. Its stencil specification language allows the programmer to express the computation in a concise way independently of hardware architecture-specific details. Thus, it increases the programmer productivity by removing the need for manual low-level tuning. We illustrate the impact of the stencil code generation in seismic applications, for which both weak and strong scaling are important. We evaluate the performance by focusing on a scalable discretization of the wave equation and testing complex simulation types of the AWP-ODC code to aim at excellent parallel efficiency, preparing for petascale 3-D earthquake calculations.", acknowledgement = ack-nhfb, articleno = "11", } @InProceedings{Beamer:2012:DOB, author = "Scott Beamer and Krste Asanovi{\'c} and David Patterson", title = "Direction-optimizing breadth-first search", crossref = "Hollingsworth:2012:SPI", pages = "12:1--12:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a019.pdf", abstract = "Breadth-First Search is an important kernel used by many graph-processing applications. In many of these emerging applications of BFS, such as analyzing social networks, the input graphs are low-diameter and scale-free. We propose a hybrid approach that is advantageous for low-diameter graphs, which combines a conventional top-down algorithm along with a novel bottom-up algorithm. The bottom-up algorithm can dramatically reduce the number of edges examined, which in turn accelerates the search as a whole. On a multi-socket server, our hybrid approach demonstrates speedups of 3.3--7.8 on a range of standard synthetic graphs and speedups of 2.4--4.6 on graphs from real social networks when compared to a strong baseline. We also typically double the performance of prior leading shared memory (multicore and GPU) implementations.", acknowledgement = ack-nhfb, articleno = "12", } @InProceedings{Checconi:2012:BSS, author = "Fabio Checconi and Fabrizio Petrini and Jeremiah Willcock and Andrew Lumsdaine and Anamitra Roy Choudhury and Yogish Sabharwal", title = "Breaking the speed and scalability barriers for graph exploration on distributed-memory machines", crossref = "Hollingsworth:2012:SPI", pages = "13:1--13:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a020.pdf", abstract = "In this paper, we describe the challenges involved in designing a family of highly-efficient Breadth-First Search (BFS) algorithms and in optimizing these algorithms on the latest two generations of Blue Gene machines, Blue Gene/P and Blue Gene/Q. With our recent winning Graph 500 submissions in November 2010, June 2011, and November 2011, we have achieved unprecedented scalability results in both space and size. On Blue Gene/P, we have been able to parallelize a scale 38 problem with 2$^{38}$ vertices and 2$^{42}$ edges on 131,072 processing cores. Using only four racks of an experimental configuration of Blue Gene/Q, we have achieved a processing rate of 254 billion edges per second on 65,536 processing cores. This paper describes the algorithmic design and the main classes of optimizations that we have used to achieve these results.", acknowledgement = ack-nhfb, articleno = "13", } @InProceedings{Satish:2012:LSE, author = "Nadathur Satish and Changkyu Kim and Jatin Chhugani and Pradeep Dubey", title = "Large-scale energy-efficient graph traversal: a path to efficient data-intensive supercomputing", crossref = "Hollingsworth:2012:SPI", pages = "14:1--14:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a018.pdf", abstract = "Graph traversal is a widely used algorithm in a variety of fields, including social networks, business analytics, and high-performance computing among others. There has been a push for HPC machines to be rated not just in Petaflops, but also in ``GigaTEPS'' (billions of traversed edges per second), and the Graph500 benchmark has been established for this purpose. Graph traversal on single nodes has been well studied and optimized on modern CPU architectures. However, current cluster implementations suffer from high latency data communication with large volumes of transfers across nodes, leading to inefficiency in performance and energy consumption. In this work, we show that we can overcome these constraints using a combination of efficient low-overhead data compression techniques to reduce transfer volumes along with latency-hiding techniques. Using an optimized single node graph traversal algorithm [1], our novel cluster optimizations result in over 6.6X performance improvements over state-of-the-art data transfer techniques, and almost an order of magnitude in energy savings. Our resulting implementation of the Graph500 benchmark achieves 115 GigaTEPS on a 320-node/5120 core Intel\reg{} Endeavor cluster with E5-2700 Sandybridge nodes, which matches the second ranked result in the most recent November 2011 Graph500 list [2] with about 5.6X fewer nodes. Our cluster optimizations only have a 1.8X overhead in overall performance from the performance of the optimized single-node implementation, and allows for near-linear scaling with number of nodes. Our algorithm on 1024 nodes on an Intel\reg{} Xeon\reg{} X5670 Westmere processor (with lower per-node performance) for a large multi-Terabyte graph attained 195 GigaTEPS in performance, proving the high scalability of our algorithm. Our per-node performance is the highest in the top 10 of the Nov 2011 Graph500 list.", acknowledgement = ack-nhfb, articleno = "14", } @InProceedings{Levesque:2012:HEA, author = "John M. Levesque and Ramanan Sankaran and Ray Grout", title = "Hybridizing {S3D} into an exascale application using {OpenACC}: an approach for moving to multi-petaflops and beyond", crossref = "Hollingsworth:2012:SPI", pages = "15:1--15:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a040.pdf", abstract = "Hybridization is the process of converting an application with a single level of parallelism to an application with multiple levels of parallelism. Over the past 15 years a majority of the applications that run on High Performance Computing systems have employed MPI for all of the parallelism within the application. In the Peta-Exascale computing regime, effective utilization of the hardware requires multiple levels of parallelism matched to the macro architecture of the system to achieve good performance. A hybridized code base is performance portable when sufficient parallelism is expressed in an architecture agnostic form to achieve good performance on a range of available systems. The hybridized S3D code is performance portable across today's leading many core and GPU accelerated systems. The OpenACC framework allows a unified code base to be deployed for either (Manycore CPU or Manycore CPU+GPU) while permitting architecture specific optimizations to expose new dimensions of parallelism to be utilized.", acknowledgement = ack-nhfb, articleno = "15", } @InProceedings{Hejazialhosseini:2012:HTS, author = "Babak Hejazialhosseini and Diego Rossinelli and Christian Conti and Petros Koumoutsakos", title = "High throughput software for direct numerical simulations of compressible two-phase flows", crossref = "Hollingsworth:2012:SPI", pages = "16:1--16:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a039.pdf", abstract = "We present an open source, object-oriented software for high throughput Direct Numerical Simulations of compressible, two-phase flows. The Navier--Stokes equations are discretized on uniform grids using high order finite volume methods. The software exploits recent CPU micro-architectures by explicit vectorization and adopts NUMA-aware techniques as well as data and computation reordering. We report a compressible flow solver with unprecedented fractions of peak performance: 45\% of the peak for a single node (nominal performance of 840 GFLOP/s) and 30\% for a cluster of 47'000 cores (nominal performance of 0.8 PFLOP/s). We suggest that the present work may serve as a performance upper bound, regarding achievable GFLOP/s, for two-phase flow solvers using adaptive mesh refinement. The software enables 3D simulations of shock-bubble interaction including, for the first time, effects of diffusion and surface tension, by efficiently employing two hundred billion computational elements.", acknowledgement = ack-nhfb, articleno = "16", } @InProceedings{Islam:2012:MSC, author = "Tanzima Zerin Islam and Kathryn Mohror and Saurabh Bagchi and Adam Moody and Bronis R. de Supinski and Rudolf Eigenmann", title = "{McrEngine}: a scalable checkpointing system using data-aware aggregation and compression", crossref = "Hollingsworth:2012:SPI", pages = "17:1--17:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a023.pdf", abstract = "High performance computing (HPC) systems use checkpoint-restart to tolerate failures. Typically, applications store their states in checkpoints on a parallel file system (PFS). As applications scale up, checkpoint-restart incurs high overheads due to contention for PFS resources. The high overheads force large-scale applications to reduce checkpoint frequency, which means more compute time is lost in the event of failure. We alleviate this problem through a scalable checkpoint-restart system, mcrEngine. mcrEngine aggregates checkpoints from multiple application processes with knowledge of the data semantics available through widely-used I/O libraries, e.g., HDF5 and netCDF, and compresses them. Our novel scheme improves compressibility of checkpoints up to 115\% over simple concatenation and compression. Our evaluation with large-scale application checkpoints show that mcrEngine reduces checkpointing overhead by up to 87\% and restart overhead by up to 62\% over a baseline with no aggregation or compression.", acknowledgement = ack-nhfb, articleno = "17", } @InProceedings{Riesen:2012:ASI, author = "Rolf Riesen and Kurt Ferreira and Dilma {Da Silva} and Pierre Lemarinier and Dorian Arnold and Patrick G. Bridges", title = "Alleviating scalability issues of checkpointing protocols", crossref = "Hollingsworth:2012:SPI", pages = "18:1--18:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a021.pdf", abstract = "Current fault tolerance protocols are not sufficiently scalable for the exascale era. The most-widely used method, coordinated checkpointing, places enormous demands on the I/O subsystem and imposes frequent synchronizations. Uncoordinated protocols use message logging which introduces message rate limitations or undesired memory and storage requirements to hold payload and event logs. In this paper we propose a combination of several techniques, namely coordinated checkpointing, optimistic message logging, and a protocol that glues them together. This combination eliminates some of the drawbacks of each individual approach and proves to be an alternative for many types of exascale applications. We evaluate performance and scaling characteristics of this combination using simulation and a partial implementation. While not a universal solution, the combined protocol is suitable for a large range of existing and future applications that use coordinated checkpointing and enhances their scalability.", acknowledgement = ack-nhfb, articleno = "18", } @InProceedings{Sato:2012:DMN, author = "Kento Sato and Naoya Maruyama and Kathryn Mohror and Adam Moody and Todd Gamblin and Bronis R. de Supinski and Satoshi Matsuoka", title = "Design and modeling of a non-blocking checkpointing system", crossref = "Hollingsworth:2012:SPI", pages = "19:1--19:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a022.pdf", abstract = "As the capability and component count of systems increase, the MTBF decreases. Typically, applications tolerate failures with checkpoint/restart to a parallel file system (PFS). While simple, this approach can suffer from contention for PFS resources. Multi-level checkpointing is a promising solution. However, while multi-level checkpointing is successful on today's machines, it is not expected to be sufficient for exascale class machines, which are predicted to have orders of magnitude larger memory sizes and failure rates. Our solution combines the benefits of non-blocking and multi-level checkpointing. In this paper, we present the design of our system and model its performance. Our experiments show that our system can improve efficiency by 1.1 to 2.0x on future machines. Additionally, applications using our checkpointing system can achieve high efficiency even when using a PFS with lower bandwidth.", acknowledgement = ack-nhfb, articleno = "19", } @InProceedings{Papaioannou:2012:SAS, author = "Thanasis G. Papaioannou and Nicolas Bonvin and Karl Aberer", title = "{Scalia}: an adaptive scheme for efficient multi-cloud storage", crossref = "Hollingsworth:2012:SPI", pages = "20:1--20:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a026.pdf", abstract = "A growing amount of data is produced daily resulting in a growing demand for storage solutions. While cloud storage providers offer a virtually infinite storage capacity, data owners seek geographical and provider diversity in data placement, in order to avoid vendor lock-in and to increase availability and durability. Moreover, depending on the customer data access pattern, a certain cloud provider may be cheaper than another. In this paper, we introduce Scalia, a cloud storage brokerage solution that continuously adapts the placement of data based on its access pattern and subject to optimization objectives, such as storage costs. Scalia efficiently considers repositioning of only selected objects that may significantly lower the storage cost. By extensive simulation experiments, we prove the cost-effectiveness of Scalia against static placements and its proximity to the ideal data placement in various scenarios of data access patterns, of available cloud storage solutions and of failures.", acknowledgement = ack-nhfb, articleno = "20", } @InProceedings{Di:2012:HLP, author = "Sheng Di and Derrick Kondo and Walfredo Cirne", title = "Host load prediction in a {Google} compute cloud with a {Bayesian} model", crossref = "Hollingsworth:2012:SPI", pages = "21:1--21:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a025.pdf", abstract = "Prediction of host load in Cloud systems is critical for achieving service-level agreements. However, accurate prediction of host load in Clouds is extremely challenging because it fluctuates drastically at small timescales. We design a prediction method based on Bayes model to predict the mean load over a long-term time interval, as well as the mean load in consecutive future time intervals. We identify novel predictive features of host load that capture the expectation, predictability, trends and patterns of host load. We also determine the most effective combinations of these features for prediction. We evaluate our method using a detailed one-month trace of a Google data center with thousands of machines. Experiments show that the Bayes method achieves high accuracy with a mean squared error of 0.0014. Moreover, the Bayes method improves the load prediction accuracy by 5.6-50\% compared to other state-of-the-art methods based on moving averages, auto-regression, and/or noise filters.", acknowledgement = ack-nhfb, articleno = "21", } @InProceedings{Malawski:2012:CDC, author = "Maciej Malawski and Gideon Juve and Ewa Deelman and Jarek Nabrzyski", title = "Cost- and deadline-constrained provisioning for scientific workflow ensembles in {IaaS} clouds", crossref = "Hollingsworth:2012:SPI", pages = "22:1--22:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a024.pdf", abstract = "Large-scale applications expressed as scientific workflows are often grouped into ensembles of inter-related workflows. In this paper, we address a new and important problem concerning the efficient management of such ensembles under budget and deadline constraints on Infrastructure- as-a-Service (IaaS) clouds. We discuss, develop, and assess algorithms based on static and dynamic strategies for both task scheduling and resource provisioning. We perform the evaluation via simulation using a set of scientific workflow ensembles with a broad range of budget and deadline parameters, taking into account uncertainties in task runtime estimations, provisioning delays, and failures. We find that the key factor determining the performance of an algorithm is its ability to decide which workflows in an ensemble to admit or reject for execution. Our results show that an admission procedure based on workflow structure and estimates of task runtimes can significantly improve the quality of solutions.", acknowledgement = ack-nhfb, articleno = "22", } @InProceedings{Lee:2012:EED, author = "Seyong Lee and Jeffrey S. Vetter", title = "Early evaluation of directive-based {GPU} programming models for productive exascale computing", crossref = "Hollingsworth:2012:SPI", pages = "23:1--23:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a051.pdf", abstract = "Graphics Processing Unit (GPU)-based parallel computer architectures have shown increased popularity as a building block for high performance computing, and possibly for future Exascale computing. However, their programming complexity remains as a major hurdle for their widespread adoption. To provide better abstractions for programming GPU architectures, researchers and vendors have proposed several directive-based GPU programming models. These directive-based models provide different levels of abstraction, and required different levels of programming effort to port and optimize applications. Understanding these differences among these new models provides valuable insights on their applicability and performance potential. In this paper, we evaluate existing directive-based models by porting thirteen application kernels from various scientific domains to use CUDA GPUs, which, in turn, allows us to identify important issues in the functionality, scalability, tunability, and debuggability of the existing models. Our evaluation shows that directive-based models can achieve reasonable performance, compared to hand-written GPU codes.", acknowledgement = ack-nhfb, articleno = "23", } @InProceedings{Pienaar:2012:AGS, author = "Jacques A. Pienaar and Srimat Chakradhar and Anand Raghunathan", title = "Automatic generation of software pipelines for heterogeneous parallel systems", crossref = "Hollingsworth:2012:SPI", pages = "24:1--24:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a049.pdf", abstract = "Pipelining is a well-known approach to increasing parallelism and performance. We address the problem of software pipelining for heterogeneous parallel platforms that consist of different multi-core and many-core processing units. In this context, pipelining involves two key steps---partitioning an application into stages and mapping and scheduling the stages onto the processing units of the heterogeneous platform. We show that the inter-dependency between these steps is a critical challenge that must be addressed in order to achieve high performance. We propose an Automatic Heterogeneous Pipelining framework (ahp) that generates an optimized pipelined implementation of a program from an annotated unpipelined specification. Across three complex applications (image classification, object detection, and document retrieval) and two heterogeneous platforms (Intel Xeon multi-core CPUs with Intel MIC and NVIDIA GPGPU accelerators), ahp achieves a throughput improvement of up to 1.53x (1.37x on average) over a heterogeneous baseline that exploits data and task parallelism.", acknowledgement = ack-nhfb, articleno = "24", } @InProceedings{Chen:2012:AMC, author = "Linchuan Chen and Xin Huo and Gagan Agrawal", title = "Accelerating {MapReduce} on a coupled {CPU--GPU} architecture", crossref = "Hollingsworth:2012:SPI", pages = "25:1--25:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a050.pdf", abstract = "The work presented here is driven by two observations. First, heterogeneous architectures that integrate a CPU and a GPU on the same chip are emerging, and hold much promise for supporting power-efficient and scalable high performance computing. Second, MapReduce has emerged as a suitable framework for simplified parallel application development for many classes of applications, including data mining and machine learning applications that benefit from accelerators. This paper focuses on the challenge of scaling a MapReduce application using the CPU and GPU together in an integrated architecture. We develop different methods for dividing the work, which are the map-dividing scheme, where map tasks are divided between both devices, and the pipelining scheme, which pipelines the map and the reduce stages on different devices. We develop dynamic work distribution schemes for both the approaches. To achieve high load balance while keeping scheduling costs low, we use a runtime tuning method to adjust task block sizes for the map-dividing scheme. Our implementation of MapReduce is based on a continuous reduction method, which avoids the memory overheads of storing key-value pairs. We have evaluated the different design decisions using 5 popular MapReduce applications. For 4 of the applications, our system achieves 1.21 to 2.1 speedup over the better of the CPU-only and GPU-only versions. The speedups over a single CPU core execution range from 3.25 to 28.68. The runtime tuning method we have developed achieves very low load imbalance, while keeping scheduling overheads low. Though our current work is specific to MapReduce, many underlying ideas are also applicable towards intra-node acceleration of other applications on integrated CPU-GPU nodes.", acknowledgement = ack-nhfb, articleno = "25", } @InProceedings{Igual:2012:UHP, author = "Francisco D. Igual and Murtaza Ali and Arnon Friedmann and Eric Stotzer and Timothy Wentz and Robert A. van de Geijn", title = "Unleashing the high performance and low power of multi-core {DSPs} for general-purpose {HPC}", crossref = "Hollingsworth:2012:SPI", pages = "26:1--26:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a070.pdf", abstract = "Take a multicore Digital Signal Processor (DSP) chip designed for cellular base stations and radio network controllers, add floating-point capabilities to support 4G networks, and out of thin air a HPC engine is born. The potential for HPC is clear: It promises 128 GFLOPS (single precision) for 10 Watts; It is used in millions of network related devices and hence benefits from economies of scale; It should be simpler to program than a GPU. Simply put, it is fast, green, and cheap. But is it easy to use? In this paper, we show how this potential can be applied to general-purpose high performance computing, more specifically to dense matrix computations, without major changes in existing codes and methodologies, and with excellent performance and power consumption numbers.", acknowledgement = ack-nhfb, articleno = "26", } @InProceedings{Chang:2012:SNS, author = "Li-Wen Chang and John A. Stratton and Hee-Seok Kim and Wen-Mei W. Hwu", title = "A scalable, numerically stable, high-performance tridiagonal solver using {GPUs}", crossref = "Hollingsworth:2012:SPI", pages = "27:1--27:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a071.pdf", abstract = "In this paper, we present a scalable, numerically stable, high-performance tridiagonal solver. The solver is based on the SPIKE algorithm for partitioning a large matrix into small independent matrices, which can be solved in parallel. For each small matrix, our solver applies a general 1-by-1 or 2-by-2 diagonal pivoting algorithm, which is also known to be numerically stable. Our paper makes two major contributions. First, our solver is the first numerically stable tridiagonal solver for GPUs. Our solver provides comparable quality of stable solutions to Intel MKL and Matlab, at speed comparable to the GPU tridiagonal solvers in existing packages like CUSPARSE. It is also scalable to multiple GPUs and CPUs. Second, we present and analyze two key optimization strategies for our solver: a high-throughput data layout transformation for memory efficiency, and a dynamic tiling approach for reducing the memory access footprint caused by branch divergence.", acknowledgement = ack-nhfb, articleno = "27", } @InProceedings{Park:2012:EBB, author = "Jongsoo Park and Ping Tak Peter Tang and Mikhail Smelyanskiy and Daehyun Kim and Thomas Benson", title = "Efficient backprojection-based synthetic aperture radar computation with many-core processors", crossref = "Hollingsworth:2012:SPI", pages = "28:1--28:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a072.pdf", abstract = "Tackling computationally challenging problems with high efficiency often requires the combination of algorithmic innovation, advanced architecture, and thorough exploitation of parallelism. We demonstrate this synergy through synthetic aperture radar (SAR) via backprojection, an image reconstruction method that can require hundreds of TFLOPS. Computation cost is significantly reduced by our new algorithm of approximate strength reduction; data movement cost is economized by software locality optimizations facilitated by advanced architecture support; parallelism is fully harnessed in various patterns and granularities. We deliver over 35 billion backprojections per second throughput per compute node on an Intel\reg{} Xeon\reg{} E5-2670-based cluster, equipped with Intel\reg{} Xeon PhiTM coprocessors. This corresponds to processing a 3K x 3K image within a second using a single node. Our study can be extended to other settings: backprojection is applicable elsewhere including medical imaging, approximate strength reduction is a general code transformation technique, and many-core processors are emerging as a solution to energy-efficient computing.", acknowledgement = ack-nhfb, articleno = "28", } @InProceedings{Li:2012:PFA, author = "Peng Li and Guodong Li and Ganesh Gopalakrishnan", title = "Parametric flows: automated behavior equivalencing for symbolic analysis of races in {CUDA} programs", crossref = "Hollingsworth:2012:SPI", pages = "29:1--29:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a009.pdf", abstract = "The growing scale of concurrency requires automated abstraction techniques to cut down the effort in concurrent system analysis. In this paper, we show that the high degree of behavioral symmetry present in GPU programs allows CUDA race detection to be dramatically simplified through abstraction. Our abstraction techniques is one of automatically creating parametric flows ---control-flow equivalence classes of threads that diverge in the same manner---and checking for data races only across a pair of threads per parametric flow. We have implemented this approach as an extension of our recently proposed GKLEE symbolic analysis framework and show that all our previous results are dramatically improved in that (i) the parametric flow-based analysis takes far less time, and (ii) because of the much higher scalability of the analysis, we can detect even more data race situations that were previously missed by GKLEE because it was forced to downscale examples to limit analysis complexity. Moreover, the parametric flow-based analysis is applicable to other programs with SPMD models.", acknowledgement = ack-nhfb, articleno = "29", } @InProceedings{Hilbrich:2012:MRE, author = "Tobias Hilbrich and Joachim Protze and Martin Schulz and Bronis R. de Supinski and Matthias S. M{\"u}ller", title = "{MPI} runtime error detection with {MUST}: advances in deadlock detection", crossref = "Hollingsworth:2012:SPI", pages = "30:1--30:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a010.pdf", abstract = "The widely used Message Passing Interface (MPI) is complex and rich. As a result, application developers require automated tools to avoid and to detect MPI programming errors. We present the Marmot Umpire Scalable Tool (MUST) that detects such errors with significantly increased scalability. We present improvements to our graph-based deadlock detection approach for MPI, which cover future MPI extensions. Our enhancements also check complex MPI constructs that no previous graph-based detection approach handled correctly. Finally, we present optimizations for the processing of MPI operations that reduce runtime deadlock detection overheads. Existing approaches often require O ( p ) analysis time per MPI operation, for p processes. We empirically observe that our improvements lead to sub-linear or better analysis time per operation for a wide range of real world applications.", acknowledgement = ack-nhfb, articleno = "30", } @InProceedings{Bhatele:2012:NVP, author = "Abhinav Bhatele and Todd Gamblin and Katherine E. Isaacs and Brian T. N. Gunney and Martin Schulz and Peer-Timo Bremer and Bernd Hamann", title = "Novel views of performance data to analyze large-scale adaptive applications", crossref = "Hollingsworth:2012:SPI", pages = "31:1--31:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a011.pdf", abstract = "Performance analysis of parallel scientific codes is becoming increasingly difficult due to the rapidly growing complexity of applications and architectures. Existing tools fall short in providing intuitive views that facilitate the process of performance debugging and tuning. In this paper, we extend recent ideas of projecting and visualizing performance data for faster, more intuitive analysis of applications. We collect detailed per-level and per-phase measurements for a dynamically load-balanced, structured AMR library and project per-core data collected in the hardware domain on to the application's communication topology. We show how our projections and visualizations lead to a rapid diagnosis of and mitigation strategy for a previously elusive scaling bottleneck in the library that is hard to detect using conventional tools. Our new insights have resulted in a 22\% performance improvement for a 65,536-core run of the AMR library on an IBM Blue Gene/P system.", acknowledgement = ack-nhfb, articleno = "31", } @InProceedings{Wu:2012:RRA, author = "Donghong Wu and Bingsheng He and Xueyan Tang and Jianliang Xu and Minyi Guo", title = "{RAMZzz}: rank-aware {DRAM} power management with dynamic migrations and demotions", crossref = "Hollingsworth:2012:SPI", pages = "32:1--32:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a042.pdf", abstract = "Main memory is a significant energy consumer which may contribute to over 40\% of the total system power, and will become more significant for server machines with more main memory. In this paper, we propose a novel memory system design named RAMZzz with rank-aware energy saving optimizations. Specifically, we rely on a memory controller to monitor the memory access locality, and group the pages with similar access locality into the same rank. We further develop dynamic page migrations to adapt to data access patterns, and a prediction model to estimate the demotion time for accurate control on power state transitions. We experimentally compare our algorithm with other energy saving policies with cycle-accurate simulation. Experiments with benchmark workloads show that RAMZzz achieves significant improvement on energy-delay and energy consumption over other power saving techniques.", acknowledgement = ack-nhfb, articleno = "32", } @InProceedings{Li:2012:MAG, author = "Sheng Li and Doe Hyun Yoon and Ke Chen and Jishen Zhao and Jung Ho Ahn and Jay B. Brockman and Yuan Xie and Norman P. Jouppi", title = "{MAGE}: adaptive granularity and {ECC} for resilient and power efficient memory systems", crossref = "Hollingsworth:2012:SPI", pages = "33:1--33:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a041.pdf", abstract = "Resiliency is one of the toughest challenges in high-performance computing, and memory accounts for a significant fraction of errors. Providing strong error tolerance in memory usually requires a wide memory channel that incurs a large access granularity (hence, a large cache line). Unfortunately, applications with limited spatial locality waste memory power and bandwidth on systems with a large access granularity. Thus, careful design considerations must be made to balance memory system performance, power efficiency, and resiliency. In this paper, we propose MAGE, a Memory system with Adaptive Granularity and ECC, to achieve high performance, power efficiency, and resiliency. MAGE can adapt memory access granularities and ECC schemes to applications with different memory behaviors. Our experiments show that MAGE achieves more than a 28\% energy-delay product improvement, compared to the best existing systems with static granularity and ECC.", acknowledgement = ack-nhfb, articleno = "33", } @InProceedings{Ren:2012:PWA, author = "Yufei Ren and Tan Li and Dantong Yu and Shudong Jin and Thomas Robertazzi and Brian L. Tierney and Eric Pouyoul", title = "Protocols for wide-area data-intensive applications: design and performance issues", crossref = "Hollingsworth:2012:SPI", pages = "34:1--34:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a059.pdf", abstract = "Providing high-speed data transfer is vital to various data-intensive applications. While there have been remarkable technology advances to provide ultra-high-speed network bandwidth, existing protocols and applications may not be able to fully utilize the bare-metal bandwidth due to their inefficient design. We identify the same problem remains in the field of Remote Direct Memory Access (RDMA) networks. RDMA offloads TCP/IP protocols to hardware devices. However, its benefits have not been fully exploited due to the lack of efficient software and application protocols, in particular in wide-area networks. In this paper, we address the design choices to develop such protocols. We describe a protocol implemented as part of a communication middleware. The protocol has its flow control, connection management, and task synchronization. It maximizes the parallelism of RDMA operations. We demonstrate its performance benefit on various local and wide-area testbeds, including the DOE ANI testbed with RoCE links and InfiniBand links.", acknowledgement = ack-nhfb, articleno = "34", } @InProceedings{Islam:2012:HPR, author = "N. S. Islam and M. W. Rahman and J. Jose and R. Rajachandrasekar and H. Wang and H. Subramoni and C. Murthy and D. K. Panda", title = "High performance {RDMA}-based design of {HDFS} over {InfiniBand}", crossref = "Hollingsworth:2012:SPI", pages = "35:1--35:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a058.pdf", abstract = "Hadoop Distributed File System (HDFS) acts as the primary storage of Hadoop and has been adopted by reputed organizations (Facebook, Yahoo! etc.) due to its portability and fault-tolerance. The existing implementation of HDFS uses Java-socket interface for communication which delivers suboptimal performance in terms of latency and throughput. For data-intensive applications, network performance becomes key component as the amount of data being stored and replicated to HDFS increases. In this paper, we present a novel design of HDFS using Remote Direct Memory Access (RDMA) over InfiniBand via JNI interfaces. Experimental results show that, for 5GB HDFS file writes, the new design reduces the communication time by 87\% and 30\% over 1Gigabit Ethernet (1GigE) and IP-over-InfiniBand (IPoIB), respectively, on QDR platform (32Gbps). For HBase, the Put operation performance is improved by 26\% with our design. To the best of our knowledge, this is the first design of HDFS over InfiniBand networks.", acknowledgement = ack-nhfb, articleno = "35", } @InProceedings{Dichev:2012:ERN, author = "Kiril Dichev and Fergal Reid and Alexey Lastovetsky", title = "Efficient and reliable network tomography in heterogeneous networks using {BitTorrent} broadcasts and clustering algorithms", crossref = "Hollingsworth:2012:SPI", pages = "36:1--36:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a060.pdf", abstract = "In the area of network performance and discovery, network tomography focuses on reconstructing network properties using only end-to-end measurements at the application layer. One challenging problem in network tomography is reconstructing available bandwidth along all links during multiple source/multiple destination transmissions. The traditional measurement procedures used for bandwidth tomography are extremely time consuming. We propose a novel solution to this problem. Our method counts the fragments exchanged during a BitTorrent broadcast. While this measurement has a high level of randomness, it can be obtained very efficiently, and aggregated into a reliable metric. This data is then analyzed with state-of-the-art algorithms, which correctly reconstruct logical clusters of nodes interconnected by high bandwidth, as well as bottlenecks between these logical clusters. Our experiments demonstrate that the proposed two-phase approach efficiently solves the presented problem for a number of settings on a complex grid infrastructure.", acknowledgement = ack-nhfb, articleno = "36", } @InProceedings{Malakar:2012:DCS, author = "Preeti Malakar and Thomas George and Sameer Kumar and Rashmi Mittal and Vijay Natarajan and Yogish Sabharwal and Vaibhav Saxena and Sathish S. Vadhiyar", title = "A divide and conquer strategy for scaling weather simulations with multiple regions of interest", crossref = "Hollingsworth:2012:SPI", pages = "37:1--37:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a105.pdf", abstract = "Accurate and timely prediction of weather phenomena, such as hurricanes and flash floods, require high-fidelity compute intensive simulations of multiple finer regions of interest within a coarse simulation domain. Current weather applications execute these nested simulations sequentially using all the available processors, which is sub-optimal due to their sub-linear scalability. In this work, we present a strategy for parallel execution of multiple nested domain simulations based on partitioning the 2-D processor grid into disjoint rectangular regions associated with each domain. We propose a novel combination of performance prediction, processor allocation methods and topology-aware mapping of the regions on torus interconnects. Experiments on IBM Blue Gene systems using WRF show that the proposed strategies result in performance improvement of up to 33\% with topology-oblivious mapping and up to additional 7\% with topology-aware mapping over the default sequential strategy.", acknowledgement = ack-nhfb, articleno = "37", } @InProceedings{Rietmann:2012:FAS, author = "Max Rietmann and Peter Messmer and Tarje Nissen-Meyer and Daniel Peter and Piero Basini and Dimitri Komatitsch and Olaf Schenk and Jeroen Tromp and Lapo Boschi and Domenico Giardini", title = "Forward and adjoint simulations of seismic wave propagation on emerging large-scale {GPU} architectures", crossref = "Hollingsworth:2012:SPI", pages = "38:1--38:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a104.pdf", abstract = "Computational seismology is an area of wide sociological and economic impact, ranging from earthquake risk assessment to subsurface imaging and oil and gas exploration. At the core of these simulations is the modeling of wave propagation in a complex medium. Here we report on the extension of the high-order finite-element seismic wave simulation package SPECFEM3D to support the largest scale hybrid and homogeneous supercomputers. Starting from an existing highly tuned MPI code, we migrated to a CUDA version. In order to be of immediate impact to the science mission of computational seismologists, we had to port the entire production package, rather than just individual kernels. One of the challenges in parallelizing finite element codes is the potential for race conditions during the assembly phase. We therefore investigated different methods such as mesh coloring or atomic updates on the GPU. In order to achieve strong scaling, we needed to ensure good overlap of data motion at all levels, including internode and host-accelerator transfers. Finally we carefully tuned the GPU implementation. The new MPI/CUDA solver exhibits excellent scalability and achieves speedup on a node-to-node basis over the carefully tuned equivalent multi-core MPI solver. To demonstrate the performance of both the forward and adjoint functionality, we present two case studies run on the Cray XE6 CPU and Cray XK6 GPU architectures up to 896 nodes: (1) focusing on most commonly used forward simulations, we simulate seismic wave propagation generated by earthquakes in Turkey, and (2) testing the most complex seismic inversion type of the package, we use ambient seismic noise to image 3-D crust and mantle structure beneath western Europe.", acknowledgement = ack-nhfb, articleno = "38", } @InProceedings{Nguyen:2012:BTM, author = "Tan Nguyen and Pietro Cicotti and Eric Bylaska and Dan Quinlan and Scott B. Baden", title = "{Bamboo}: translating {MPI} applications to a latency-tolerant, data-driven form", crossref = "Hollingsworth:2012:SPI", pages = "39:1--39:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a032.pdf", abstract = "We present Bamboo, a custom source-to-source translator that transforms MPI C source into a data-driven form that automatically overlaps communication with available computation. Running on up to 98304 processors of NERSC's Hopper system, we observe that Bamboo's overlap capability speeds up MPI implementations of a 3D Jacobi iterative solver and Cannon's matrix multiplication. Bamboo's generated code meets or exceeds the performance of hand optimized MPI, which includes split-phase coding, the method classically employed to hide communication. We achieved our results with only modest amounts of programmer annotation and no intrusive reprogramming of the original application source.", acknowledgement = ack-nhfb, articleno = "39", } @InProceedings{Bandishti:2012:TSC, author = "Vinayaka Bandishti and Irshad Pananilath and Uday Bondhugula", title = "Tiling stencil computations to maximize parallelism", crossref = "Hollingsworth:2012:SPI", pages = "40:1--40:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a031.pdf", abstract = "Most stencil computations allow tile-wise concurrent start, i.e., there always exists a face of the iteration space and a set of tiling hyperplanes such that all tiles along that face can be started concurrently. This provides load balance and maximizes parallelism. However, existing automatic tiling frameworks often choose hyperplanes that lead to pipelined start-up and load imbalance. We address this issue with a new tiling technique that ensures concurrent start-up as well as perfect load-balance whenever possible. We first provide necessary and sufficient conditions on tiling hyperplanes to enable concurrent start for programs with affine data accesses. We then provide an approach to find such hyperplanes. Experimental evaluation on a 12-core Intel Westmere shows that our code is able to outperform a tuned domain-specific stencil code generator by 4\% to 27\%, and previous compiler techniques by a factor of 2x to 10.14 x.", acknowledgement = ack-nhfb, articleno = "40", } @InProceedings{Ding:2012:CDF, author = "Wei Ding and Yuanrui Zhang and Mahmut Kandemir and Seung Woo Son", title = "Compiler-directed file layout optimization for hierarchical storage systems", crossref = "Hollingsworth:2012:SPI", pages = "41:1--41:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a030.pdf", abstract = "File layout of array data is a critical factor that effects the behavior of storage caches, and has so far taken not much attention in the context of hierarchical storage systems. The main contribution of this paper is a compiler-driven file layout optimization scheme for hierarchical storage caches. This approach, fully automated within an optimizing compiler, analyzes a multi-threaded application code and determines a file layout for each disk-resident array referenced by the code, such that the performance of the target storage cache hierarchy is maximized. We tested our approach using 16 I/O intensive application programs and compared its performance against two previously proposed approaches under different cache space management schemes. Our experimental results show that the proposed approach improves the execution time of these parallel applications by 23.7\% on average.", acknowledgement = ack-nhfb, articleno = "41", } @InProceedings{Tang:2012:FLC, author = "Ping Tak Peter Tang and Jongsoo Park and Daehyun Kim and Vladimir Petrov", title = "A framework for low-communication {$1$-D} {FFT}", crossref = "Hollingsworth:2012:SPI", pages = "42:1--42:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a043.pdf", abstract = "In high-performance computing on distributed-memory systems, communication often represents a significant part of the overall execution time. The relative cost of communication will certainly continue to rise as compute-density growth follows the current technology and industry trends. Design of lower-communication alternatives to fundamental computational algorithms has become an important field of research. For distributed 1-D FFT, communication cost has hitherto remained high as all industry-standard implementations perform three all-to-all internode data exchanges (also called global transposes). These communication steps indeed dominate execution time. In this paper, we present a mathematical framework from which many single-all-to-all and easy-to-implement 1-D FFT algorithms can be derived. For large-scale problems, our implementation can be twice as fast as leading FFT libraries on state-of-the-art computer clusters. Moreover, our framework allows tradeoff between accuracy and performance, further boosting performance if reduced accuracy is acceptable.", acknowledgement = ack-nhfb, articleno = "42", } @InProceedings{Sundar:2012:PGA, author = "Hari Sundar and George Biros and Carsten Burstedde and Johann Rudi and Omar Ghattas and Georg Stadler", title = "Parallel geometric-algebraic multigrid on unstructured forests of octrees", crossref = "Hollingsworth:2012:SPI", pages = "43:1--43:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a045.pdf", abstract = "We present a parallel multigrid method for solving variable-coefficient elliptic partial differential equations on arbitrary geometries using highly adapted meshes. Our method is designed for meshes that are built from an unstructured hexahedral macro mesh, in which each macro element is adaptively refined as an octree. This forest-of-octrees approach enables us to generate meshes for complex geometries with arbitrary levels of local refinement. We use geometric multigrid (GMG) for each of the octrees and algebraic multigrid (AMG) as the coarse grid solver. We designed our GMG sweeps to entirely avoid collectives, thus minimizing communication cost. We present weak and strong scaling results for the 3D variable-coefficient Poisson problem that demonstrate high parallel scalability. As a highlight, the largest problem we solve is on a non-uniform mesh with 100 billion unknowns on 262,144 cores of NCCS's Cray XK6 ``Jaguar''; in this solve we sustain 272 TFlops/s.", acknowledgement = ack-nhfb, articleno = "43", } @InProceedings{Nukada:2012:SMG, author = "Akira Nukada and Kento Sato and Satoshi Matsuoka", title = "Scalable multi-{GPU} {$3$-D} {FFT} for {TSUBAME 2.0} supercomputer", crossref = "Hollingsworth:2012:SPI", pages = "44:1--44:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a044.pdf", abstract = "For scalable 3-D FFT computation using multiple GPUs, efficient all-to-all communication between GPUs is the most important factor in good performance. Implementations with point-to-point MPI library functions and CUDA memory copy APIs typically exhibit very large overheads especially for small message sizes in all-to-all communications between many nodes. We propose several schemes to minimize the overheads, including employment of lower-level API of InfiniBand to effectively overlap intra- and inter-node communication, as well as auto-tuning strategies to control scheduling and determine rail assignments. As a result we achieve very good strong scalability as well as good performance, up to 4.8TFLOPS using 256 nodes of TSUBAME 2.0 Supercomputer (768 GPUs) in double precision.", acknowledgement = ack-nhfb, articleno = "44", } @InProceedings{Doi:2012:PSL, author = "Jun Doi", title = "Peta-scale lattice quantum chromodynamics on a {Blue Gene/Q} supercomputer", crossref = "Hollingsworth:2012:SPI", pages = "45:1--45:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a068.pdf", abstract = "Lattice Quantum Chromodynamics (QCD) is one of the most challenging applications running on massively parallel supercomputers. To reproduce these physical phenomena on a supercomputer, a precise simulation is demanded requiring well optimized and scalable code. We have optimized lattice QCD programs on Blue Gene family supercomputers and shown the strength in lattice QCD simulation. Here we optimized on the third generation Blue Gene/Q supercomputer; (i) by changing the data layout, (ii) by exploiting new SIMD instruction sets, and (iii) by pipelining boundary data exchange to overlap communication and calculation. The optimized lattice QCD program shows excellent weak scalability on the large scale Blue Gene/Q system, and with 16 racks we sustained 1.08 Pflop/s, 32.1\% of the theoretical peak performance, including the conjugate gradient solver routines.", acknowledgement = ack-nhfb, articleno = "45", } @InProceedings{Sarje:2012:MPX, author = "Abhinav Sarje and Xiaoye S. Li and Slim Chourou and Elaine R. Chan and Alexander Hexemer", title = "Massively parallel {X-ray} scattering simulations", crossref = "Hollingsworth:2012:SPI", pages = "46:1--46:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a067.pdf", abstract = "Although present X-ray scattering techniques can provide tremendous information on the nano-structural properties of materials that are valuable in the design and fabrication of energy-relevant nano-devices, a primary challenge remains in the analyses of such data. In this paper we describe a high-performance, flexible, and scalable Grazing Incidence Small Angle X-ray Scattering simulation algorithm and codes that we have developed on multi-core/CPU and many-core/GPU clusters. We discuss in detail our implementation, optimization and performance on these platforms. Our results show speedups of ~125x on a Fermi-GPU and ~20x on a Cray-XE6 24-core node, compared to a sequential CPU code, with near linear scaling on multi-node clusters. To our knowledge, this is the first GISAXS simulation code that is flexible to compute scattered light intensities in all spatial directions allowing full reconstruction of GISAXS patterns for any complex structures and with high-resolutions while reducing simulation times from months to minutes.", acknowledgement = ack-nhfb, articleno = "46", } @InProceedings{Baker:2012:HPR, author = "C. Baker and G. Davidson and T. M. Evans and S. Hamilton and J. Jarrell and W. Joubert", title = "High performance radiation transport simulations: preparing for {Titan}", crossref = "Hollingsworth:2012:SPI", pages = "47:1--47:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a069.pdf", abstract = "In this paper we describe the Denovo code system. Denovo solves the six-dimensional, steady-state, linear Boltzmann transport equation, of central importance to nuclear technology applications such as reactor core analysis (neutronics), radiation shielding, nuclear forensics and radiation detection. The code features multiple spatial differencing schemes, state-of-the-art linear solvers, the Koch-Baker-Alcouffe (KBA) parallel-wavefront sweep algorithm for inverting the transport operator, a new multilevel energy decomposition method scaling to hundreds of thousands of processing cores, and a modern, novel code architecture that supports straightforward integration of new features. In this paper we discuss the performance of Denovo on the 20+ petaflop ORNL GPU-based system, Titan. We describe algorithms and techniques used to exploit the capabilities of Titan's heterogeneous compute node architecture and the challenges of obtaining good parallel performance for this sparse hyperbolic PDE solver containing inherently sequential computations. Numerical results demonstrating Denovo performance on early Titan hardware are presented.", acknowledgement = ack-nhfb, articleno = "47", } @InProceedings{Jenkins:2012:BPL, author = "John Jenkins and Eric R. Schendel and Sriram Lakshminarasimhan and David A. {Boyuka II} and Terry Rogers and Stephane Ethier and Robert Ross and Scott Klasky and Nagiza F. Samatova", title = "Byte-precision level of detail processing for variable precision analytics", crossref = "Hollingsworth:2012:SPI", pages = "48:1--48:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a088.pdf", abstract = "I/O bottlenecks in HPC applications are becoming a more pressing problem as compute capabilities continue to outpace I/O capabilities. While double-precision simulation data often must be stored losslessly, the loss of some of the fractional component may introduce acceptably small errors to many types of scientific analyses. Given this observation, we develop a precision level of detail (APLOD) library, which partitions double-precision datasets along user-defined byte boundaries. APLOD parameterizes the analysis accuracy-I/O performance tradeoff, bounds maximum relative error, maintains I/O access patterns compared to full precision, and operates with low overhead. Using ADIOS as an I/O use-case, we show proportional reduction in disk access time to the degree of precision. Finally, we show the effects of partial precision analysis on accuracy for operations such as $k$-means and Fourier analysis, finding a strong applicability for the use of varying degrees of precision to reduce the cost of analyzing extreme-scale data.", acknowledgement = ack-nhfb, articleno = "48", } @InProceedings{Bennett:2012:CST, author = "Janine C. Bennett and Hasan Abbasi and Peer-Timo Bremer and Ray Grout and Attila Gyulassy and Tong Jin and Scott Klasky and Hemanth Kolla and Manish Parashar and Valerio Pascucci and Philippe Pebay and David Thompson and Hongfeng Yu and Fan Zhang and Jacqueline Chen", title = "Combining in-situ and in-transit processing to enable extreme-scale scientific analysis", crossref = "Hollingsworth:2012:SPI", pages = "49:1--49:9", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a089.pdf", abstract = "With the onset of extreme-scale computing, I/O constraints make it increasingly difficult for scientists to save a sufficient amount of raw simulation data to persistent storage. One potential solution is to change the data analysis pipeline from a post-process centric to a concurrent approach based on either in-situ or in-transit processing. In this context computations are considered in-situ if they utilize the primary compute resources, while in-transit processing refers to offloading computations to a set of secondary resources using asynchronous data transfers. In this paper we explore the design and implementation of three common analysis techniques typically performed on large-scale scientific simulations: topological analysis, descriptive statistics, and visualization. We summarize algorithmic developments, describe a resource scheduling system to coordinate the execution of various analysis workflows, and discuss our implementation using the DataSpaces and ADIOS frameworks that support efficient data movement between in-situ and in-transit computations. We demonstrate the efficiency of our lightweight, flexible framework by deploying it on the Jaguar XK6 to analyze data generated by S3D, a massively parallel turbulent combustion code. Our framework allows scientists dealing with the data deluge at extreme scale to perform analyses at increased temporal resolutions, mitigate I/O costs, and significantly improve the time to insight.", acknowledgement = ack-nhfb, articleno = "49", } @InProceedings{Kumar:2012:EDR, author = "Sidharth Kumar and Venkatram Vishwanath and Philip Carns and Joshua A. Levine and Robert Latham and Giorgio Scorzelli and Hemanth Kolla and Ray Grout and Robert Ross and Michael E. Papka and Jacqueline Chen and Valerio Pascucci", title = "Efficient data restructuring and aggregation for {I/O} acceleration in {PIDX}", crossref = "Hollingsworth:2012:SPI", pages = "50:1--50:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a090.pdf", abstract = "Hierarchical, multiresolution data representations enable interactive analysis and visualization of large-scale simulations. One promising application of these techniques is to store high performance computing simulation output in a hierarchical Z (HZ) ordering that translates data from a Cartesian coordinate scheme to a one-dimensional array ordered by locality at different resolution levels. However, when the dimensions of the simulation data are not an even power of 2, parallel HZ ordering produces sparse memory and network access patterns that inhibit I/O performance. This work presents a new technique for parallel HZ ordering of simulation datasets that restructures simulation data into large (power of 2) blocks to facilitate efficient I/O aggregation. We perform both weak and strong scaling experiments using the S3D combustion application on both Cray-XE6 (65,536 cores) and IBM Blue Gene/P (131,072 cores) platforms. We demonstrate that data can be written in hierarchical, multiresolution format with performance competitive to that of native data-ordering methods.", acknowledgement = ack-nhfb, articleno = "50", } @InProceedings{Kambadur:2012:MIB, author = "Melanie Kambadur and Tipp Moseley and Rick Hank and Martha A. Kim", title = "Measuring interference between live datacenter applications", crossref = "Hollingsworth:2012:SPI", pages = "51:1--51:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a036.pdf", abstract = "Application interference is prevalent in datacenters due to contention over shared hardware resources. Unfortunately, understanding interference in live datacenters is more difficult than in controlled environments or on simpler architectures. Most approaches to mitigating interference rely on data that cannot be collected efficiently in a production environment. This work exposes eight specific complexities of live datacenters that constrain measurement of interference. It then introduces new, generic measurement techniques for analyzing interference in the face of these challenges and restrictions. We use the measurement techniques to conduct the first large-scale study of application interference in live production datacenter workloads. Data is measured across 1000 12-core Google servers observed to be running 1102 unique applications. Finally, our work identifies several opportunities to improve performance that use only the available data; these opportunities are applicable to any datacenter.", acknowledgement = ack-nhfb, articleno = "51", } @InProceedings{Kaushik:2012:DCC, author = "Rini T. Kaushik and Klara Nahrstedt", title = "{T*}: a data-centric cooling energy costs reduction approach for big data analytics cloud", crossref = "Hollingsworth:2012:SPI", pages = "52:1--52:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a037.pdf", abstract = "Explosion in Big Data has led to a surge in extremely large-scale Big Data analytics platforms, resulting in burgeoning energy costs. Big Data compute model mandates strong data-locality for computational performance, and moves computations to data. State-of-the-art cooling energy management techniques rely on thermal-aware computational job placement/migration and are inherently data-placement-agnostic in nature. T * takes a novel, data-centric approach to reduce cooling energy costs and to ensure thermal-reliability of the servers. T * is cognizant of the uneven thermal-profile and differences in thermal-reliability-driven load thresholds of the servers, and the differences in the computational jobs arrival rate, size, and evolution life spans of the Big Data placed in the cluster. Based on this knowledge, and coupled with its predictive file models and insights, T * does proactive, thermal-aware file placement, which implicitly results in thermal-aware job placement in the Big Data analytics compute model. Evaluation results with one-month long real-world Big Data analytics production traces from Yahoo! show up to 42\% reduction in the cooling energy costs with T * courtesy of its lower and more uniform thermal-profile and 9x better performance than the state-of-the-art data-agnostic cooling techniques.", acknowledgement = ack-nhfb, articleno = "52", } @InProceedings{Ravi:2012:VVB, author = "Vignesh T. Ravi and Michela Becchi and Gagan Agrawal and Srimat Chakradhar", title = "{ValuePack}: value-based scheduling framework for {CPU--GPU} clusters", crossref = "Hollingsworth:2012:SPI", pages = "53:1--53:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a038.pdf", abstract = "Heterogeneous computing nodes are becoming commonplace today, and recent trends strongly indicate that clusters, supercomputers, and cloud environments will increasingly host more heterogeneous resources, with some being massively parallel (e.g., GPU). With such heterogeneous environments becoming common, it is important to revisit scheduling problems for clusters and cloud environments. In this paper, we formulate and address the problem of value-driven scheduling of independent jobs on heterogeneous clusters, which captures both the urgency and relative priority of jobs. Our overall scheduling goal is to maximize the aggregate value or yield of all jobs. Exploiting the portability available from the underlying programming model, we propose four novel scheduling schemes that can automatically and dynamically map jobs onto heterogeneous resources. Additionally, to improve the utilization of massively parallel resources, we also propose heuristics to automatically decide when and which jobs can share a single resource.", acknowledgement = ack-nhfb, articleno = "53", } @InProceedings{Preissl:2012:CSS, author = "Robert Preissl and Theodore M. Wong and Pallab Datta and Myron Flickner and Raghavendra Singh and Steven K. Esser and William P. Risk and Horst D. Simon and Dharmendra S. Modha", title = "{Compass}: a scalable simulator for an architecture for cognitive computing", crossref = "Hollingsworth:2012:SPI", pages = "54:1--54:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a085.pdf", abstract = "Inspired by the function, power, and volume of the organic brain, we are developing TrueNorth, a novel modular, non-von Neumann, ultra-low power, compact architecture. TrueNorth consists of a scalable network of neurosynaptic cores, with each core containing neurons, dendrites, synapses, and axons. To set sail for TrueNorth, we developed Compass, a multi-threaded, massively parallel functional simulator and a parallel compiler that maps a network of long-distance pathways in the macaque monkey brain to TrueNorth. We demonstrate near-perfect weak scaling on a 16 rack IBM\reg{} Blue Gene\reg{}/Q (262144 CPUs, 256 TB memory), achieving an unprecedented scale of 256 million neurosynaptic cores containing 65 billion neurons and 16 trillion synapses running only 388X slower than real time with an average spiking rate of 8.1 Hz. By using emerging PGAS communication primitives, we also demonstrate 2X better real-time performance over MPI primitives on a 4 rack Blue Gene/P (16384 CPUs, 16 TB memory).", acknowledgement = ack-nhfb, articleno = "54", } @InProceedings{Sun:2012:OFG, author = "Yanhua Sun and Gengbin Zheng and Chao Mei and Eric J. Bohm and James C. Phillips and Laximant V. Kal{\'e} and Terry R. Jones", title = "Optimizing fine-grained communication in a biomolecular simulation application on {Cray XK6}", crossref = "Hollingsworth:2012:SPI", pages = "55:1--55:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a086.pdf", abstract = "Achieving good scaling for fine-grained communication intensive applications on modern supercomputers remains challenging. In our previous work, we have shown that such an application --- NAMD --- scales well on the full Jaguar XT5 without long-range interactions; Yet, with them, the speedup falters beyond 64K cores. Although the new Gemini interconnect on Cray XK6 has improved network performance, the challenges remain, and are likely to remain for other such networks as well. We analyze communication bottlenecks in NAMD and its CHARM++ runtime, using the Projections performance analysis tool. Based on the analysis, we optimize the runtime, built on the uGNI library for Gemini. We present several techniques to improve the fine-grained communication. Consequently, the performance of running 92224-atom Apoa1 with GPUs on TitanDev is improved by 36\%. For 100-million-atom STMV, we improve upon the prior Jaguar XT5 result of 26 ms/step to 13 ms/step using 298,992 cores on Jaguar XK6.", acknowledgement = ack-nhfb, articleno = "55", } @InProceedings{Alexeev:2012:HSL, author = "Yuri Alexeev and Ashutosh Mahajan and Sven Leyffer and Graham Fletcher and Dmitri G. Fedorov", title = "Heuristic static load-balancing algorithm applied to the fragment molecular orbital method", crossref = "Hollingsworth:2012:SPI", pages = "56:1--56:13", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a087.pdf", abstract = "In the era of petascale supercomputing, the importance of load balancing is crucial. Although dynamic load balancing is widespread, it is increasingly difficult to implement effectively with thousands of processors or more, prompting a second look at static load-balancing techniques even though the optimal allocation of tasks to processors is an NP-hard problem. We propose a heuristic static load-balancing algorithm, employing fitted benchmarking data, as an alternative to dynamic load balancing. The problem of allocating CPU cores to tasks is formulated as a mixed-integer nonlinear optimization problem, which is solved by using an optimization solver. On 163,840 cores of Blue Gene/P, we achieved a parallel efficiency of 80\% for an execution of the fragment molecular orbital method applied to model protein-ligand complexes quantum-mechanically. The obtained allocation is shown to outperform dynamic load balancing by at least a factor of 2, thus motivating the use of this approach on other coarse-grained applications.", acknowledgement = ack-nhfb, articleno = "56", } @InProceedings{Li:2012:CSE, author = "Dong Li and Jeffrey S. Vetter and Weikuan Yu", title = "Classifying soft error vulnerabilities in extreme-scale scientific applications using a binary instrumentation tool", crossref = "Hollingsworth:2012:SPI", pages = "57:1--57:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a097.pdf", abstract = "Extreme-scale scientific applications are at a significant risk of being hit by soft errors on supercomputers as the scale of these systems and the component density continues to increase. In order to better understand the specific soft error vulnerabilities in scientific applications, we have built an empirical fault injection and consequence analysis tool --- BIFIT --- that allows us to evaluate how soft errors impact applications. In particular, BIFIT is designed with capability to inject faults at very specific targets: an arbitrarily-chosen execution point and any specific data structure. We apply BIFIT to three mission-critical scientific applications and investigate the applications vulnerability to soft errors by performing thousands of statistical tests. We, then, classify each applications individual data structures based on their sensitivity to these vulnerabilities, and generalize these classifications across applications. Subsequently, these classifications can be used to apply appropriate resiliency solutions to each data structure within an application. Our study reveals that these scientific applications have a wide range of sensitivities to both the time and the location of a soft error; yet, we are able to identify intrinsic relationships between application vulnerabilities and specific types of data objects. In this regard, BIFIT enables new opportunities for future resiliency research.", acknowledgement = ack-nhfb, articleno = "57", } @InProceedings{Chung:2012:CDS, author = "Jinsuk Chung and Ikhwan Lee and Michael Sullivan and Jee Ho Ryoo and Dong Wan Kim and Doe Hyun Yoon and Larry Kaplan and Mattan Erez", title = "Containment domains: a scalable, efficient, and flexible resilience scheme for exascale systems", crossref = "Hollingsworth:2012:SPI", pages = "58:1--58:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a098.pdf", abstract = "This paper describes and evaluates a scalable and efficient resilience scheme based on the concept of containment domains. Containment domains are a programming construct that enable applications to express resilience needs and to interact with the system to tune and specialize error detection, state preservation and restoration, and recovery schemes. Containment domains have weak transactional semantics and are nested to take advantage of the machine and application hierarchies and to enable hierarchical state preservation, restoration, and recovery. We evaluate the scalability and efficiency of containment domains using generalized trace-driven simulation and analytical analysis and show that containment domains are superior to both checkpoint restart and redundant execution approaches.", acknowledgement = ack-nhfb, articleno = "58", } @InProceedings{Byna:2012:PAV, author = "Surendra Byna and Jerry Chou and Oliver R{\"u}bel and Prabhat and Homa Karimabadi and William S. Daughton and Vadim Roytershteyn and E. Wes Bethel and Mark Howison and Ke-Jou Hsu and Kuan-Wu Lin and Arie Shoshani and Andrew Uselton and Kesheng Wu", title = "Parallel {I/O}, analysis, and visualization of a trillion particle simulation", crossref = "Hollingsworth:2012:SPI", pages = "59:1--59:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a103.pdf", abstract = "Petascale plasma physics simulations have recently entered the regime of simulating trillions of particles. These unprecedented simulations generate massive amounts of data, posing significant challenges in storage, analysis, and visualization. In this paper, we present parallel I/O, analysis, and visualization results from a VPIC trillion particle simulation running on 120,000 cores, which produces ~ 30 TB of data for a single timestep. We demonstrate the successful application of H5Part, a particle data extension of parallel HDF5, for writing the dataset at a significant fraction of system peak I/O rates. To enable efficient analysis, we develop hybrid parallel FastQuery to index and query data using multi-core CPUs on distributed memory hardware. We show good scalability results for the FastQuery implementation using up to 10,000 cores. Finally, we apply this indexing/query-driven approach to facilitate the first-ever analysis and visualization of the trillion particle dataset.", acknowledgement = ack-nhfb, articleno = "59", } @InProceedings{Kanov:2012:DIS, author = "Kalin Kanov and Randal Burns and Greg Eyink and Charles Meneveau and Alexander Szalay", title = "Data-intensive spatial filtering in large numerical simulation datasets", crossref = "Hollingsworth:2012:SPI", pages = "60:1--60:9", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a101.pdf", abstract = "We present a query processing framework for the efficient evaluation of spatial filters on large numerical simulation datasets stored in a data-intensive cluster. Previously, filtering of large numerical simulations stored in scientific databases has been impractical owing to the immense data requirements. Rather, filtering is done during simulation or by loading snapshots into the aggregate memory of an HPC cluster. Our system performs filtering within the database and supports large filter widths. We present two complementary methods of execution: I/O streaming computes a batch filter query in a single sequential pass using incremental evaluation of decomposable kernels, summed volumes generates an intermediate data set and evaluates each filtered value by accessing only eight points in this dataset. We dynamically choose between these methods depending upon workload characteristics. The system allows us to perform filters against large data sets with little overhead: query performance scales with the cluster's aggregate I/O throughput.", acknowledgement = ack-nhfb, articleno = "60", } @InProceedings{Nouanesengsy:2012:PPA, author = "Boonthanome Nouanesengsy and Teng-Yok Lee and Kewei Lu and Han-Wei Shen and Tom Peterka", title = "Parallel particle advection and {FTLE} computation for time-varying flow fields", crossref = "Hollingsworth:2012:SPI", pages = "61:1--61:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a102.pdf", abstract = "Flow fields are an important product of scientific simulations. One popular flow visualization technique is particle advection, in which seeds are traced through the flow field. One use of these traces is to compute a powerful analysis tool called the Finite-Time Lyapunov Exponent (FTLE) field, but no existing particle tracing algorithms scale to the particle injection frequency required for high-resolution FTLE analysis. In this paper, a framework to trace the massive number of particles necessary for FTLE computation is presented. A new approach is explored, in which processes are divided into groups, and are responsible for mutually exclusive spans of time. This pipelining over time intervals reduces overall idle time of processes and decreases I/O overhead. Our parallel FTLE framework is capable of advecting hundreds of millions of particles at once, with performance scaling up to tens of thousands of processes.", acknowledgement = ack-nhfb, articleno = "61", } @InProceedings{Patwary:2012:NSP, author = "Mostofa Ali Patwary and Diana Palsetia and Ankit Agrawal and Wei-keng Liao and Fredrik Manne and Alok Choudhary", title = "A new scalable parallel {DBSCAN} algorithm using the disjoint-set data structure", crossref = "Hollingsworth:2012:SPI", pages = "62:1--62:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a053.pdf", abstract = "DBSCAN is a well-known density based clustering algorithm capable of discovering arbitrary shaped clusters and eliminating noise data. However, parallelization of Dbscan is challenging as it exhibits an inherent sequential data access order. Moreover, existing parallel implementations adopt a master-slave strategy which can easily cause an unbalanced workload and hence result in low parallel efficiency. We present a new parallel Dbscan algorithm (Pdsdbscan) using graph algorithmic concepts. More specifically, we employ the disjoint-set data structure to break the access sequentiality of Dbscan. In addition, we use a tree-based bottom-up approach to construct the clusters. This yields a better-balanced workload distribution. We implement the algorithm both for shared and for distributed memory. Using data sets containing up to several hundred million high-dimensional points, we show that Pdsdbscan significantly outperforms the master-slave approach, achieving speedups up to 25.97 using 40 cores on shared memory architecture, and speedups up to 5,765 using 8,192 cores on distributed memory architecture.", acknowledgement = ack-nhfb, articleno = "62", } @InProceedings{Nikolova:2012:PBN, author = "Olga Nikolova and Srinivas Aluru", title = "Parallel {Bayesian} network structure learning with application to gene networks", crossref = "Hollingsworth:2012:SPI", pages = "63:1--63:9", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a052.pdf", abstract = "Bayesian networks (BN) are probabilistic graphical models which are widely utilized in various research areas, including modeling complex biological interactions in the cell. Learning the structure of a BN is an NP-hard problem and exact solutions are limited to a few tens of variables. In this work, we present a parallel BN structure learning algorithm that combines principles of both heuristic and exact approaches and facilitates learning of larger networks. We demonstrate the applicability of our approach by an implementation on a Cray AMD cluster, and present experimental results for the problem of inferring gene networks. Our approach is work-optimal and achieves nearly perfect scaling.", acknowledgement = ack-nhfb, articleno = "63", } @InProceedings{Khan:2012:MAN, author = "Arif M. Khan and David F. Gleich and Alex Pothen and Mahantesh Halappanavar", title = "A multithreaded algorithm for network alignment via approximate matching", crossref = "Hollingsworth:2012:SPI", pages = "64:1--64:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a054.pdf", abstract = "Network alignment is an optimization problem to find the best one-to-one map between the vertices of a pair of graphs that overlaps as many edges as possible. It is a relaxation of the graph isomorphism problem and is closely related to the subgraph isomorphism problem. The best current approaches are entirely heuristic and iterative in nature. They generate real-valued heuristic weights that must be rounded to find integer solutions. This rounding requires solving a bipartite maximum weight matching problem at each iteration in order to avoid missing high quality solutions. We investigate substituting a parallel, half-approximation for maximum weight matching instead of an exact computation. Our experiments show that the resulting difference in solution quality is negligible. We demonstrate almost a 20-fold speedup using 40 threads on an 8 processor Intel Xeon E7-8870 system and now solve real-world problems in 36 seconds instead of 10 minutes.", acknowledgement = ack-nhfb, articleno = "64", } @InProceedings{Olivier:2012:CMW, author = "Stephen L. Olivier and Bronis R. de Supinski and Martin Schulz and Jan F. Prins", title = "Characterizing and mitigating work time inflation in task parallel programs", crossref = "Hollingsworth:2012:SPI", pages = "65:1--65:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a066.pdf", abstract = "Task parallelism raises the level of abstraction in shared memory parallel programming to simplify the development of complex applications. However, task parallel applications can exhibit poor performance due to thread idleness, scheduling overheads, and work time inflation --- additional time spent by threads in a multithreaded computation beyond the time required to perform the same work in a sequential computation. We identify the contributions of each factor to lost efficiency in various task parallel OpenMP applications and diagnose the causes of work time inflation in those applications. Increased data access latency can cause significant work time inflation in NUMA systems. Our locality framework for task parallel OpenMP programs mitigates this cause of work time inflation. Our extensions to the Qthreads library demonstrate that locality-aware scheduling can improve performance up to 3X compared to the Intel OpenMP task scheduler.", acknowledgement = ack-nhfb, articleno = "65", } @InProceedings{Bauer:2012:LEL, author = "Michael Bauer and Sean Treichler and Elliott Slaughter and Alex Aiken", title = "{Legion}: expressing locality and independence with logical regions", crossref = "Hollingsworth:2012:SPI", pages = "66:1--66:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a065.pdf", abstract = "Modern parallel architectures have both heterogeneous processors and deep, complex memory hierarchies. We present Legion, a programming model and runtime system for achieving high performance on these machines. Legion is organized around logical regions, which express both locality and independence of program data, and tasks, functions that perform computations on regions. We describe a runtime system that dynamically extracts parallelism from Legion programs, using a distributed, parallel scheduling algorithm that identifies both independent tasks and nested parallelism. Legion also enables explicit, programmer controlled movement of data through the memory hierarchy and placement of tasks based on locality information via a novel mapping interface. We evaluate our Legion implementation on three applications: fluid-flow on a regular grid, a three-level AMR code solving a heat diffusion equation, and a circuit simulation.", acknowledgement = ack-nhfb, articleno = "66", } @InProceedings{Garland:2012:DUP, author = "Michael Garland and Manjunath Kudlur and Yili Zheng", title = "Designing a unified programming model for heterogeneous machines", crossref = "Hollingsworth:2012:SPI", pages = "67:1--67:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a064.pdf", abstract = "While high-efficiency machines are increasingly embracing heterogeneous architectures and massive multithreading, contemporary mainstream programming languages reflect a mental model in which processing elements are homogeneous, concurrency is limited, and memory is a flat undifferentiated pool of storage. Moreover, the current state of the art in programming heterogeneous machines tends towards using separate programming models, such as OpenMP and CUDA, for different portions of the machine. Both of these factors make programming emerging heterogeneous machines unnecessarily difficult. We describe the design of the Phalanx programming model, which seeks to provide a unified programming model for heterogeneous machines. It provides constructs for bulk parallelism, synchronization, and data placement which operate across the entire machine. Our prototype implementation is able to launch and coordinate work on both CPU and GPU processors within a single node, and by leveraging the GASNet runtime, is able to run across all the nodes of a distributed-memory machine.", acknowledgement = ack-nhfb, articleno = "67", } @InProceedings{Sharma:2012:DII, author = "Sushant Sharma and Dimitrios Katramatos and Dantong Yu and Li Shi", title = "Design and implementation of an intelligent end-to-end network {QoS} system", crossref = "Hollingsworth:2012:SPI", pages = "68:1--68:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a077.pdf", abstract = "End-to-End guaranteed network QoS is a requirement for predictable data transfers between geographically distant end-hosts. Existing QoS systems, however, do not have the capability/intelligence to decide what resources to reserve and which paths to choose when there are multiple and flexible resource reservation requests. In this paper, we design and implement an intelligent system that can guarantee end-to-end network QoS for multiple flexible reservation requests. At the heart of this system is a polynomial time algorithm called resource reservation and path construction (RRPC). The RRPC algorithm schedules multiple flexible end-to-end data transfer requests by jointly optimizing the path construction and bandwidth reservation along these paths. We show that constructing such schedules is NP-hard. We implement our intelligent QoS system, and present the results of deployment on real world production networks (ESnet and Internet2). Our implementation does not require modifications or new software to be deployed on the routers within network.", acknowledgement = ack-nhfb, articleno = "68", } @InProceedings{Chen:2012:LUH, author = "Dong Chen and Noel Eisley and Philip Heidelberger and Sameer Kumar and Amith Mamidala and Fabrizio Petrini and Robert Senger and Yutaka Sugawara and Robert Walkup and Burkhard Steinmacher-Burow and Anamitra Choudhury and Yogish Sabharwal and Swati Singhal and Jeffrey J. Parker", title = "Looking under the hood of the {IBM Blue Gene/Q} network", crossref = "Hollingsworth:2012:SPI", pages = "69:1--69:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a078.pdf", abstract = "This paper explores the performance and optimization of the IBM Blue Gene/Q (BG/Q) five dimensional torus network on up to 16K nodes. The BG/Q hardware supports multiple dynamic routing algorithms and different traffic patterns may require different algorithms to achieve best performance. Between 85\% to 95\% of peak network performance is achieved for all-to-all traffic, while over 85\% of peak is obtained for challenging bisection pairings. A new software-controlled algorithm is developed for bisection traffic that selects which hardware algorithm to employ and achieves better performance than any individual hardware algorithm. The benefit of dynamic routing is shown for a highly non-uniform ``transpose'' traffic pattern. To evaluate memory and network performance, the HPCC Random Access benchmark was tuned for BG/Q and achieved 858 Giga Updates per Second (GUPS) on 16K nodes. To further accelerate message processing, the message libraries on BG/Q enable the offloading of messaging overhead onto dedicated communication threads. Several applications, including Algebraic Multigrid (AMG), exhibit from 3 to 20\% gain using communication threads.", acknowledgement = ack-nhfb, articleno = "69", } @InProceedings{Subramoni:2012:DSI, author = "H. Subramoni and S. Potluri and K. Kandalla and B. Barth and J. Vienne and J. Keasler and K. Tomko and K. Schulz and A. Moody and D. K. Panda", title = "Design of a scalable {InfiniBand} topology service to enable network-topology-aware placement of processes", crossref = "Hollingsworth:2012:SPI", pages = "70:1--70:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a076.pdf", abstract = "Over the last decade, InfiniBand has become an increasingly popular interconnect for deploying modern super-computing systems. However, there exists no detection service that can discover the underlying network topology in a scalable manner and expose this information to runtime libraries and users of the high performance computing systems in a convenient way. In this paper, we design a novel and scalable method to detect the InfiniBand network topology by using Neighbor-Joining techniques (NJ). To the best of our knowledge, this is the first instance where the neighbor joining algorithm has been applied to solve the problem of detecting InfiniBand network topology. We also design a network-topology-aware MPI library that takes advantage of the network topology service. The library places processes taking part in the MPI job in a network-topology-aware manner with the dual aim of increasing intra-node communication and reducing the long distance inter-node communication across the InfiniBand fabric.", acknowledgement = ack-nhfb, articleno = "70", } @InProceedings{Chen:2012:CLA, author = "Guancheng Chen and Per Stenstrom", title = "Critical lock analysis: diagnosing critical section bottlenecks in multithreaded applications", crossref = "Hollingsworth:2012:SPI", pages = "71:1--71:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a099.pdf", abstract = "Critical sections are well known potential performance bottlenecks in multithreaded applications and identifying the ones that inhibit scalability are important for performance optimizations. While previous approaches use idle time as a key measure, we show such a measure is not reliable. The reason is that idleness does not necessarily mean the critical section is on the critical path. We introduce critical lock analysis, a new method for diagnosing critical section bottlenecks in multithreaded applications. Our method firstly identifies the critical sections appearing on the critical path, and then quantifies the impact of such critical sections on the overall performance by using quantitative performance metrics. Case studies show that our method can successfully identify critical sections that are most beneficial for improving overall performance as well as quantify their performance impact on the critical path, which results in a more reliable establishment of the inherent critical section bottlenecks than previous approaches.", acknowledgement = ack-nhfb, articleno = "71", } @InProceedings{Ravishankar:2012:CGP, author = "Mahesh Ravishankar and John Eisenlohr and Louis-No{\"e}l Pouchet and J. Ramanujam and Atanas Rountev and P. Sadayappan", title = "Code generation for parallel execution of a class of irregular loops on distributed memory systems", crossref = "Hollingsworth:2012:SPI", pages = "72:1--72:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a100.pdf", abstract = "Parallelization and locality optimization of affine loop nests has been successfully addressed for shared-memory machines. However, many large-scale simulation applications must be executed in a distributed-memory environment, and use irregular/sparse computations where the control-flow and array-access patterns are data-dependent. In this paper, we propose an approach for effective parallel execution of a class of irregular loop computations in a distributed-memory environment, using a combination of static and runtime analysis. We discuss algorithms that analyze sequential code to generate an inspector and an executor. The inspector captures the data-dependent behavior of the computation in parallel and without requiring complete replication of any of the data structures used in the original computation. The executor performs the computation in parallel. The effectiveness of the framework is demonstrated on several benchmarks and a climate modeling application.", acknowledgement = ack-nhfb, articleno = "72", } @InProceedings{Alimi:2012:FEF, author = "Jean-Michel Alimi and Vincent Bouillot and Yann Rasera and Vincent Reverdy and Pier-Stefano Corasaniti and Ir{\`e}ne Balm{\`e}s and St{\'e}phane Requena and Xavier Delaruelle and Jean-Noel Richet", title = "First-ever full observable universe simulation", crossref = "Hollingsworth:2012:SPI", pages = "73:1--73:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a034.pdf", abstract = "We have performed the first-ever numerical N-body simulation of the full observable universe (DEUS ``Dark Energy Universe Simulation'' FUR ``Full Universe Run''). This has evolved 550 billion particles on an Adaptive Mesh Refinement grid with more than two trillion computing points along the entire evolutionary history of the universe and across 6 order of magnitudes length scales, from the size of the Milky Way to that of the whole observable Universe. To date, this is the largest and most advanced cosmological simulation ever run. It provides unique information on the formation and evolution of the largest structure in the universe and an exceptional support to future observational programs dedicated to mapping the distribution of matter and galaxies in the universe. The simulation has run on 4752 (of 5040) thin nodes of BULL supercomputer CURIE, using more than 300 TB of memory for 10 million hours of computing time. About 50 PBytes of data were generated throughout the run. Using an advanced and innovative reduction workflow the amount of useful stored data has been reduced to 500 TBytes.", acknowledgement = ack-nhfb, articleno = "73", } @InProceedings{March:2012:OCP, author = "William B. March and Kenneth Czechowski and Marat Dukhan and Thomas Benson and Dongryeol Lee and Andrew J. Connolly and Richard Vuduc and Edmond Chow and Alexander G. Gray", title = "Optimizing the computation of $n$-point correlations on large-scale astronomical data", crossref = "Hollingsworth:2012:SPI", pages = "74:1--74:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a033.pdf", abstract = "The n-point correlation functions (npcf) are powerful statistics that are widely used for data analyses in astronomy and other fields. These statistics have played a crucial role in fundamental physical breakthroughs, including the discovery of dark energy. Unfortunately, directly computing the npcf at a single value requires O ( N$^n$ ) time for N points and values of n of 2, 3, 4, or even larger. Astronomical data sets can contain billions of points, and the next generation of surveys will generate terabytes of data per night. To meet these computational demands, we present a highly-tuned npcf computation code that show an order-of-magnitude speedup over current state-of-the-art. This enables a much larger 3-point correlation computation on the galaxy distribution than was previously possible. We show a detailed performance evaluation on many different architectures.", acknowledgement = ack-nhfb, articleno = "74", } @InProceedings{Wu:2012:HTM, author = "Jingjin Wu and Zhiling Lan and Xuanxing Xiong and Nickolay Y. Gnedin and Andrey V. Kravtsov", title = "Hierarchical task mapping of cell-based {AMR} cosmology simulations", crossref = "Hollingsworth:2012:SPI", pages = "75:1--75:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a035.pdf", abstract = "Cosmology simulations are highly communication-intensive, thus it is critical to exploit topology-aware task mapping techniques for performance optimization. To exploit the architectural properties of multiprocessor clusters (the performance gap between inter-node and intra-node communication as well as the gap between inter-socket and intra-socket communication), we design and develop a hierarchical task mapping scheme for cell-based AMR (Adaptive Mesh Refinement) cosmology simulations, in particular, the ART application. Our scheme consists of two parts: (1) an inter-node mapping to map application processes onto nodes with the objective of minimizing network traffic among nodes and (2) an intra-node mapping within each node to minimize the maximum size of messages transmitted between CPU sockets. Experiments on production supercomputers with 3D torus and fat-tree topologies show that our scheme can significantly reduce application communication cost by up to 50\%. More importantly, our scheme is generic and can be extended to many other applications.", acknowledgement = ack-nhfb, articleno = "75", } @InProceedings{Sridharan:2012:SDF, author = "Vilas Sridharan and Dean Liberty", title = "A study of {DRAM} failures in the field", crossref = "Hollingsworth:2012:SPI", pages = "76:1--76:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a047.pdf", abstract = "Most modern computer systems use dynamic random access memory (DRAM) as a main memory store. Recent publications have confirmed that DRAM errors are a common source of failures in the field. Therefore, further attention to the faults experienced by DRAM sub-systems is warranted. In this paper, we present a study of 11 months of DRAM errors in a large high-performance computing cluster. Our goal is to understand the failure modes, rates, and fault types experienced by DRAM in production settings. We identify several unique DRAM failure modes, including single-bit, multi-bit, and multi-chip failures. We also provide a deterministic bound on the rate of transient faults in the DRAM array, by exploiting the presence of a hardware scrubber on our nodes. We draw several conclusions from our study. First, DRAM failures are dominated by permanent, rather than transient, faults, although not to the extent found by previous publications. Second, DRAMs are susceptible to large multi-bit failures, such as failures that affect an entire DRAM row or column, indicating faults in shared internal circuitry. Third, we identify a DRAM failure mode that disrupts access to other DRAM devices that share the same board-level circuitry. Finally, we find that chipkill error-correcting codes (ECC) are extremely effective, reducing the node failure rate from uncorrected DRAM errors by 42x compared to single-error correct/double-error detect (SEC-DED) ECC.", acknowledgement = ack-nhfb, articleno = "76", } @InProceedings{Gainaru:2012:FPU, author = "Ana Gainaru and Franck Cappello and Marc Snir and William Kramer", title = "Fault prediction under the microscope: a closer look into {HPC} systems", crossref = "Hollingsworth:2012:SPI", pages = "77:1--77:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a048.pdf", abstract = "A large percentage of computing capacity in today's large high-performance computing systems is wasted because of failures. Consequently current research is focusing on providing fault tolerance strategies that aim to minimize fault's effects on applications. By far the most popular technique is the checkpoint-restart strategy. A complement to this classical approach is failure avoidance, by which the occurrence of a fault is predicted and preventive measures are taken. This requires a reliable prediction system to anticipate failures and their locations. Thus far, research in this field has used ideal predictors that were not implemented in real HPC systems. In this paper, we merge signal analysis concepts with data mining techniques to extend the ELSA (Event Log Signal Analyzer) toolkit and offer an adaptive and more efficient prediction module. Our goal is to provide models that characterize the normal behavior of a system and the way faults affect it. Being able to detect deviations from normality quickly is the foundation of accurate fault prediction. However, this is challenging because component failure dynamics are heterogeneous in space and time. To this end, a large part of the paper is focused on a detailed analysis of the prediction method, by applying it to two large-scale systems and by investigating the characteristics and bottlenecks of each step of the prediction process. Furthermore, we analyze the prediction's precision and recall impact on current checkpointing strategies and highlight future improvements and directions for research in this field.", acknowledgement = ack-nhfb, articleno = "77", } @InProceedings{Fiala:2012:DCS, author = "David Fiala and Frank Mueller and Christian Engelmann and Rolf Riesen and Kurt Ferreira and Ron Brightwell", title = "Detection and correction of silent data corruption for large-scale high-performance computing", crossref = "Hollingsworth:2012:SPI", pages = "78:1--78:12", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a046.pdf", abstract = "Faults have become the norm rather than the exception for high-end computing clusters. Exacerbating this situation, some of these faults remain undetected, manifesting themselves as silent errors that allow applications to compute incorrect results. This paper studies the potential for redundancy to detect and correct soft errors in MPI message-passing applications while investigating the challenges inherent to detecting soft errors within MPI applications by providing transparent MPI redundancy. By assuming a model wherein corruption in application data manifests itself by producing differing MPI messages between replicas, we study the best suited protocols for detecting and correcting corrupted MPI messages. Using our fault injector, we observe that even a single error can have profound effects on applications by causing a cascading pattern of corruption which in most cases spreads to all other processes. Results indicate that our consistency protocols can successfully protect applications experiencing even high rates of silent data corruption.", acknowledgement = ack-nhfb, articleno = "78", } @InProceedings{Karpenko:2012:AGW, author = "Dmytro Karpenko and Roman Vitenberg and Alexander L. Read", title = "{ATLAS} grid workload on {NDGF} resources: analysis, modeling, and workload generation", crossref = "Hollingsworth:2012:SPI", pages = "79:1--79:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a055.pdf", abstract = "Evaluating new ideas for job scheduling or data transfer algorithms in large-scale grid systems is known to be notoriously challenging. Existing grid simulators expect to receive a realistic workload as an input. Such input is difficult to provide in absence of an in-depth study of representative grid workloads. In this work, we analyze the ATLAS workload processed on the resources of NDG Facility. ATLAS is one of the biggest grid technology users, with extreme demands for CPU power and bandwidth. The analysis is based on the data sample with ~1.6 million jobs, 1,723 TB of data transfer, and 873 years of processor time. Our additional contributions are (a) scalable workload models that can be used to generate a synthetic workload for a given number of jobs, (b) an open-source workload generator software integrated with existing grid simulators, and (c) suggestions for grid system designers based on the insights of data analysis.", acknowledgement = ack-nhfb, articleno = "79", } @InProceedings{Estrada:2012:EAA, author = "Trilce Estrada and Michela Taufer", title = "On the effectiveness of application-aware self-management for scientific discovery in volunteer computing systems", crossref = "Hollingsworth:2012:SPI", pages = "80:1--80:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a057.pdf", abstract = "An important challenge faced by high-throughput, multiscale applications is that human intervention has a central role in driving their success. However, manual intervention is in-efficient, error-prone and promotes resource wasting. This paper presents an application-aware modular framework that provides self-management for computational multiscale applications in volunteer computing (VC). Our framework consists of a learning engine and three modules that can be easily adapted to different distributed systems. The learning engine of this framework is based on our novel tree-like structure called KOTree. KOTree is a fully automatic method that organizes statistical information in a multidimensional structure that can be efficiently searched and updated at runtime. Our empirical evaluation shows that our framework can effectively provide application-aware self-management in VC systems. Additionally, we observed that our KOTree algorithm is able to predict accurately the expected length of new jobs, resulting in an average of 85\% increased throughput with respect to other algorithms.", acknowledgement = ack-nhfb, articleno = "80", } @InProceedings{Liu:2012:UVC, author = "Z. Liu and M. Veeraraghavan and Z. Yan and C. Tracy and J. Tie and I. Foster and J. Dennis and J. Hick and Y. Li and W. Yang", title = "On using virtual circuits for {GridFTP} transfers", crossref = "Hollingsworth:2012:SPI", pages = "81:1--81:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a056.pdf", abstract = "The goal of this work is to characterize scientific data transfers and to determine the suitability of dynamic virtual circuit service for these transfers instead of the currently used IP-routed service. Specifically, logs collected by servers executing a commonly used scientific data transfer application, GridFTP, are obtained from three US super-computing/scientific research centers, NERSC, SLAC, and NCAR, and analyzed. Dynamic virtual circuit (VC) service, a relatively new offering from providers such as ESnet and Internet2, allows for the selection of a path on which a rate-guaranteed connection is established prior to data transfer. Given VC setup overhead, the first analysis of the GridFTP transfer logs characterizes the duration of sessions, where a session consists of multiple back-to-back transfers executed in batch mode between the same two GridFTP servers. Of the NCAR-NICS sessions analyzed, 56\% of all sessions (90\% of all transfers) would have been long enough to be served with dynamic VC service. An analysis of transfer logs across four paths, NCAR-NICS, SLAC-BNL, NERSC-ORNL and NERSC-ANL, shows significant throughput variance, where NICS, BNL, ORNL, and ANL are other US national laboratories. For example, on the NERSC-ORNL path, the inter-quartile range was 695 Mbps, with a maximum value of 3.64 Gbps and a minimum value of 758 Mbps. An analysis of the impact of various factors that are potential causes of this variance is also presented.", acknowledgement = ack-nhfb, articleno = "81", } @InProceedings{Meng:2012:DDG, author = "Jiayuan Meng and Vitali A. Morozov and Venkatram Vishwanath and Kalyan Kumaran", title = "Dataflow-driven {GPU} performance projection for multi-kernel transformations", crossref = "Hollingsworth:2012:SPI", pages = "82:1--82:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a092.pdf", abstract = "Applications often have a sequence of parallel operations to be offloaded to graphics processors; each operation can become an individual GPU kernel. Developers typically explore a variety of transformations for each kernel. Furthermore, it is well known that efficient data management is critical in achieving high GPU performance and that ``fusing'' multiple kernels into one may greatly improve data locality. Doing so, however, requires transformations across multiple, potentially nested, parallel loops; at the same time, the original code semantics and data dependency must be preserved. Since each kernel may have distinct data access patterns, their combined dataflow can be nontrivial. As a result, the complexity of multi-kernel transformations often leads to significant effort with no guarantee of performance benefits. This paper proposes a dataflow-driven analytical framework to project GPU performance for a sequence of parallel operations. Users need only provide CPU code skeletons for a sequence of parallel loops. The framework can then automatically identify opportunities for multi-kernel transformations and data management. It is also able to project the overall performance without implementing GPU code or using physical hardware.", acknowledgement = ack-nhfb, articleno = "82", } @InProceedings{Dwyer:2012:PME, author = "Tyler Dwyer and Alexandra Fedorova and Sergey Blagodurov and Mark Roth and Fabien Gaud and Jian Pei", title = "A practical method for estimating performance degradation on multicore processors, and its application to {HPC} workloads", crossref = "Hollingsworth:2012:SPI", pages = "83:1--83:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a093.pdf", abstract = "When multiple threads or processes run on a multi-core CPU they compete for shared resources, such as caches and memory controllers, and can suffer performance degradation as high as 200\%. We design and evaluate a new machine learning model that estimates this degradation online, on previously unseen workloads, and without perturbing the execution. Our motivation is to help data center and HPC cluster operators effectively use workload consolidation. Data center consolidation is about placing many applications on the same server to maximize hardware utilization. In HPC clusters, processes of the same distributed applications run on the same machine. Consolidation improves hardware utilization, but may sacrifice performance as processes compete for resources. Our model helps determine when consolidation is overly harmful to performance. Our work is the first to apply machine learning to this problem domain, and we report on our experience reaping the advantages of machine learning while navigating around its limitations. We demonstrate how the model can be used to improve performance fidelity and save energy for HPC workloads.", acknowledgement = ack-nhfb, articleno = "83", } @InProceedings{Spafford:2012:ADS, author = "Kyle L. Spafford and Jeffrey S. Vetter", title = "{Aspen}: a domain specific language for performance modeling", crossref = "Hollingsworth:2012:SPI", pages = "84:1--84:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a091.pdf", abstract = "We present a new approach to analytical performance modeling using Aspen, a domain specific language. Aspen (Abstract Scalable Performance Engineering Notation) fills an important gap in existing performance modeling techniques and is designed to enable rapid exploration of new algorithms and architectures. It includes a formal specification of an application's performance behavior and an abstract machine model. We provide an overview of Aspen's features and demonstrate how it can be used to express a performance model for a three dimensional Fast Fourier Transform. We then demonstrate the composability and modularity of Aspen by importing and reusing the FFT model in a molecular dynamics model. We have also created a number of tools that allow scientists to balance application and system factors quickly and accurately.", acknowledgement = ack-nhfb, articleno = "84", } @InProceedings{Zhang:2012:DAD, author = "Zhao Zhang and Daniel S. Katz and Justin M. Wozniak and Allan Espinosa and Ian Foster", title = "Design and analysis of data management in scalable parallel scripting", crossref = "Hollingsworth:2012:SPI", pages = "85:1--85:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a016.pdf", abstract = "We seek to enable efficient large-scale parallel execution of applications in which a shared filesystem abstraction is used to couple many tasks. Such parallel scripting ( many-task computing, MTC ) applications suffer poor performance and utilization on large parallel computers because of the volume of filesystem I/O and a lack of appropriate optimizations in the shared filesystem. Thus, we design and implement a scalable MTC data management system that uses aggregated compute node local storage for more efficient data movement strategies. We co-design the data management system with the data-aware scheduler to enable dataflow pattern identification and automatic optimization. The framework reduces the time to solution of parallel stages of an astronomy data analysis application, Montage, by 83.2\% on 512 cores; decreases the time to solution of a seismology application, CyberShake, by 7.9\% on 2,048 cores; and delivers BLAST performance better than mpiBLAST at various scales up to 32,768 cores, while preserving the flexibility of the original BLAST application.", acknowledgement = ack-nhfb, articleno = "85", } @InProceedings{Adams:2012:UBL, author = "Ian F. Adams and Brian A. Madden and Joel C. Frank and Mark W. Storer and Ethan L. Miller and Gene Harano", title = "Usage behavior of a large-scale scientific archive", crossref = "Hollingsworth:2012:SPI", pages = "86:1--86:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a017.pdf", abstract = "Archival storage systems for scientific data have been growing in both size and relevance over the past two decades, yet researchers and system designers alike must rely on limited and obsolete knowledge to guide archival management and design. To address this issue, we analyzed three years of file-level activities from the NCAR mass storage system, providing valuable insight into a large-scale scientific archive with over 1600 users, tens of millions of files, and petabytes of data. Our examination of system usage showed that, while a subset of users were responsible for most of the activity, this activity was widely distributed at the file level. We also show that the physical grouping of files and directories on media can improve archival storage system performance. Based on our observations, we provide suggestions and guidance for both future scientific archival system designs as well as improved tracing of archival activity.", acknowledgement = ack-nhfb, articleno = "86", } @InProceedings{LaFon:2012:DFT, author = "Jharrod LaFon and Satyajayant Misra and Jon Bringhurst", title = "On distributed file tree walk of parallel file systems", crossref = "Hollingsworth:2012:SPI", pages = "87:1--87:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a015.pdf", abstract = "Supercomputers generate vast amounts of data, typically organized into large directory hierarchies on parallel file systems. While the supercomputing applications are parallel, the tools used to process them requiring complete directory traversal, are typically serial. We present an algorithm framework and three fully distributed algorithms for traversing large parallel file systems, and performing file operations in parallel. The first algorithm introduces a randomized work-stealing scheduler; the second improves the first with proximity-awareness; and the third improves upon the second by using a hybrid approach. We have tested our implementation on Cielo, a 1.37 petaflop supercomputer at the Los Alamos National Laboratory and its 7 petabyte file system. Test results show that our algorithms execute orders of magnitude faster than state-of-the-art algorithms while achieving ideal load balancing and low communication cost. We present performance insights from the use of our algorithms in production systems at LANL, performing daily file system operations.", acknowledgement = ack-nhfb, articleno = "87", } @InProceedings{Chung:2012:ADP, author = "I-Hsin Chung and Changhoan Kim and Hui-Fang Wen and Guojing Cong", title = "Application data prefetching on the {IBM Blue Gene/Q} supercomputer", crossref = "Hollingsworth:2012:SPI", pages = "88:1--88:8", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a075.pdf", abstract = "Memory access latency is often a crucial performance limitation for high performance computing. Prefetching is one of the strategies used by system designers to bridge the processor-memory gap. This paper describes a new innovative list prefetching feature introduced in the IBM Blue Gene/Q supercomputer. The list prefetcher records the L1 cache miss addresses and prefetches them in the next iteration. The evaluation shows this list prefetching mechanism reduces data fetching time when L1 cache misses happen and improves the performance for high performance computing applications with repeating non-uniform memory access patterns. Its performance is compatible with classic stream prefetcher when properly configured.", acknowledgement = ack-nhfb, articleno = "88", } @InProceedings{Alvarez:2012:HSC, author = "Lluc Alvarez and Llu{\'\i}s Vilanova and Marc Gonzalez and Xavier Martorell and Nacho Navarro and Eduard Ayguade", title = "Hardware-software coherence protocol for the coexistence of caches and local memories", crossref = "Hollingsworth:2012:SPI", pages = "89:1--89:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a074.pdf", abstract = "Cache coherence protocols limit the scalability of chip multiprocessors. One solution is to introduce a local memory alongside the cache hierarchy, forming a hybrid memory system. Local memories are more power-efficient than caches and they do not generate coherence traffic but they suffer from poor programmability. When non-predictable memory access patterns are found compilers do not succeed in generating code because of the incoherency between the two storages. This paper proposes a coherence protocol for hybrid memory systems that allows the compiler to generate code even in the presence of memory aliasing problems. Coherency is ensured by a simple software/hardware co-design where the compiler identifies potentially incoherent memory accesses and the hardware diverts them to the correct copy of the data. The coherence protocol introduces overheads of 0.24\% in execution time and of 1.06\% in energy consumption to enable the usage of the hybrid memory system.", acknowledgement = ack-nhfb, articleno = "89", } @InProceedings{Schindewolf:2012:WSA, author = "Martin Schindewolf and Barna Bihari and John Gyllenhaal and Martin Schulz and Amy Wang and Wolfgang Karl", title = "What scientific applications can benefit from hardware transactional memory?", crossref = "Hollingsworth:2012:SPI", pages = "90:1--90:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a073.pdf", abstract = "Achieving efficient and correct synchronization of multiple threads is a difficult and error-prone task at small scale and, as we march towards extreme scale computing, will be even more challenging when the resulting application is supposed to utilize millions of cores efficiently. Transactional Memory (TM) is a promising technique to ease the burden on the programmer, but only recently has become available on commercial hardware in the new Blue Gene/Q system and hence the real benefit for realistic applications has not been studied yet. This paper presents the first performance results of TM embedded into OpenMP on a prototype system of BG/Q and characterizes code properties that will likely lead to benefits when augmented with TM primitives. We first study the influence of thread count, environment variables and memory layout on TM performance and identify code properties that will yield performance gains with TM. Second, we evaluate the combination of OpenMP with multiple synchronization primitives on top of MPI to determine suitable task to thread ratios per node. Finally, we condense our findings into a set of best practices. These are applied to a Monte Carlo Benchmark and a Smoothed Particle Hydrodynamics method. In both cases an optimized TM version, executed with 64 threads on one node, outperforms a simple TM implementation. MCB with optimized TM yields a speedup of 27.45 over baseline.", acknowledgement = ack-nhfb, articleno = "90", } @InProceedings{Grigori:2012:PTL, author = "Laura Grigori and Radek Stompor and Mikolaj Szydlarski", title = "A parallel two-level preconditioner for cosmic microwave background map-making", crossref = "Hollingsworth:2012:SPI", pages = "91:1--91:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a084.pdf", abstract = "Generalized least square problems with non-diagonal weights arise frequently in an estimation of two dimensional images from data of cosmological as well as astro- or geo- physical observations. As the observational data sets keep growing at Moore's rate, with their volumes exceeding tens and hundreds billions of samples, the need for fast and efficiently parallelizable iterative solvers is generally recognized. In this work we propose a new iterative algorithm for solving generalized least square systems with weights given by a block-diagonal matrix with Toeplitz blocks. Such cases are physically well motivated and correspond to measurement noise being piece-wise stationary --- a common occurrence in many actual observations. Our iterative algorithm is based on the conjugate gradient method and includes a parallel two-level preconditioner (2lvl-PCG) constructed from a limited number of sparse vectors estimated from the coefficients of the initial linear system. Our prototypical application is the map-making problem in the Cosmic Microwave Background data analysis. We show experimentally that our parallel implementation of 2lvl-PCG outperforms by a factor of up to 6 the standard one-level PCG in terms of both the convergence rate and the time to solution on up to 12, 228 cores of NERSC's Cray XE6 (Hopper) system displaying nearly perfect strong and weak scaling behavior in this regime.", acknowledgement = ack-nhfb, articleno = "91", } @InProceedings{Speck:2012:MST, author = "R. Speck and D. Ruprecht and R. Krause and M. Emmett and M. Minion and M. Winkel and P. Gibbon", title = "A massively space-time parallel {$N$}-body solver", crossref = "Hollingsworth:2012:SPI", pages = "92:1--92:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a083.pdf", abstract = "We present a novel space-time parallel version of the Barnes--Hut tree code pepc using pfasst, the Parallel Full Approximation Scheme in Space and Time. The naive use of increasingly more processors for a fixed-size N-body problem is prone to saturate as soon as the number of unknowns per core becomes too small. To overcome this intrinsic strong-scaling limit, we introduce temporal parallelism on top of pepc's existing hybrid MPI/PThreads spatial decomposition. Here, we use pfasst which is based on a combination of the iterations of the parallel-in-time algorithm parareal with the sweeps of spectral deferred correction (SDC) schemes. By combining these sweeps with multiple space-time discretization levels, pfasst relaxes the theoretical bound on parallel efficiency in parareal. We present results from runs on up to 262,144 cores on the IBM Blue Gene/P installation JUGENE, demonstrating that the space-time parallel code provides speedup beyond the saturation of the purely space-parallel approach.", acknowledgement = ack-nhfb, articleno = "92", } @InProceedings{Fujisawa:2012:HPG, author = "Katsuki Fujisawa and Hitoshi Sato and Satoshi Matsuoka and Toshio Endo and Makoto Yamashita and Maho Nakata", title = "High-performance general solver for extremely large-scale semidefinite programming problems", crossref = "Hollingsworth:2012:SPI", pages = "93:1--93:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a082.pdf", abstract = "Semidefinite programming (SDP) is one of the most important problems among optimization problems at present. It is relevant to a wide range of fields such as combinatorial optimization, structural optimization, control theory, economics, quantum chemistry, sensor network location and data mining. The capability to solve extremely large-scale SDP problems will have a significant effect on the current and future applications of SDP. In 1995, Fujisawa et al. started the SDPA(Semidefinite programming algorithm) Project aimed at solving large-scale SDP problems with high numerical stability and accuracy. SDPA is one of the main codes to solve general SDPs. SDPARA is a parallel version of SDPA on multiple processors with distributed memory, and it replaces two major bottleneck parts (the generation of the Schur complement matrix and its Cholesky factorization) of SDPA by their parallel implementation. In particular, it has been successfully applied to combinatorial optimization and truss topology optimization. The new version of SDPARA (7.5.0-G) on a large-scale supercomputer called TSUBAME 2.0 at the Tokyo Institute of Technology has successfully been used to solve the largest SDP problem (which has over 1.48 million constraints), and created a new world record. Our implementation has also achieved 533 TFlops in double precision for large-scale Cholesky factorization using 2,720 CPUs and 4,080 GPUs.", acknowledgement = ack-nhfb, articleno = "93", } @InProceedings{VanderWijngaart:2012:EBN, author = "Rob F. {Van der Wijngaart} and Srinivas Sridharan and Victor W. Lee", title = "Extending the {BT NAS} parallel benchmark to exascale computing", crossref = "Hollingsworth:2012:SPI", pages = "94:1--94:9", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a094.pdf", abstract = "The NAS Parallel Benchmarks (NPB) are a well-known suite of benchmarks that proxy scientific computing applications. They specify several problem sizes that represent how such applications may run on different sizes of HPC systems. However, even the largest problem (class F) is still far too small to exercise properly a petascale supercomputer. Our work shows how one may scale the Block Tridiagonal (BT) NPB from today's published size to petascale and exascale computing systems. In this paper we discuss the pros and cons of various ways of scaling. We discuss how scaling BT would impact computation, memory access, and communications, and highlight the expected bottleneck, which turns out to be not memory or communication bandwidth, but network latency. Two complementary ways are presented to overcome latency obstacles. We also describe a practical method to gather approximate performance data for BT at exascale on actual hardware, without requiring an exascale system.", acknowledgement = ack-nhfb, articleno = "94", } @InProceedings{Frasca:2012:NAG, author = "Michael Frasca and Kamesh Madduri and Padma Raghavan", title = "{NUMA}-aware graph mining techniques for performance and energy efficiency", crossref = "Hollingsworth:2012:SPI", pages = "95:1--95:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a096.pdf", abstract = "We investigate dynamic methods to improve the power and performance profiles of large irregular applications on modern multi-core systems. In this context, we study a large sparse graph application, Betweenness Centrality, and focus on memory behavior as core count scales. We introduce new techniques to efficiently map the computational demands onto non-uniform memory architectures (NUMA). Our dynamic design adapts to hardware topology and dramatically improves both energy and performance. These gains are more significant at higher core counts. We implement a scheme for adaptive data layout, which reorganizes the graph after observing parallel access patterns, and a dynamic task scheduler that encourages shared data between neighboring cores. We measure performance and energy consumption on a modern multi-core machine and observe that mean execution time is reduced by 51.2\% and energy is reduced by 52.4\%.", acknowledgement = ack-nhfb, articleno = "95", } @InProceedings{Williams:2012:OGM, author = "Samuel Williams and Dhiraj D. Kalamkar and Amik Singh and Anand M. Deshpande and Brian {Van Straalen} and Mikhail Smelyanskiy and Ann Almgren and Pradeep Dubey and John Shalf and Leonid Oliker", title = "Optimization of geometric multigrid for emerging multi- and manycore processors", crossref = "Hollingsworth:2012:SPI", pages = "96:1--96:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a095.pdf", abstract = "Multigrid methods are widely used to accelerate the convergence of iterative solvers for linear systems used in a number of different application areas. In this paper, we explore optimization techniques for geometric multigrid on existing and emerging multicore systems including the Opteron-based Cray XE6, Intel\reg{} Xeon\reg{} E5-2670 and X5550 processor-based InfiniBand clusters, as well as the new Intel\reg{} Xeon PhiTM coprocessor (Knights Corner). Our work examines a variety of novel techniques including communication-aggregation, threaded wavefront-based DRAM communication-avoiding, dynamic threading decisions, SIMDization, and fusion of operators. We quantify performance through each phase of the V-cycle for both single-node and distributed-memory experiments and provide detailed analysis for each class of optimization. Results show our optimizations yield significant speedups across a variety of subdomain sizes while simultaneously demonstrating the potential of multi- and manycore processors to dramatically accelerate single-node performance. However, our analysis also indicates that improvements in networks and communication will be essential to reap the potential of manycore processors in large-scale multigrid calculations.", acknowledgement = ack-nhfb, articleno = "96", } @InProceedings{Bhatele:2012:MAC, author = "Abhinav Bhatele and Todd Gamblin and Steven H. Langer and Peer-Timo Bremer and Erik W. Draeger and Bernd Hamann and Katherine E. Isaacs and Aaditya G. Landge and Joshua A. Levine and Valerio Pascucci and Martin Schulz and Charles H. Still", title = "Mapping applications with collectives over sub-communicators on torus networks", crossref = "Hollingsworth:2012:SPI", pages = "97:1--97:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a027.pdf", abstract = "The placement of tasks in a parallel application on specific nodes of a supercomputer can significantly impact performance. Traditionally, this task mapping has focused on reducing the distance between communicating tasks on the physical network. This minimizes the number of hops that point-to-point messages travel and thus reduces link sharing between messages and contention. However, for applications that use collectives over sub-communicators, this heuristic may not be optimal. Many collectives can benefit from an increase in bandwidth even at the cost of an increase in hop count, especially when sending large messages. For example, placing communicating tasks in a cube configuration rather than a plane or a line on a torus network increases the number of possible paths messages might take. This increases the available bandwidth which can lead to significant performance gains. We have developed Rubik, a tool that provides a simple and intuitive interface to create a wide variety of mappings for structured communication patterns. Rubik supports a number of elementary operations such as splits, tilts, or shifts, that can be combined into a large number of unique patterns. Each operation can be applied to disjoint groups of processes involved in collectives to increase the effective bandwidth. We demonstrate the use of Rubik for improving performance of two parallel codes, pF3D and Qbox, which use collectives over sub-communicators.", acknowledgement = ack-nhfb, articleno = "97", } @InProceedings{Hoefler:2012:OPC, author = "Torsten Hoefler and Timo Schneider", title = "Optimization principles for collective neighborhood communications", crossref = "Hollingsworth:2012:SPI", pages = "98:1--98:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a028.pdf", abstract = "Many scientific applications operate in a bulk-synchronous mode of iterative communication and computation steps. Even though the communication steps happen at the same logical time, important patterns such as stencil computations cannot be expressed as collective communications in MPI. We demonstrate how neighborhood collective operations allow to specify arbitrary collective communication relations during run-time and enable optimizations similar to traditional collective calls. We show a number of optimization opportunities and algorithms for different communication scenarios. We also show how users can assert constraints that provide additional optimization opportunities in a portable way. We demonstrate the utility of all described optimizations in a highly optimized implementation of neighborhood collective operations. Our communication and protocol optimizations result in a performance improvement of up to a factor of two for small stencil communications. We found that, for some patterns, our optimization heuristics automatically generate communication schedules that are comparable to hand-tuned collectives. With those optimizations in place, we are able to accelerate arbitrary collective communication patterns, such as regular and irregular stencils with optimization methods for collective communications. We expect that our methods will influence the design of future MPI libraries and provide a significant performance benefit on large-scale systems.", acknowledgement = ack-nhfb, articleno = "98", } @InProceedings{Cui:2012:OOB, author = "Zheng Cui and Lei Xia and Patrick G. Bridges and Peter A. Dinda and John R. Lange", title = "Optimizing overlay-based virtual networking through optimistic interrupts and cut-through forwarding", crossref = "Hollingsworth:2012:SPI", pages = "99:1--99:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a029.pdf", abstract = "Overlay-based virtual networking provides a powerful model for realizing virtual distributed and parallel computing systems with strong isolation, portability, and recoverability properties. However, in extremely high throughput and low latency networks, such overlays can suffer from bandwidth and latency limitations, which is of particular concern if we want to apply the model in HPC environments. Through careful study of an existing very high performance overlay-based virtual network system, we have identified two core issues limiting performance: delayed and/or excessive virtual interrupt delivery into guests, and copies between host and guest data buffers done during encapsulation. We respond with two novel optimizations: optimistic, timer-free virtual interrupt injection, and zero-copy cut-through data forwarding. These optimizations improve the latency and bandwidth of the overlay network on 10 Gbps interconnects, resulting in near-native performance for a wide range of microbenchmarks and MPI application benchmarks.", acknowledgement = ack-nhfb, articleno = "99", } @InProceedings{Georganas:2012:CAO, author = "Evangelos Georganas and Jorge Gonz{\'a}lez-Dom{\'\i}nguez and Edgar Solomonik and Yili Zheng and Juan Touri{\~n}o and Katherine Yelick", title = "Communication avoiding and overlapping for numerical linear algebra", crossref = "Hollingsworth:2012:SPI", pages = "100:1--100:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a061.pdf", abstract = "To efficiently scale dense linear algebra problems to future exascale systems, communication cost must be avoided or overlapped. Communication-avoiding 2.5D algorithms improve scalability by reducing inter-processor data transfer volume at the cost of extra memory usage. Communication overlap attempts to hide messaging latency by pipelining messages and overlapping with computational work. We study the interaction and compatibility of these two techniques for two matrix multiplication algorithms (Cannon and SUMMA), triangular solve, and Cholesky factorization. For each algorithm, we construct a detailed performance model that considers both critical path dependencies and idle time. We give novel implementations of 2.5D algorithms with overlap for each of these problems. Our software employs UPC, a partitioned global address space (PGAS) language that provides fast one-sided communication. We show communication avoidance and overlap provide a cumulative benefit as core counts scale, including results using over 24K cores of a Cray XE6 system.", acknowledgement = ack-nhfb, articleno = "100", } @InProceedings{Lipshitz:2012:CAP, author = "Benjamin Lipshitz and Grey Ballard and James Demmel and Oded Schwartz", title = "Communication-avoiding parallel {Strassen}: implementation and performance", crossref = "Hollingsworth:2012:SPI", pages = "101:1--101:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a063.pdf", abstract = "Matrix multiplication is a fundamental kernel of many high performance and scientific computing applications. Most parallel implementations use classical O ( n$^3$ ) matrix multiplication, even though there exist algorithms with lower arithmetic complexity. We recently presented a new Communication-Avoiding Parallel Strassen algorithm (CAPS), based on Strassen's fast matrix multiplication, that minimizes communication (SPAA '12). It communicates asymptotically less than all classical and all previous Strassen-based algorithms, and it attains theoretical lower bounds. In this paper we show that CAPS is also faster in practice. We benchmark and compare its performance to previous algorithms on Hopper (Cray XE6), Intrepid (IBM BG/P), and Franklin (Cray XT4). We demonstrate significant speedups over previous algorithms both for large matrices and for small matrices on large numbers of processors. We model and analyze the performance of CAPS and predict its performance on future exascale platforms.", acknowledgement = ack-nhfb, articleno = "101", } @InProceedings{Avron:2012:MDM, author = "Haim Avron and Anshul Gupta", title = "Managing data-movement for effective shared-memory parallelization of out-of-core sparse solvers", crossref = "Hollingsworth:2012:SPI", pages = "102:1--102:11", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a062.pdf", abstract = "Direct methods for solving sparse linear systems are robust and typically exhibit good performance, but often require large amounts of memory due to fill-in. Many industrial applications use out-of-core techniques to mitigate this problem. However, parallelizing sparse out-of-core solvers poses some unique challenges because accessing secondary storage introduces serialization and I/O overhead. We analyze the data-movement costs and memory versus parallelism trade-offs in a shared-memory parallel out-of-core linear solver for sparse symmetric systems. We propose an algorithm that uses a novel memory management scheme and adaptive task parallelism to reduce the data-movement costs. We present experiments to show that our solver is faster than existing out-of-core sparse solvers on a single core, and is more scalable than the only other known shared-memory parallel out-of-core solver. This work is also directly applicable at the node level in a distributed-memory parallel scenario.", acknowledgement = ack-nhfb, articleno = "102", } @InProceedings{Faanes:2012:CCS, author = "Greg Faanes and Abdulla Bataineh and Duncan Roweth and Tom Court and Edwin Froese and Bob Alverson and Tim Johnson and Joe Kopnick and Mike Higgins and James Reinhard", title = "{Cray Cascade}: a scalable {HPC} system based on a {Dragonfly} network", crossref = "Hollingsworth:2012:SPI", pages = "103:1--103:9", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a079.pdf", abstract = "Higher global bandwidth requirement for many applications and lower network cost have motivated the use of the Dragonfly network topology for high performance computing systems. In this paper we present the architecture of the Cray Cascade system, a distributed memory system based on the Dragonfly [1] network topology. We describe the structure of the system, its Dragonfly network and the routing algorithms. We describe a set of advanced features supporting both mainstream high performance computing applications and emerging global address space programming models. We present a combination of performance results from prototype systems and simulation data for large systems. We demonstrate the value of the Dragonfly topology and the benefits obtained through extensive use of adaptive routing.", acknowledgement = ack-nhfb, articleno = "103", } @InProceedings{Makino:2012:GAG, author = "Junichiro Makino and Hiroshi Daisaka", title = "{GRAPE-8}: an accelerator for gravitational {$N$}-body simulation with {20.5Gflops/W} performance", crossref = "Hollingsworth:2012:SPI", pages = "104:1--104:10", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a081.pdf", abstract = "In this paper, we describe the design and performance of GRAPE-8 accelerator processor for gravitational N -body simulations. It is designed to evaluate gravitational interaction with cutoff between particles. The cutoff function is useful for schemes like TreePM or Particle-Particle Particle-Tree, in which gravitational force is divided to short-range and long-range components. A single GRAPE-8 processor chip integrates 48 pipeline processors. The effective number of floating-point operations per interaction is around 40. Thus the peak performance of a single GRAPE-8 processor chip is 480 Gflops. A GRAPE-8 processor card houses two GRAPE-8 chips and one FPGA chip for PCI-Express interface. The total power consumption of the board is 46W. Thus, theoretical peak performance per watt is 20.5 Gflops/W. The effective performance of the total system, including the host computer, is around 5Gflops/W. This is more than a factor of two higher than the highest number in the current Green500 list.", acknowledgement = ack-nhfb, articleno = "104", } @InProceedings{Thorson:2012:SUF, author = "Greg Thorson and Michael Woodacre", title = "{SGI UV2}: a fused computation and data analysis machine", crossref = "Hollingsworth:2012:SPI", pages = "105:1--105:9", year = "2012", bibdate = "Thu Nov 15 07:38:35 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", URL = "http://conferences.computer.org/sc/2012/papers/1000a080.pdf", abstract = "UV2 is SGI's second generation data fusion system. UV2 was designed to meet the latest challenges facing users in computation and data analysis. Its unique ability to perform both functions on a single platform enables efficient, easy to manage workflows. This platform has a hybrid infrastructure, leveraging the latest Intel\reg{} EP processors providing industry leading computational power. Due to its high bandwidth, extremely low latency NUMALink\reg{}6 (NL6) interconnect, plus vectorized synchronization and data movement, UV2 provides industry leading data intensive capability. It supports a single operating system (OS) image up to 64TB and 4K threads. Multiple OS images can be deployed on a single NL6 fabric, which has a single flat address space up to 8PB and 256K threads. These capabilities allow for extreme performance on a broad range of programming models and languages including OpenMP[1], MPI, UPC[2], CAF[3] and SHMEM. The architecture, implementation and performance of UV2 are detailed.", acknowledgement = ack-nhfb, articleno = "105", } %%% ==================================================================== %%% Cross-referenced entries must come last: @Proceedings{Hollingsworth:2012:SPI, editor = "Jeffrey Hollingsworth", booktitle = "{SC '12: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, Salt Lake Convention Center, Salt Lake City, UT, USA, November 10--16, 2012}", title = "{SC '12: Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis, Salt Lake Convention Center, Salt Lake City, UT, USA, November 10--16, 2012}", publisher = pub-IEEE, address = pub-IEEE:adr, year = "2012", ISBN = "1-4673-0804-8", ISBN-13 = "978-1-4673-0804-5", bibdate = "Thu Nov 15 07:35:55 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; http://www.math.utah.edu/pub/tex/bib/supercomputing2012.bib", acknowledgement = ack-nhfb, }