%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.97", %%% date = "28 March 2026", %%% time = "08:07:03 MDT", %%% filename = "taco.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% URL = "https://www.math.utah.edu/~beebe", %%% checksum = "13192 48904 257236 2426712", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "ACM Transactions on Architecture and Code %%% Optimization; bibliography; TACO", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Architecture and Code %%% Optimization (CODEN ????, ISSN 1544-3566 %%% (print), 1544-3973 (electronic)), covering %%% all journal issues from 2004 -- date. %%% %%% At version 1.97, the COMPLETE journal %%% coverage looked like this: %%% %%% 2004 ( 17) 2012 ( 61) 2020 ( 47) %%% 2005 ( 17) 2013 ( 103) 2021 ( 56) %%% 2006 ( 19) 2014 ( 34) 2022 ( 61) %%% 2007 ( 19) 2015 ( 66) 2023 ( 62) %%% 2008 ( 21) 2016 ( 91) 2024 ( 91) %%% 2009 ( 20) 2017 ( 55) 2025 ( 168) %%% 2010 ( 21) 2018 ( 39) 2026 ( 38) %%% 2011 ( 17) 2019 ( 63) %%% %%% Article: 1186 %%% %%% Total entries: 1186 %%% %%% The journal Web page can be found at: %%% %%% http://www.acm.org/pubs/taco.html %%% %%% The journal table of contents page is at: %%% %%% http://www.acm.org/taco/ %%% http://portal.acm.org/browse_dl.cfm?idx=J924 %%% https://dl.acm.org/loi/taco %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty" # "\def \TM {${}^{\sc TM}$}" # "\ifx \undefined \pkg \def \pkg #1{{{\tt #1}}} \fi" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|https://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TACO = "ACM Transactions on Architecture and Code Optimization"} %%% ==================================================================== %%% Bibliography entries: @Article{Calder:2004:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "1", number = "1", pages = "1--2", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2004:RIC, author = "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir and N. Vijaykrishnan and M. J. Irwin", title = "Reducing instruction cache energy consumption using a compiler-based strategy", journal = j-TACO, volume = "1", number = "1", pages = "3--33", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Isailovic:2004:DCQ, author = "Nemanja Isailovic and Mark Whitney and Yatish Patel and John Kubiatowicz and Dean Copsey and Frederic T. Chong and Isaac L. Chuang and Mark Oskin", title = "Datapath and control for quantum wires", journal = j-TACO, volume = "1", number = "1", pages = "34--61", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sankaralingam:2004:TPA, author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya Ranganathan and Doug Burger and Stephen W. Keckler and Robert G. McDonald and Charles R. Moore", title = "{TRIPS}: a polymorphous architecture for exploiting {ILP}, {TLP}, and {DLP}", journal = j-TACO, volume = "1", number = "1", pages = "62--93", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Skadron:2004:TAM, author = "Kevin Skadron and Mircea R. Stan and Karthik Sankaranarayanan and Wei Huang and Sivakumar Velusamy and David Tarjan", title = "Temperature-aware microarchitecture: {Modeling} and implementation", journal = j-TACO, volume = "1", number = "1", pages = "94--125", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Aleta:2004:RCC, author = "Alex Alet{\`a} and Josep M. Codina and Antonio Gonz{\'a}lez and David Kaeli", title = "Removing communications in clustered microarchitectures through instruction replication", journal = j-TACO, volume = "1", number = "2", pages = "127--151", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bai:2004:LPO, author = "Yu Bai and R. Iris Bahar", title = "A low-power in-order\slash out-of-order issue queue", journal = j-TACO, volume = "1", number = "2", pages = "152--179", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Juang:2004:IBP, author = "Philo Juang and Kevin Skadron and Margaret Martonosi and Zhigang Hu and Douglas W. Clark and Philip W. Diodato and Stefanos Kaxiras", title = "Implementing branch-predictor decay using quasi-static memory cells", journal = j-TACO, volume = "1", number = "2", pages = "180--219", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Santana:2004:LCF, author = "Oliverio J. Santana and Alex Ramirez and Josep L. Larriba-Pey and Mateo Valero", title = "A low-complexity fetch architecture for high-performance superscalar processors", journal = j-TACO, volume = "1", number = "2", pages = "220--245", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lin:2004:CFS, author = "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun Chan", title = "A compiler framework for speculative optimizations", journal = j-TACO, volume = "1", number = "3", pages = "247--271", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fields:2004:ICS, author = "Brian A. Fields and Rastislav Bodik and Mark D. Hill and Chris J. Newburn", title = "Interaction cost and shotgun profiling", journal = j-TACO, volume = "1", number = "3", pages = "272--304", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sankaranarayanan:2004:PBA, author = "Karthik Sankaranarayanan and Kevin Skadron", title = "Profile-based adaptation for cache decay", journal = j-TACO, volume = "1", number = "3", pages = "305--322", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xie:2004:IDV, author = "Fen Xie and Margaret Martonosi and Sharad Malik", title = "Intraprogram dynamic voltage scaling: {Bounding} opportunities with analytic modeling", journal = j-TACO, volume = "1", number = "3", pages = "323--367", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hartstein:2004:OPD, author = "A. Hartstein and Thomas R. Puzak", title = "The optimum pipeline depth considering both power and performance", journal = j-TACO, volume = "1", number = "4", pages = "369--388", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cristal:2004:TKI, author = "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo Valero and Jos{\'e} F. Mart{\'\i}nez", title = "Toward kilo-instruction processors", journal = j-TACO, volume = "1", number = "4", pages = "389--417", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Akkary:2004:ARE, author = "Haitham Akkary and Ravi Rajwar and Srikanth T. Srinivasan", title = "An analysis of a resource efficient checkpoint architecture", journal = j-TACO, volume = "1", number = "4", pages = "418--444", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2004:TML, author = "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng and Chien-Hao Lee", title = "Tolerating memory latency through push prefetching for pointer-intensive applications", journal = j-TACO, volume = "1", number = "4", pages = "445--475", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Calder:2005:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "2", number = "1", pages = "1--2", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2005:EFA, author = "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu and Josep Torrellas", title = "Efficient and flexible architectural support for dynamic monitoring", journal = j-TACO, volume = "2", number = "1", pages = "3--33", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2005:WHC, author = "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid Najjar", title = "A way-halting cache for low-energy high-performance systems", journal = j-TACO, volume = "2", number = "1", pages = "34--54", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Abella:2005:ISP, author = "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera and Michael F. P. O'Boyle", title = "{IATAC}: a smart predictor to turn-off {L2} cache lines", journal = j-TACO, volume = "2", number = "1", pages = "55--77", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Haskins:2005:AWS, author = "John W. {Haskins, Jr.} and Kevin Skadron", title = "Accelerated warmup for sampled microarchitecture simulation", journal = j-TACO, volume = "2", number = "1", pages = "78--108", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2005:ABT, author = "Tao Li and Ravi Bhargava and Lizy Kurian John", title = "Adapting branch-target buffer to improve the target predictability of {Java} code", journal = j-TACO, volume = "2", number = "2", pages = "109--130", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2005:DIE, author = "Lingli Zhang and Chandra Krintz", title = "The design, implementation, and evaluation of adaptive code unloading for resource-constrained devices", journal = j-TACO, volume = "2", number = "2", pages = "131--164", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kulkarni:2005:FES, author = "Prasad A. Kulkarni and Stephen R. Hines and David B. Whalley and Jason D. Hiser and Jack W. Davidson and Douglas L. Jones", title = "Fast and efficient searches for effective optimization-phase sequences", journal = j-TACO, volume = "2", number = "2", pages = "165--198", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Salami:2005:DMI, author = "Esther Salam{\'\i} and Mateo Valero", title = "Dynamic memory interval test vs. interprocedural pointer analysis in multimedia applications", journal = j-TACO, volume = "2", number = "2", pages = "199--219", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Meng:2005:ELL, author = "Yan Meng and Timothy Sherwood and Ryan Kastner", title = "Exploring the limits of leakage power reduction in caches", journal = j-TACO, volume = "2", number = "3", pages = "221--246", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Garzaran:2005:TBS, author = "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor Vi{\~n}als and Lawrence Rauchwerger and Josep Torrellas", title = "Tradeoffs in buffering speculative memory state for thread-level speculation in multiprocessors", journal = j-TACO, volume = "2", number = "3", pages = "247--279", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tarjan:2005:MPG, author = "David Tarjan and Kevin Skadron", title = "Merging path and gshare indexing in perceptron branch prediction", journal = j-TACO, volume = "2", number = "3", pages = "280--300", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2005:WET, author = "Xiangyu Zhang and Rajiv Gupta", title = "Whole execution traces and their applications", journal = j-TACO, volume = "2", number = "3", pages = "301--334", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2005:IWA, author = "Wankang Zhao and David Whalley and Christopher Healy and Frank Mueller", title = "Improving {WCET} by applying a {WC} code-positioning optimization", journal = j-TACO, volume = "2", number = "4", pages = "335--365", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "WC (worst case); WCET (worst case execution time)", } @Article{Reis:2005:SCF, author = "George A. Reis and Jonathan Chang and Neil Vachharajani and Ram Rangan and David I. August and Shubhendu S. Mukherjee", title = "Software-controlled fault tolerance", journal = j-TACO, volume = "2", number = "4", pages = "366--396", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2005:PPC, author = "Jian Li and Jos{\'e} F. Mart{\'\i}nez", title = "Power-performance considerations of parallel computing on chip multiprocessors", journal = j-TACO, volume = "2", number = "4", pages = "397--422", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sharma:2005:SPE, author = "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte", title = "Spectral prefetcher: {An} effective mechanism for {L2} cache prefetching", journal = j-TACO, volume = "2", number = "4", pages = "423--450", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Calder:2006:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "3", number = "1", pages = "1--2", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tan:2006:BSS, author = "Lin Tan and Brett Brotherton and Timothy Sherwood", title = "Bit-split string-matching engines for intrusion detection and prevention", journal = j-TACO, volume = "3", number = "1", pages = "3--34", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nagpurkar:2006:ERP, author = "Priya Nagpurkar and Hussam Mousa and Chandra Krintz and Timothy Sherwood", title = "Efficient remote profiling for resource-constrained devices", journal = j-TACO, volume = "3", number = "1", pages = "35--66", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lin:2006:RCG, author = "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy Dz-Ching Ju and Tin-Fook Ngai", title = "Recovery code generation for general speculative optimizations", journal = j-TACO, volume = "3", number = "1", pages = "67--89", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Choi:2006:ORR, author = "Yoonseo Choi and Hwansoo Han", title = "Optimal register reassignment for register stack overflow minimization", journal = j-TACO, volume = "3", number = "1", pages = "90--114", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xue:2006:LOA, author = "Jingling Xue and Qiong Cai", title = "A lifetime optimal algorithm for speculative {PRE}", journal = j-TACO, volume = "3", number = "2", pages = "115--155", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sharkey:2006:IPT, author = "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad Ghose and Oguz Ergin", title = "Instruction packing: {Toward} fast and energy-efficient instruction scheduling", journal = j-TACO, volume = "3", number = "2", pages = "156--181", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ceze:2006:CUC, author = "Luis Ceze and Karin Strauss and James Tuck and Josep Torrellas and Jose Renau", title = "{CAVA}: {Using} checkpoint-assisted value prediction to hide {L2} misses", journal = j-TACO, volume = "3", number = "2", pages = "182--208", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2006:EAR, author = "Lixin Zhang and Mike Parker and John Carter", title = "Efficient address remapping in distributed shared-memory systems", journal = j-TACO, volume = "3", number = "2", pages = "209--229", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2006:ATP, author = "Min Zhao and Bruce R. Childers and Mary Lou Soffa", title = "An approach toward profit-driven optimization", journal = j-TACO, volume = "3", number = "3", pages = "231--262", month = sep, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1162690.1162691", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Although optimizations have been applied for a number of years to improve the performance of software, problems with respect to the application of optimizations have not been adequately addressed. For example, in certain circumstances, optimizations may degrade performance. However, there is no efficient way to know when a degradation will occur. In this research, we investigate the profitability of optimizations, which is useful for determining the benefit of applying optimizations. We develop a framework that enables us to predict profitability using analytic models. The profitability of an optimization depends on code context, the particular optimization, and machine resources. Thus, our framework has analytic models for each of these components. As part of the framework, there is also a profitability engine that uses models to predict the profit. In this paper, we target scalar optimizations and, in particular, describe the models for partial redundancy elimination (PRE), loop invariant code motion (LICM), and value numbering (VN). We implemented the framework for predicting the profitability of these optimizations. Based on the predictions, we can selectively apply profitable optimizations. We compared the profit-driven approach with an approach that uses a heuristic in deciding when optimizations should be applied. Our experiments demonstrate that the profitability of scalar optimizations can be accurately predicted by using models. That is, without actually applying a scalar optimization, we can determine if an optimization is beneficial and should be applied.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hazelwood:2006:MBC, author = "Kim Hazelwood and Michael D. Smith", title = "Managing bounded code caches in dynamic binary optimization systems", journal = j-TACO, volume = "3", number = "3", pages = "263--294", month = sep, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1162690.1162692", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dynamic binary optimizers store altered copies of original program instructions in software-managed code caches in order to maximize reuse of transformed code. Code caches store code blocks that may vary in size, reference other code blocks, and carry a high replacement overhead. These unique constraints reduce the effectiveness of conventional cache management policies. Our work directly addresses these unique constraints and presents several contributions to the code-cache management problem. First, we show that evicting more than the minimum number of code blocks from the code cache results in less run-time overhead than the existing alternatives. Such granular evictions reduce overall execution time, as the fixed costs of invoking the eviction mechanism are amortized across multiple cache insertions. Second, a study of the ideal lifetimes of dynamically generated code blocks illustrates the benefit of a replacement algorithm based on a generational heuristic. We describe and evaluate a generational approach to code cache management that makes it easy to identify long-lived code blocks and simultaneously avoid any fragmentation because of the eviction of short-lived blocks. Finally, we present results from an implementation of our generational approach in the DynamoRIO framework and illustrate that, as dynamic optimization systems become more prevalent, effective code cache-management policies will be essential for reliable, scalable performance of modern applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rochecouste:2006:CCE, author = "Olivier Rochecouste and Gilles Pokam and Andr{\'e} Seznec", title = "A case for a complexity-effective, width-partitioned microarchitecture", journal = j-TACO, volume = "3", number = "3", pages = "295--326", month = sep, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1162690.1162693", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The analysis of program executions reveals that most integer and multimedia applications make heavy use of narrow-width operations, i.e., instructions exclusively using narrow-width operands and producing a narrow-width result. Moreover, this usage is relatively well distributed over the application. We observed this program property on the MediaBench and SPEC2000 benchmarks with about 40\% of the instructions being narrow-width operations. Current superscalar processors use 64-bit datapaths to execute all the instructions of the applications. In this paper, we suggest the use of a width-partitioned microarchitecture (WPM) to master the hardware complexity of a superscalar processor. For a four-way issue machine, we split the processor in two two-way clusters: the main cluster executing 64-bit operations, load/store, and complex operations and a narrow cluster executing the 16-bit operations. We resort to partitioning to decouple the treatment of the narrow-width operations from that of the other program instructions. This provides the benefit of greatly simplifying the design of the critical processor components in each cluster (e.g., the register file and the bypass network). The dynamic interleaving of the two instruction types allows maintaining the workload balanced among clusters. WPM also helps to reduce the complexity of the interconnection fabric and of the issue logic. In fact, since the 16-bit cluster can only communicate narrow-width data, the datapath-width of the interconnect fabric can be significantly reduced, yielding a corresponding saving of the interconnect power and area. We explore different possible configurations of WPM, discussing the various implementation tradeoffs. We also examine a speculative steering heuristic to distribute the narrow-width operations among clusters. A detailed analysis of the complexity factors shows using WPM instead of a classical 64-bit two-cluster microarchitecture can save power and silicon area with a minimal impact on the overall performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zmily:2006:BAI, author = "Ahmad Zmily and Christos Kozyrakis", title = "Block-aware instruction set architecture", journal = j-TACO, volume = "3", number = "3", pages = "327--357", month = sep, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1162690.1162694", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Instruction delivery is a critical component for wide-issue, high-frequency processors since its bandwidth and accuracy place an upper limit on performance. The processor front-end accuracy and bandwidth are limited by instruction-cache misses, multicycle instruction-cache accesses, and target or direction mispredictions for control-flow operations. This paper presents a block-aware instruction set (BLISS) that allows software to assist with front-end challenges. BLISS defines basic block descriptors that are stored separately from the actual instructions in a program. We show that BLISS allows for a decoupled front-end that tolerates instruction-cache latency, facilitates instruction prefetching, and leads to higher prediction accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Crandall:2006:MAS, author = "Jedidiah R. Crandall and S. Felix Wu and Frederic T. Chong", title = "{Minos}: {Architectural} support for protecting control data", journal = j-TACO, volume = "3", number = "4", pages = "359--389", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Marathe:2006:ACC, author = "Jaydeep Marathe and Frank Mueller and Bronis R. de Supinski", title = "Analysis of cache-coherence bottlenecks with hybrid hardware\slash software techniques", journal = j-TACO, volume = "3", number = "4", pages = "390--423", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ganusov:2006:FEP, author = "Ilya Ganusov and Martin Burtscher", title = "Future execution: a prefetching mechanism that uses multiple cores to speed up single threads", journal = j-TACO, volume = "3", number = "4", pages = "424--449", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Co:2006:ETC, author = "Michele Co and Dee A. B. Weikle and Kevin Skadron", title = "Evaluating trace cache energy efficiency", journal = j-TACO, volume = "3", number = "4", pages = "450--476", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hu:2006:EMM, author = "Shiwen Hu and Madhavi Valluri and Lizy Kurian John", title = "Effective management of multiple configurable units using dynamic optimization", journal = j-TACO, volume = "3", number = "4", pages = "477--501", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bentley:2006:IAB, author = "Chris Bentley and Scott A. Watterson and David K. Lowenthal and Barry Rountree", title = "Implicit array bounds checking on 64-bit architectures", journal = j-TACO, volume = "3", number = "4", pages = "502--527", month = dec, year = "2006", CODEN = "????", DOI = "https://doi.org/10.1145/1187976.1187982", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Several programming languages guarantee that array subscripts are checked to ensure they are within the bounds of the array. While this guarantee improves the correctness and security of array-based code, it adds overhead to array references. This has been an obstacle to using higher-level languages, such as Java, for high-performance parallel computing, where the language specification requires that all array accesses must be checked to ensure they are within bounds. This is because, in practice, array-bounds checking in scientific applications may increase execution time by more than a factor of 2. Previous research has explored optimizations to statically eliminate bounds checks, but the dynamic nature of many scientific codes makes this difficult or impossible. Our approach is, instead, to create a compiler and operating system infrastructure that does not generate explicit bounds checks. It instead places arrays inside of Index Confinement Regions (ICRs), which are large, isolated, mostly unmapped virtual memory regions. Any array reference outside of its bounds will cause a protection violation; this provides implicit bounds checking. Our results show that when applying this infrastructure to high-performance computing programs written in Java, the overhead of bounds checking relative to a program with no bounds checks is reduced from an average of 63\% to an average of 9\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Calder:2007:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "4", number = "1", pages = "1:1--1:1", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Constantinides:2007:ARC, author = "Kypros Constantinides and Stephen Plaza and Jason Blome and Valeria Bertacco and Scott Mahlke and Todd Austin and Bin Zhang and Michael Orshansky", title = "Architecting a reliable {CMP} switch architecture", journal = j-TACO, volume = "4", number = "1", pages = "2:1--2:37", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sasanka:2007:AES, author = "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and Yen-Kuang Chen and Eric Debes", title = "{ALP}: {Efficient} support for all levels of parallelism for complex media applications", journal = j-TACO, volume = "4", number = "1", pages = "3:1--3:30", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luo:2007:CNP, author = "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan", title = "Conserving network processor power consumption by exploiting traffic variability", journal = j-TACO, volume = "4", number = "1", pages = "4:1--4:26", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Soteriou:2007:SDP, author = "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh", title = "Software-directed power-aware interconnection networks", journal = j-TACO, volume = "4", number = "1", pages = "5:1--5:40", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hwang:2007:SSA, author = "Yuan-Shin Hwang and Jia-Jhe Li", title = "Snug set-associative caches: Reducing leakage power of instruction and data caches with no performance penalties", journal = j-TACO, volume = "4", number = "1", pages = "6:1--6:28", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rong:2007:SDS, author = "Hongbo Rong and Zhizhong Tang and R. Govindarajan and Alban Douillet and Guang R. Gao", title = "Single-dimension software pipelining for multidimensional loops", journal = j-TACO, volume = "4", number = "1", pages = "7:1--7:44", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bower:2007:ODH, author = "Fred A. Bower and Daniel J. Sorin and Sule Ozev", title = "Online diagnosis of hard faults in microprocessors", journal = j-TACO, volume = "4", number = "2", pages = "8:1--8:??", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1250727.1250728", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We develop a microprocessor design that tolerates hard faults, including fabrication defects and in-field faults, by leveraging existing microprocessor redundancy. To do this, we must: detect and correct errors, diagnose hard faults at the field deconfigurable unit (FDU) granularity, and deconfigure FDUs with hard faults. In our reliable microprocessor design, we use DIVA dynamic verification to detect and correct errors. Our new scheme for diagnosing hard faults tracks instructions' core structure occupancy from decode until commit. If a DIVA checker detects an error in an instruction, it increments a small saturating error counter for every FDU used by that instruction, including that DIVA checker. A hard fault in an FDU quickly leads to an above-threshold error counter for that FDU and thus diagnoses the fault. For deconfiguration, we use previously developed schemes for functional units and buffers and present a scheme for deconfiguring DIVA checkers. Experimental results show that our reliable microprocessor quickly and accurately diagnoses each hard fault that is injected and continues to function, albeit with somewhat degraded performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "fine-grained diagnosis; hard fault tolerance; processor microarchitecture", } @Article{Michaud:2007:STM, author = "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis and Yiannakis Sazeides and Theofanis Constantinou", title = "A study of thread migration in temperature-constrained multicores", journal = j-TACO, volume = "4", number = "2", pages = "9:1--9:??", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1250727.1250729", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Temperature has become an important constraint in high-performance processors, especially multicores. Thread migration will be essential to exploit the full potential of future thermally constrained multicores. We propose and study a thread migration method that maximizes performance under a temperature constraint, while minimizing the number of migrations and ensuring fairness between threads. We show that thread migration brings important performance gains and that it is most effective during the first tens of seconds following a decrease of the number of running threads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "multicore processor; power density; temperature; thermal management; thread migration", } @Article{Chen:2007:CRL, author = "Yu Chen and Fuxin Zhang", title = "Code reordering on limited branch offset", journal = j-TACO, volume = "4", number = "2", pages = "10:1--10:??", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1250727.1250730", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Since the 1980's code reordering has gained popularity as an important way to improve the spatial locality of programs. While the effect of the processor's microarchitecture and memory hierarchy on this optimization technique has been investigated, little research has focused on the impact of the instruction set. In this paper, we analyze the effect of limited branch offset of the MIPS-like instruction set [Hwu et al. 2004, 2005] on code reordering, explore two simple methods to handle the exceeded branches, and propose the bidirectional code layout (BCL) algorithm to reduce the number of branches exceeding the offset limit. The BCL algorithm sorts the chains according to the position of related chains, avoids cache conflict misses deliberately and lays out the code bidirectionally. It strikes a balance among the distance of related blocks, the instruction cache miss rate, the memory size required, and the control flow transfer. Experimental results show that BCL can effectively reduce exceeded branches by 50.1\%, on average, with up to 100\% for some programs. Except for some programs with little spatial locality, the BCL algorithm can achieve the performance, as the case with no branch offset limitation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "code reordering; Godson Processor; link-time optimization", } @Article{Terechko:2007:ICC, author = "A. S. Terechko and H. Corporaal", title = "Inter-cluster communication in {VLIW} architectures", journal = j-TACO, volume = "4", number = "2", pages = "11:1--11:??", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1250727.1250731", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The traditional VLIW (very long instruction word) architecture with a single register file does not scale up well to address growing performance demands on embedded media processors. However, splitting a VLIW processor in smaller clusters, which are comprised of function units fully connected to local register files, can significantly improve VLSI implementation characteristics of the processor, such as speed, energy consumption, and area. In our paper we reveal that achieving the best characteristics of a clustered VLIW requires a thorough selection of an Inter-cluster Communication (ICC) model, which is the way clustering is exposed in the Instruction Set Architecture. For our study we, first, define a taxonomy of ICC models including copy operations, dedicated issue slots, extended operands, extended results, and multicast. Evaluation of the execution time of the models requires both the dynamic cycle count and clock period. We developed an advanced instruction scheduler for all the five ICC models in order to quantify the dynamic cycle counts of our multimedia C benchmarks. To assess the clock period of the ICC models we designed and laid out VLIW datapaths using the RTL hardware descriptions derived from a deeply pipelined commercial TriMedia processor. In contrast to prior art, our research shows that fully distributed register file architectures (with eight clusters in our study) often underperform compared to moderately clustered machines with two or four clusters because of explosion of the cycle count overhead in the former. Among the evaluated ICC models, performance of the copy operation model, popular both in academia and industry, is severely limited by the copy operations hampering scheduling of regular operations in high ILP (instruction-level parallelism) code. The dedicated issue slots model combats this limitation by dedicating extra VLIW issue slots purely for ICC, reaching the highest 1.74 execution time speedup relative to the unicluster. Furthermore, our VLSI experiments show that the lowest area and energy consumption of 42 and 57\% relative to the unicluster, respectively, are achieved by the extended operands model, which, nevertheless, provides higher performance than the copy operation model.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "clock frequency; cluster assignment; instruction scheduler; instruction-level parallelism; intercluster communication; optimizing compiler; pipelining; register allocation; VLIW", } @Article{Dou:2007:CCM, author = "Jialin Dou and Marcelo Cintra", title = "A compiler cost model for speculative parallelization", journal = j-TACO, volume = "4", number = "2", pages = "12:1--12:??", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1250727.1250732", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Speculative parallelization is a technique that allows code sections that cannot be fully analyzed by the compiler to be aggressively executed in parallel. However, while speculative parallelization can potentially deliver significant speedups, several overheads associated with this technique can limit these speedups in practice. This paper proposes a novel compiler static cost model of speculative multithreaded execution that can be used to predict the resulting performance. This model attempts to predict the expected speedups, or slowdowns, of the candidate speculative sections based on the estimation of the combined runtime effects of various overheads, and taking into account the scheduling restrictions of most speculative execution environments. The model is based on estimating the likely execution duration of threads and considers all the possible permutations of these threads. This model also produces a quantitative estimate of the speedup, which is different from prior heuristics that only qualitatively estimate the benefits of speculative multithreaded execution. In previous work, a limited version of the framework was evaluated on a number of loops from a collection of SPEC benchmarks that suffer mainly from load imbalance and thread dispatch and commit overheads. In this work, an extended framework is also evaluated on loops that may suffer from data-dependence violations. Experimental results show that prediction accuracy is lower when loops with violations are included. Nevertheless, accuracy is still very high for a static model: the framework can identify, on average, 45\% of the loops that cause slowdowns and, on average, 96\% of the loops that lead to speedups; it predicts the speedups or slowdowns with an error of less than 20\% for an average of 28\% of the loops across the benchmarks and with an error of less than 50\% for an average of 80\% of the loops. Overall, the framework often outperforms, by as much as 25\%, a naive approach that attempts to speculatively parallelize all the loops considered, and is able to curb the large slowdowns caused in many cases by this naive approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "speculative multithreading; speculative parallelization; thread-level speculation", } @Article{Amme:2007:SBM, author = "Wolfram Amme and Jeffery von Ronne and Michael Franz", title = "{SSA}-based mobile code: {Implementation} and empirical evaluation", journal = j-TACO, volume = "4", number = "2", pages = "13:1--13:??", month = jun, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1250727.1250733", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Although one might expect transportation formats based on static single-assignment form (SSA) to yield faster just-in-time compilation times than those based on stack-based virtual machines, this claim has not previously been validated, in practice. We attempt to quantify the effect of using an SSA-based mobile code representation by integrating support for a verifiable SSA-based IR into Jikes RVM. Performance results, measured with various optimizations and on both the IA32 and PowerPC, show improvements in both compilation time and code quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "SafeTSA; static single-assignment form; virtual machines", } @Article{Li:2007:CCE, author = "Xiaodong Li and Ritu Gupta and Sarita V. Adve and Yuanyuan Zhou", title = "Cross-component energy management: {Joint} adaptation of processor and memory", journal = j-TACO, volume = "4", number = "3", pages = "14:1--14:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1275937.1275938", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Researchers have proposed the use of adaptation to reduce the energy consumption of different hardware components, such as the processor, memory, disk, and display for general-purpose applications. Previous algorithms to control these adaptations, however, have focused on a single component. This work takes the first step toward developing algorithms that can jointly control adaptations in multiple interacting components for general-purpose applications, with the goal of minimizing the total energy consumed within a specified performance loss. Specifically, we develop a joint-adaptation algorithm for processor and memory adaptations. We identify two properties that enable per-component algorithms to be easily used in a cross-component context---the algorithms' performance impact must be guaranteed and composable. We then modify a current processor and a memory algorithm to obey these properties. This allows the cross-component problem to be reduced to determine an appropriate (energy-optimal) allocation of the target performance loss (slack) between the two components. We develop such an optimal slack allocation algorithm that exploits the above properties. The result is an efficient cross-component adaptation framework that minimizes the total energy of the processor and memory without exceeding the target performance loss, while substantially leveraging current per-component algorithms. Our experiments show that joint processor and memory adaptation provides significantly more energy savings than adapting either component alone; intelligent slack distribution is specifically effective for highly compute- or memory-intensive applications; and the performance slowdown never exceeds the specification.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "adaptive systems; control algorithms; energy management; low-power design; memory; performance guarantee; processor", } @Article{Gabor:2007:FES, author = "Ron Gabor and Shlomo Weiss and Avi Mendelson", title = "Fairness enforcement in switch on event multithreading", journal = j-TACO, volume = "4", number = "3", pages = "15:1--15:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1275937.1275939", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The need to reduce power and complexity will increase the interest in Switch On Event multithreading (coarse-grained multithreading). Switch On Event multithreading is a low-power and low-complexity mechanism to improve processor throughput by switching threads on execution stalls. Fairness may, however, become a problem in a multithreaded processor. Unless fairness is properly handled, some threads may starve while others consume all of the processor cycles. Heuristics that were devised in order to improve fairness in simultaneous multithreading are not applicable to Switch On Event multithreading. This paper defines the fairness metric using the ratio of the individual threads' speedups and shows how it can be enforced in Switch On Event multithreading. Fairness is controlled by forcing additional thread switch points. These switch points are determined dynamically by runtime estimation of the single threaded performance of each of the individual threads. We analyze the impact of the fairness enforcement mechanism on aggregate IPC and weighted speedup. We present simulation results of the performance of Switch On Event multithreading. Switch On Event multithreading achieves an average aggregate IPC increase of 26\% over single thread and 12\% weighted speedup when no fairness is enforced. In this case, a sixth of our runs resulted in poor fairness in which one thread ran extremely slowly (10 to 100 times slower than its single-thread performance), while the other thread's performance was hardly affected. By using the proposed mechanism, we can guarantee fairness at different levels of strictness and, in most cases, even improve the weighted speedup.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "coarse-grained multithreading; fairness; multithreading; performance; SOE; Switch on Event multithreading; throughput; weighted speedup", } @Article{Andrade:2007:PAA, author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n Doallo", title = "Precise automatable analytical modeling of the cache behavior of codes with indirections", journal = j-TACO, volume = "4", number = "3", pages = "16:1--16:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1275937.1275940", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The performance of memory hierarchies, in which caches play an essential role, is critical in nowadays general-purpose and embedded computing systems because of the growing memory bottleneck problem. Unfortunately, cache behavior is very unstable and difficult to predict. This is particularly true in the presence of irregular access patterns, which exhibit little locality. Such patterns are very common, for example, in applications in which pointers or compressed sparse matrices give place to indirections. Nevertheless, cache behavior in the presence of irregular access patterns has not been widely studied. In this paper we present an extension of a systematic analytical modeling technique based on PMEs (probabilistic miss equations), previously developed by the authors, that allows the automated analysis of the cache behavior for codes with irregular access patterns resulting from indirections. The model generates very accurate predictions despite the irregularities and has very low computing requirements, being the first model that gathers these desirable characteristics that can automatically analyze this kind of codes. These properties enable this model to help drive compiler optimizations, as we show with an example.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "analytical modeling; irregular access patterns; memory hierarchy; performance prediction", } @Article{Venstermans:2007:JOH, author = "Kris Venstermans and Lieven Eeckhout and Koen {De Bosschere}", title = "{Java} object header elimination for reduced memory consumption in 64-bit virtual machines", journal = j-TACO, volume = "4", number = "3", pages = "17:1--17:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1275937.1275941", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memory performance is an important design issue for contemporary computer systems given the huge processor/memory speed gap. This paper proposes a space-efficient Java object model for reducing the memory consumption of 64-bit Java virtual machines. We completely eliminate the object header through typed virtual addressing (TVA) or implicit typing. TVA encodes the object type in the object's virtual address by allocating all objects of a given type in a contiguous memory segment. This allows for removing the type information as well as the status field from the object header. Whenever type and status information is needed, masking is applied to the object's virtual address for obtaining an offset into type and status information structures. Unlike previous work on implicit typing, we apply TVA to a selected number of frequently allocated object types, hence, the name selective TVA (STVA); this limits the amount of memory fragmentation. In addition to applying STVA, we also compress the type information block (TIB) pointers for all objects that do not fall under TVA. We implement the space-efficient Java object model in the 64-bit version of the Jikes RVM on an AIX IBM platform and compare its performance against the traditionally used Java object model using a multitude of Java benchmarks. We conclude that the space-efficient Java object model reduces memory consumption by on average 15\% (and up to 45\% for some benchmarks). About one-half the reduction comes from TIB pointer compression; the other one-half comes from STVA. In terms of performance, the space-efficient object model generally does not affect performance; however, for some benchmarks we observe statistically significant performance speedups, up to 20\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "64-bit implementation; implicit typing; Java object model; typed virtual addressing; Virtual machine", } @Article{Xiao:2007:VIS, author = "Shu Xiao and Edmund M.-K. Lai", title = "{VLIW} instruction scheduling for minimal power variation", journal = j-TACO, volume = "4", number = "3", pages = "18:1--18:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1275937.1275942", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The focus of this paper is on the minimization of the variation in power consumed by a VLIW processor during the execution of a target program through instruction scheduling. The problem is formulated as a mixed-integer program (MIP) and a problem-specific branch-and-bound algorithm has been developed to solve it more efficiently than generic MIP solvers. Simulation results based on the TMS320C6711 VLIW digital signal processor using benchmarks from Mediabench and Trimaran showed that over 40\% average reduction in power variation can be achieved without sacrificing execution speed of these benchmarks. Computational requirements and convergence rates of our algorithm are also analyzed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "instruction scheduling; power variation reduction; VLIW processors", } @Article{Tallam:2007:UCF, author = "Sriraman Tallam and Rajiv Gupta", title = "Unified control flow and data dependence traces", journal = j-TACO, volume = "4", number = "3", pages = "19:1--19:??", month = sep, year = "2007", CODEN = "????", DOI = "https://doi.org/10.1145/1275937.1275943", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We describe the design, generation, and compression of the extended whole program path (eWPP), representation that not only captures the control flow history of a program execution but also its data dependence history. This representation is motivated by the observation that, typically, a significant fraction of data dependence history can be recovered from the control flow trace. To capture the remainder of the data dependence history, we introduce disambiguation checks in the program whose control flow signatures capture the results of the checks. The resulting extended control flow trace enables the recovery of otherwise irrecoverable data dependences. The code for the checks is designed to minimize the increase in program execution time and the extended control flow trace size when compared to directly collecting control flow and address traces. Our experiments show that compressed eWPPs are only one-quarter of the size of combined compressed control flow and address traces. However, their collection incurs a 5{\times} increase in runtime overhead relative to the overhead required for directly collecting the control flow and address traces, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "address trace; control flow trace; dynamic data dependence trace; profiling", } @Article{Ipek:2008:EAD, author = "Engin Ipek and Sally A. McKee and Karan Singh and Rich Caruana and Bronis R. de Supinski and Martin Schulz", title = "Efficient architectural design space exploration via predictive modeling", journal = j-TACO, volume = "4", number = "4", pages = "1:1--1:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1328195.1328196", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Efficiently exploring exponential-size architectural design spaces with many interacting parameters remains an open problem: the sheer number of experiments required renders detailed simulation intractable. We attack this via an automated approach that builds accurate predictive models. We simulate sampled points, using results to teach our models the function describing relationships among design parameters. The models can be queried and are very fast, enabling efficient design tradeoff discovery. We validate our approach via two uniprocessor sensitivity studies, predicting IPC with only 1--2\% error. In an experimental study using the approach, training on 1\% of a 250-K-point CMP design space allows our models to predict performance with only 4--5\% error. Our predictive modeling combines well with techniques that reduce the time taken by each simulation experiment, achieving net time savings of three-four orders of magnitude.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "artificial neural networks; design space exploration; performance prediction; sensitivity studies", } @Article{Shi:2008:VMS, author = "Yunhe Shi and Kevin Casey and M. Anton Ertl and David Gregg", title = "Virtual machine showdown: {Stack} versus registers", journal = j-TACO, volume = "4", number = "4", pages = "2:1--2:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1328195.1328197", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Virtual machines (VMs) enable the distribution of programs in an architecture-neutral format, which can easily be interpreted or compiled. A long-running question in the design of VMs is whether a stack architecture or register architecture can be implemented more efficiently with an interpreter. We extend existing work on comparing virtual stack and virtual register architectures in three ways. First, our translation from stack to register code and optimization are much more sophisticated. The result is that we eliminate an average of more than 46\% of executed VM instructions, with the bytecode size of the register machine being only 26\% larger than that of the corresponding stack one. Second, we present a fully functional virtual-register implementation of the Java virtual machine (JVM), which supports Intel, AMD64, PowerPC and Alpha processors. This register VM supports inline-threaded, direct-threaded, token-threaded, and switch dispatch. Third, we present experimental results on a range of additional optimizations such as register allocation and elimination of redundant heap loads. On the AMD64 architecture the register machine using switch dispatch achieves an average speedup of 1.48 over the corresponding stack machine. Even using the more efficient inline-threaded dispatch, the register VM achieves a speedup of 1.15 over the equivalent stack-based VM.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "interpreter; register architecture; stack architecture; virtual machine", } @Article{Yan:2008:EVR, author = "Jun Yan and Wei Zhang", title = "Exploiting virtual registers to reduce pressure on real registers", journal = j-TACO, volume = "4", number = "4", pages = "3:1--3:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1328195.1328198", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "It is well known that a large fraction of variables are short-lived. This paper proposes a novel approach to exploiting this fact to reduce the register pressure for pipelined processors with data-forwarding network. The idea is that the compiler can allocate virtual registers (i.e., place holders to identify dependences among instructions) to short-lived variables, which do not need to be stored to physical storage locations. As a result, real registers (i.e., physically existed registers) can be reserved for long-lived variables for mitigating the register pressure and decreasing the register spills, leading to performance improvement. In this paper, we develop the architectural and compiler support for exploiting virtual registers for statically scheduled processors. Our experimental results show that virtual registers are very effective at reducing the register spills, which, in many cases, can achieve the performance close to the processor with twice number of real registers. Our results also indicate that, for some applications, using 24 virtual, in addition to 8 real registers, can attain even higher performance than that of 16 real without any virtual registers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "data forwarding; register allocation; register file; short-lived variables; virtual register", } @Article{Yu:2008:OCL, author = "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang", title = "Object co-location and memory reuse for {Java} programs", journal = j-TACO, volume = "4", number = "4", pages = "4:1--4:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1328195.1328199", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We introduce a new memory management system, STEMA, which can improve the execution time of Java programs. STEMA detects prolific types on-the-fly and co-locates their objects in a special memory space which supports reuse of memory. We argue and show that memory reuse and co-location of prolific objects can result in improved cache locality, reduced memory fragmentation, reduced GC time, and faster object allocation. We evaluate STEMA using 16 benchmarks. Experimental results show that STEMA performs 2.7\%, 4.0\%, and 8.2\% on average better than MarkSweep, CopyMS, and SemiSpace.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "garbage collector; Java; memory allocator; memory reuse; mutator; object co-location", } @Article{Zhang:2008:RCM, author = "Chuanjun Zhang", title = "Reducing cache misses through programmable decoders", journal = j-TACO, volume = "4", number = "4", pages = "5:1--5:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1328195.1328200", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Level-one caches normally reside on a processor's critical path, which determines clock frequency. Therefore, fast access to level-one cache is important. Direct-mapped caches exhibit faster access time, but poor hit rates, compared with same sized set-associative caches because of nonuniform accesses to the cache sets. The nonuniform accesses generate more cache misses in some sets, while other sets are underutilized. We propose to increase the decoder length and, hence, reduce the accesses to heavily used sets without dynamically detecting the cache set usage information. We increase the access to the underutilized cache sets by incorporating a replacement policy into the cache design using programmable decoders. On average, the proposed techniques achieve as low a miss rate as a traditional 4-way cache on all 26 SPEC2K benchmarks for the instruction and data caches, respectively. This translates into an average IPC improvement of 21.5 and 42.4\% for SPEC2K integer and floating-point benchmarks, respectively. The B-Cache consumes 10.5\% more power per access, but exhibits a 12\% total memory access-related energy savings as a result of the miss rate reductions, and, hence, the reduction to applications' execution time. Compared with previous techniques that aim at reducing the miss rate of direct-mapped caches, our technique requires only one cycle to access all cache hits and has the same access time of a direct-mapped cache.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "cache; dynamic optimization; low power", } @Article{Golander:2008:HMP, author = "Amit Golander and Shlomo Weiss", title = "Hiding the misprediction penalty of a resource-efficient high-performance processor", journal = j-TACO, volume = "4", number = "4", pages = "6:1--6:??", month = jan, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1328195.1328201", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Misprediction is a major obstacle for increasing speculative out-of-order processors performance. Performance degradation depends on both the number of misprediction events and the recovery time associated with each one of them. In recent years a few checkpoint based microarchitectures have been proposed. In comparison with ROB-based processors, checkpoint processors are scalable and highly resource efficient. Unfortunately, in these proposals the misprediction recovery time is proportional to the instruction queue size.\par In this paper we analyze methods to reduce the misprediction recovery time. We propose a new register file management scheme and techniques to selectively flush the instruction queue and the load store queue, and to isolate deeply pipelined execution units. The result is a novel checkpoint processor with Constant misprediction RollBack time (CRB). We further present a streamlined, cost-efficient solution, which saves complexity at the price of slightly lower performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "checkpoints; misprediction; out-of-order execution; rollback; scalable architecture", } @Article{Calder:2008:E, author = "Brad Calder and Dean Tullsen", title = "Editorial", journal = j-TACO, volume = "5", number = "1", pages = "1:1--1:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1369397", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mysore:2008:FIP, author = "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber and Timothy Sherwood and Nisheeth Shrivastava and Subhash Suri", title = "Formulating and implementing profiling over adaptive ranges", journal = j-TACO, volume = "5", number = "1", pages = "2:1--2:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1369398", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern computer systems are called on to deal with billions of events every second, whether they are executed instructions, accessed memory locations, or forwarded packets. This presents a serious challenge to those who seek to quantify, analyze, or optimize such systems, because important trends and behaviors may easily be lost in a sea of data. We present range-adaptive profiling (RAP) as a new and general-purpose profiling method capable of hierarchically efficiently classifying streams of data in hardware. Through the use of RAP, events in an input stream are dynamically classified into increasingly precise categories, based on the frequency with which they occur. The more important a class, or range of events, the more precisely it is quantified. Despite the dynamic nature of our technique, we build upon tight theoretic bounds covering both worst-case error, as well as the required memory. In the limit, it is known that error and the memory bounds can be independent of the stream size and grow only linearly with the level of precision desired. Significantly, we expose the critical constants in these algorithms and through careful engineering, algorithm redesign, and use of heuristics, we show how a high-performance profile system can be implemented for range-adaptive profiling. RAP can be used on various profiles, such as PCs, load values, and memory addresses, and has a broad range of uses, from hot-region profiling to quantifying cache miss value locality. We propose two methods of implementation of RAP, one in software and the other with specialized hardware, for which we also describe our prototype FPGA implementation. We show that with just 8KB of memory, range profiles can be gathered with an average accuracy of 98\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "profiling hardware; range adaptive; value locality", } @Article{Zhai:2008:CHS, author = "Antonia Zhai and J. Gregory Steffan and Christopher B. Colohan and Todd C. Mowry", title = "Compiler and hardware support for reducing the synchronization of speculative threads", journal = j-TACO, volume = "5", number = "1", pages = "3:1--3:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1369399", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Thread-level speculation (TLS) allows us to automatically parallelize general-purpose programs by supporting parallel execution of threads that might not actually be independent. In this article, we focus on one important limitation of program performance under TLS, which stalls as a result of synchronizing and forwarding scalar values between speculative threads that would otherwise cause frequent data dependences and, hence, failed speculation. Using SPECint benchmarks that have been automatically transformed by our compiler to exploit TLS, we present, evaluate in detail, and compare both compiler and hardware techniques for improving the communication of scalar values. We find that through our dataflow algorithms for three increasingly aggressive instruction scheduling techniques, the compiler can drastically reduce the critical forwarding path introduced by the synchronization and forwarding of scalar values. We also show that hardware techniques for reducing synchronization can be complementary to compiler scheduling, but that the additional performance benefits are minimal and are generally not worth the cost.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "automatic parallelization; chip-multiprocessing; instruction scheduling; thread-level speculation", } @Article{Winter:2008:ATN, author = "Jonathan A. Winter and David H. Albonesi", title = "Addressing thermal nonuniformity in {SMT} workloads", journal = j-TACO, volume = "5", number = "1", pages = "4:1--4:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1369400", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We explore DTM techniques within the context of uniform and nonuniform SMT workloads. While DVS is suitable for addressing workloads with uniformly high temperatures, for nonuniform workloads, performance loss occurs because of the slowdown of the cooler thread. To address this, we propose and evaluate DTM mechanisms that exploit the steering-based thread management mechanisms inherent in a clustered SMT architecture. We show that in contrast to DVS, which operates globally, our techniques are more effective at controlling temperature for nonuniform workloads. Furthermore, we devise a DTM technique that combines steering and DVS to achieve consistently good performance across all workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "adaptive microarchitectures; clustered microarchitectures; dynamic thermal management; dynamic voltage scaling; simultaneous multithreading", } @Article{Shahbahrami:2008:VES, author = "Asadollah Shahbahrami and Ben Juurlink and Stamatis Vassiliadis", title = "Versatility of extended subwords and the matrix register file", journal = j-TACO, volume = "5", number = "1", pages = "5:1--5:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1369401", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Extended subwords and the matrix register file (MRF) are two micro architectural techniques that address some of the limitations of existing SIMD architectures. Extended subwords are wider than the data stored in memory. Specifically, for every byte of data stored in memory, there are four extra bits in the media register file. This avoids the need for data-type conversion instructions. The MRF is a register file organization that provides both conventional row-wise, as well as column-wise, access to the register file. In other words, it allows to view the register file as a matrix in which corresponding subwords in different registers corresponds to a column of the matrix. It was introduced to accelerate matrix transposition which is a very common operation in multimedia applications. In this paper, we show that the MRF is very versatile, since it can also be used for other permutations than matrix transposition. Specifically, it is shown how it can be used to provide efficient access to strided data, as is needed in, e.g., color space conversion. Furthermore, it is shown that special-purpose instructions (SPIs), such as the sum-of-absolute differences (SAD) instruction, have limited usefulness when extended subwords and a few general SIMD instructions that we propose are supported, for the following reasons. First, when extended subwords are supported, the SAD instruction provides only a relatively small performance improvement. Second, the SAD instruction processes 8-bit subwords only, which is not sufficient for quarter-pixel resolution nor for cost functions used in image and video retrieval. Results obtained by extending the SimpleScalar toolset show that the proposed techniques provide a speedup of up to 3.00 over the MMX architecture. The results also show that using, at most, 13 extra media registers yields an additional performance improvement ranging from 1.3 to 1.57.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "multimedia standards; SIMD architectures; SIMD programming", } @Article{Guo:2008:EHC, author = "Zhi Guo and Walid Najjar and Betul Buyukkurt", title = "Efficient hardware code generation for {FPGAs}", journal = j-TACO, volume = "5", number = "1", pages = "6:1--6:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1369402", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The wider acceptance of FPGAs as a computing device requires a higher level of programming abstraction. ROCCC is an optimizing C to HDL compiler. We describe the code generation approach in ROCCC. The smart buffer is a component that reuses input data between adjacent iterations. It significantly improves the performance of the circuit and simplifies loop control. The ROCCC-generated datapath can execute one loop iteration per clock cycle when there is no loop dependency or there is only scalar recurrence variable dependency. ROCCC's approach to supporting while-loops operating on scalars makes the compiler able to move scalar iterative computation into hardware.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "data reuse; FPGA; high-level synthesis; reconfigurable computing; VHDL", } @Article{Kotzmann:2008:DJH, author = "Thomas Kotzmann and Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth Russell and David Cox", title = "Design of the {Java HotSpot\TM} client compiler for {Java 6}", journal = j-TACO, volume = "5", number = "1", pages = "7:1--7:??", month = may, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1369396.1370017", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM ships with a redesigned version of the client just-in-time compiler that includes several research results of the last years. The client compiler is at the heart of the VM configuration used by default for interactive desktop applications. For such applications, low startup and pause times are more important than peak performance. This paper outlines the new architecture of the client compiler and shows how it interacts with the VM. It presents the intermediate representation that now uses static single-assignment (SSA) form and the linear scan algorithm for global register allocation. Efficient support for exception handling and deoptimization fulfills the demands that are imposed by the dynamic features of the Java programming language. The evaluation shows that the new client compiler generates better code in less time. The popular SPECjvm98 benchmark suite is executed 45\% faster, while the compilation speed is also up to 40\% better. This indicates that a carefully selected set of global optimizations can also be integrated in just-in-time compilers that focus on compilation speed and not on peak performance. In addition, the paper presents the impact of several optimizations on execution and compilation speed. As the source code is freely available, the Java HotSpot{\TM} VM and the client compiler are the ideal basis for experiments with new feedback-directed optimizations in a production-level Java just-in-time compiler. The paper outlines research projects that add fast algorithms for escape analysis, automatic object inlining, and array bounds check elimination.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "compiler; deoptimization; intermediate representation; Java; just-in-time compilation; optimization; register allocation", } @Article{Rangan:2008:PSD, author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni and David I. August", title = "Performance scalability of decoupled software pipelining", journal = j-TACO, volume = "5", number = "2", pages = "8:1--8:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1400112.1400113", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Any successful solution to using multicore processors to scale general-purpose program performance will have to contend with rising intercore communication costs while exposing coarse-grained parallelism. Recently proposed pipelined multithreading (PMT) techniques have been demonstrated to have general-purpose applicability and are also able to effectively tolerate inter-core latencies through pipelined interthread communication. These desirable properties make PMT techniques strong candidates for program parallelization on current and future multicore processors and understanding their performance characteristics is critical to their deployment. To that end, this paper evaluates the performance scalability of a general-purpose PMT technique called decoupled software pipelining (DSWP) and presents a thorough analysis of the communication bottlenecks that must be overcome for optimal DSWP scalability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "decoupled software pipelining; performance analysis", } @Article{Long:2008:TMM, author = "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and Rajarshi Mukherjee", title = "Thermal monitoring mechanisms for chip multiprocessors", journal = j-TACO, volume = "5", number = "2", pages = "9:1--9:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1400112.1400114", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With large-scale integration and increasing power densities, thermal management has become an important tool to maintain performance and reliability in modern process technologies. In the core of dynamic thermal management schemes lies accurate reading of on-die temperatures. Therefore, careful planning and embedding of thermal monitoring mechanisms into high-performance systems becomes crucial. In this paper, we propose three techniques to create sensor infrastructures for monitoring the maximum temperature on a multicore system. Initially, we extend a nonuniform sensor placement methodology proposed in the literature to handle chip multiprocessors (CMPs) and show its limitations. We then analyze a grid-based approach where the sensors are placed on a static grid covering each core and show that the sensor readings can differ from the actual maximum core temperature by as much as 12.6^\circ C when using 16 sensors per core. Also, as large as 10.6\% of the thermal emergencies are not captured using the same number of sensors. Based on this observation, we first develop an interpolation scheme, which estimates the maximum core temperature through interpolation of the readings collected at the static grid points. We show that the interpolation scheme improves the measurement accuracy and emergency coverage compared to grid-based placement when using the same number of sensors. Second, we present a dynamic scheme where only a subset of the sensor readings is collected to predict the maximum temperature of each core. Our results indicate that, we can reduce the number of active sensors by as much as 50\%, while maintaining similar measurement accuracy and emergency coverage compared to the case where the entire sensor set on the grid is sampled at all times.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "nonuniform and uniform sensor placement; thermal sensor allocation", } @Article{Joshi:2008:DEP, author = "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell, Jr.} and Lizy K. John", title = "Distilling the essence of proprietary workloads into miniature benchmarks", journal = j-TACO, volume = "5", number = "2", pages = "10:1--10:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1400112.1400115", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Benchmarks set standards for innovation in computer architecture research and industry product development. Consequently, it is of paramount importance that these workloads are representative of real-world applications. However, composing such representative workloads poses practical challenges to application analysis teams and benchmark developers (1) real-world workloads are intellectual property and vendors hesitate to share these proprietary applications; and (2) porting and reducing these applications to benchmarks that can be simulated in a tractable amount of time is a nontrivial task. In this paper, we address this problem by proposing a technique that automatically distills key inherent behavioral attributes of a proprietary workload and captures them into a miniature synthetic benchmark clone. The advantage of the benchmark clone is that it hides the functional meaning of the code but exhibits similar performance characteristics as the target application. Moreover, the dynamic instruction count of the synthetic benchmark clone is substantially shorter than the proprietary application, greatly reducing overall simulation time for SPEC CPU, the simulation time reduction is over five orders of magnitude compared to entire benchmark execution. Using a set of benchmarks representative of general-purpose, scientific, and embedded applications, we demonstrate that the power and performance characteristics of the synthetic benchmark clone correlate well with those of the original application across a wide range of microarchitecture configurations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "benchmark cloning; benchmarks; workload characterization", } @Article{Catania:2008:RCM, author = "Vincenzo Catania and Maurizio Palesi and Davide Patti", title = "Reducing complexity of multiobjective design space exploration in {VLIW}-based embedded systems", journal = j-TACO, volume = "5", number = "2", pages = "11:1--11:??", month = aug, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1400112.1400116", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Architectures based on very-long instruction word (VLIW) have found fertile ground in multimedia electronic appliances thanks to their ability to exploit high degrees of instruction level parallelism (ILP) with a reasonable trade-off in complexity and silicon cost. Specialization of such architectures involves the configuration of both hardware-related aspects (e.g., register files, functional units, memory subsystem) and software-related issues (e.g., the compilation strategy). The complex interactions between the components of such systems will force a human designer to rely on judgment and experience in designing them, possibly eliminating interesting configurations, and making tuning of the system, for either power, energy, or performance, difficult. In this paper we propose tools and methodologies to efficiently cope with this complexity from a multiobjective perspective. We first analyze the impact of ILP-oriented code transformations using two alternative compilation profiles to quantitatively show the effect of such transformations on typical design objectives like performance, power dissipation, and energy consumption. Next, by means of statistical analysis, we collect useful data to predict the effectiveness of a given compilation profiles for a specific application. Information gathered from such analysis can be exploited to drastically reduce the computational effort needed to perform the design space exploration.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "design space exploration; energy; genetic algorithms; hyperblock formation; ILP; multiobjective optimization; performances; power; statistical analysis; VLIW architectures", } @Article{Leverich:2008:CEM, author = "Jacob Leverich and Hideho Arakida and Alex Solomatnikov and Amin Firoozshahian and Mark Horowitz and Christos Kozyrakis", title = "Comparative evaluation of memory models for chip multiprocessors", journal = j-TACO, volume = "5", number = "3", pages = "12:1--12:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1455650.1455651", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "There are two competing models for the on-chip memory in Chip Multiprocessor (CMP) systems: {\em hardware-managed coherent caches\/} and {\em software-managed streaming memory}. This paper performs a direct comparison of the two models under the same set of assumptions about technology, area, and computational capabilities. The goal is to quantify how and when they differ in terms of performance, energy consumption, bandwidth requirements, and latency tolerance for general-purpose CMPs. We demonstrate that for data-parallel applications on systems with up to 16 cores, the cache-based and streaming models perform and scale equally well. For certain applications with little data reuse, streaming scales better due to better bandwidth use and macroscopic software prefetching. However, the introduction of techniques such as hardware prefetching and nonallocating stores to the cache-based model eliminates the streaming advantage. Overall, our results indicate that there is not sufficient advantage in building streaming memory systems where all on-chip memory structures are explicitly managed. On the other hand, we show that streaming at the programming model level is particularly beneficial, even with the cache-based model, as it enhances locality and creates opportunities for bandwidth optimizations. Moreover, we observe that stream programming is actually easier with the cache-based model because the hardware guarantees correct, best-effort execution even when the programmer cannot fully regularize an application's code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "cache coherence; Chip multiprocessors; locality optimizations; parallel programming; streaming memory", } @Article{Sharkey:2008:RRP, author = "Joseph J. Sharkey and Jason Loew and Dmitry V. Ponomarev", title = "Reducing register pressure in {SMT} processors through {L2}-miss-driven early register release", journal = j-TACO, volume = "5", number = "3", pages = "13:1--13:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1455650.1455652", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The register file is one of the most critical datapath components limiting the number of threads that can be supported on a simultaneous multithreading (SMT) processor. To allow the use of smaller register files without degrading performance, techniques that maximize the efficiency of using registers through aggressive register allocation/deallocation can be considered. In this article, we propose a novel technique to early deallocate physical registers allocated to threads which experience L2 cache misses. This is accomplished by speculatively committing the load-independent instructions and deallocating the registers corresponding to the previous mappings of their destinations, without waiting for the cache miss request to be serviced. The early deallocated registers are then made immediately available for allocation to instructions within the same thread as well as within other threads, thus improving the overall processor throughput. On the average across the simulated mixes of multiprogrammed SPEC 2000 workloads, our technique results in 33\% improvement in throughput and 25\% improvement in terms of harmonic mean of weighted IPCs over the baseline SMT with the state-of-the-art DCRA policy. This is achieved without creating checkpoints, maintaining per-register counters of pending consumers, performing tag rebroadcasts, register remappings, and/or additional associative searches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "register file; Simultaneous multithreading", } @Article{Mehrara:2008:ESP, author = "Mojtaba Mehrara and Todd Austin", title = "Exploiting selective placement for low-cost memory protection", journal = j-TACO, volume = "5", number = "3", pages = "14:1--14:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1455650.1455653", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many embedded processing applications, such as those found in the automotive or medical field, require hardware designs that are at the same time low cost and reliable. Traditionally, reliable memory systems have been implemented using coded storage techniques, such as ECC. While these designs can effectively detect and correct memory faults such as transient errors and single-bit defects, their use bears a significant cost overhead. In this article, we propose a novel partial memory protection scheme that provides high-coverage fault protection for program code and data, but with much lower cost than traditional approaches. Our approach profiles program code and data usage to assess which program elements are most critical to maintaining program correctness. Critical code and variables are then placed into a limited protected storage resources. To ensure high coverage of program elements, our placement technique considers all program components simultaneously, including code, global variables, stack frames, and heap variables. The fault coverage of our approach is gauged using Monte Carlo fault-injection experiments, which confirm that our technique provides high levels of fault protection (99\% coverage) with limited memory protection resources (36\% protected area).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "fault-tolerant design; memory system design; Partial memory protection; selective placement; transient faults", } @Article{Vandierendonck:2008:SRA, author = "Hans Vandierendonck and Andr{\'e} Seznec", title = "Speculative return address stack management revisited", journal = j-TACO, volume = "5", number = "3", pages = "15:1--15:??", month = nov, year = "2008", CODEN = "????", DOI = "https://doi.org/10.1145/1455650.1455654", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Branch prediction feeds a speculative execution processor core with instructions. Branch mispredictions are inevitable and have negative effects on performance and energy consumption. With the advent of highly accurate conditional branch predictors, nonconditional branch instructions are gaining importance.\par In this article, we address the prediction of procedure returns. On modern processors, procedure returns are predicted through a return address stack (RAS). The overwhelming majority of the return mispredictions are due to RAS overflows and/or overwriting the top entries of the RAS on a mispredicted path. These sources of misprediction were addressed by previously proposed speculative return address stacks [Jourdan et al. 1996; Skadron et al. 1998]. However, the remaining misprediction rate of these RAS designs is still significant when compared to state-of-the-art conditional predictors.\par We present two low-cost corruption detectors for RAS predictors. They detect RAS overflows and wrong path corruption with 100\% coverage. As a consequence, when such a corruption is detected, another source can be used for predicting the return. On processors featuring a branch target buffer (BTB), this BTB can be used as a free backup predictor for predicting returns when corruption is detected.\par Our experiments show that our proposal can be used to improve the behavior of all previously proposed speculative RASs. For instance, without any specific management of the speculative states on the RAS, an 8-entry BTB-backed up RAS achieves the same performance level as a state-of-the-art, but complex, 64-entry self-checkpointing RAS [Jourdan et al. 1996]. Therefore, our proposal can be used either to improve the performance of the processor or to reduce its hardware complexity.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "back-up predictor; corruption detection; Return address prediction", } @Article{Chhabra:2009:MSP, author = "Siddhartha Chhabra and Brian Rogers and Yan Solihin and Milos Prvulovic", title = "Making secure processors {OS}- and performance-friendly", journal = j-TACO, volume = "5", number = "4", pages = "16:1--16:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1498690.1498691", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In today's digital world, computer security issues have become increasingly important. In particular, researchers have proposed designs for secure processors that utilize hardware-based memory encryption and integrity verification to protect the privacy and integrity of computation even from sophisticated physical attacks. However, currently proposed schemes remain hampered by problems that make them impractical for use in today's computer systems: lack of virtual memory and Inter-Process Communication support as well as excessive storage and performance overheads. In this article, we propose (1) address independent seed encryption (AISE), a counter-mode-based memory encryption scheme using a novel seed composition, and (2) bonsai Merkle trees (BMT), a novel Merkle tree-based memory integrity verification technique, to eliminate these system and performance issues associated with prior counter-mode memory encryption and Merkle tree integrity verification schemes. We present both a qualitative discussion and a quantitative analysis to illustrate the advantages of our techniques over previously proposed approaches in terms of complexity, feasibility, performance, and storage. Our results show that AISE+BMT reduces the overhead of prior memory encryption and integrity verification schemes from 12\% to 2\% on average for single-threaded benchmarks on uniprocessor systems, and from 15\% to 4\% for coscheduled benchmarks on multicore systems while eliminating critical system-level problems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "memory encryption; memory integrity verification; Secure processor architectures; virtualization", } @Article{Jimenez:2009:GNB, author = "Daniel A. Jim{\'e}nez", title = "Generalizing neural branch prediction", journal = j-TACO, volume = "5", number = "4", pages = "17:1--17:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1498690.1498692", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Improved branch prediction accuracy is essential to sustaining instruction throughput with today's deep pipelines. Traditional branch predictors exploit correlations between pattern history and branch outcome to predict branches, but there is a stronger and more natural correlation between path history and branch outcome. We explore the potential for exploiting this correlation. We introduce {\em piecewise linear branch prediction}, an idealized branch predictor that develops a set of linear functions, one for each program path to the branch to be predicted, that separate predicted taken from predicted not taken branches. Taken together, all of these linear functions form a piecewise linear decision surface. We present a limit study of this predictor showing its potential to greatly improve predictor accuracy.\par We then introduce a practical implementable branch predictor based on piecewise linear branch prediction. In making our predictor practical, we show how a parameterized version of it unifies the previously distinct concepts of perceptron prediction and path-based neural prediction. Our new branch predictor has implementation costs comparable to current prominent predictors in the literature while significantly improving accuracy. For a deeply pipelined simulated microarchitecture our predictor with a 256-KB hardware budget improves the harmonic mean normalized instructions-per-cycle rate by 8\% over both the original path-based neural predictor and 2Bc-{\em gskew}. The average misprediction rate is decreased by 16\% over the path-based neural predictor and by 22\% over 2Bc-{\em gskew}.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "Branch prediction; machine learning", } @Article{Jeon:2009:AAP, author = "Jinseong Jeon and Keoncheol Shin and Hwansoo Han", title = "Abstracting access patterns of dynamic memory using regular expressions", journal = j-TACO, volume = "5", number = "4", pages = "18:1--18:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1498690.1498693", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Unless the speed gap between CPU and memory disappears, efficient memory usage remains a decisive factor for performance. To optimize data usage of programs in the presence of the memory hierarchy, we are particularly interested in two compiler techniques: {\em pool allocation\/} and {\em field layout restructuring}. Since foreseeing runtime behaviors of programs at compile time is difficult, most of the previous work relied on profiling. On the contrary, our goal is to develop a fully automatic compiler that statically transforms input codes to use memory efficiently. Noticing that {\em regular expressions}, which denote repetition explicitly, are sufficient for memory access patterns, we describe how to extract memory access patterns as regular expressions in detail. Based on static patterns presented in regular expressions, we apply pool allocation to repeatedly accessed structures and exploit field layout restructuring according to field affinity relations of chosen structures. To make a scalable framework, we devise and apply new abstraction techniques, which build and interpret access patterns for the whole programs in a bottom-up fashion. We implement our analyses and transformations with the CIL compiler. To verify the effect and scalability of our scheme, we examine 17 benchmarks including 2 SPECINT 2000 benchmarks whose source lines of code are larger than 10,000. Our experiments demonstrate that the static layout transformations for dynamic memory can reduce L1D cache misses by 16\% and execution times by 14\% on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "Access patterns; field affinity; layout transformation; pool allocation; regular expressions", } @Article{Shobaki:2009:OTS, author = "Ghassan Shobaki and Kent Wilken and Mark Heffernan", title = "Optimal trace scheduling using enumeration", journal = j-TACO, volume = "5", number = "4", pages = "19:1--19:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1498690.1498694", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents the first optimal algorithm for trace scheduling. The trace is a global scheduling region used by compilers to exploit instruction-level parallelism across basic block boundaries. Several heuristic techniques have been proposed for trace scheduling, but the precision of these techniques has not been studied relative to optimality. This article describes a technique for finding provably optimal trace schedules, where optimality is defined in terms of a weighted sum of schedule lengths across all code paths in a trace. The optimal algorithm uses branch-and-bound enumeration to efficiently explore the entire solution space. Experimental evaluation of the algorithm shows that, with a time limit of 1 second per problem, 91\% of the hard trace scheduling problems in the SPEC CPU 2006 Integer Benchmarks are solved optimally. For 58\% of these hard problems, the optimal schedule is improved compared to that produced by a heuristic scheduler with a geometric mean improvement of 3.2\% in weighted schedule length and 18\% in compensation code size.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "branch-and-bound enumeration; compiler optimizations; global instruction scheduling; Instruction scheduling; instruction-level parallelism; optimal instruction scheduling; trace scheduling", } @Article{Kulkarni:2009:PEO, author = "Prasad A. Kulkarni and David B. Whalley and Gary S. Tyson and Jack W. Davidson", title = "Practical exhaustive optimization phase order exploration and evaluation", journal = j-TACO, volume = "6", number = "1", pages = "1:1--1:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1509864.1509865", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Choosing the most appropriate optimization phase ordering has been a long-standing problem in compiler optimizations. Exhaustive evaluation of all possible orderings of optimization phases for each function is generally dismissed as infeasible for production-quality compilers targeting accepted benchmarks. In this article, we show that it is possible to exhaustively evaluate the optimization phase order space for each function in a reasonable amount of time for most of the functions in our benchmark suite. To achieve this goal, we used various techniques to significantly prune the optimization phase order search space so that it can be inexpensively enumerated in most cases and reduce the number of program simulations required to evaluate program performance for each distinct phase ordering. The techniques described are applicable to other compilers in which it is desirable to find the best phase ordering for most functions in a reasonable amount of time. We also describe some interesting properties of the optimization phase order space, which will prove useful for further studies of related problems in compilers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "exhaustive search; iterative compilation; Phase ordering", } @Article{Hohenauer:2009:SOF, author = "Manuel Hohenauer and Felix Engel and Rainer Leupers and Gerd Ascheid and Heinrich Meyr", title = "A {SIMD} optimization framework for retargetable compilers", journal = j-TACO, volume = "6", number = "1", pages = "2:1--2:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1509864.1509866", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Retargetable C compilers are currently widely used to quickly obtain compiler support for new embedded processors and to perform early processor architecture exploration. A partially inherent problem of the retargetable compilation approach, though, is the limited code quality as compared to hand-written compilers or assembly code due to the lack of dedicated optimizations techniques. This problem can be circumvented by designing flexible, retargetable code optimization techniques that apply to a certain range of target architectures. This article focuses on target machines with SIMD instruction support, a common feature in embedded processors for multimedia applications. However, SIMD optimization is known to be a difficult task since SIMD architectures are largely nonuniform, support only a limited set of data types and impose several memory alignment constraints. Additionally, such techniques require complicated loop transformations, which are tailored to the SIMD architecture in order to exhibit the necessary amount of parallelism in the code. Thus, integrating the SIMD optimization {\em and\/} the required loop transformations together in a single retargeting formalism is an ambitious challenge. In this article, we present an efficient and quickly retargetable SIMD code optimization framework that is integrated into an industrial retargetable C compiler. Experimental results for different processors demonstrate that the proposed technique applies to real-life target machines and that it produces code quality improvements close to the theoretical limit.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "ASIP; retargetable compilers; SIMD; subword parallelism; vectorization", } @Article{Eyerman:2009:MLP, author = "Stijn Eyerman and Lieven Eeckhout", title = "Memory-level parallelism aware fetch policies for simultaneous multithreading processors", journal = j-TACO, volume = "6", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1509864.1509867", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A thread executing on a simultaneous multithreading (SMT) processor that experiences a long-latency load will eventually stall while holding execution resources. Existing long-latency load aware SMT fetch policies limit the amount of resources allocated by a stalled thread by identifying long-latency loads and preventing the thread from fetching more instructions --- and in some implementations, instructions beyond the long-latency load are flushed to release allocated resources.\par This article proposes an SMT fetch policy that takes into account the available memory-level parallelism (MLP) in a thread. The key idea proposed in this article is that in case of an isolated long-latency load (i.e., there is no MLP), the thread should be prevented from allocating additional resources. However, in case multiple independent long-latency loads overlap (i.e., there is MLP), the thread should allocate as many resources as needed in order to fully expose the available MLP. MLP-aware fetch policies achieve better performance for MLP-intensive threads on SMT processors, leading to higher overall system throughput and shorter average turnaround time than previously proposed fetch policies.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "Fetch Policy; Memory-Level Parallelism (MLP); Simultaneous Multithreading (SMT)", } @Article{Strozek:2009:EAE, author = "Lukasz Strozek and David Brooks", title = "Energy- and area-efficient architectures through application clustering and architectural heterogeneity", journal = j-TACO, volume = "6", number = "1", pages = "4:1--4:??", month = mar, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1509864.1509868", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Customizing architectures for particular applications is a promising approach to yield highly energy-efficient designs for embedded systems. This work explores the benefits of architectural customization for a class of embedded architectures typically used in energy- and area-constrained application domains, such as sensor nodes and multimedia processing. We implement a process flow that performs an automatic synthesis and evaluation of the different architectures based on runtime profiles of applications and determines an efficient architecture, with consideration for both energy and area constraints. An expressive architectural model, used by our engine, is introduced that takes advantage of efficient opcode allocation, several memory addressing modes, and operand types. By profiling embedded benchmarks from a variety of sensor and multimedia applications, we show that the energy savings resulting from various architectural optimizations relative to the base architectures (e.g., MIPS and MSP430) are significant and can reach 50\%, depending on the application. We then identify the set of architectures that achieves near-optimal savings for a group of applications. Finally, we propose the use of heterogeneous ISA processors implementing those architectures as a solution to capitalize on energy savings provided by application customization while executing a range of applications efficiently.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "Efficient custom architectures; heterogeneous ISA processors", } @Article{Venkataramani:2009:MAM, author = "Guru Venkataramani and Ioannis Doudalis and Yan Solihin and Milos Prvulovic", title = "{MemTracker}: {An} accelerator for memory debugging and monitoring", journal = j-TACO, volume = "6", number = "2", pages = "5:1--5:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1543753.1543754", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memory bugs are a broad class of bugs that is becoming increasingly common with increasing software complexity, and many of these bugs are also security vulnerabilities. Existing software and hardware approaches for finding and identifying memory bugs have a number of drawbacks including considerable performance overheads, target only a specific type of bug, implementation cost, and inefficient use of computational resources.\par This article describes MemTracker, a new hardware support mechanism that can be configured to perform different kinds of memory access monitoring tasks. MemTracker associates each word of data in memory with a few bits of state, and uses a programmable state transition table to react to different events that can affect this state. The number of state bits per word, the events to which MemTracker reacts, and the transition table are all fully programmable. MemTracker's rich set of states, events, and transitions can be used to implement different monitoring and debugging checkers with minimal performance overheads, even when frequent state updates are needed. To evaluate MemTracker, we map three different checkers onto it, as well as a checker that combines all three. For the most demanding (combined) checker with 8 bits state per memory word, we observe performance overheads of only around 3\%, on average, and 14.5\% worst-case across different benchmark suites. Such low overheads allow continuous (always-on) use of MemTracker-enabled checkers, even in production runs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "Accelerator; debugging; memory access monitoring", } @Article{Gabor:2009:SLA, author = "Ron Gabor and Avi Mendelson and Shlomo Weiss", title = "Service level agreement for multithreaded processors", journal = j-TACO, volume = "6", number = "2", pages = "6:1--6:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1543753.1543755", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multithreading is widely used to increase processor throughput. As the number of shared resources increase, managing them while guaranteeing predicted performance becomes a major problem. Attempts have been made in previous work to ease this via different fairness mechanisms. In this article, we present a new approach to control the resource allocation and sharing via a service level agreement (SLA)-based mechanism; that is, via an agreement in which multithreaded processors guarantee a minimal level of service to the running threads. We introduce a new metric, {\em C\/}$_{SLA}$, for conformance to SLA in multithreaded processors and show that controlling resources using with SLA allows for higher gains than are achievable by previously suggested fairness techniques. It also permits improving one metric (e.g., power) while maintaining SLA in another (e.g., performance). We compare SLA enforcement to schemes based on other fairness metrics, which are mostly targeted at equalizing execution parameters. We show that using SLA rather than fairness based algorithms provides a range of acceptable execution points from which we can select the point that best fits our optimization target, such as maximizing the weighted speedup (sum of the speedups of the individual threads) or reducing power. We demonstrate the effectiveness of the new SLA approach using switch-on-event (coarse-grained) multithreading. Our weighted speedup improvement scheme successfully enforces SLA while improving the weighted speedup by an average of 10\% for unbalanced threads. This result is significant when compared with performance losses that may be incurred by fairness enforcement methods. When optimizing for power reduction in unbalanced threads SLA enforcement reduces the power by an average of 15\%. SLA may be complemented by other power reduction methods to achieve further power savings {\em and\/} maintain the same service level for the threads. We also demonstrate differentiated SLA, where weighted speedup is maximized while each thread may have a different throughput constraint.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "fairness; performance; power; Service level agreement; throughput", } @Article{Fung:2009:DWF, author = "Wilson W. L. Fung and Ivan Sham and George Yuan and Tor M. Aamodt", title = "Dynamic warp formation: {Efficient MIMD} control flow on {SIMD} graphics hardware", journal = j-TACO, volume = "6", number = "2", pages = "7:1--7:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1543753.1543756", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent advances in graphics processing units (GPUs) have resulted in massively parallel hardware that is easily programmable and widely available in today's desktop and notebook computer systems. GPUs typically use single-instruction, multiple-data (SIMD) pipelines to achieve high performance with minimal overhead for control hardware. Scalar threads running the same computing kernel are grouped together into SIMD batches, sometimes referred to as warps. While SIMD is ideally suited for simple programs, recent GPUs include control flow instructions in the GPU instruction set architecture and programs using these instructions may experience reduced performance due to the way branch execution is supported in hardware. One solution is to add a stack to allow different SIMD processing elements to execute distinct program paths after a branch instruction. The occurrence of diverging branch outcomes for different processing elements significantly degrades performance using this approach. In this article, we propose dynamic warp formation and scheduling, a mechanism for more efficient SIMD branch execution on GPUs. It dynamically regroups threads into new warps on the fly following the occurrence of diverging branch outcomes. We show that a realistic hardware implementation of this mechanism improves performance by 13\%, on average, with 256 threads per core, 24\% with 512 threads, and 47\% with 768 threads for an estimated area increase of 8\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "control flow; fine-grained multithreading; GPU; SIMD", } @Article{Koh:2009:TPV, author = "Cheng-Kok Koh and Weng-Fai Wong and Yiran Chen and Hai Li", title = "Tolerating process variations in large, set-associative caches: {The} buddy cache", journal = j-TACO, volume = "6", number = "2", pages = "8:1--8:??", month = jun, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1543753.1543757", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "One important trend in today's microprocessor architectures is the increase in size of the processor caches. These caches also tend to be set associative. As technology scales, process variations are expected to increase the fault rates of the SRAM cells that compose such caches. As an important component of the processor, the parametric yield of SRAM cells is crucial to the overall performance and yield of the microchip. In this article, we propose a microarchitectural solution, called the buddy cache that permits large, set-associative caches to tolerate faults in SRAM cells due to process variations. In essence, instead of disabling a faulty cache block in a set (as is the current practice), it is paired with another faulty cache block in the same set --- the buddy. Although both cache blocks are faulty, if the faults of the two blocks do not overlap, then instead of losing two blocks, buddying will yield a functional block from the nonfaulty portions of the two blocks. We found that with buddying, caches can better mitigate the negative impacts of process variations on performance and yield, gracefully downgrading performance as opposed to catastrophic failure. We will describe the details of the buddy cache and give insights as to why it is both more performance and yield resilient to faults.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "caches; fault recovery; memory structures; Processor architectures", } @Article{Li:2009:CDS, author = "Lian Li and Hui Feng and Jingling Xue", title = "Compiler-directed scratchpad memory management via graph coloring", journal = j-TACO, volume = "6", number = "3", pages = "9:1--9:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1582710.1582711", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Scratchpad memory (SPM), a fast on-chip SRAM managed by software, is widely used in embedded systems. This article introduces a general-purpose compiler approach, called memory coloring, to assign static data aggregates, such as arrays and structs, in a program to an SPM. The novelty of this approach lies in partitioning the SPM into a pseudo--register file (with interchangeable and aliased registers), splitting the live ranges of data aggregates to create potential data transfer statements between SPM and off-chip memory, and finally, adapting an existing graph coloring algorithm for register allocation to assign the data aggregates to the pseudo--register file. Our experimental results using a set of 10 C benchmarks from MediaBench and MiBench show that our methodology is capable of managing SPMs efficiently and effectively for large embedded applications. In addition, our SPM allocator can obtain close to optimal solutions when evaluated and compared against an existing heuristics-based SPM allocator and an ILP-based SPM allocator.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "graph coloring; live range splitting; memory allocation; memory coloring; register coalescing; Scratchpad memory; software-managed cache", } @Article{Golander:2009:CAR, author = "Amit Golander and Shlomo Weiss", title = "Checkpoint allocation and release", journal = j-TACO, volume = "6", number = "3", pages = "10:1--10:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1582710.1582712", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Out-of-order speculative processors need a bookkeeping method to recover from incorrect speculation. In recent years, several microarchitectures that employ checkpoints have been proposed, either extending the reorder buffer or entirely replacing it. This work presents an in-dept-study of checkpointing in checkpoint-based microarchitectures, from the desired content of a checkpoint, via implementation trade-offs, and to checkpoint allocation and release policies. A major contribution of the article is a novel adaptive checkpoint allocation policy that outperforms known policies. The adaptive policy controls checkpoint allocation according to dynamic events, such as second-level cache misses and rollback history. It achieves 6.8\% and 2.2\% speedup for the integer and floating point benchmarks, respectively, and does not require a branch confidence estimator. The results show that the proposed adaptive policy achieves most of the potential of an oracle policy whose performance improvement is 9.8\% and 3.9\% for the integer and floating point benchmarks, respectively. We exploit known techniques for saving leakage power by adapting and applying them to checkpoint-based microarchitectures. The proposed applications combine to reduce the leakage power of the register file to about one half of its original value.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "Checkpoint; early register release; leakage; misprediction; out-of-order execution; rollback", } @Article{Xu:2009:TXP, author = "Weifeng Xu and Russell Tessier", title = "{Tetris-XL}: a performance-driven spill reduction technique for embedded {VLIW} processors", journal = j-TACO, volume = "6", number = "3", pages = "11:1--11:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1582710.1582713", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As technology has advanced, the application space of Very Long Instruction Word (VLIW) processors has grown to include a variety of embedded platforms. Due to cost and power consumption constraints, many embedded VLIW processors contain limited resources, including registers. As a result, a VLIW compiler that maximizes instruction level parallelism (ILP) without considering register constraints may generate excessive register spills, leading to reduced overall system performance. To address this issue, this article presents a new spill reduction technique that improves VLIW runtime performance by reordering operations prior to register allocation and instruction scheduling. Unlike earlier algorithms, our approach explicitly considers both register reduction and data dependency in performing operation reordering. Data dependency control limits unexpected schedule length increases during subsequent instruction scheduling. Our technique has been evaluated using Trimaran, an academic VLIW compiler, and evaluated using a set of embedded systems benchmarks. Experimental results show that, on average, this technique improves VLIW performance by 10\% for VLIW processors with 32 registers and 8 functional units compared with previous spill reduction techniques. Limited improvement is seen versus prior approaches for VLIW processors with 64 registers and 8 functional units.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "instruction level parallelism; Register pressure; Very Long Instruction Word (VLIW) processor", } @Article{Jones:2009:ELE, author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin", title = "Exploring the limits of early register release: {Exploiting} compiler analysis", journal = j-TACO, volume = "6", number = "3", pages = "12:1--12:??", month = sep, year = "2009", CODEN = "????", DOI = "https://doi.org/10.1145/1582710.1582714", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Register pressure in modern superscalar processors can be reduced by releasing registers early and by copying their contents to cheap back-up storage. This article quantifies the potential benefits of register occupancy reduction and shows that existing hardware-based schemes typically achieve only a small fraction of this potential. This is because they are unable to accurately determine the last use of a register and must wait until the redefining instruction enters the pipeline. On the other hand, compilers have a global view of the program and, using simple dataflow analysis, can determine the last use. This article evaluates the extent to which compiler analysis can aid early releasing, explores the design space, and introduces commit and issue-based early releasing schemes, quantifying their benefits. Using simple compiler analysis and microarchitecture changes, we achieve 70\% of the potential register file occupancy reduction. By adding more hardware support, we can increase this to 94\%. Our schemes are compared to state-of-the-art approaches for varying register file sizes and are shown to outperform these existing techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "compiler; energy efficiency; Low-power design; microarchitecture; register file", } @Article{Jones:2009:EER, author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin", title = "Energy-efficient register caching with compiler assistance", journal = j-TACO, volume = "6", number = "4", pages = "13:1--13:??", month = oct, year = "2009", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 15 18:49:43 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2009:TUC, author = "Weijia Li and Youtao Zhang and Jun Yang and Jiang Zheng", title = "Towards update-conscious compilation for energy-efficient code dissemination in {WSNs}", journal = j-TACO, volume = "6", number = "4", pages = "14:1--14:??", month = oct, year = "2009", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 15 18:49:43 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wegiel:2009:SRC, author = "Michal Wegiel and Chandra Krintz", title = "The single-referent collector: {Optimizing} compaction for the common case", journal = j-TACO, volume = "6", number = "4", pages = "15:1--15:??", month = oct, year = "2009", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 15 18:49:43 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Subramaniam:2009:DOS, author = "Samantika Subramaniam and Gabriel H. Loh", title = "Design and optimization of the store vectors memory dependence predictor", journal = j-TACO, volume = "6", number = "4", pages = "16:1--16:??", month = oct, year = "2009", CODEN = "????", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 15 18:49:43 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2010:PAM, author = "Xiaohang Wang and Mei Yang and Yingtao Jiang and Peng Liu", title = "A power-aware mapping approach to map {IP} cores onto {NoCs} under bandwidth and latency constraints", journal = j-TACO, volume = "7", number = "1", pages = "1:1--1:??", month = apr, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1736065.1736066", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 5 15:38:13 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we investigate the Intellectual Property (IP) mapping problem that maps a given set of IP cores onto the tiles of a mesh-based Network-on-Chip (NoC) architecture such that the power consumption due to intercore communications is minimized. This IP mapping problem is considered under both bandwidth and latency constraints as imposed by the applications and the on-chip network infrastructure. By examining various applications' communication characteristics extracted from their respective communication trace graphs, two distinguishable connectivity templates are realized: the graphs with tightly coupled vertices and those with distributed vertices. These two templates are formally defined in this article, and different mapping heuristics are subsequently developed to map them. In general, tightly coupled vertices are mapped onto tiles that are physically close to each other while the distributed vertices are mapped following a graph partition scheme. Experimental results on both random and multimedia benchmarks have confirmed that the proposed template-based mapping algorithm achieves an average of 15\% power savings as compared with MOCA, a fast greedy-based mapping algorithm. Compared with a branch-and-bound--based mapping algorithm, which produces near optimal results but incurs an extremely high computation cost, the proposed algorithm, due to its polynomial runtime complexity, can generate the results of almost the same quality with much less CPU time. As the on-chip network size increases, the superiority of the proposed algorithm becomes more evident.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "bandwidth and latency constraints; IP mapping; Low power; network-on-chip (NoC)", } @Article{Chen:2010:HSF, author = "Zhong-Ho Chen and Alvin W. Y. Su", title = "A hardware\slash software framework for instruction and data scratchpad memory allocation", journal = j-TACO, volume = "7", number = "1", pages = "2:1--2:??", month = apr, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1736065.1736067", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 5 15:38:13 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Previous researches show that a scratchpad memory device consumed less energy than a cache device with the same capacity. In this article, we locate the scratchpad memory (SPM) in the top level of the memory hierarchy to reduce the energy consumption. To take the advantage of a SPM, we address two issues of utilizing a SPM. First, the program's locality should be improved. The second issue is SPM management. To tackle these two issues, we present a hardware/software framework for dynamically allocating both instructions and data in SPM. The software flow could be divided into three phases: locality improving, locality extraction, and runtime SPM management. Without modifying the original compiler and the source code, we improve the locality of a program. An optimization algorithm is proposed to extract the SPM allocations. At runtime, an SPM management program is employed. In hardware, an address translation logic (ATL) is proposed to reduce the overhead of SPM management.\par The results show that the proposed framework can reduce energy delay product (EDP) by 63\%, on average, when compared with the traditional cache architecture. The reduction in EDP is contributed by properly allocating both instructions and data in SPM. By allocating only instructions in SPM, the EDPs are reduced by 45\%, on average. By allocating only data in SPM, the EDPs are reduced by 14\%, on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "allocation algorithm; Memory allocation; scratchpad memory", } @Article{Woo:2010:CVI, author = "Dong Hyuk Woo and Joshua B. Fryman and Allan D. Knies and Hsien-Hsin S. Lee", title = "{Chameleon}: {Virtualizing} idle acceleration cores of a heterogeneous multicore processor for caching and prefetching", journal = j-TACO, volume = "7", number = "1", pages = "3:1--3:??", month = apr, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1736065.1736068", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 5 15:38:13 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Heterogeneous multicore processors have emerged as an energy- and area-efficient architectural solution to improving performance for domain-specific applications such as those with a plethora of data-level parallelism. These processors typically contain a large number of small, compute-centric cores for acceleration while keeping one or two high-performance ILP cores on the die to guarantee single-thread performance. Although a major portion of the transistors are occupied by the acceleration cores, these resources will sit idle when running unparallelized legacy codes or the sequential part of an application. To address this underutilization issue, in this article, we introduce Chameleon, a flexible heterogeneous multicore architecture to virtualize these resources for enhancing memory performance when running sequential programs. The Chameleon architecture can dynamically virtualize the idle acceleration cores into a last-level cache, a data prefetcher, or a hybrid between these two techniques. In addition, Chameleon can operate in an adaptive mode that dynamically configures the acceleration cores between the hybrid mode and the prefetch-only mode by monitoring the effectiveness of the Chameleon cache mode. In our evaluation with SPEC2006 benchmark suite, different levels of performance improvements were achieved in different modes for different applications. In the case of the adaptive mode, Chameleon improves the performance of SPECint06 and SPECfp06 by 31\% and 15\%, on average. When considering only memory-intensive applications, Chameleon improves the system performance by 50\% and 26\% for SPECint06 and SPECfp06, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "cache; Heterogeneous multicore; idle core; prefetching", } @Article{Sanchez:2010:ACI, author = "Daniel Sanchez and George Michelogiannakis and Christos Kozyrakis", title = "An analysis of on-chip interconnection networks for large-scale chip multiprocessors", journal = j-TACO, volume = "7", number = "1", pages = "4:1--4:??", month = apr, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1756065.1736069", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 5 15:38:13 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the number of cores of chip multiprocessors (CMPs) rapidly growing as technology scales down, connecting the different components of a CMP in a scalable and efficient way becomes increasingly challenging. In this article, we explore the architectural-level implications of interconnection network design for CMPs with up to 128 fine-grain multithreaded cores. We evaluate and compare different network topologies using accurate simulation of the full chip, including the memory hierarchy and interconnect, and using a diverse set of scientific and engineering workloads.\par We find that the interconnect has a large impact on performance, as it is responsible for 60\% to 75\% of the miss latency. Latency, and not bandwidth, is the primary performance constraint, since, even with many threads per core and workloads with high miss rates, networks with enough bandwidth can be efficiently implemented for the system scales we consider. From the topologies we study, the flattened butterfly consistently outperforms the mesh and fat tree on all workloads, leading to performance advantages of up to 22\%. We also show that considering interconnect and memory hierarchy together when designing large-scale CMPs is crucial, and neglecting either of the two can lead to incorrect conclusions. Finally, the effect of the interconnect on overall performance becomes more important as the number of cores increases, making interconnection choices especially critical when scaling up.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "chip multiprocessors; hierarchical networks; Networks-on-chip", } @Article{Zhou:2010:PAT, author = "Xiuyi Zhou and Jun Yang and Marek Chrobak and Youtao Zhang", title = "Performance-aware thermal management via task scheduling", journal = j-TACO, volume = "7", number = "1", pages = "5:1--5:??", month = apr, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1746065.1736070", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 5 15:38:13 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High on-chip temperature impairs the processor's reliability and reduces its lifetime. Hardware-level dynamic thermal management (DTM) techniques can effectively constrain the chip temperature, but degrades the performance. We propose an OS-level technique that performs thermal-aware job scheduling to reduce DTMs. The algorithm is based on the observation that hot and cool jobs executed in a different order can make a difference in resulting temperature. Real-system implementation in Linux shows that our scheduler can remove 10.5\% to 73.6\% of the hardware DTMs in a medium thermal environment. The CPU throughput is improved by up to 7.6\% (4.1\%, on average) in a severe thermal environment.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "task scheduling; Thermal management", } @Article{Raghavan:2010:TTP, author = "Arun Raghavan and Colin Blundell and Milo M. K. Martin", title = "Token tenure and {PATCH}: a predictive\slash adaptive token-counting hybrid", journal = j-TACO, volume = "7", number = "2", pages = "6:1--6:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839668", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Traditional coherence protocols present a set of difficult trade-offs: the reliance of snoopy protocols on broadcast and ordered interconnects limits their scalability, while directory protocols incur a performance penalty on sharing misses due to indirection. This work introduces Patch (Predictive/Adaptive Token-Counting Hybrid), a coherence protocol that provides the scalability of directory protocols while opportunistically sending direct requests to reduce sharing latency. Patch extends a standard directory protocol to track tokens and use token-counting rules for enforcing coherence permissions. Token counting allows Patch to support direct requests on an unordered interconnect, while a mechanism called {\em token tenure\/} provides broadcast-free forward progress using the directory protocol's per-block point of ordering at the home along with either timeouts at requesters or explicit race notification messages.\par Patch makes three main contributions. First, Patch introduces token tenure, which provides broadcast-free forward progress for token-counting protocols. Second, Patch deprioritizes best-effort direct requests to match or exceed the performance of directory protocols without restricting scalability. Finally, Patch provides greater scalability than directory protocols when using inexact encodings of sharers because only processors holding tokens need to acknowledge requests. Overall, Patch is a ``one-size-fits-all'' coherence protocol that dynamically adapts to work well for small systems, large systems, and anywhere in between.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "adaptive; bandwidth-efficiency; Cache coherence protocol; predictive; token coherence", } @Article{Wimmer:2010:AFD, author = "Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}sck", title = "Automatic feedback-directed object fusing", journal = j-TACO, volume = "7", number = "2", pages = "7:1--7:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839669", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Object fusing is an optimization that embeds certain referenced objects into their referencing object. The order of objects on the heap is changed in such a way that objects that are accessed together are placed next to each other in memory. Their offset is then fixed, that is, the objects are colocated, allowing field loads to be replaced by address arithmetic. Array fusing specifically optimizes arrays, which are frequently used for the implementation of dynamic data structures. Therefore, the length of arrays often varies, and fields referencing such arrays have to be changed. An efficient code pattern detects these changes and allows the optimized access of such fields.\par We integrated these optimizations into Sun Microsystems' Java HotSpot\TM{} VM. The analysis is performed automatically at runtime, requires no actions on the part of the programmer, and supports dynamic class loading. To safely eliminate a field load, the colocation of the object that holds the field and the object that is referenced by the field must be guaranteed. Two preconditions must be satisfied: The objects must be allocated at the same time, and the field must not be overwritten later. These preconditions are checked by the just-in-time compiler to avoid an interprocedural data flow analysis. The garbage collector ensures that groups of colocated objects are not split by copying groups as a whole. The evaluation shows that the dynamic approach successfully identifies and optimizes frequently accessed fields for several benchmarks with a low compilation and analysis overhead. It leads to a speedup of up to 76\% for simple benchmarks and up to 6\% for complex workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "cache performance; garbage collection; Java; just-in-time compilation; object colocation; object fusing; object inlining; optimization", } @Article{Lee:2010:AIC, author = "Benjamin C. Lee and David Brooks", title = "Applied inference: {Case} studies in microarchitectural design", journal = j-TACO, volume = "7", number = "2", pages = "8:1--8:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839670", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We propose and apply a new simulation paradigm for microarchitectural design evaluation and optimization. This paradigm enables more comprehensive design studies by combining spatial sampling and statistical inference. Specifically, this paradigm (i) defines a large, comprehensive design space, (ii) samples points from the space for simulation, and (iii) constructs regression models based on sparse simulations. This approach greatly improves the computational efficiency of microarchitectural simulation and enables new capabilities in design space exploration.\par We illustrate new capabilities in three case studies for a large design space of approximately 260,000 points: (i) Pareto frontier, (ii) pipeline depth, and (iii) multiprocessor heterogeneity analyses. In particular, regression models are exhaustively evaluated to identify Pareto optimal designs that maximize performance for given power budgets. These models enable pipeline depth studies in which all parameters vary simultaneously with depth, thereby more effectively revealing interactions with nondepth parameters. Heterogeneity analysis combines regression-based optimization with clustering heuristics to identify efficient design compromises between similar optimal architectures. These compromises are potential core designs in a heterogeneous multicore architecture. Increasing heterogeneity can improve {\em bips\/}$^3$ / {\em w\/} efficiency by as much as 2.4\times , a theoretical upper bound on heterogeneity benefits that neglects contention between shared resources as well as design complexity. Collectively these studies demonstrate regression models' ability to expose trends and identify optima in diverse design regions, motivating the application of such models in statistical inference for more effective use of modern simulator infrastructure.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "Microarchitecture; regression; simulation; statistics", } @Article{Rakvic:2010:TMT, author = "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G. Magklis and P. Chaparro and A. Gonz{\'a}lez", title = "Thread-management techniques to maximize efficiency in multicore and simultaneous multithreaded microprocessors", journal = j-TACO, volume = "7", number = "2", pages = "9:1--9:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839671", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We provide an analysis of thread-management techniques that increase performance or reduce energy in multicore and Simultaneous Multithreaded (SMT) cores. Thread delaying reduces energy consumption by running the core containing the critical thread at maximum frequency while scaling down the frequency and voltage of the cores containing noncritical threads. In this article, we provide an insightful breakdown of thread delaying on a simulated multi-core microprocessor. Thread balancing improves overall performance by giving higher priority to the critical thread in the issue queue of an SMT core. We provide a detailed breakdown of performance results for thread-balancing, identifying performance benefits and limitations. For those benchmarks where a performance benefit is not possible, we introduce a novel thread-balancing mechanism on an SMT core that can reduce energy consumption. We have performed a detailed study on an Intel microprocessor simulator running parallel applications. Thread delaying can reduce energy consumption by 4\% to 44\% with negligible performance loss. Thread balancing can increase performance by 20\% or can reduce energy consumption by 23\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "critical threads; energy-aware; low-power; Meeting point thread characterization; microarchitecture; multi-threaded application; thread balancing; thread delaying", } @Article{Pao:2010:MEP, author = "Derek Pao and Wei Lin and Bin Liu", title = "A memory-efficient pipelined implementation of the {Aho--Corasick} string-matching algorithm", journal = j-TACO, volume = "7", number = "2", pages = "10:1--10:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839672", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With rapid advancement in Internet technology and usages, some emerging applications in data communications and network security require matching of huge volume of data against large signature sets with thousands of strings in real time. In this article, we present a memory-efficient hardware implementation of the well-known Aho--Corasick (AC) string-matching algorithm using a pipelining approach called P-AC. An attractive feature of the AC algorithm is that it can solve the string-matching problem in time linearly proportional to the length of the input stream, and the computation time is independent of the number of strings in the signature set. A major disadvantage of the AC algorithm is the high memory cost required to store the transition rules of the underlying deterministic finite automaton. By incorporating pipelined processing, the state graph is reduced to a character trie that only contains forward edges. Together with an intelligent implementation of look-up tables, the memory cost of P-AC is only about 18 bits per character for a signature set containing 6,166 strings extracted from Snort. The control structure of P-AC is simple and elegant. The cost of the control logic is very low. With the availability of dual-port memories in FPGA devices, we can double the system throughput by duplicating the control logic such that the system can process two data streams concurrently. Since our method is memory-based, incremental changes to the signature set can be accommodated by updating the look-up tables without reconfiguring the FPGA circuitry.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "deterministic and nondeterministic finite automaton; intrusion detection system; pipelined processing; String-matching", } @Article{Yang:2010:ERS, author = "Xuejun Yang and Ying Zhang and Xicheng Lu and Jingling Xue and Ian Rogers and Gen Li and Guibin Wang and Xudong Fang", title = "Exploiting the reuse supplied by loop-dependent stream references for stream processors", journal = j-TACO, volume = "7", number = "2", pages = "11:1--11:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839673", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memory accesses limit the performance of stream processors. By exploiting the reuse of data held in the Stream Register File (SRF), an on-chip, software controlled storage, the number of memory accesses can be reduced. In current stream compilers, reuse exploitation is only attempted for simple stream references, those whose start and end are known. Compiler analysis, from outside of stream processors, does not directly enable the consideration of other more complex stream references. In this article, we propose a transformation to automatically optimize stream programs to exploit the reuse supplied by loop-dependent stream references. The transformation is based on three results: lemmas identifying the reuse supplied by stream references, a new abstract representation called the Stream Reuse Graph (SRG) depicting the identified reuse, and the optimization of the SRG for our transformation. Both the reuse between the whole sequences accessed by stream references and between partial sequences is exploited in the article. In particular, partial reuse and its treatment are quite new and have never, to the best of our knowledge, appeared in scalar and vector processing. At the same time, reusing streams increases the pressure on the SRF, and this presents a problem of which reuse should be exploited within limited SRF capacity. We extend our analysis to achieve this objective. Finally, we implement our techniques based on the StreamC/KernelC compiler that has been optimized with the best existing compilation techniques for stream processors. Experimental results show a resultant speed-up of 1.14 to 2.54 times using a range of benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "stream professor; Stream programming model; stream register file; stream reuse; streamc", } @Article{Reddi:2010:EVE, author = "Vijay Janapa Reddi and Simone Campanoni and Meeta S. Gupta and Michael D. Smith and Gu-Yeon Wei and David Brooks and Kim Hazelwood", title = "Eliminating voltage emergencies via software-guided code transformations", journal = j-TACO, volume = "7", number = "2", pages = "12:1--12:??", month = sep, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1839667.1839674", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In recent years, circuit reliability in modern high-performance processors has become increasingly important. Shrinking feature sizes and diminishing supply voltages have made circuits more sensitive to microprocessor supply voltage fluctuations. These fluctuations result from the natural variation of processor activity as workloads execute, but when left unattended, these voltage fluctuations can lead to timing violations or even transistor lifetime issues. In this article, we present a hardware--software collaborative approach to mitigate voltage fluctuations. A checkpoint-recovery mechanism rectifies errors when voltage violates maximum tolerance settings, while a runtime software layer reschedules the program's instruction stream to prevent recurring violations at the same program location. The runtime layer, combined with the proposed code-rescheduling algorithm, removes 60\% of all violations with minimal overhead, thereby significantly improving overall performance. Our solution is a radical departure from the ongoing industry-standard approach to circumvent the issue altogether by optimizing for the worst-case voltage flux, which compromises power and performance efficiency severely, especially looking ahead to future technology generations. Existing conservative approaches will have severe implications on the ability to deliver efficient microprocessors. The proposed technique reassembles a traditional reliability problem as a runtime performance optimization problem, thus allowing us to design processors for typical case operation by building intelligent algorithms that can prevent recurring violations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", keywords = "dI/dt; inductive noise; voltage emergencies; Voltage noise", } @Article{Zhao:2010:PPP, author = "Qin Zhao and Ioana Cutcutache and Weng-Fai Wong", title = "{PiPA}: {Pipelined} profiling and analysis on multicore systems", journal = j-TACO, volume = "7", number = "3", pages = "13:1--13:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880037.1880038", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Profiling and online analysis are important tasks in program understanding and feedback-directed optimization. However, fine-grained profiling and online analysis tend to seriously slow down the application. To cope with the slowdown, one may have to terminate the process early or resort to sampling. The former tends to distort the result because of warm-up effects. The latter runs the risk of missing important effects because sampling was turned off during the time that these effects appeared. A promising approach is to make use of the parallel processing capabilities of the now ubiquitous multicore processors to speed up the profiling and analysis process.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Guo:2010:QSS, author = "Fei Guo and Yan Solihin and Li Zhao and Ravishankar Iyer", title = "Quality of service shared cache management in chip multiprocessor architecture", journal = j-TACO, volume = "7", number = "3", pages = "14:1--14:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880037.1880039", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The trends in enterprise IT toward service-oriented computing, server consolidation, and virtual computing point to a future in which workloads are becoming increasingly diverse in terms of performance, reliability, and availability requirements. It can be expected that more and more applications with diverse requirements will run on a Chip Multi-Processor (CMP) and share platform resources such as the lowest level cache and off-chip bandwidth. In this environment, it is desirable to have microarchitecture and software support that can provide a guarantee of a certain level of performance, which we refer to as performance Quality of Service. In this article, we investigated a framework would be needed to manage the shared cache resource for fully providing QoS in a CMP.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2010:DEH, author = "Xiaoxia Wu and Jian Li and Lixin Zhang and Evan Speight and Ram Rajamony and Yuan Xie", title = "Design exploration of hybrid caches with disparate memory technologies", journal = j-TACO, volume = "7", number = "3", pages = "15:1--15:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880037.1880040", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Traditional multilevel SRAM-based cache hierarchies, especially in the context of chip multiprocessors (CMPs), present many challenges in area requirements, core--to--cache balance, power consumption, and design complexity. New advancements in technology enable caches to be built from other technologies, such as Embedded DRAM (EDRAM), Magnetic RAM (MRAM), and Phase-change RAM (PRAM), in both 2D chips or 3D stacked chips. Caches fabricated in these technologies offer dramatically different power-performance characteristics when compared with SRAM-based caches, particularly in the areas of access latency, cell density, and overall power consumption. In this article, we propose to take advantage of the best characteristics that each technology has to offer through the use of Hybrid Cache Architecture (HCA) designs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kourtis:2010:ECO, author = "Kornilios Kourtis and Georgios Goumas and Nectarios Koziris", title = "Exploiting compression opportunities to improve {SpMxV} performance on shared memory systems", journal = j-TACO, volume = "7", number = "3", pages = "16:1--16:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880037.1880041", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Sparse Matrix-Vector Multiplication (SpMxV) kernel exhibits poor scaling on shared memory systems, due to the streaming nature of its data access pattern. To decrease memory contention and improve kernel performance we propose two compression schemes: CSR-DU, that targets the reduction of the matrix structural data by applying coarse-grained delta-encoding, and CSR-VI, that targets the reduction of the values using indirect indexing, applicable to matrices with a small number of unique values.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Buyukkurt:2010:IHL, author = "Betul Buyukkurt and John Cortes and Jason Villarreal and Walid A. Najjar", title = "Impact of high-level transformations within the {ROCCC} framework", journal = j-TACO, volume = "7", number = "4", pages = "17:1--17:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880043.1880044", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hwang:2010:DCR, author = "Yuan-Shin Hwang and Tzong-Yen Lin and Rong-Guey Chang", title = "{DisIRer}: {Converting} a retargetable compiler into a multiplatform binary translator", journal = j-TACO, volume = "7", number = "4", pages = "18:1--18:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880043.1880045", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Boyer:2010:FBP, author = "Michael Boyer and David Tarjan and Kevin Skadron", title = "Federation: {Boosting} per-thread performance of throughput-oriented manycore architectures", journal = j-TACO, volume = "7", number = "4", pages = "19:1--19:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880043.1880046", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fursin:2010:COP, author = "Grigori Fursin and Olivier Temam", title = "Collective optimization: a practical collaborative approach", journal = j-TACO, volume = "7", number = "4", pages = "20:1--20:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880043.1880047", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2010:UBI, author = "Fang Liu and Yan Solihin", title = "Understanding the behavior and implications of context switch misses", journal = j-TACO, volume = "7", number = "4", pages = "21:1--21:??", month = dec, year = "2010", CODEN = "????", DOI = "https://doi.org/10.1145/1880043.1880048", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 10 09:37:16 MST 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Eyerman:2011:FGD, author = "Stijn Eyerman and Lieven Eeckhout", title = "Fine-grained {DVFS} using on-chip regulators", journal = j-TACO, volume = "8", number = "1", pages = "1:1--1:??", month = apr, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1952998.1952999", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Apr 27 07:54:03 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Limit studies on Dynamic Voltage and Frequency Scaling (DVFS) provide apparently contradictory conclusions. On the one hand early limit studies report that DVFS is effective at large timescales (on the order of million(s) of cycles) with large scaling overheads (on the order of tens of microseconds), and they conclude that there is no need for small overhead DVFS at small timescales. Recent work on the other hand --- motivated by the surge of on-chip voltage regulator research --- explores the potential of fine-grained DVFS and reports substantial energy savings at timescales of hundreds of cycles (while assuming no scaling overhead). This article unifies these apparently contradictory conclusions through a DVFS limit study that simultaneously explores timescale and scaling speed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cher:2011:EEC, author = "Chen-Yong Cher and Eren Kursun", title = "Exploring the effects of on-chip thermal variation on high-performance multicore architectures", journal = j-TACO, volume = "8", number = "1", pages = "2:1--2:??", month = apr, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1952998.1953000", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Apr 27 07:54:03 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Inherent temperature variation among cores in a multicore architecture can be caused by a number of factors including process variation, cooling and packaging imperfections, and even placement of the chip in the module. Current dynamic thermal management techniques assume identical heating profiles for homogeneous multicore architectures. Our experimental results indicate that inherent thermal variation is very common in existing multicores. While most multicore chips accommodate multiple thermal sensors, the dynamic power/thermal management schemes are oblivious of the inherent heating tendencies. Hence, in the case of variation, the chip faces repetitive hotspots running on such cores. In this article, we propose a technique that leverages the on-chip sensor infrastructure as well as the capabilities of power/thermal management to effectively reduce the heating and minimize local hotspots.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2011:ATR, author = "Carole-Jean Wu and Margaret Martonosi", title = "Adaptive timekeeping replacement: Fine-grained capacity management for shared {CMP} caches", journal = j-TACO, volume = "8", number = "1", pages = "3:1--3:??", month = apr, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1952998.1953001", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Apr 27 07:54:03 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In chip multiprocessors (CMPs), several high-performance cores typically compete for capacity in a shared last-level cache. This causes degraded and unpredictable memory performance for multiprogrammed and parallel workloads. In response, recent schemes apportion cache bandwidth and capacity in ways that offer better aggregate performance for the workloads. These schemes, however, focus primarily on relatively coarse-grained capacity management without concern for operating system process priority levels. In this work, we explore capacity management approaches that are both temporally and spatially more fine-grained than prior work. We also consider operating system priority levels as part of capacity management. We propose a capacity management mechanism based on timekeeping techniques that track the time interval since the last access to cached data.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vespa:2011:DFA, author = "Lucas Vespa and Ning Weng", title = "Deterministic finite automata characterization and optimization for scalable pattern matching", journal = j-TACO, volume = "8", number = "1", pages = "4:1--4:??", month = apr, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1952998.1953002", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Apr 27 07:54:03 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memory-based Deterministic Finite Automata (DFA) are ideal for pattern matching in network intrusion detection systems due to their deterministic performance and ease of update of new patterns, however severe DFA memory requirements make it impractical to implement thousands of patterns. This article aims to understand the basic relationship between DFA characteristics and memory requirements, and to design a practical memory-based pattern matching engine. We present a methodology that consists of theoretical DFA characterization, encoding optimization, and implementation architecture. Results show the validity of the characterization metrics, effectiveness of the encoding techniques, and efficiency of the memory-based pattern engines.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bhattacharjee:2011:PLC, author = "Abhishek Bhattacharjee and Gilberto Contreras and Margaret Martonosi", title = "Parallelization libraries: Characterizing and reducing overheads", journal = j-TACO, volume = "8", number = "1", pages = "5:1--5:??", month = apr, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1952998.1953003", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Apr 27 07:54:03 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Creating efficient, scalable dynamic parallel runtime systems for chip multiprocessors (CMPs) requires understanding the overheads that manifest at high core counts and small task sizes. In this article, we assess these overheads on Intel's Threading Building Blocks (TBB) and OpenMP. First, we use real hardware and simulations to detail various scheduler and synchronization overheads. We find that these can amount to 47\% of TBB benchmark runtime and 80\% of OpenMP benchmark runtime. Second, we propose load balancing techniques such as occupancy-based and criticality-guided task stealing, to boost performance. Overall, our study provides valuable insights for creating robust, scalable runtime libraries.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dong:2011:HCU, author = "Xiangyu Dong and Yuan Xie and Naveen Muralimanohar and Norman P. Jouppi", title = "Hybrid checkpointing using emerging nonvolatile memories for future exascale systems", journal = j-TACO, volume = "8", number = "2", pages = "6:1--6:??", month = jul, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1970386.1970387", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 17 18:32:40 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The scalability of future Massively Parallel Processing (MPP) systems is being severely challenged by high failure rates. Current centralized Hard Disk Drive (HDD) checkpointing results in overhead of 25\% or more at petascale. Since systems become more vulnerable as the node count keeps increasing, novel techniques that enable fast and frequent checkpointing are critical to the future exascale system implementation. In this work, we first introduce one of the emerging nonvolatile memory technologies, Phase-Change Random Access Memory (PCRAM), as a proper candidate of the fast checkpointing device. After a thorough analysis of MPP systems, failure rates and failure sources, we propose a PCRAM-based hybrid local/global checkpointing mechanism which not only provides a faster checkpoint storage, but also boosts the effectiveness of other orthogonal techniques such as incremental checkpointing and background checkpointing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2011:EEM, author = "Jianjun Li and Chenggang Wu and Wei-Chung Hsu", title = "Efficient and effective misaligned data access handling in a dynamic binary translation system", journal = j-TACO, volume = "8", number = "2", pages = "7:1--7:??", month = jul, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1970386.1970388", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 17 18:32:40 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Binary Translation (BT) has been commonly used to migrate application software across Instruction Set Architectures (ISAs). Some architectures, such as X86, allow Misaligned Data Accesses (MDAs), while most modern architectures require natural data alignments. In a binary translation system, where the source ISA allows MDA and the target ISA does not, memory operations must be carefully translated. Naive translation may cause frequent misaligned data access traps to occur at runtime on the target machine and severely slow down the migrated application. This article evaluates different approaches in handling MDA in a binary translation system including how to identify MDA candidates and how to translate such memory instructions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Venkataramani:2011:DDS, author = "Guru Venkataramani and Christopher J. Hughes and Sanjeev Kumar and Milos Prvulovic", title = "{DeFT}: Design space exploration for on-the-fly detection of coherence misses", journal = j-TACO, volume = "8", number = "2", pages = "8:1--8:??", month = jul, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1970386.1970389", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 17 18:32:40 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "While multicore processors promise large performance benefits for parallel applications, writing these applications is notoriously difficult. Tuning a parallel application to achieve good performance, also known as performance debugging, is often more challenging than debugging the application for correctness. Parallel programs have many performance-related issues that are not seen in sequential programs. An increase in cache misses is one of the biggest challenges that programmers face. To minimize these misses, programmers must not only identify the source of the extra misses, but also perform the tricky task of determining if the misses are caused by interthread communication (i.e., coherence misses) and if so, whether they are caused by true or false sharing (since the solutions for these two are quite different).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hiser:2011:EIB, author = "Jason D. Hiser and Daniel W. Williams and Wei Hu and Jack W. Davidson and Jason Mars and Bruce R. Childers", title = "Evaluating indirect branch handling mechanisms in software dynamic translation systems", journal = j-TACO, volume = "8", number = "2", pages = "9:1--9:??", month = jul, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/1970386.1970390", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jun 17 18:32:40 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Software Dynamic Translation (SDT) is used for instrumentation, optimization, security, and many other uses. A major source of SDT overhead is the execution of code to translate an indirect branch's target address into the translated destination block's address. This article discusses sources of Indirect Branch (IB) overhead in SDT systems and evaluates techniques for overhead reduction. Measurements using SPEC CPU2000 show that the appropriate choice and configuration of IB translation mechanisms can significantly reduce the overhead. Further, cross-architecture evaluation of these mechanisms reveals that the most efficient implementation and configuration can be highly dependent on the architecture implementation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2011:HAM, author = "Xi E. Chen and Tor M. Aamodt", title = "Hybrid analytical modeling of pending cache hits, data prefetching, and {MSHRs}", journal = j-TACO, volume = "8", number = "3", pages = "10:1--10:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019609", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article proposes techniques to predict the performance impact of pending cache hits, hardware prefetching, and miss status holding register resources on superscalar microprocessors using hybrid analytical models. The proposed models focus on timeliness of pending hits and prefetches and account for a limited number of MSHRs. They improve modeling accuracy of pending hits by 3.9{\times} and when modeling data prefetching, a limited number of MSHRs, or both, these techniques result in average errors of 9.5\% to 17.8\%. The impact of non-uniform DRAM memory latency is shown to be approximated well by using a moving average of memory access latency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kleanthous:2011:CMD, author = "Marios Kleanthous and Yiannakis Sazeides", title = "{CATCH}: a mechanism for dynamically detecting cache-content-duplication in instruction caches", journal = j-TACO, volume = "8", number = "3", pages = "11:1--11:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019610", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Cache-content-duplication (CCD) occurs when there is a miss for a block in a cache and the entire content of the missed block is already in the cache in a block with a different tag. Caches aware of content-duplication can have lower miss penalty by fetching, on a miss to a duplicate block, directly from the cache instead of accessing lower in the memory hierarchy, and can have lower miss rates by allowing only blocks with unique content to enter a cache. This work examines the potential of CCD for instruction caches. We show that CCD is a frequent phenomenon and that an idealized duplication-detection mechanism for instruction caches has the potential to increase performance of an out-of-order processor, with a 16KB, 8-way, 8 instructions per block instruction cache, often by more than 10\% and up to 36\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vandierendonck:2011:MSR, author = "Hans Vandierendonck and Andr{\'e} Seznec", title = "Managing {SMT} resource usage through speculative instruction window weighting", journal = j-TACO, volume = "8", number = "3", pages = "12:1--12:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019611", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Simultaneous multithreading processors dynamically share processor resources between multiple threads. In general, shared SMT resources may be managed explicitly, for instance, by dynamically setting queue occupation bounds for each thread as in the DCRA and Hill-Climbing policies. Alternatively, resources may be managed implicitly; that is, resource usage is controlled by placing the desired instruction mix in the resources. In this case, the main resource management tool is the instruction fetch policy which must predict the behavior of each thread (branch mispredictions, long-latency loads, etc.) as it fetches instructions. In this article, we present the use of Speculative Instruction Window Weighting (SIWW) to bridge the gap between implicit and explicit SMT fetch policies.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2011:PGS, author = "Po-Han Wang and Chia-Lin Yang and Yen-Ming Chen and Yu-Jung Cheng", title = "Power gating strategies on {GPUs}", journal = j-TACO, volume = "8", number = "3", pages = "13:1--13:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019612", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As technology continues to shrink, reducing leakage is critical to achieving energy efficiency. Previous studies on low-power GPUs (Graphics Processing Units) focused on techniques for dynamic power reduction, such as DVFS (Dynamic Voltage and Frequency Scaling) and clock gating. In this paper, we explore the potential of adopting architecture-level power gating techniques for leakage reduction on GPUs. We propose three strategies for applying power gating on different modules in GPUs. The Predictive Shader Shutdown technique exploits workload variation across frames to eliminate leakage in shader clusters. Deferred Geometry Pipeline seeks to minimize leakage in fixed-function geometry units by utilizing an imbalance between geometry and fragment computation across batches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Feng:2011:DAD, author = "Min Feng and Chen Tian and Changhui Lin and Rajiv Gupta", title = "Dynamic access distance driven cache replacement", journal = j-TACO, volume = "8", number = "3", pages = "14:1--14:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019613", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we propose a new cache replacement policy that makes the replacement decision based on the reuse information of the cache lines and the requested data. We present the architectural support and evaluate the performance of our approach using SPEC benchmarks. We also develop two reuse information predictors: a profile-based static predictor and a runtime predictor. The applicability of each predictor is discussed in this paper. We further extend our reuse information predictors so that the cache can adaptively choose between the reuse information based replacement policy and an approximation of LRU policy. According to the experimental results, our adaptive reuse information based replacement policy performs either better than or close to the LRU policy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Samih:2011:EPP, author = "Ahmad Samih and Yan Solihin and Anil Krishna", title = "Evaluating placement policies for managing capacity sharing in {CMP} architectures with private caches", journal = j-TACO, volume = "8", number = "3", pages = "15:1--15:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019614", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Chip Multiprocessors (CMP) with distributed L2 caches suffer from a cache fragmentation problem; some caches may be overutilized while others may be underutilized. To avoid such fragmentation, researchers have proposed capacity sharing mechanisms where applications that need additional cache space can place their victim blocks in remote caches. However, we found that only allowing victim blocks to be placed on remote caches tends to cause a high number of remote cache hits relative to local cache hits. In this article, we show that many of the remote cache hits can be converted into local cache hits if we allow newly fetched blocks to be selectively placed directly in a remote cache, rather than in the local cache.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yeh:2011:MPP, author = "Chang-Ching Yeh and Kuei-Chung Chang and Tien-Fu Chen and Chingwei Yeh", title = "Maintaining performance on power gating of microprocessor functional units by using a predictive pre-wakeup strategy", journal = j-TACO, volume = "8", number = "3", pages = "16:1--16:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019615", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Power gating is an effective technique for reducing leakage power in deep submicron CMOS technology. Microarchitectural techniques for power gating of functional units have been developed by detecting suitable idle regions and turning them off to reduce leakage energy consumption; however, wakeup of functional units is needed when instructions are ready for execution such that wakeup overhead is naturally incurred. This study presents time-based power gating with reference pre-wakeup (PGRP), a novel predictive strategy that detects suitable idle periods for power gating and then enables pre-wakeup of needed functional units for avoiding wakeup overhead. The key insight is that most wakeups are repeated due to program locality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2011:DDE, author = "Hyunjin Lee and Sangyeun Cho and Bruce R. Childers", title = "{DEFCAM}: a design and evaluation framework for defect-tolerant cache memories", journal = j-TACO, volume = "8", number = "3", pages = "17:1--17:??", month = oct, year = "2011", CODEN = "????", DOI = "https://doi.org/10.1145/2019608.2019616", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Advances in deep submicron technology call for a careful review of existing cache designs and design practices in terms of yield, area, and performance. This article presents a Design and Evaluation Framework for defect-tolerant Cache Memories (DEFCAM), which enables processor architects to consider yield, area, and performance together in a unified framework. Since there is a complex, changing trade-off among these metrics depending on the technology, the cache organization, and the yield enhancement scheme employed, such a design flow is invaluable to processor architects when they assess a design and explore the design space quickly at an early stage. We develop a complete framework supporting the proposed DEFCAM design flow, from injecting defects into a wafer to evaluating program performance of individual processors on the wafer.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Stenstrom:2012:ISI, author = "Per Stenstr{\"o}m and Koen {De Bosschere}", title = "Introduction to the special issue on high-performance and embedded architectures and compilers", journal = j-TACO, volume = "8", number = "4", pages = "18:1--18:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086697", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Albericio:2012:ALC, author = "Jorge Albericio and Rub{\'e}n Gran and Pablo Ib{\'a}{\~n}ez and V{\'\i}ctor Vi{\~n}als and Jose Mar{\'\i}a Llaber{\'\i}a", title = "{ABS}: a low-cost adaptive controller for prefetching in a banked shared last-level cache", journal = j-TACO, volume = "8", number = "4", pages = "19:1--19:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086698", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hardware data prefetch is a very well known technique for hiding memory latencies. However, in a multicore system fitted with a shared Last-Level Cache (LLC), prefetch induced by a core consumes common resources such as shared cache space and main memory bandwidth. This may degrade the performance of other cores and even the overall system performance unless the prefetch aggressiveness of each core is controlled from a system standpoint. On the other hand, LLCs in commercial chip multiprocessors are more and more frequently organized in independent banks. In this contribution, we target for the first time prefetch in a banked LLC organization and propose ABS, a low-cost controller with a hill-climbing approach that runs stand-alone at each LLC bank without requiring inter-bank communication.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bayrak:2012:AII, author = "Ali Galip Bayrak and Nikola Velickovic and Paolo Ienne and Wayne Burleson", title = "An architecture-independent instruction shuffler to protect against side-channel attacks", journal = j-TACO, volume = "8", number = "4", pages = "20:1--20:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086699", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Embedded cryptographic systems, such as smart cards, require secure implementations that are robust to a variety of low-level attacks. Side-Channel Attacks (SCA) exploit the information such as power consumption, electromagnetic radiation and acoustic leaking through the device to uncover the secret information. Attackers can mount successful attacks with very modest resources in a short time period. Therefore, many methods have been proposed to increase the security against SCA. Randomizing the execution order of the instructions that are independent, i.e., random shuffling, is one of the most popular among them. Implementing instruction shuffling in software is either implementation specific or has a significant performance or code size overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Demme:2012:AGC, author = "John Demme and Simha Sethumadhavan", title = "Approximate graph clustering for program characterization", journal = j-TACO, volume = "8", number = "4", pages = "21:1--21:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086700", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "An important aspect of system optimization research is the discovery of program traits or behaviors. In this paper, we present an automated method of program characterization which is able to examine and cluster program graphs, i.e., dynamic data graphs or control flow graphs. Our novel approximate graph clustering technology allows users to find groups of program fragments which contain similar code idioms or patterns in data reuse, control flow, and context. Patterns of this nature have several potential applications including development of new static or dynamic optimizations to be implemented in software or in hardware. For the SPEC CPU 2006 suite of benchmarks, our results show that approximate graph clustering is effective at grouping behaviorally similar functions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pricopi:2012:BPH, author = "Mihai Pricopi and Tulika Mitra", title = "{Bahurupi}: a polymorphic heterogeneous multi-core architecture", journal = j-TACO, volume = "8", number = "4", pages = "22:1--22:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086701", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Computing systems have made an irreversible transition towards parallel architectures with the emergence of multi-cores. Moreover, power and thermal limits in embedded systems mandate the deployment of many simpler cores rather than a few complex cores on chip. Consumer electronic devices, on the other hand, need to support an ever-changing set of diverse applications with varying performance demands. While some applications can benefit from thread-level parallelism offered by multi-core solutions, there still exist a large number of applications with substantial amount of sequential code. The sequential programs suffer from limited exploitation of instruction-level parallelism in simple cores. We propose a reconfigurable multi-core architecture, called Bahurupi, that can successfully reconcile the conflicting demands of instruction-level and thread-level parallelism.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cleemput:2012:CMT, author = "Jeroen V. Cleemput and Bart Coppens and Bjorn {De Sutter}", title = "Compiler mitigations for time attacks on modern x86 processors", journal = j-TACO, volume = "8", number = "4", pages = "23:1--23:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086702", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This paper studies and evaluates the extent to which automated compiler techniques can defend against timing-based side channel attacks on modern x86 processors. We study how modern x86 processors can leak timing information through side channels that relate to data flow. We study the efficiency, effectiveness, portability, predictability and sensitivity of several mitigating code transformations that eliminate or minimize key-dependent execution time variations. Furthermore, we discuss the extent to which compiler backends are a suitable tool to provide automated support for the proposed mitigations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mccandless:2012:CTI, author = "Jason Mccandless and David Gregg", title = "Compiler techniques to improve dynamic branch prediction for indirect jump and call instructions", journal = j-TACO, volume = "8", number = "4", pages = "24:1--24:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086703", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Indirect jump instructions are used to implement multiway branch statements and virtual function calls in object-oriented languages. Branch behavior can have significant impact on program performance, but fortunately hardware predictors can alleviate much of the risk. Modern processors include indirect branch predictors which use part of the target address to update a global history. We present a code generation technique to maximize the branch history information available to the predictor. We implement our optimization as an assembly language transformation, and evaluate it for SPEC benchmarks and interpreters using simulated and real hardware, showing indirect branch misprediction decreases.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Garcia-Guirado:2012:DDA, author = "Antonio Garc{\'\i}a-Guirado and Ricardo Fern{\'a}ndez-Pascual and Alberto Ros and Jos{\'e} M. Garc{\'\i}a", title = "{DAPSCO}: Distance-aware partially shared cache organization", journal = j-TACO, volume = "8", number = "4", pages = "25:1--25:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086704", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many-core tiled CMP proposals often assume a partially shared last level cache (LLC) since this provides a good compromise between access latency and cache utilization. In this paper, we propose a novel way to map memory addresses to LLC banks that takes into account the average distance between the banks and the tiles that access them. Contrary to traditional approaches, our mapping does not group the tiles in clusters within which all the cores access the same bank for the same addresses. Instead, two neighboring cores access different sets of banks minimizing the average distance travelled by the cache requests. Results for a 64-core CMP show that our proposal improves both execution time and the energy consumed by the network by 13\% when compared to a traditional mapping.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2012:FSS, author = "Zhenjiang Wang and Chenggang Wu and Pen-Chung Yew and Jianjun Li and Di Xu", title = "On-the-fly structure splitting for heap objects", journal = j-TACO, volume = "8", number = "4", pages = "26:1--26:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086705", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the advent of multicore systems, the gap between processor speed and memory latency has grown worse because of their complex interconnect. Sophisticated techniques are needed more than ever to improve an application's spatial and temporal locality. This paper describes an optimization that aims to improve heap data layout by structure-splitting. It also provides runtime address checking by piggybacking on the existing page protection mechanism to guarantee the correctness of such optimization that has eluded many previous attempts due to safety concerns. The technique can be applied to both sequential and parallel programs at either compile time or runtime. However, we focus primarily on sequential programs (i.e., single-threaded programs) at runtime in this paper.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Das:2012:ELC, author = "Dibyendu Das and B. Dupont {De Dinechin} and Ramakrishna Upadrasta", title = "Efficient liveness computation using merge sets and {DJ}-graphs", journal = j-TACO, volume = "8", number = "4", pages = "27:1--27:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086706", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this work we devise an efficient algorithm that computes the liveness information of program variables. The algorithm employs SSA form and DJ-graphs as representation to build Merge sets. The Merge set of node n, M(n) is based on the structure of the Control Flow Graph (CFG) and consists of all nodes where a {\phi}-function needs to be placed, if a definition of a variable appears in n. The merge sets of a CFG can be computed using DJ-graphs without prior knowledge of how the variables are used and defined. Later, we can answer the liveness query (as a part of other optimization or analysis phase) by utilizing the knowledge of the use/def of variables, the dominator tree and the pre-computed merge sets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Patsilaras:2012:EEM, author = "George Patsilaras and Niket K. Choudhary and James Tuck", title = "Efficiently exploiting memory level parallelism on asymmetric coupled cores in the dark silicon era", journal = j-TACO, volume = "8", number = "4", pages = "28:1--28:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086707", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Extracting high memory-level parallelism (MLP) is essential for speeding up single-threaded applications which are memory bound. At the same time, the projected amount of dark silicon (the fraction of the chip powered off) on a chip is growing. Hence, Asymmetric Multicore Processors (AMP) offer a unique opportunity to integrate many types of cores, each powered at different times, in order to optimize for different regions of execution. In this work, we quantify the potential for exploiting core customization to speedup programs during regions of high MLP. Based on a careful design space exploration, we discover that an AMP that includes a narrow and fast specialized core has the potential to efficiently exploit MLP.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Malits:2012:ELG, author = "Roman Malits and Evgeny Bolotin and Avinoam Kolodny and Avi Mendelson", title = "Exploring the limits of {GPGPU} scheduling in control flow bound applications", journal = j-TACO, volume = "8", number = "4", pages = "29:1--29:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086708", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GPGPUs are optimized for graphics, for that reason the hardware is optimized for massively data parallel applications characterized by predictable memory access patterns and little control flow. For such applications' e.g., matrix multiplication, GPGPU based system can achieve very high performance. However, many general purpose data parallel applications are characterized as having intensive control flow and unpredictable memory access patterns. Optimizing the code in such problems for current hardware is often ineffective and even impractical since it exhibits low hardware utilization leading to relatively low performance. This work tracks the root causes of execution inefficacies when running control flow intensive CUDA applications on NVIDIA GPGPU hardware.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Orosa:2012:FIF, author = "Lois Orosa and Elisardo Antelo and Javier D. Bruguera", title = "{FlexSig}: {Implementing} flexible hardware signatures", journal = j-TACO, volume = "8", number = "4", pages = "30:1--30:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086709", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the advent of chip multiprocessors, new techniques have been developed to make parallel programming easier and more reliable. New parallel programming paradigms and new methods of making the execution of programs more efficient and more reliable have been developed. Usually, these improvements require hardware support to avoid a system slowdown. Signatures based on Bloom filters are widely used as hardware support for parallel programming in chip multiprocessors. Signatures are used in Transactional Memory, thread-level speculation, parallel debugging, deterministic replay and other tools and applications. The main limitation of hardware signatures is the lack of flexibility: if signatures are designed with a given configuration, tailored to the requirements of a specific tool or application, it is likely that they do not fit well for other different requirements.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Titos-Gil:2012:HTM, author = "Ruben Titos-Gil and Manuel E. Acacio and Jose M. Garcia and Tim Harris and Adrian Cristal and Osman Unsal and Ibrahim Hur and Mateo Valero", title = "Hardware transactional memory with software-defined conflicts", journal = j-TACO, volume = "8", number = "4", pages = "31:1--31:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086710", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this paper we investigate the benefits of turning the concept of transactional conflict from its traditionally fixed definition into a variable one that can be dynamically controlled in software. We propose the extension of the atomic language construct with an attribute that specifies the definition of conflict, so that programmers can write code which adjusts what kinds of conflicts are to be detected, relaxing or tightening the conditions according to the forms of interference that can be tolerated by a particular algorithm. Using this performance-motivated construct, specific conflict information can be associated with portions of code, as each transaction is provided with a local definition that applies while it executes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kim:2012:IPN, author = "Yongjoo Kim and Jongeun Lee and Toan X. Mai and Yunheung Paek", title = "Improving performance of nested loops on reconfigurable array processors", journal = j-TACO, volume = "8", number = "4", pages = "32:1--32:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086711", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Pipelining algorithms are typically concerned with improving only the steady-state performance, or the kernel time. The pipeline setup time happens only once and therefore can be negligible compared to the kernel time. However, for Coarse-Grained Reconfigurable Architectures (CGRAs) used as a coprocessor to a main processor, pipeline setup can take much longer due to the communication delay between the two processors, and can become significant if it is repeated in an outer loop of a loop nest. In this paper we evaluate the overhead of such non-kernel execution times when mapping nested loops for CGRAs, and propose a novel architecture-compiler cooperative scheme to reduce the overhead, while also minimizing the number of extra configurations required.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Purnaprajna:2012:MWI, author = "Madhura Purnaprajna and Paolo Ienne", title = "Making wide-issue {VLIW} processors viable on {FPGAs}", journal = j-TACO, volume = "8", number = "4", pages = "33:1--33:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086712", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Soft and highly-customized processors are emerging as a common way to efficiently control large amount of computing resources available on FPGAs. Yet, some processor architectures of choice for DSP and media applications, such as wide-issue VLIW processors, remain impractical: the multi-ported register file makes a very inefficient use of the resources in the FPGA fabric. This paper proposes modifications to existing FPGAs to make soft-VLIW processor viable. We introduce an embedded multi-ported RAM that can be customized to match the issue-width of VLIW processors. To ascertain the benefits of this approach, we map an extensible VLIW processor onto a standard FPGA from Xilinx.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Radojkovic:2012:EIS, author = "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud Grasset and Eduardo Qui{\~n}ones and Sami Yehia and Francisco J. Cazorla", title = "On the evaluation of the impact of shared resources in multithreaded {COTS} processors in time-critical environments", journal = j-TACO, volume = "8", number = "4", pages = "34:1--34:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086713", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Commercial Off-The-Shelf (COTS) processors are now commonly used in real-time embedded systems. The characteristics of these processors fulfill system requirements in terms of time-to-market, low cost, and high performance-per-watt ratio. However, multithreaded (MT) processors are still not widely used in real-time systems because the timing analysis is too complex. In MT processors, simultaneously-running tasks share and compete for processor resources, so the timing analysis has to estimate the possible impact that the inter-task interferences have on the execution time of the applications. In this paper, we propose a method that quantifies the slowdown that simultaneously-running tasks may experience due to collision in shared processor resources.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Domnitser:2012:NMC, author = "Leonid Domnitser and Aamer Jaleel and Jason Loew and Nael Abu-Ghazaleh and Dmitry Ponomarev", title = "Non-monopolizable caches: Low-complexity mitigation of cache side channel attacks", journal = j-TACO, volume = "8", number = "4", pages = "35:1--35:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086714", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We propose a flexibly-partitioned cache design that either drastically weakens or completely eliminates cache-based side channel attacks. The proposed Non-Monopolizable (NoMo) cache dynamically reserves cache lines for active threads and prevents other co-executing threads from evicting reserved lines. Unreserved lines remain available for dynamic sharing among threads. NoMo requires only simple modifications to the cache replacement logic, making it straightforward to adopt. It requires no software support enabling it to automatically protect pre-existing binaries. NoMo results in performance degradation of about 1\% on average. We demonstrate that NoMo can provide strong security guarantees for the AES and Blowfish encryption algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rico:2012:SLS, author = "Alejandro Rico and Felipe Cabarcas and Carlos Villavieja and Milan Pavlovic and Augusto Vega and Yoav Etsion and Alex Ramirez and Mateo Valero", title = "On the simulation of large-scale architectures using multiple application abstraction levels", journal = j-TACO, volume = "8", number = "4", pages = "36:1--36:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086715", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Simulation is a key tool for computer architecture research. In particular, cycle-accurate simulators are extremely important for microarchitecture exploration and detailed design decisions, but they are slow and, so, not suitable for simulating large-scale architectures, nor are they meant for this. Moreover, microarchitecture design decisions are irrelevant, or even misleading, for early processor design stages and high-level explorations. This allows one to raise the abstraction level of the simulated architecture, and also the application abstraction level, as it does not necessarily have to be represented as an instruction stream. In this paper we introduce a definition of different application abstraction levels, and how these are employed in TaskSim, a multi-core architecture simulator, to provide several architecture modeling abstractions, and simulate large-scale architectures with hundreds of cores.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Saidi:2012:OED, author = "Selma Saidi and Pranav Tendulkar and Thierry Lepley and Oded Maler", title = "Optimizing explicit data transfers for data parallel applications on the {Cell} architecture", journal = j-TACO, volume = "8", number = "4", pages = "37:1--37:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086716", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this paper we investigate a general approach to automate some deployment decisions for a certain class of applications on multi-core computers. We consider data-parallelizable programs that use the well-known double buffering technique to bring the data from the off-chip slow memory to the local memory of the cores via a DMA (direct memory access) mechanism. Based on the computation time and size of elementary data items as well as DMA characteristics, we derive optimal and near optimal values for the number of blocks that should be clustered in a single DMA command. We then extend the results to the case where a computation for one data item needs some data in its neighborhood.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Feng:2012:PPL, author = "Min Feng and Changhui Lin and Rajiv Gupta", title = "{PLDS}: Partitioning linked data structures for parallelism", journal = j-TACO, volume = "8", number = "4", pages = "38:1--38:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086717", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recently, parallelization of computations in the presence of dynamic data structures has shown promising potential. In this paper, we present PLDS, a system for easily expressing and efficiently exploiting parallelism in computations that are based on dynamic linked data structures. PLDS improves the execution efficiency by providing support for data partitioning and then distributing computation across threads based on the partitioning. Such computations often require the use of speculation to exploit dynamic parallelism. PLDS supports a conditional speculation mechanism that reduces the cost of speculation. PLDS can be employed in the context of different forms of parallelism, which to cover a wide range of parallel applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pradelle:2012:PPB, author = "Benoit Pradelle and Alain Ketterlin and Philippe Clauss", title = "Polyhedral parallelization of binary code", journal = j-TACO, volume = "8", number = "4", pages = "39:1--39:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086718", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many automatic software parallelization systems have been proposed in the past decades, but most of them are dedicated to source-to-source transformations. This paper shows that parallelizing executable programs is feasible, even if they require complex transformations, and in effect decouples parallelization from compilation, for example, for closed-source or legacy software, where binary code is the only available representation. We propose an automatic parallelizer, which is able to perform advanced parallelization on binary code. It first parses the binary code and extracts high-level information. From this information, a C program is generated. This program captures only a subset of the program semantics, namely, loops and memory accesses.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dong:2012:RAE, author = "Yaozu Dong and Yu Chen and Zhenhao Pan and Jinquan Dai and Yunhong Jiang", title = "{ReNIC}: Architectural extension to {SR-IOV} {I/O} virtualization for efficient replication", journal = j-TACO, volume = "8", number = "4", pages = "40:1--40:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086719", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Virtualization is gaining popularity in cloud computing and has become the key enabling technology in cloud infrastructure. By replicating the virtual server state to multiple independent platforms, virtualization improves the reliability and availability of cloud systems. Unfortunately, existing Virtual Machine (VM) replication solutions were designed only for software virtualized I/O, which suffers from large performance and scalability overheads. Although hardware-assisted I/O virtualization (such as SR-IOV) can achieve close to native performance and very good scalability, they cannot be properly replicated across different physical machines due to architectural limitations (such as lack of efficient device state read/write, buffering outbound packets, etc.) .", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bruintjes:2012:SLA, author = "Tom M. Bruintjes and Karel H. G. Walters and Sabih H. Gerez and Bert Molenkamp and Gerard J. M. Smit", title = "{Sabrewing}: a lightweight architecture for combined floating-point and integer arithmetic", journal = j-TACO, volume = "8", number = "4", pages = "41:1--41:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086720", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In spite of the fact that floating-point arithmetic is costly in terms of silicon area, the joint design of hardware for floating-point and integer arithmetic is seldom considered. While components like multipliers and adders can potentially be shared, floating-point and integer units in contemporary processors are practically disjoint. This work presents a new architecture which tightly integrates floating-point and integer arithmetic in a single datapath. It is mainly intended for use in low-power embedded digital signal processors and therefore the following design constraints were important: limited use of pipelining for the convenience of the compiler; maintaining compatibility with existing technology; minimal area and power consumption for applicability in embedded systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kicherer:2012:SPA, author = "Mario Kicherer and Fabian Nowak and Rainer Buchty and Wolfgang Karl", title = "Seamlessly portable applications: Managing the diversity of modern heterogeneous systems", journal = j-TACO, volume = "8", number = "4", pages = "42:1--42:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086721", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Nowadays, many possible configurations of heterogeneous systems exist, posing several new challenges to application development: different types of processing units usually require individual programming models with dedicated runtime systems and accompanying libraries. If these are absent on an end-user system, e.g. because the respective hardware is not present, an application linked against these will break. This handicaps portability of applications being developed on one system and executed on other, differently configured heterogeneous systems. Moreover, the individual profit of different processing units is normally not known in advance. In this work, we propose a technique to effectively decouple applications from their accelerator-specific parts, respectively code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Premillieu:2012:SSR, author = "Nathanael Premillieu and Andre Seznec", title = "{SYRANT}: {SYmmetric Resource Allocation on Not-taken and Taken} paths", journal = j-TACO, volume = "8", number = "4", pages = "43:1--43:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086722", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In the multicore era, achieving ultimate single process performance is still an issue e.g. for single process workload or for sequential sections in parallel applications. Unfortunately, despite tremendous research effort on branch prediction, substantial performance potential is still wasted due to branch mispredictions. On a branch misprediction resolution, instruction treatment on the wrong path is essentially thrown away. However, in most cases after a conditional branch, the taken and the not-taken paths of execution merge after a few instructions. Instructions that follow the reconvergence point are executed whatever the branch outcome is. We present SYRANT (SYmmetric Resource Allocation on Not-taken and Taken paths), a new technique for exploiting control independence.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hasenplaugh:2012:GBC, author = "William Hasenplaugh and Pritpal S. Ahuja and Aamer Jaleel and Simon {Steely, Jr.} and Joel Emer", title = "The gradient-based cache partitioning algorithm", journal = j-TACO, volume = "8", number = "4", pages = "44:1--44:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086723", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This paper addresses the problem of partitioning a cache between multiple concurrent threads and in the presence of hardware prefetching. Cache replacement designed to preserve temporal locality (e.g., LRU) will allocate cache resources proportional to the miss-rate of each competing thread irrespective of whether the cache space will be utilized [Qureshi and Patt 2006]. This is clearly suboptimal as applications vary dramatically in their use of recently accessed data. We address this problem by partitioning a shared cache such that a global goodness metric is optimized. This paper introduces the Gradient-based Cache Partitioning Algorithm (GPA), whose variants optimize either hitrate, total instructions per cycle (IPC) or a weighted IPC metric designed to enforce Quality of Service (QoS) [Iyer 2004].", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lira:2012:MPA, author = "Javier Lira and Timothy M. Jones and Carlos Molina and Antonio Gonz{\'a}lez", title = "The migration prefetcher: Anticipating data promotion in dynamic {NUCA} caches", journal = j-TACO, volume = "8", number = "4", pages = "45:1--45:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086724", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The exponential increase in multicore processor (CMP) cache sizes accompanied by growing on-chip wire delays make it difficult to implement traditional caches with a single, uniform access latency. Non-Uniform Cache Architecture (NUCA) designs have been proposed to address this problem. A NUCA divides the whole cache memory into smaller banks and allows banks nearer a processor core to have lower access latencies than those further away, thus mitigating the effects of the cache's internal wires. Determining the best placement for data in the NUCA cache at any particular moment during program execution is crucial for exploiting the benefits that this architecture provides.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pusukuri:2012:TTD, author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N. Bhuyan", title = "Thread Tranquilizer: Dynamically reducing performance variation", journal = j-TACO, volume = "8", number = "4", pages = "46:1--46:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086725", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To realize the performance potential of multicore systems, we must effectively manage the interactions between memory reference behavior and the operating system policies for thread scheduling and migration decisions. We observe that these interactions lead to significant variations in the performance of a given application, from one execution to the next, even when the program input remains unchanged and no other applications are being run on the system. Our experiments with multithreaded programs, including the TATP database application, SPECjbb2005, and a subset of PARSEC and SPEC OMP programs, on a 24-core Dell PowerEdge R905 server running OpenSolaris confirms the above observation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2012:TPB, author = "Dongsong Zhang and Deke Guo and Fangyuan Chen and Fei Wu and Tong Wu and Ting Cao and Shiyao Jin", title = "{TL}-plane-based multi-core energy-efficient real-time scheduling algorithm for sporadic tasks", journal = j-TACO, volume = "8", number = "4", pages = "47:1--47:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086726", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As the energy consumption of multi-core systems becomes increasingly prominent, it's a challenge to design an energy-efficient real-time scheduling algorithm in multi-core systems for reducing the system energy consumption while guaranteeing the feasibility of real-time tasks. In this paper, we focus on multi-core processors, with the global Dynamic Voltage Frequency Scaling (DVFS) and Dynamic Power Management (DPM) technologies. In this setting, we propose an energy-efficient real-time scheduling algorithm, the Time Local remaining execution plane based Dynamic Voltage Frequency Scaling (TL-DVFS). TL-DVFS utilizes the concept of Time Local remaining execution (TL) plane to dynamically scale the voltage and frequency of a processor at the initial time of each TL plane as well as at the release time of a sporadic task in each TL plane.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lyons:2012:ASS, author = "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei and David Brooks", title = "The accelerator store: a shared memory framework for accelerator-based systems", journal = j-TACO, volume = "8", number = "4", pages = "48:1--48:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086727", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This paper presents the many-accelerator architecture, a design approach combining the scalability of homogeneous multi-core architectures and system-on-chip's high performance and power-efficient hardware accelerators. In preparation for systems containing tens or hundreds of accelerators, we characterize a diverse pool of accelerators and find each contains significant amounts of SRAM memory (up to 90\% of their area). We take advantage of this discovery and introduce the accelerator store, a scalable architectural component to minimize accelerator area by sharing its memories between accelerators. We evaluate the accelerator store for two applications and find significant system area reductions (30\%) in exchange for small overheads (2\% performance, 0\%--8\% energy).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Orozco:2012:THT, author = "Daniel Orozco and Elkin Garcia and Rishi Khan and Kelly Livingston and Guang R. Gao", title = "Toward high-throughput algorithms on many-core architectures", journal = j-TACO, volume = "8", number = "4", pages = "49:1--49:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086728", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Advanced many-core CPU chips already have a few hundreds of processing cores (e.g., 160 cores in an IBM Cyclops-64 chip) and more and more processing cores become available as computer architecture progresses. The underlying runtime systems of such architectures need to efficiently serve hundreds of processors at the same time, requiring all basic data structures within the runtime to maintain unprecedented throughput. In this paper, we analyze the throughput requirements that must be met by algorithms in runtime systems to be able to handle hundreds of simultaneous operations in real time. We reach a surprising conclusion: Many traditional algorithm techniques are poorly suited for highly parallel computing environments because of their low throughput.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Stock:2012:UML, author = "Kevin Stock and Louis-No{\"e}l Pouchet and P. Sadayappan", title = "Using machine learning to improve automatic vectorization", journal = j-TACO, volume = "8", number = "4", pages = "50:1--50:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086729", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Automatic vectorization is critical to enhancing performance of compute-intensive programs on modern processors. However, there is much room for improvement over the auto-vectorization capabilities of current production compilers through careful vector-code synthesis that utilizes a variety of loop transformations (e.g., unroll-and-jam, interchange, etc.) . As the set of transformations considered is increased, the selection of the most effective combination of transformations becomes a significant challenge: Currently used cost models in vectorizing compilers are often unable to identify the best choices. In this paper, we address this problem using machine learning models to predict the performance of SIMD codes. In contrast to existing approaches that have used high-level features of the program, we develop machine learning models based on features extracted from the generated assembly code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Therdsteerasukdi:2012:URI, author = "Kanit Therdsteerasukdi and Gyungsu Byun and Jason Cong and M. Frank Chang and Glenn Reinman", title = "Utilizing {RF-I} and intelligent scheduling for better throughput\slash watt in a mobile {GPU} memory system", journal = j-TACO, volume = "8", number = "4", pages = "51:1--51:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086730", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Smartphones and tablets are becoming more and more powerful, replacing desktops and laptops as the users' main computing system. As these systems support higher and higher resolutions with more complex 3D graphics, a high-throughput and low-power memory system is essential for the mobile GPU. In this article, we propose to improve throughput/watt in a mobile GPU memory system by using intelligent scheduling to reduce power and multi-band radio frequency interconnect (MRF-I) to offset any throughput degradation caused by our intelligent scheduling.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ryckbosch:2012:VSM, author = "Frederick Ryckbosch and Stijn Polfliet and Lieven Eeckhout", title = "{VSim}: Simulating multi-server setups at near native hardware speed", journal = j-TACO, volume = "8", number = "4", pages = "52:1--52:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086731", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Simulating contemporary computer systems is a challenging endeavor, especially when it comes to simulating high-end setups involving multiple servers. The simulation environment needs to run complete software stacks, including operating systems, middleware, and application software, and it needs to simulate network and disk activity next to CPU performance. In addition, it needs the ability to scale out to a large number of server nodes while attaining good accuracy and reasonable simulation speeds. This paper presents VSim, a novel simulation methodology for multi-server systems. VSim leverages virtualization technology for simulating a target system on a host system. VSim controls CPU, network and disk performance on the host, and it gives the illusion to the software stack to run on a target system through time dilation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2012:WAP, author = "Miao Zhou and Yu Du and Bruce Childers and Rami Melhem and Daniel Moss{\'e}", title = "Writeback-aware partitioning and replacement for last-level caches in phase change main memory systems", journal = j-TACO, volume = "8", number = "4", pages = "53:1--53:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086732", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Phase-Change Memory (PCM) has emerged as a promising low-power main memory candidate to replace DRAM. The main problems of PCM are that writes are much slower and more power hungry than reads, write bandwidth is much lower than read bandwidth, and limited write endurance. Adding an extra layer of cache, which is logically the last-level cache (LLC), can mitigate the drawbacks of PCM. However, writebacks from the LLC might (a) overwhelm the limited PCM write bandwidth and stall the application, (b) shorten lifetime, and (c) increase energy consumption. Cache partitioning and replacement schemes are important to achieve high throughput for multi-core systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2012:TMA, author = "Qingping Wang and Sameer Kulkarni and John Cavazos and Michael Spear", title = "A transactional memory with automatic performance tuning", journal = j-TACO, volume = "8", number = "4", pages = "54:1--54:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086733", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A significant obstacle to the acceptance of transactional memory (TM) in real-world parallel programs is the abundance of substantially different TM algorithms. Each TM algorithm appears well-suited to certain workload characteristics, but the best choice of algorithm is sensitive to program inputs, available cores, and program phases. Furthermore, operating system and hardware characteristics can affect which algorithm is best, with tradeoffs changing across iterations of a single ISA. This paper introduces methods for constructing policies to dynamically select the most appropriate TM algorithm based on static and dynamic information. We leverage intraprocedural static analysis to create a static profile of the application.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bogdanski:2012:SFC, author = "Bartosz Bogdanski and Sven-Arne Reinemo and Frank Olaf Sem-Jacobsen and Ernst Gunnar Gran", title = "{sFtree}: a fully connected and deadlock-free switch-to-switch routing algorithm for fat-trees", journal = j-TACO, volume = "8", number = "4", pages = "55:1--55:??", month = jan, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2086696.2086734", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Existing fat-tree routing algorithms fully exploit the path diversity of a fat-tree topology in the context of compute node traffic, but they lack support for deadlock-free and fully connected switch-to-switch communication. Such support is crucial for efficient system management, for example, in InfiniBand (IB) systems. With the general increase in system management capabilities found in modern InfiniBand switches, the lack of deadlock-free switch-to-switch communication is a problem for fat-tree-based IB installations because management traffic might cause routing deadlocks that bring the whole system down. This lack of deadlock-free communication affects all system management and diagnostic tools using LID routing. In this paper, we propose the sFtree routing algorithm that guarantees deadlock-free and fully connected switch-to-switch communication in fat-trees while maintaining the properties of the current fat-tree algorithm.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ghandour:2012:LSB, author = "Walid J. Ghandour and Haitham Akkary and Wes Masri", title = "Leveraging Strength-Based Dynamic Information Flow Analysis to Enhance Data Value Prediction", journal = j-TACO, volume = "9", number = "1", pages = "1:1--1:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133382.2133383", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Value prediction is a technique to increase parallelism by attempting to overcome serialization constraints caused by true data dependences. By predicting the outcome of an instruction before it executes, value prediction allows data dependent instructions to issue and execute speculatively, hence increasing parallelism when the prediction is correct. In case of a misprediction, the execution is redone with the corrected value. If the benefit from increased parallelism outweighs the misprediction recovery penalty, overall performance could be improved. Enhancing performance with value prediction therefore requires highly accurate prediction methods. Most existing general value prediction techniques are local, that is, future outputs of an instruction are predicted based on outputs from previous executions of the same instruction. In this article, we investigate leveraging strength-based dynamic information flow analysis to enhance data value prediction. We use dynamic information flow analysis (DIFA) to determine when a specific value predictor can perform well and even outperform other predictors. We apply information theory to mathematically prove the validity and benefits of correlating value predictors. We also introduce the concept of the linear value predictors, a new technique that predicts a new value from another one using a linear relation. We finally present a variant of stride predictor that we call update stride. We then conduct an empirical analysis using Pin, a dynamic binary instrumentation tool, and DynFlow, a dynamic information flow analysis tool, that we apply to programs from the SPECjvm2008 and Siemens benchmarks. Our empirical measurements support our mathematical theory and allow us to make important observations on the relation between predictability of data values and information flow. Our analysis and empirical results show that the values of a set of selected variables can be predicted with a very high accuracy, up to 100\%. Such prediction is based on the previous history and/or the values of one or more other source variables that have strong information flow into the predicted variable. Using our selection criteria, we show that a DIFA-directed predictor outperforms hardware value prediction for all subject programs, and sometimes by a significant margin. This was observed even when using an ideal tagged hardware value prediction table that does not suffer from aliasing or capacity misses.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2012:WPW, author = "Jaekyu Lee and Hyesoon Kim and Richard Vuduc", title = "When Prefetching Works, When It Doesn't, and Why", journal = j-TACO, volume = "9", number = "1", pages = "2:1--2:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133382.2133384", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In emerging and future high-end processor systems, tolerating increasing cache miss latency and properly managing memory bandwidth will be critical to achieving high performance. Prefetching, in both hardware and software, is among our most important available techniques for doing so; yet, we claim that prefetching is perhaps also the least well-understood. Thus, the goal of this study is to develop a novel, foundational understanding of both the benefits and limitations of hardware and software prefetching. Our study includes: source code-level analysis, to help in understanding the practical strengths and weaknesses of compiler- and software-based prefetching; a study of the synergistic and antagonistic effects between software and hardware prefetching; and an evaluation of hardware prefetching training policies in the presence of software prefetching requests. We use both simulation and measurement on real systems. We find, for instance, that although there are many opportunities for compilers to prefetch much more aggressively than they currently do, there is also a tangible risk of interference with training existing hardware prefetching mechanisms. Taken together, our observations suggest new research directions for cooperative hardware/software prefetching.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mazloom:2012:DTI, author = "Bita Mazloom and Shashidhar Mysore and Mohit Tiwari and Banit Agrawal and Tim Sherwood", title = "Dataflow Tomography: Information Flow Tracking For Understanding and Visualizing Full Systems", journal = j-TACO, volume = "9", number = "1", pages = "3:1--3:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133382.2133385", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "It is not uncommon for modern systems to be composed of a variety of interacting services, running across multiple machines in such a way that most developers do not really understand the whole system. As abstraction is layered atop abstraction, developers gain the ability to compose systems of extraordinary complexity with relative ease. However, many software properties, especially those that cut across abstraction layers, become very difficult to understand in such compositions. The communication patterns involved, the privacy of critical data, and the provenance of information, can be difficult to find and understand, even with access to all of the source code. The goal of Dataflow Tomography is to use the inherent information flow of such systems to help visualize the interactions between complex and interwoven components across multiple layers of abstraction. In the same way that the injection of short-lived radioactive isotopes help doctors trace problems in the cardiovascular system, the use of ``data tagging'' can help developers slice through the extraneous layers of software and pin-point those portions of the system interacting with the data of interest. To demonstrate the feasibility of this approach we have developed a prototype system in which tags are tracked both through the machine and in between machines over the network, and from which novel visualizations of the whole system can be derived. We describe the system-level challenges in creating a working system tomography tool and we qualitatively evaluate our system by examining several example real world scenarios.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ahn:2012:ISE, author = "Jung Ho Ahn and Norman P. Jouppi and Christos Kozyrakis and Jacob Leverich and Robert S. Schreiber", title = "Improving System Energy Efficiency with Memory Rank Subsetting", journal = j-TACO, volume = "9", number = "1", pages = "4:1--4:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133382.2133386", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "VLSI process technology scaling has enabled dramatic improvements in the capacity and peak bandwidth of DRAM devices. However, current standard DDR x DIMM memory interfaces are not well tailored to achieve high energy efficiency and performance in modern chip-multiprocessor-based computer systems. Their suboptimal performance and energy inefficiency can have a significant impact on system-wide efficiency since much of the system power dissipation is due to memory power. New memory interfaces, better suited for future many-core systems, are needed. In response, there are recent proposals to enhance the energy efficiency of main-memory systems by dividing a memory rank into subsets, and making a subset rather than a whole rank serve a memory request. We holistically assess the effectiveness of rank subsetting from system-wide performance, energy-efficiency, and reliability perspectives. We identify the impact of rank subsetting on memory power and processor performance analytically, compare two promising rank-subsetting proposals, Multicore DIMM and mini-rank, and verify our analysis by simulating a chip-multiprocessor system using multithreaded and consolidated workloads. We extend the design of Multicore DIMM for high-reliability systems and show that compared with conventional chipkill approaches, rank subsetting can lead to much higher system-level energy efficiency and performance at the cost of additional DRAM devices. This holistic assessment shows that rank subsetting offers compelling alternatives to existing processor-memory interfaces for future DDR systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2012:CGC, author = "Xuejun Yang and Li Wang and Jingling Xue and Qingbo Wu", title = "Comparability Graph Coloring for Optimizing Utilization of Software-Managed Stream Register Files for Stream Processors", journal = j-TACO, volume = "9", number = "1", pages = "5:1--5:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133382.2133387", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The stream processors represent a promising alternative to traditional cache-based general-purpose processors in achieving high performance in stream applications (media and some scientific applications). In a stream programming model for stream processors, an application is decomposed into a sequence of kernels operating on streams of data. During the execution of a kernel on a stream processor, all streams accessed must be communicated through a nonbypassing software-managed on-chip memory, the SRF (Stream Register File). Optimizing utilization of the scarce on-chip memory is crucial for good performance. The key insight is that the interference graphs (IGs) formed by the streams in stream applications tend to be comparability graphs or decomposable into a set of comparability graphs. We present a compiler algorithm for finding optimal or near-optimal colorings, that is, SRF allocations in stream IGs, by computing a maximum spanning forest of the sub-IG formed by long live ranges, if necessary. Our experimental results validate the optimality and near-optimality of our algorithm by comparing it with an ILP solver, and show that our algorithm yields improved SRF utilization over the First-Fit bin-packing algorithm, the best in the literature.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Majumdar:2012:MPE, author = "Abhinandan Majumdar and Srihari Cadambi and Michela Becchi and Srimat T. Chakradhar and Hans Peter Graf", title = "A Massively Parallel, Energy Efficient Programmable Accelerator for Learning and Classification", journal = j-TACO, volume = "9", number = "1", pages = "6:1--6:??", month = mar, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2133382.2133388", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Applications that use learning and classification algorithms operate on large amounts of unstructured data, and have stringent performance constraints. For such applications, the performance of general purpose processors scales poorly with data size because of their limited support for fine-grained parallelism and absence of software-managed caches. The large intermediate data in these applications also limits achievable performance on many-core processors such as GPUs. To accelerate such learning applications, we present a programmable accelerator that can execute multiple learning and classification algorithms. To architect such an accelerator, we profile five representative workloads, and find that their computationally intensive portions can be formulated as matrix or vector operations generating large amounts of intermediate data, which are then reduced by a secondary operation such as array ranking, finding max/min and aggregation. Our proposed accelerator, called MAPLE, has hundreds of simple processing elements (PEs) laid out in a two-dimensional grid, with two key features. First, it uses dynamic in-memory processing where on-chip memory blocks perform the secondary reduction operations. Second, MAPLE uses banked off-chip memory, and organizes its PEs into independent groups each with its own off-chip memory bank. These two features allow MAPLE to scale its performance with data size. We also present an Atom based energy-efficient heterogeneous system with MAPLE as the accelerator that satisfies the application's performance requirements at a lower system power. This article describes the MAPLE architecture, explores its design space with a simulator, illustrates how to automatically map application kernels to the hardware, and presents its performance improvement and energy benefits over classic server-based implementations. We implement a 512-PE FPGA prototype of MAPLE and find that it is 1.5-10x faster than a 2.5 GHz quad-core Xeon processor despite running at a modest 125 MHz clock rate. With MAPLE connected to a 1.6GHz dual-core Atom, we show an energy improvement of 38--84\% over the Xeon server coupled to a 1.3 GHz 240 core Tesla GPU.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Eyerman:2012:PMJ, author = "Stijn Eyerman and Lieven Eeckhout", title = "Probabilistic modeling for job symbiosis scheduling on {SMT} processors", journal = j-TACO, volume = "9", number = "2", pages = "7:1--7:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2207222.2207223", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Jun 13 17:20:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Symbiotic job scheduling improves simultaneous multithreading (SMT) processor performance by coscheduling jobs that have ``compatible'' demands on the processor's shared resources. Existing approaches however require a sampling phase, evaluate a limited number of possible coschedules, use heuristics to gauge symbiosis, are rigid in their optimization target, and do not preserve system-level priorities/shares. This article proposes probabilistic job symbiosis modeling, which predicts whether jobs will create positive or negative symbiosis when coscheduled without requiring the coschedule to be evaluated. The model, which uses per-thread cycle stacks computed through a previously proposed cycle accounting architecture, is simple enough to be used in system software. Probabilistic job symbiosis modeling provides six key innovations over prior work in symbiotic job scheduling: (i) it does not require a sampling phase, (ii) it readjusts the job coschedule continuously, (iii) it evaluates a large number of possible coschedules at very low overhead, (iv) it is not driven by heuristics, (v) it can optimize a performance target of interest (e.g., system throughput or job turnaround time), and (vi) it preserves system-level priorities/shares. These innovations make symbiotic job scheduling both practical and effective. Our experimental evaluation, which assumes a realistic scenario in which jobs come and go, reports an average 16\% (and up to 35\%) reduction in job turnaround time compared to the previously proposed SOS (sample, optimize, symbios) approach for a two-thread SMT processor, and an average 19\% (and up to 45\%) reduction in job turnaround time for a four-thread SMT processor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Seghir:2012:IAT, author = "Rachid Seghir and Vincent Loechner and Beno{\^\i}t Meister", title = "Integer affine transformations of parametric {$Z$}-polytopes and applications to loop nest optimization", journal = j-TACO, volume = "9", number = "2", pages = "8:1--8:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2207222.2207224", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Jun 13 17:20:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The polyhedral model is a well-known compiler optimization framework for the analysis and transformation of affine loop nests. We present a new method to solve a difficult geometric operation that is raised by this model: the integer affine transformation of parametric $Z$-polytopes. The result of such a transformation is given by a worst-case exponential union of $Z$-polytopes. We also propose a polynomial algorithm (for fixed dimension), to count points in arbitrary unions of a fixed number of parametric $Z$-polytopes. We implemented these algorithms and compared them to other existing algorithms, for a set of applications to loop nest analysis and optimization.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2012:UOC, author = "Yi Yang and Ping Xiang and Jingfei Kong and Mike Mantor and Huiyang Zhou", title = "A unified optimizing compiler framework for different {GPGPU} architectures", journal = j-TACO, volume = "9", number = "2", pages = "9:1--9:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2207222.2207225", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Jun 13 17:20:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents a novel optimizing compiler for general purpose computation on graphics processing units (GPGPU). It addresses two major challenges of developing high performance GPGPU programs: effective utilization of GPU memory hierarchy and judicious management of parallelism. The input to our compiler is a na{\"\i}ve GPU kernel function, which is functionally correct but without any consideration for performance optimization. The compiler generates two kernels, one optimized for global memories and the other for texture memories. The proposed compilation process is effective for both AMD/ATI and NVIDIA GPUs. The experiments show that our optimized code achieves very high performance, either superior or very close to highly fine-tuned libraries.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jang:2012:ACO, author = "Choonki Jang and Jaejin Lee and Bernhard Egger and Soojung Ryu", title = "Automatic code overlay generation and partially redundant code fetch elimination", journal = j-TACO, volume = "9", number = "2", pages = "10:1--10:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2207222.2207226", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Jun 13 17:20:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "There is an increasing interest in explicitly managed memory hierarchies, where a hierarchy of distinct memories is exposed to the programmer and managed explicitly in software. These hierarchies can be found in typical embedded systems and an emerging class of multicore architectures. To run an application that requires more code memory than the available higher-level memory, typically an overlay structure is needed. The overlay structure is generated manually by the programmer or automatically by a specialized linker. Manual code overlaying requires the programmer to deeply understand the program structure for maximum memory savings as well as minimum performance degradation. Although the linker can automatically generate the code overlay structure, its memory savings are limited and it even brings significant performance degradation because traditional techniques do not consider the program context. In this article, we propose an automatic code overlay generation technique that overcomes the limitations of traditional automatic code overlaying techniques. We are dealing with a system context that imposes two distinct constraints: (1) no hardware support for address translation and (2) a spatially and temporally coarse grained faulting mechanism at the function level. Our approach addresses those two constraints as efficiently as possible. Our technique statically computes the Worst-Case Number of Conflict misses (WCNC) between two different code segments using path expressions. Then, it constructs a static temporal relationship graph with the WCNCs and emits an overlay structure for a given higher-level memory size. We also propose an inter-procedural partial redundancy elimination technique that minimizes redundant code copying caused by the generated overlay structure. Experimental results show that our approach is promising.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Abbasi:2012:TSW, author = "Zahra Abbasi and Georgios Varsamopoulos and Sandeep K. S. Gupta", title = "{TACOMA}: Server and workload management in {Internet} data centers considering cooling-computing power trade-off and energy proportionality", journal = j-TACO, volume = "9", number = "2", pages = "11:1--11:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2207222.2207227", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Jun 13 17:20:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A two-tier Internet data center management scheme, TACOMA, with thermal-aware server provisioning (TASP) in one tier, and thermal-aware workload distribution (TAWD) in the other is proposed. TASP and TAWD coordinate to maximize the energy savings by leveraging the workload dynamics, at coarse and fine time scale, respectively. TACOMA is aware of the QoS constraints, the energy proportionality of servers, and the potential trade-off between cooling and computing power. The obtained energy savings are a combination of suspending idle servers, using servers at their peak efficiency, and avoiding heat recirculation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lankes:2012:BSP, author = "Andreas Lankes and Thomas Wild and Stefan Wallentowitz and Andreas Herkersdorf", title = "Benefits of selective packet discard in networks-on-chip", journal = j-TACO, volume = "9", number = "2", pages = "12:1--12:??", month = jun, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2207222.2207228", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Jun 13 17:20:51 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Today, Network on Chip concepts principally assume inherent lossless operation. Considering that future nanometer CMOS technologies will witness increased sensitivity to all forms of manufacturing and environmental variations (e.g., IR drop, soft errors due to radiation, transient temperature induced timing problems, device aging), efforts to cope with data corruption or packet loss will be unavoidable. Possible counter measures against packet loss are the extension of flits with ECC or the introduction of error detection with retransmission. We propose to make use of the perceived deficiency of packet loss as a feature. By selectively discarding stuck packets in the NoC, a proven practice in computer networks, all types of deadlocks can be resolved. This is especially advantageous for solving the problem of message-dependent deadlocks, which otherwise leads to high costs either in terms of throughput or chip area. Strict ordering, the most popular approach to this problem, results in a significant buffer overhead and a more complex router architecture. In addition, we will show that eliminating local network congestions by selectively discarding individual packets also can improve the effective throughput of the network. The end-to-end retransmission mechanism required for the reliable communication, then also provides lossless communication for the cores.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luo:2012:DDS, author = "Yangchun Luo and Antonia Zhai", title = "Dynamically dispatching speculative threads to improve sequential execution", journal = j-TACO, volume = "9", number = "3", pages = "13:1--13:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355586", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Efficiently utilizing multicore processors to improve their performance potentials demands extracting thread-level parallelism from the applications. Various novel and sophisticated execution models have been proposed to extract thread-level parallelism from sequential programs. One such execution model, Thread-Level Speculation (TLS), allows potentially dependent threads to execute speculatively in parallel. However, TLS execution is inherently unpredictable, and consequently incorrect speculation could degrade performance for the multicore systems. Existing approaches have focused on using the compilers to select sequential program regions to apply TLS. Our research shows that even the state-of-the-art compiler makes suboptimal decisions, due to the unpredictability of TLS execution. Thus, we propose to dynamically optimize TLS performance. This article describes the design, implementation, and evaluation of a runtime thread dispatching mechanism that adjusts the behaviors of speculative threads based on their efficiency. In the proposed system, speculative threads are monitored by hardware-based performance counters and their performance impact is evaluated with a novel methodology that takes into account various unique TLS characteristics. Thread dispatching policies are devised to adjust the behaviors of speculative threads accordingly. With the help of the runtime evaluation, where and how to create speculative threads is better determined. Evaluated with all the SPEC CPU2000 benchmark programs written in C, the dynamic dispatching system outperforms the state-of-the-art compiler-based thread management techniques by 9.4\% on average. Comparing to sequential execution, we achieve 1.37X performance improvement on a four-core CMP-based system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cui:2012:EPO, author = "Huimin Cui and Jingling Xue and Lei Wang and Yang Yang and Xiaobing Feng and Dongrui Fan", title = "Extendable pattern-oriented optimization directives", journal = j-TACO, volume = "9", number = "3", pages = "14:1--14:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355587", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Algorithm-specific, that is, semantic-specific optimizations have been observed to bring significant performance gains, especially for a diverse set of multi/many-core architectures. However, current programming models and compiler technologies for the state-of-the-art architectures do not exploit well these performance opportunities. In this article, we propose a pattern-making methodology that enables algorithm-specific optimizations to be encapsulated into ``optimization patterns''. Such optimization patterns are expressed in terms of preprocessor directives so that simple annotations can result in significant performance improvements. To validate this new methodology, a framework, named EPOD, is developed to map these directives into the underlying optimization schemes for a particular architecture. It is difficult to create an exact performance model to determine an optimal or near-optimal optimization scheme (including which optimizations to apply and in which order) for a specific application, due to the complexity of applications and architectures. However, it is trackable to build individual optimization components and let compiler developers synthesize an optimization scheme from these components. Therefore, our EPOD framework provides an Optimization Programming Interface (OPI) for compiler developers to define new optimization schemes. Thus, new patterns can be integrated into EPOD in a flexible manner. We have identified and implemented a number of optimization patterns for three representative computer platforms. Our experimental results show that a pattern-guided compiler can outperform the state-of-the-art compilers and even achieve performance as competitive as hand-tuned code. Therefore, such a pattern-making methodology represents an encouraging direction for domain experts' experience and knowledge to be integrated into general-purpose compilers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lewis:2012:REC, author = "Adam Wade Lewis and Nian-Feng Tzeng and Soumik Ghosh", title = "Runtime energy consumption estimation for server workloads based on chaotic time-series approximation", journal = j-TACO, volume = "9", number = "3", pages = "15:1--15:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355588", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article proposes a runtime model that relates server energy consumption to its overall thermal envelope, using hardware performance counters and experimental measurements. While previous studies have attempted system-wide modeling of server power consumption through subsystem models, our approach is different in that it links system energy input to subsystem energy consumption based on a small set of tightly correlated parameters. The proposed model takes into account processor power, bus activities, and system ambient temperature for real-time prediction on the power consumption of long running jobs. Using the HyperTransport and QuickPath Link structures as case studies and through electrical measurements on example server subsystems, we develop a chaotic time-series approximation for runtime power consumption, arriving at the Chaotic Attractor Predictor (CAP). With polynomial time complexity, CAP exhibits high prediction accuracy, having the prediction errors within 1.6\% (or 3.3\%) for servers based on the HyperTransport bus (or the QuickPath Links), as verified by a set of common processor benchmarks. Our CAP is a superior predictive mechanism over existing linear auto-regressive methods, which require expensive and complex corrective steps to address the nonlinear and chaotic aspects of the underlying physical system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Valero:2012:CRI, author = "Alejandro Valero and Julio Sahuquillo and Salvador Petit and Pedro L{\'o}pez and Jos{\'e} Duato", title = "Combining recency of information with selective random and a victim cache in last-level caches", journal = j-TACO, volume = "9", number = "3", pages = "16:1--16:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355589", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memory latency has become an important performance bottleneck in current microprocessors. This problem aggravates as the number of cores sharing the same memory controller increases. To palliate this problem, a common solution is to implement cache hierarchies with large or huge Last-Level Cache (LLC) organizations. LLC memories are implemented with a high number of ways (e.g., 16) to reduce conflict misses. Typically, caches have implemented the LRU algorithm to exploit temporal locality, but its performance goes away from the optimal as the number of ways increases. In addition, the implementation of a strict LRU algorithm is costly in terms of area and power. This article focuses on a family of low-cost replacement strategies, whose implementation scales with the number of ways while maintaining the performance. The proposed strategies track the accessing order for just a few blocks, which cannot be replaced. The victim is randomly selected among those blocks exhibiting poor locality. Although, in general, the random policy helps improving the performance, in some applications the scheme fails with respect to the LRU policy leading to performance degradation. This drawback can be overcome by the addition of a small victim cache of the large LLC. Experimental results show that, using the best version of the family without victim cache, MPKI reduction falls in between 10\% and 11\% compared to a set of the most representative state-of-the-art algorithms, whereas the reduction grows up to 22\% with respect to LRU. The proposal with victim cache achieves speedup improvements, on average, by 4\% compared to LRU. In addition, it reduces dynamic energy, on average, up to 8\%. Finally, compared to the studied algorithms, hardware complexity is largely reduced by the baseline algorithm of the family.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2012:DQM, author = "Bin Li and Li-Shiuan Peh and Li Zhao and Ravi Iyer", title = "Dynamic {QoS} management for chip multiprocessors", journal = j-TACO, volume = "9", number = "3", pages = "17:1--17:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355590", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the continuing scaling of semiconductor technologies, chip multiprocessor (CMP) has become the de facto design for modern high performance computer architectures. It is expected that more and more applications with diverse requirements will run simultaneously on the CMP platform. However, this will exert contention on shared resources such as the last level cache, network-on-chip bandwidth and off-chip memory bandwidth, thus affecting the performance and quality-of-service (QoS) significantly. In this environment, efficient resource sharing and a guarantee of a certain level of performance is highly desirable. Researchers have proposed different frameworks for providing QoS. Most of these frameworks focus on individual resource for QoS management. Coordinated management of multiple QoS-aware shared resources at runtime remains an open problem. Recently, there has been work that proposed a class-of-serviced based framework to jointly managing cache, NoC and memory resources simultaneously. However, the work allocates shared resources statically at the beginning of application runtime, and do not dynamically track, manage and share shared resources across applications. In this article, we address this limitation by proposing dynamic resource management policies that monitor the resource usage of applications at runtime, then steals resources from the high-priority applications for lower-priority ones. The goal is to maintain the targeted level of performance for high-priority applications while improving the performance of lower-priority applications. We use a PI (Proportional-Integral gain) feedback controller based technique to maintain stability in our framework. Our evaluation results show that our policy can improve performance for lower-priority applications significantly while maintaining the performance for high-priority application, thus demonstrating the effectiveness of our dynamic QoS resource management policy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xekalakis:2012:MSM, author = "Polychronis Xekalakis and Nikolas Ioannou and Marcelo Cintra", title = "Mixed speculative multithreaded execution models", journal = j-TACO, volume = "9", number = "3", pages = "18:1--18:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355591", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The current trend toward multicore architectures has placed great pressure on programmers and compilers to generate thread-parallel programs. Improved execution performance can no longer be obtained via traditional single-thread instruction level parallelism (ILP), but, instead, via multithreaded execution. One notable technique that facilitates the extraction of parallel threads from sequential applications is thread-level speculation (TLS). This technique allows programmers/compilers to generate threads without checking for inter-thread data and control dependences, which are then transparently enforced by the hardware. Most prior work on TLS has concentrated on thread selection and mechanisms to efficiently support the main TLS operations, such as squashes, data versioning, and commits. This article seeks to enhance TLS functionality by combining it with other speculative multithreaded execution models. The main idea is that TLS already requires extensive hardware support, which when slightly augmented can accommodate other speculative multithreaded techniques. Recognizing that for different applications, or even program phases, the application bottlenecks may be different, it is reasonable to assume that the more versatile a system is, the more efficiently it will be able to execute the given program. Toward this direction, we first show that mixed execution models that combine TLS with Helper Threads (HT), RunAhead execution (RA) and MultiPath execution (MP) perform better than any of the models alone. Based on a simple model that we propose, we show that benefits come from being able to extract additional ILP without harming the TLP extracted by TLS. We then show that by combining all the execution models in a unified one that combines all these speculative multithreaded models, ILP can be further enhanced with only minimal additional cost in hardware.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sharafeddine:2012:DOE, author = "Mageda Sharafeddine and Komal Jothi and Haitham Akkary", title = "Disjoint out-of-order execution processor", journal = j-TACO, volume = "9", number = "3", pages = "19:1--19:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355592", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High-performance superscalar architectures used to exploit instruction level parallelism in single-thread applications have become too complex and power hungry for the multicore processors era. We propose a new architecture that uses multiple small latency-tolerant out-of-order cores to improve single-thread performance. Improving single-thread performance with multiple small out-of-order cores allows designers to place more of these cores on the same die. Consequently, emerging highly parallel applications can take full advantage of the multicore parallel hardware without sacrificing performance of inherently serial and hard to parallelize applications. Our architecture combines speculative multithreading (SpMT) with checkpoint recovery and continual flow pipeline architectures. It splits single-thread program execution into disjoint control and data threads that execute concurrently on multiple cooperating small and latency-tolerant out-of-order cores. Hence we call this style of execution Disjoint Out-of-Order Execution (DOE). DOE uses latency tolerance to overcome performance issues of SpMT caused by interthread data dependences. To evaluate this architecture, we have developed a microarchitecture performance model of DOE based on PTLSim, a simulation infrastructure of the x86 instruction set architecture. We evaluate the potential performance of DOE processor architecture using a simple heuristic to fork control independent threads in hardware at the target addresses of future procedure return instructions. Using applications from SpecInt 2000, we study DOE under ideal as well as realistic architectural constraints. We discuss the performance impact of key DOE architecture and application variables such as number of cores, interthread data dependences, intercore data communication delay, buffers capacity, and branch mispredictions. Without any DOE specific compiler optimizations, our results show that DOE outperforms conventional SpMT architectures by 15\%, on average. We also show that DOE with four small cores can perform on average equally well to a large superscalar core, consuming about the same power. Most importantly, DOE improves throughput performance by a significant amount over a large superscalar core, up to 2.5 times, when running multitasking applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Andrade:2012:SAW, author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n Doallo", title = "Static analysis of the worst-case memory performance for irregular codes with indirections", journal = j-TACO, volume = "9", number = "3", pages = "20:1--20:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355593", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Real-time systems are subject to timing constraints, whose upper bound is given by the Worst-Case Execution Time (WCET). Cache memory behavior is difficult to predict analytically and estimating a safe and precise worst-case value is even more challenging. The worst-case memory performance (WCMP) component of the WCET can only be estimated with the precise knowledge of the stream of data addresses accessed by the code, which is determined by the access patterns and the base addresses of the data structures accessed. The regularity of strided access patterns simplifies their analysis, as they are characterized by relatively few parameters, which are often available at compile time. Unfortunately codes may exhibit irregular access patterns, which are much more difficult to statically analyze. As for the base addresses of the data structures, they are not always available at compile-time for many reasons: stack variables, dynamically allocated memory, modules compiled separately, etc. This article addresses these problems by presenting a model that predicts an \%safe and upper bound of the data cache performance for codes both with regular and irregular access patterns, which is valid for any possible base addresses of the data structures. The model analyzes irregular access patterns due to the presence of indirections in the code and it can provide two kinds of predictions: a safe hard boundary that is suitable for hard real-time systems and a soft boundary whose safeness is not guaranteed but which is valid most of the times. In fact, in all our experiments the number of misses was below the soft boundary predicted by the model. This turns this soft boundary prediction into a valuable tool, particularly for non and soft real-time systems, which tolerate a percentage of the runs exceeding their deadlines.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2012:DIO, author = "Yang Chen and Shuangde Fang and Yuanjie Huang and Lieven Eeckhout and Grigori Fursin and Olivier Temam and Chengyong Wu", title = "Deconstructing iterative optimization", journal = j-TACO, volume = "9", number = "3", pages = "21:1--21:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355594", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Iterative optimization is a popular compiler optimization approach that has been studied extensively over the past decade. In this article, we deconstruct iterative optimization by evaluating whether it works across datasets and by analyzing why it works. Up to now, most iterative optimization studies are based on a premise which was never truly evaluated: that it is possible to learn the best compiler optimizations across datasets. In this article, we evaluate this question for the first time with a very large number of datasets. We therefore compose KDataSets, a dataset suite with 1000 datasets for 32 programs, which we release to the public. We characterize the diversity of KDataSets, and subsequently use it to evaluate iterative optimization. For all 32 programs, we find that there exists at least one combination of compiler optimizations that achieves at least 83\% or more of the best possible speedup across all datasets on two widely used compilers (Intel's ICC and GNU's GCC). This optimal combination is program-specific and yields speedups up to 3.75$ \times $ (averaged across datasets of a program) over the highest optimization level of the compilers (-O3 for GCC and -fast for ICC). This finding suggests that optimizing programs across datasets might be much easier than previously anticipated. In addition, we evaluate the idea of introducing compiler choice as part of iterative optimization. We find that it can further improve the performance of iterative optimization because different programs favor different compilers. We also investigate why iterative optimization works by analyzing the optimal combinations. We find that only a handful optimizations yield most of the speedup. Finally, we show that optimizations interact in a complex and sometimes counterintuitive way through two case studies, which confirms that iterative optimization is an irreplaceable and important compiler strategy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Guha:2012:MOD, author = "Apala Guha and Kim Hazelwood and Mary Lou Soffa", title = "Memory optimization of dynamic binary translators for embedded systems", journal = j-TACO, volume = "9", number = "3", pages = "22:1--22:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355595", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dynamic binary translators (DBTs) are becoming increasingly important because of their power and flexibility. DBT-based services are valuable for all types of platforms. However, the high memory demands of DBTs present an obstacle for embedded systems. Most research on DBT design has a performance focus, which often drives up the DBT memory demand. In this article, we present a memory-oriented approach to DBT design. We consider the class of translation-based DBTs and their sources of memory demand; cached translated code, cached auxiliary code and DBT data structures. We explore aspects of DBT design that impact these memory demand sources and present strategies to mitigate memory demand. We also explore performance optimizations for DBTs that handle memory demand by placing a limit on it, and repeatedly flush translations to stay within the limit, thereby replacing the memory demand problem with a performance degradation problem. Our optimizations that mitigate memory demand improve performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Geraci:2012:TFP, author = "James R. Geraci and Sharon M. Sacco", title = "A transpose-free in-place {SIMD} optimized {FFT}", journal = j-TACO, volume = "9", number = "3", pages = "23:1--23:??", month = sep, year = "2012", CODEN = "????", DOI = "https://doi.org/10.1145/2355585.2355596", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 22 10:48:53 MDT 2012", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A transpose-free in-place SIMD optimized algorithm for the computation of large FFTs is introduced and implemented on the Cell Broadband Engine. Six different FFT implementations of the algorithm using six different data movement methods are described. Their relative performance is compared for input sizes from $ 2^{17} $ to $ 2^{21} $ complex floating point samples. Large differences in performance are observed among even theoretically equivalent data movement patterns. All six implementations compare favorably with FFTW and other previous FFT implementations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Coppens:2013:FDB, author = "Bart Coppens and Bjorn {De Sutter} and Jonas Maebe", title = "Feedback-driven binary code diversification to the special issue on high-performance embedded architectures and compilers", journal = j-TACO, volume = "9", number = "4", pages = "24:1--24:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400683", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As described in many blog posts and in the scientific literature, exploits for software vulnerabilities are often engineered on the basis of patches. For example, ``Microsoft Patch Tuesday'' is often followed by ``Exploit Wednesday'' during which yet unpatched systems become vulnerable to patch-based exploits. Part of the patch engineering includes the identification of the vulnerable binary code by means of reverse-engineering tools and diffing add-ons. In this article we present a feedback-driven compiler tool flow that iteratively transforms code until diffing tools become ineffective enough to close the ``Exploit Wednesday'' window of opportunity. We demonstrate the tool's effectiveness on a set of real-world patches and against the latest version of BinDiff.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fowers:2013:PEC, author = "Jeremy Fowers and Greg Brown and John Wernsing and Greg Stitt", title = "A performance and energy comparison of convolution on {GPUs}, {FPGAs}, and multicore processors", journal = j-TACO, volume = "9", number = "4", pages = "25:1--25:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400684", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent architectural trends have focused on increased parallelism via multicore processors and increased heterogeneity via accelerator devices (e.g., graphics-processing units, field-programmable gate arrays). Although these architectures have significant performance and energy potential, application designers face many device-specific challenges when choosing an appropriate accelerator or when customizing an algorithm for an accelerator. To help address this problem, in this article we thoroughly evaluate convolution, one of the most common operations in digital-signal processing, on multicores, graphics-processing units, and field-programmable gate arrays. Whereas many previous application studies evaluate a specific usage of an application, this article assists designers with design space exploration for numerous use cases by analyzing effects of different input sizes, different algorithms, and different devices, while also determining Pareto-optimal trade-offs between performance and energy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rohou:2013:VTI, author = "Erven Rohou and Kevin Williams and David Yuste", title = "Vectorization technology to improve interpreter performance", journal = j-TACO, volume = "9", number = "4", pages = "26:1--26:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400685", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In the present computing landscape, interpreters are in use in a wide range of systems. Recent trends in consumer electronics have created a new category of portable, lightweight software applications. Typically, these applications have fast development cycles and short life spans. They run on a wide range of systems and are deployed in a target independent bytecode format over Internet and cellular networks. Their authors are untrusted third-party vendors, and they are executed in secure managed runtimes or virtual machines. Furthermore, due to security policies or development time constraints, these virtual machines often lack just-in-time compilers and rely on interpreted execution. At the other end of the spectrum, interpreters are also a reality in the field of high-performance computations because of the flexibility they provide. The main performance penalty in interpreters arises from instruction dispatch. Each bytecode requires a minimum number of machine instructions to be executed. In this work, we introduce a novel approach for interpreter optimization that reduces instruction dispatch thanks to vectorization technology. We extend the split compilation paradigm to interpreters, thus guaranteeing that our approach exhibits almost no overhead at runtime. We take advantage of the vast research in vectorization and its presence in modern compilers. Complex analyses are performed ahead of time, and their results are conveyed to the executable bytecode. At runtime, the interpreter retrieves this additional information to build the SIMD IR (intermediate representation) instructions that carry the vector semantics. The bytecode language remains unmodified, making this representation compatible with legacy interpreters and previously proposed JIT compilers. We show that this approach drastically reduces the number of instructions to interpret and decreases execution time of vectorizable applications. Moreover, we map SIMD IR instructions to hardware SIMD instructions when available, with a substantial additional improvement. Finally, we finely analyze the impact of our extension on the behavior of the caches and branch predictors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cleary:2013:FAT, author = "Jimmy Cleary and Owen Callanan and Mark Purcell and David Gregg", title = "Fast asymmetric thread synchronization", journal = j-TACO, volume = "9", number = "4", pages = "27:1--27:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400686", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "For most multi-threaded applications, data structures must be shared between threads. Ensuring thread safety on these data structures incurs overhead in the form of locking and other synchronization mechanisms. Where data is shared among multiple threads these costs are unavoidable. However, a common access pattern is that data is accessed primarily by one dominant thread, and only very rarely by the other, non-dominant threads. Previous research has proposed biased locks, which are optimized for a single dominant thread, at the cost of greater overheads for non-dominant threads. In this article we propose a new family of biased synchronization mechanisms that, using a modified interface, push accesses to shared data from the non-dominant threads to the dominant one, via a novel set of message passing mechanisms. We present mechanisms for protecting critical sections, for queueing work, for caching shared data in registers where it is safe to do so, and for asynchronous critical section accesses. We present results for the conventional Intel\reg{} Sandy Bridge processor and for the emerging network-optimized many-core IBM\reg{} PowerENTM processor. We find that our algorithms compete well with existing biased locking algorithms, and, in particular, perform better than existing algorithms as accesses from non-dominant threads increase.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2013:PTL, author = "Yong Li and Rami Melhem and Alex K. Jones", title = "{PS-TLB}: Leveraging page classification information for fast, scalable and efficient translation for future {CMPs}", journal = j-TACO, volume = "9", number = "4", pages = "28:1--28:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400687", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Traversing the page table during virtual to physical address translation causes pipeline stalls when misses occur in the translation-lookaside buffer (TLB). State-of-the-art translation proposals typically optimize a single aspect of translation performance (e.g., translation sharing, context switch performance, etc.) with potential trade-offs of additional hardware complexity, increased translation latency, or reduced scalability. In this article, we propose the partial sharing TLB (PS-TLB), a fast and scalable solution that reduces off-chip translation misses without sacrificing the timing-critical requirement of on-chip translation. We introduce the partial sharing buffer (PSB) which leverages application page sharing characteristics using minimal additional hardware resources. Compared to the leading TLB proposal that leverages sharing, PS-TLB provides a more than 45\% improvement in translation latency with a 9\% application speedup while using fewer storage resources. In addition, the page classification and PS-TLB architecture provide further optimizations including an over 30\% reduction of interprocessor interrupts for coherence, and reduced context switch misses with fewer resources compared with existing methods.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{DuBois:2013:PTC, author = "Kristof {Du Bois} and Stijn Eyerman and Lieven Eeckhout", title = "Per-thread cycle accounting in multicore processors", journal = j-TACO, volume = "9", number = "4", pages = "29:1--29:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400688", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "While multicore processors improve overall chip throughput and hardware utilization, resource sharing among the cores leads to unpredictable performance for the individual threads running on a multicore processor. Unpredictable per-thread performance becomes a problem when considered in the context of multicore scheduling: system software assumes that all threads make equal progress, however, this is not what the hardware provides. This may lead to problems at the system level such as missed deadlines, reduced quality-of-service, non-satisfied service-level agreements, unbalanced parallel performance, priority inversion, unpredictable interactive performance, etc. This article proposes a hardware-efficient per-thread cycle accounting architecture for multicore processors. The counter architecture tracks per-thread progress in a multicore processor, detects how inter-thread interference affects per-thread performance, and predicts the execution time for each thread if run in isolation. The counter architecture captures the effects of additional conflict misses due to cache sharing as well as increased latency for other memory accesses due to resource and bandwidth contention in the memory subsystem. The proposed method accounts for 74.3\% of the interference cycles, and estimates per-thread progress within 14.2\% on average across a large set of multi-program workloads. Hardware cost is limited to 7.44KB for an 8-core processor, a reduction by almost $ 10 \times $ compared to prior work while being 63.8\% more accurate. Making system software progress aware improves fairness by 22.5\% on average over progress-agnostic scheduling.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wimmer:2013:MAV, author = "Christian Wimmer and Michael Haupt and Michael L. {Van De Vanter} and Mick Jordan and Laurent Dayn{\`e}s and Douglas Simon", title = "{Maxine}: an approachable virtual machine for, and in, {Java}", journal = j-TACO, volume = "9", number = "4", pages = "30:1--30:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400689", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A highly productive platform accelerates the production of research results. The design of a Virtual Machine (VM) written in the Java{\TM} programming language can be simplified through exploitation of interfaces, type and memory safety, automated memory management (garbage collection), exception handling, and reflection. Moreover, modern Java IDEs offer time-saving features such as refactoring, auto-completion, and code navigation. Finally, Java annotations enable compiler extensions for low-level ``systems programming'' while retaining IDE compatibility. These techniques collectively make complex system software more ``approachable'' than has been typical in the past. The Maxine VM, a metacircular Java VM implementation, has aggressively used these features since its inception. A co-designed companion tool, the Maxine Inspector, offers integrated debugging and visualization of all aspects of the VM's runtime state. The Inspector's implementation exploits advanced Java language features, embodies intimate knowledge of the VM's design, and even reuses a significant amount of VM code directly. These characteristics make Maxine a highly approachable VM research platform and a productive basis for research and teaching.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Khan:2013:SBA, author = "Malik Khan and Protonu Basu and Gabe Rudy and Mary Hall and Chun Chen and Jacqueline Chame", title = "A script-based autotuning compiler system to generate high-performance {CUDA} code", journal = j-TACO, volume = "9", number = "4", pages = "31:1--31:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400690", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents a novel compiler framework for CUDA code generation. The compiler structure is designed to support autotuning, which employs empirical techniques to evaluate a set of alternative mappings of computation kernels and select the mapping that obtains the best performance. This article introduces a Transformation Strategy Generator, a meta-optimizer that generates a set of transformation recipes, which are descriptions of the mapping of the sequential code to parallel CUDA code. These recipes comprise a search space of possible implementations. This system achieves performance comparable and sometimes better than manually tuned libraries and exceeds the performance of a state-of-the-art GPU compiler.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{VanCraeynest:2013:UFD, author = "Kenzo {Van Craeynest} and Lieven Eeckhout", title = "Understanding fundamental design choices in single-{ISA} heterogeneous multicore architectures", journal = j-TACO, volume = "9", number = "4", pages = "32:1--32:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400691", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Single-ISA heterogeneous multicore processors have gained substantial interest over the past few years because of their power efficiency, as they offer the potential for high overall chip throughput within a given power budget. Prior work in heterogeneous architectures has mainly focused on how heterogeneity can improve overall system throughput. To what extent heterogeneity affects per-program performance has remained largely unanswered. In this article, we aim at understanding how heterogeneity affects both chip throughput and per-program performance; how heterogeneous architectures compare to homogeneous architectures under both performance metrics; and how fundamental design choices, such as core type, cache size, and off-chip bandwidth, affect performance. We use analytical modeling to explore a large space of single-ISA heterogeneous architectures. The analytical model has linear-time complexity in the number of core types and programs of interest, and offers a unique opportunity for exploring the large space of both homogeneous and heterogeneous multicore processors in limited time. Our analysis provides several interesting insights: While it is true that heterogeneity can improve system throughput, it fundamentally trades per-program performance for chip throughput; although some heterogeneous configurations yield better throughput and per-program performance than homogeneous designs, some homogeneous configurations are optimal for particular throughput versus per-program performance trade-offs. Two core types provide most of the benefits from heterogeneity and a larger number of core types does not contribute much; job-to-core mapping is both important and challenging for heterogeneous multicore processors to achieve optimum performance. Limited off-chip bandwidth does alter some of the fundamental design choices in heterogeneous multicore architectures, such as the need for large on-chip caches for achieving high throughput, and per-program performance degrading more relative to throughput under constrained off-chip bandwidth.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Antao:2013:CFA, author = "Samuel Ant{\~a}o and Leonel Sousa", title = "The {CRNS} framework and its application to programmable and reconfigurable cryptography", journal = j-TACO, volume = "9", number = "4", pages = "33:1--33:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400692", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article proposes the Computing with the ResidueNumber System (CRNS) framework, which aims at the design automation of accelerators for Modular Arithmetic (MA). The framework provides a comprehensive set of tools ranging from a programming language and respective compiler to back-ends targeting parallel computation platforms such as Graphical Processing Units (GPUs) and reconfigurable hardware. Given an input algorithm described with a high-level programming language, the CRNS can be used to obtain in a few seconds the corresponding optimized Parallel Thread Execution (PTX) program ready to be run on GPUs or the Hardware Description Language (HDL) specification of a fully functional accelerator suitable for reconfigurable hardware and embedded systems. The resulting framework's implementations benefit from the Residue Number System (RNS) arithmetic's parallelization properties in a fully automated way. Designers do not need to be familiar with the mathematical details concerning the employed arithmetic, namely the RNS representation. In order to thoroughly describe and evaluate the proposed framework, experimental results obtained for the supported back-ends (GPU and HDL) are presented targeting the implementation of the modular exponentiation used in the Rivest-Shamir-Adleman (RSA) algorithm and Elliptic Curve (EC) point multiplication. Results suggest competitive latency and throughput with minimum design effort and overcoming all the development issues that arise in the specification and verification of dedicated solutions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Diouf:2013:DLM, author = "Boubacar Diouf and Can Hantas and Albert Cohen and {\"O}zcan {\"O}zturk and Jens Palsberg", title = "A decoupled local memory allocator", journal = j-TACO, volume = "9", number = "4", pages = "34:1--34:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400693", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compilers use software-controlled local memories to provide fast, predictable, and power-efficient access to critical data. We show that the local memory allocation for straight-line, or linearized programs is equivalent to a weighted interval-graph coloring problem. This problem is new when allowing a color interval to ``wrap around,'' and we call it the submarine-building problem. This graph-theoretical decision problem differs slightly from the classical ship-building problem, and exhibits very interesting and unusual complexity properties. We demonstrate that the submarine-building problem is NP-complete, while it is solvable in linear time for not-so-proper interval graphs, an extension of the class of proper interval graphs. We propose a clustering heuristic to approximate any interval graph into a not-so-proper interval graph, decoupling spill code generation from local memory assignment. We apply this heuristic to a large number of randomly generated interval graphs reproducing the statistical features of standard local memory allocation benchmarks, comparing with state-of-the-art heuristics.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cui:2013:LOC, author = "Huimin Cui and Qing Yi and Jingling Xue and Xiaobing Feng", title = "Layout-oblivious compiler optimization for matrix computations", journal = j-TACO, volume = "9", number = "4", pages = "35:1--35:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400694", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Most scientific computations serve to apply mathematical operations to a set of preconceived data structures, e.g., matrices, vectors, and grids. In this article, we use a number of widely used matrix computations from the LINPACK library to demonstrate that complex internal organizations of data structures can severely degrade the effectiveness of compiler optimizations. We then present a data-layout-oblivious optimization methodology, where by isolating an abstract representation of the computations from complex implementation details of their data, we enable these computations to be much more accurately analyzed and optimized through varying state-of-the-art compiler technologies. We evaluated our approach on an Intel 8-core platform using two source-to-source compiler infrastructures, Pluto and EPOD. Our results show that while the efficiency of a computational kernel differs when using different data layouts, the alternative implementations typically benefit from a common set of optimizations on the operations. Therefore separately optimizing the operations and the data layout of a computation could dramatically enhance the effectiveness of compiler optimizations compared with the conventional approaches of using a unified representation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dolan:2013:CSL, author = "Stephen Dolan and Servesh Muralidharan and David Gregg", title = "Compiler support for lightweight context switching", journal = j-TACO, volume = "9", number = "4", pages = "36:1--36:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400695", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We propose a new language-neutral primitive for the LLVM compiler, which provides efficient context switching and message passing between lightweight threads of control. The primitive, called Swapstack, can be used by any language implementation based on LLVM to build higher-level language structures such as continuations, coroutines, and lightweight threads. As part of adding the primitives to LLVM, we have also added compiler support for passing parameters across context switches. Our modified LLVM compiler produces highly efficient code through a combination of exposing the context switching code to existing compiler optimizations, and adding novel compiler optimizations to further reduce the cost of context switches. To demonstrate the generality and efficiency of our primitives, we add one-shot continuations to C++, and provide a simple fiber library that allows millions of fibers to run on multiple cores, with a work-stealing scheduler and fast inter-fiber sychronization. We argue that compiler-supported lightweight context switching can be significantly faster than using a library to switch between contexts, and provide experimental evidence to support the position.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Abad:2013:LLE, author = "Pablo Abad and Valentin Puente and Jose-Angel Gregorio", title = "{LIGERO}: a light but efficient router conceived for cache-coherent chip multiprocessors", journal = j-TACO, volume = "9", number = "4", pages = "37:1--37:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400696", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Although abstraction is the best approach to deal with computing system complexity, sometimes implementation details should be considered. Considering on-chip interconnection networks in particular, underestimating the underlying system specificity could have nonnegligible impact on performance, cost, or correctness. This article presents a very efficient router that has been devised to deal with cache-coherent chip multiprocessor particularities in a balanced way. Employing the same principles of packet rotation structures as in the rotary router, we present a router configuration with the following novel features: (1) reduced buffering requirements, (2) optimized pipeline under contentionless conditions, (3) more efficient deadlock avoidance mechanism, and (4) optimized in-order delivery guarantee. Putting it all together, our proposal provides a set of features that no other router, to the best of our knowledge, has achieved previously. These are: (1') low implementation cost, (2') low pass-through latency under low load, (3') improved resource utilization through adaptive routing and a buffering scheme free of head-of-line blocking, (4') guarantee of coherence protocol correctness via end-to-end deadlock avoidance and in-order delivery, and (5') improvement of coherence protocol responsiveness through adaptive in-network multicast support. We conduct a thorough evaluation that includes hardware cost estimation and performance evaluation under a wide spectrum of realistic workloads and coherence protocols. Comparing our proposal with VCTM, an optimized state-of-the-art wormhole router, it requires 50\% less area, reduces on-chip cache hierarchy energy delay product on average by 20\%, and improves the cache-coherency chip multiprocessor performance under realistic working conditions by up to 20\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Albericio:2013:ERL, author = "Jorge Albericio and Pablo Ib{\'a}{\~n}ez and V{\'\i}ctor Vi{\~n}als and Jose Mar{\'\i}a Llaber{\'\i}a", title = "Exploiting reuse locality on inclusive shared last-level caches", journal = j-TACO, volume = "9", number = "4", pages = "38:1--38:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400697", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Optimization of the replacement policy used for Shared Last-Level Cache (SLLC) management in a Chip-MultiProcessor (CMP) is critical for avoiding off-chip accesses. Temporal locality, while being exploited by first levels of private cache memories, is only slightly exhibited by the stream of references arriving at the SLLC. Thus, traditional replacement algorithms based on recency are bad choices for governing SLLC replacement. Recent proposals involve SLLC replacement policies that attempt to exploit reuse either by segmenting the replacement list or improving the rereference interval prediction. On the other hand, inclusive SLLCs are commonplace in the CMP market, but the interaction between replacement policy and the enforcement of inclusion has barely been discussed. After analyzing that interaction, this article introduces two simple replacement policies exploiting reuse locality and targeting inclusive SLLCs: Least Recently Reused (LRR) and Not Recently Reused (NRR). NRR has the same implementation cost as NRU, and LRR only adds one bit per line to the LRU cost. After considering reuse locality and its interaction with the invalidations induced by inclusion, the proposals are evaluated by simulating multiprogrammed workloads in an 8-core system with two private cache levels and an SLLC. LRR outperforms LRU by 4.5\% (performing better in 97 out of 100 mixes) and NRR outperforms NRU by 4.2\% (performing better in 99 out of 100 mixes). We also show that our mechanisms outperform rereference interval prediction, a recently proposed SLLC replacement policy and that similar conclusions can be drawn by varying the associativity or the SLLC size.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yiapanis:2013:OSR, author = "Paraskevas Yiapanis and Demian Rosas-Ham and Gavin Brown and Mikel Luj{\'a}n", title = "Optimizing software runtime systems for speculative parallelization", journal = j-TACO, volume = "9", number = "4", pages = "39:1--39:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400698", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Thread-Level Speculation (TLS) overcomes limitations intrinsic with conservative compile-time auto-parallelizing tools by extracting parallel threads optimistically and only ensuring absence of data dependence violations at runtime. A significant barrier for adopting TLS (implemented in software) is the overheads associated with maintaining speculative state. Based on previous TLS limit studies, we observe that on future multicore systems we will likely have more cores idle than those which traditional TLS would be able to harness. This implies that a TLS system should focus on optimizing for small number of cores and find efficient ways to take advantage of the idle cores. Furthermore, research on optimistic systems has covered two important implementation design points: eager vs. lazy version management. With this knowledge, we propose new simple and effective techniques to reduce the execution time overheads for both of these design points. This article describes a novel compact version management data structure optimized for space overhead when using a small number of TLS threads. Furthermore, we describe two novel software runtime parallelization systems that utilize this compact data structure. The first software TLS system, MiniTLS, relies on eager memory data management (in-place updates) and, thus, when a misspeculation occurs a rollback process is required. MiniTLS takes advantage of the novel compact version management representation to parallelize the rollback process and is able to recover from misspeculation faster than existing software eager TLS systems. The second one, Lector (Lazy inspECTOR) is based on lazy version management. Since we have idle cores, the question is whether we can create ``helper'' tasks to determine whether speculation is actually needed without stopping or damaging the speculative execution. In Lector, for each conventional TLS thread running speculatively with lazy version management, there is associated with it a lightweight inspector. The inspector threads execute alongside to verify quickly whether data dependencies will occur. Inspector threads are generated by standard techniques for inspector/executor parallelization. We have applied both TLS systems to seven Java sequential benchmarks, including three benchmarks from SPECjvm2008. Two out of the seven benchmarks exhibit misspeculations. MiniTLS experiments report average speedups of 1.8x for 4 threads increasing close to 7x speedups with 32 threads. Facilitated by our novel compact representation, MiniTLS reduces the space overhead over state-of-the-art software TLS systems between 96\% on 2 threads and 40\% on 32 threads. The experiments for Lector, report average speedups of 1.7x for 2 threads (that is 1 TLS + 1 Inspector threads) increasing close to 8.2x speedups with 32 threads (16 + 16 threads). Compared to a well established software TLS baseline, Lector performs on average 1.7x faster for 32 threads and in no case ( x TLS + x Inspector threads) Lector delivers worse performance than the baseline TLS with the equivalent number of TLS threads (i.e. x TLS threads) nor doubling the equivalent number of TLS threads (i.e., x + x TLS threads).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nugteren:2013:ASC, author = "Cedric Nugteren and Pieter Custers and Henk Corporaal", title = "Algorithmic species: a classification of affine loop nests for parallel programming", journal = j-TACO, volume = "9", number = "4", pages = "40:1--40:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400699", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Code generation and programming have become ever more challenging over the last decade due to the shift towards parallel processing. Emerging processor architectures such as multi-cores and GPUs exploit increasingly parallelism, requiring programmers and compilers to deal with aspects such as threading, concurrency, synchronization, and complex memory partitioning. We advocate that programmers and compilers can greatly benefit from a structured classification of program code. Such a classification can help programmers to find opportunities for parallelization, reason about their code, and interact with other programmers. Similarly, parallelising compilers and source-to-source compilers can take threading and optimization decisions based on the same classification. In this work, we introduce algorithmic species, a classification of affine loop nests based on the polyhedral model and targeted for both automatic and manual use. Individual classes capture information such as the structure of parallelism and the data reuse. To make the classification applicable for manual use, a basic vocabulary forms the base for the creation of a set of intuitive classes. To demonstrate the use of algorithmic species, we identify 115 classes in a benchmark set. Additionally, we demonstrate the suitability of algorithmic species for automated uses by showing a tool to automatically extract species from program code, a species-based source-to-source compiler, and a species-based performance prediction model.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gerards:2013:ODD, author = "Marco E. T. Gerards and Jan Kuper", title = "Optimal {DPM} and {DVFS} for frame-based real-time systems", journal = j-TACO, volume = "9", number = "4", pages = "41:1--41:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400700", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dynamic Power Management (DPM) and Dynamic Voltage and Frequency Scaling (DVFS) are popular techniques for reducing energy consumption. Algorithms for optimal DVFS exist, but optimal DPM and the optimal combination of DVFS and DPM are not yet solved. In this article we use well-established models of DPM and DVFS for frame-based systems. We show that it is not sufficient-as some authors argue-to consider only individual invocations of a task. We define a schedule that also takes interactions between invocations into account and prove-in a theoretical fashion-that this schedule is optimal.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yan:2013:IPA, author = "Zhichao Yan and Hong Jiang and Yujuan Tan and Dan Feng", title = "An integrated pseudo-associativity and relaxed-order approach to hardware transactional memory", journal = j-TACO, volume = "9", number = "4", pages = "42:1--42:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400701", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Our experimental study and analysis reveal that the bottlenecks of existing hardware transactional memory systems are largely rooted in the extra data movements in version management and in the inefficient scheduling of conflicting transactions in conflict management, particularly in the presence of high-contention and coarse-grained applications. In order to address this problem, we propose an integrated Pseudo-Associativity and Relaxed-Order approach to hardware Transactional Memory, called PARO-TM. It exploits the extra pseudo-associative space in the data cache to hold the new value of each transactional modification, and maintains the mappings between the old and new versions via an implicit pseudo-associative hash algorithm (i.e., by inverting the specific bit of the SET index). PARO-TM can branch out the speculative version from the old version upon each transactional modification on demand without a dedicated hardware component to hold the uncommitted data. This means that it is able to automatically access the proper version upon the transaction's commit or abort. Moreover, PARO-TM augments multi-version support in a chained directory to schedule conflicting transactions in a relaxed-order manner to further reduce their overheads. We compare PARO-TM with the state-of-the-art LogTM-SE, TCC, DynTM, and SUV-TM systems and find that PARO-TM consistently outperforms these four representative HTMs. This performance advantage of PARO-TM is far more pronounced under the high-contention and coarse-grained applications in the STAMP benchmark suite, for which PARO-TM is motivated and designed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2013:PGF, author = "Doris Chen and Deshanand Singh", title = "Profile-guided floating- to fixed-point conversion for hybrid {FPGA}-processor applications", journal = j-TACO, volume = "9", number = "4", pages = "43:1--43:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400702", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The key to enabling widespread use of FPGAs for algorithm acceleration is to allow programmers to create efficient designs without the time-consuming hardware design process. Programmers are used to developing scientific and mathematical algorithms in high-level languages (C/C++) using floating point data types. Although easy to implement, the dynamic range provided by floating point is not necessary in many applications; more efficient implementations can be realized using fixed point arithmetic. While this topic has been studied previously [Han et al. 2006; Olson et al. 1999; Gaffar et al. 2004; Aamodt and Chow 1999], the degree of full automation has always been lacking. We present a novel design flow for cases where FPGAs are used to offload computations from a microprocessor. Our LLVM-based algorithm inserts value profiling code into an unmodified C/C++ application to guide its automatic conversion to fixed point. This allows for fast and accurate design space exploration on a host microprocessor before any accelerators are mapped to the FPGA. Through experimental results, we demonstrate that fixed-point conversion can yield resource savings of up to 2x--3x reductions. Embedded RAM usage is minimized, and 13\%--22\% higher $ F_{\rm max} $ than the original floating-point implementation is observed. In a case study, we show that 17\% reduction in logic and 24\% reduction in register usage can be realized by using our algorithm in conjunction with a High-Level Synthesis (HLS) tool.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cui:2013:LCA, author = "Yan Cui and Yingxin Wang and Yu Chen and Yuanchun Shi", title = "Lock-contention-aware scheduler: a scalable and energy-efficient method for addressing scalability collapse on multicore systems", journal = j-TACO, volume = "9", number = "4", pages = "44:1--44:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400703", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In response to the increasing ubiquity of multicore processors, there has been widespread development of multithreaded applications that strive to realize their full potential. Unfortunately, lock contention within operating systems can limit the scalability of multicore systems so severely that an increase in the number of cores can actually lead to reduced performance (i.e., scalability collapse). Existing efforts of solving scalability collapse mainly focus on making critical sections of kernel code fine-grained or designing new synchronization primitives. However, these methods have disadvantages in scalability or energy efficiency. In this article, we observe that the percentage of lock-waiting time over the total execution time for a lock intensive task has a significant correlation with the occurrence of scalability collapse. Based on this observation, a lock-contention-aware scheduler is proposed. Specifically, each task in the scheduler monitors its percentage of lock waiting time continuously. If the percentage exceeds a predefined threshold, this task is considered as lock intensive and migrated to a Special Set of Cores (i.e., SSC). In this way, the number of concurrently running lock-intensive tasks is limited to the number of cores in the SSC, and therefore, the degree of lock contention is controlled. A central challenge of using this scheme is how many cores should be allocated in the SSC to handle lock-intensive tasks. In our scheduler, the optimal number of cores is determined online by the model-driven search. The proposed scheduler is implemented in the recent Linux kernel and evaluated using micro- and macrobenchmarks on AMD and Intel 32-core systems. Experimental results suggest that our proposal is able to remove scalability collapse completely and sustains the maximal throughput of the spin-lock-based system for most applications. Furthermore, the percentage of lock-waiting time can be reduced by up to 84\%. When compared with scalability collapse reduction methods such as requester-based locking scheme and sleeping-based synchronization primitives, our scheme exhibits significant advantages in scalability, power consumption, and energy efficiency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pusukuri:2013:AFC, author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N. Bhuyan", title = "{ADAPT}: a framework for coscheduling multithreaded programs", journal = j-TACO, volume = "9", number = "4", pages = "45:1--45:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400704", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Since multicore systems offer greater performance via parallelism, future computing is progressing towards use of multicore machines with large number of cores. However, the performance of emerging multithreaded programs often does not scale to fully utilize the available cores. Therefore, simultaneously running multiple multithreaded applications becomes inevitable to fully exploit the computing potential of such machines. However, maximizing the performance and throughput on multicore machines in the presence of multiple multithreaded programs is a challenge for the OS. We have observed that the state-of-the-art contention management algorithms fail to effectively coschedule multithreaded programs on multicore machines. To address the above challenge, we present ADAPT, a scheduling framework that continuously monitors the resource usage of multithreaded programs and adaptively coschedules them such that they interfere with each other's performance as little as possible. In addition, ADAPT selects appropriate memory allocation and scheduling policies according to the workload characteristics. We have implemented ADAPT on a 64-core Supermicro server running Solaris 11 and evaluated it using 26 multithreaded programs including the TATP database application, SPECjbb2005, and programs from Phoenix, PARSEC, and SPEC OMP suites. The experimental results show that ADAPT substantially improves total turnaround time and system utilization relative to the default Solaris 11 scheduler.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tartara:2013:CLC, author = "Michele Tartara and Stefano Crespi Reghizzi", title = "Continuous learning of compiler heuristics", journal = j-TACO, volume = "9", number = "4", pages = "46:1--46:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400705", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Optimizing programs to exploit the underlying hardware architecture is an important task. Much research has been done on enabling compilers to find the best set of code optimizations that can build the fastest and less resource-hungry executable for a given program. A common approach is iterative compilation, sometimes enriched by machine learning techniques. This provides good results, but requires extremely long compilation times and an initial training phase lasting even for days or weeks. We present long-term learning, a new algorithm that allows the compiler user to improve the performance of compiled programs with reduced compilation times with respect to iterative compilation, and without an initial training phase. Our algorithm does not just build good programs: it acquires knowledge every time a program is compiled and it uses such knowledge to learn compiler heuristics, without the need for an expert to manually define them. The heuristics are evolved during every compilation, by evaluating their effect on the generated programs. We present implementations of long-term learning on top of two different compilers, and experimental data gathered on multiple hardware configurations showing its effectiveness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chrysos:2013:HCP, author = "Grigorios Chrysos and Panagiotis Dagritzikos and Ioannis Papaefstathiou and Apostolos Dollas", title = "{HC-CART}: a parallel system implementation of data mining classification and regression tree {(CART)} algorithm on a multi-{FPGA} system", journal = j-TACO, volume = "9", number = "4", pages = "47:1--47:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400706", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Data mining is a new field of computer science with a wide range of applications. Its goal is to extract knowledge from massive datasets in a human-understandable structure, for example, the decision trees. In this article we present an innovative, high-performance, system-level architecture for the Classification And Regression Tree (CART) algorithm, one of the most important and widely used algorithms in the data mining area. Our proposed architecture exploits parallelism at the decision variable level, and was fully implemented and evaluated on a modern high-performance reconfigurable platform, the Convey HC-1 server, that features four FPGAs and a multicore processor. Our FPGA-based implementation was integrated with the widely used ``rpart'' software library of the R project in order to provide the first fully functional reconfigurable system that can handle real-world large databases. The proposed system, named HC-CART system, achieves a performance speedup of up to two orders of magnitude compared to well-known single-threaded data mining software platforms, such as WEKA and the R platform. It also outperforms similar hardware systems which implement parts of the complete application by an order of magnitude. Finally, we show that the HC-CART system offers higher performance speedup than some other proposed parallel software implementations of decision tree construction algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2013:DCD, author = "Jongwon Lee and Yohan Ko and Kyoungwoo Lee and Jonghee M. Youn and Yunheung Paek", title = "Dynamic code duplication with vulnerability awareness for soft error detection on {VLIW} architectures", journal = j-TACO, volume = "9", number = "4", pages = "48:1--48:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400707", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Soft errors are becoming a critical concern in embedded system designs. Code duplication techniques have been proposed to increase the reliability in multi-issue embedded systems such as VLIW by exploiting empty slots for duplicated instructions. However, they increase code size, another important concern, and ignore vulnerability differences in instructions, causing unnecessary or inefficient protection when selecting instructions to be duplicated under constraints. In this article, we propose a compiler-assisted dynamic code duplication method to minimize the code size overhead, and present vulnerability-aware duplication algorithms to maximize the effectiveness of instruction duplication with least overheads for VLIW architecture. Our experimental results with SoarGen and Synopsys simulation environments demonstrate that our proposals can reduce the code size by up to 40\% and detect more soft errors by up to 82\% via fault injection experiments over benchmarks from DSPstone and Livermore Loops as compared to the previously proposed instruction duplication technique.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Coelho:2013:ACI, author = "Fabien Coelho and Fran{\c{c}}ois Irigoin", title = "{API} compilation for image hardware accelerators", journal = j-TACO, volume = "9", number = "4", pages = "49:1--49:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400708", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present an API-based compilation strategy to optimize image applications, developed using a high-level image processing library, onto three different image processing hardware accelerators. We demonstrate that such a strategy is profitable for both development cost and overall performance, especially as it takes advantage of optimization opportunities across library calls otherwise beyond reach. The library API provides the semantics of the image computations. The three image accelerator targets are quite distinct: the first one uses a vector architecture; the second one presents an SIMD architecture; the last one runs both on GPGPU and multicores through OpenCL. We have adapted standard compilation techniques to perform these compilation and code generation tasks automatically. Our strategy is implemented in PIPS, a source-to-source compiler which greatly reduces the development cost as standard phases are reused and parameterized. We carried out experiments with applications on hardware functional simulators and GPUs. Our contributions include: (1) a general low-cost compilation strategy for image processing applications, based on the semantics provided by library calls, which improves locality by an order of magnitude; (2) specific heuristics to minimize execution time on the target accelerators; (3) numerous experiments that show the effectiveness of our strategies. We also discuss the conditions required to extend this approach to other application domains.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luque:2013:FCT, author = "Carlos Luque and Miquel Moreto and Francisco J. Cazorla and Mateo Valero", title = "Fair {CPU} time accounting in {CMP+SMT} processors", journal = j-TACO, volume = "9", number = "4", pages = "50:1--50:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400709", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Processor architectures combining several paradigms of Thread-Level Parallelism (TLP), such as CMP processors in which each core is SMT, are becoming more and more popular as a way to improve performance at a moderate cost. However, the complex interaction between running tasks in hardware shared resources in multi-TLP architectures introduces complexities when accounting CPU time (or CPU utilization) to tasks. The CPU utilization accounted to a task depends on both the time it runs in the processor and the amount of processor hardware resources it receives. Deploying systems with accurate CPU accounting mechanisms is necessary to increase fairness. Moreover, it will allow users to be fairly charged on a shared data center, facilitating server consolidation in future systems. In this article we analyze the accuracy and hardware cost of previous CPU accounting mechanisms for pure-CMP and pure-SMT processors and we show that they are not adequate for CMP+SMT processors. Consequently, we propose a new accounting mechanism for CMP+SMT processors which: (1) increases the accuracy of accounted CPU utilization; (2) provides much more stable results over a wide range of processor setups; and (3) does not require tracking all hardware shared resources, significantly reducing its implementation cost. In particular, previous proposals lead to inaccuracies between 21\% and 79\% when measuring CPU utilization in an 8-core 2-way SMT processor, while our proposal reduces this inaccuracy to less than 5.0\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mattheakis:2013:SRM, author = "Pavlos M. Mattheakis and Ioannis Papaefstathiou", title = "Significantly reducing {MPI} intercommunication latency and power overhead in both embedded and {HPC} systems", journal = j-TACO, volume = "9", number = "4", pages = "51:1--51:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400710", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Highly parallel systems are becoming mainstream in a wide range of sectors ranging from their traditional stronghold high-performance computing, to data centers and even embedded systems. However, despite the quantum leaps of improvements in cost and performance of individual components over the last decade (e.g., processor speeds, memory/interconnection bandwidth, etc.), system manufacturers are still struggling to deliver low-latency, highly scalable solutions. One of the main reasons is that the intercommunication latency grows significantly with the number of processor nodes. This article presents a novel way to reduce this intercommunication delay by implementing, in custom hardware, certain communication tasks. In particular, the proposed novel device implements the two most widely used procedures of the most popular communication protocol in parallel systems the Message Passing Interface (MPI). Our novel approach has initially been simulated within a pioneering parallel systems simulation framework and then synthesized directly from a high-level description language (i.e., SystemC) using a state-of-the-art synthesis tool. To the best of our knowledge, this is the first article presenting the complete hardware implementation of such a system. The proposed novel approach triggers a speedup from one to four orders of magnitude when compared with conventional software-based solutions and from one to three orders of magnitude when compared with a sophisticated software-based approach. Moreover, the performance of our system is from one to two orders of magnitude higher than the simulated performance of a similar but, relatively simpler hardware architecture; at the same time the power consumption of our device is about two orders of magnitude lower than that of a low-power CPU when executing the exact same intercommunication tasks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Baghdadi:2013:ILT, author = "Riyadh Baghdadi and Albert Cohen and Sven Verdoolaege and Konrad Trifunovi{\'c}", title = "Improved loop tiling based on the removal of spurious false dependences", journal = j-TACO, volume = "9", number = "4", pages = "52:1--52:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400711", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To preserve the validity of loop nest transformations and parallelization, data dependences need to be analyzed. Memory dependences come in two varieties: true dependences or false dependences. While true dependences must be satisfied in order to preserve the correct order of computations, false dependences are induced by the reuse of a single memory location to store multiple values. False dependences reduce the degrees of freedom for loop transformations. In particular, loop tiling is severely limited in the presence of these dependences. While array expansion removes all false dependences, the overhead on memory and the detrimental impact on register-level reuse can be catastrophic. We propose and evaluate a compilation technique to safely ignore a large number of false dependences in order to enable loop nest tiling in the polyhedral model. It is based on the precise characterization of interferences between live range intervals, and it does not incur any scalar or array expansion. Our algorithms have been implemented in the Pluto polyhedral compiler, and evaluated on the PolyBench suite.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pop:2013:OED, author = "Antoniu Pop and Albert Cohen", title = "{OpenStream}: Expressiveness and data-flow compilation of {OpenMP} streaming programs", journal = j-TACO, volume = "9", number = "4", pages = "53:1--53:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400712", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present OpenStream, a data-flow extension of OpenMP to express dynamic dependent tasks. The language supports nested task creation, modular composition, variable and unbounded sets of producers/consumers, and first-class streams. These features, enabled by our original compilation flow, allow translating high-level parallel programming patterns, like dependences arising from StarSs' array regions, or universal low-level primitives like futures. In particular, these dynamic features can be embedded efficiently and naturally into an unmanaged imperative language, avoiding the complexity and overhead of a concurrent garbage collector. We demonstrate the performance advantages of a data-flow execution model compared to more restricted task and barrier models. We also demonstrate the efficiency of our compilation and runtime algorithms for the support of complex dependence patterns arising from StarSs benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Verdoolaege:2013:PPC, author = "Sven Verdoolaege and Juan Carlos Juega and Albert Cohen and Jos{\'e} Ignacio G{\'o}mez and Christian Tenllado and Francky Catthoor", title = "Polyhedral parallel code generation for {CUDA}", journal = j-TACO, volume = "9", number = "4", pages = "54:1--54:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400713", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article addresses the compilation of a sequential program for parallel execution on a modern GPU. To this end, we present a novel source-to-source compiler called PPCG. PPCG singles out for its ability to accelerate computations from any static control loop nest, generating multiple CUDA kernels when necessary. We introduce a multilevel tiling strategy and a code generation scheme for the parallelization and locality optimization of imperfectly nested loops, managing memory and exposing concurrency according to the constraints of modern GPUs. We evaluate our algorithms and tool on the entire PolyBench suite.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Du:2013:DCC, author = "Yu Du and Miao Zhou and Bruce Childers and Rami Melhem and Daniel Moss{\'e}", title = "Delta-compressed caching for overcoming the write bandwidth limitation of hybrid main memory", journal = j-TACO, volume = "9", number = "4", pages = "55:1--55:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400714", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Limited PCM write bandwidth is a critical obstacle to achieve good performance from hybrid DRAM/PCM memory systems. The write bandwidth is severely restricted in PCM devices, which harms application performance. Indeed, as we show, it is more important to reduce PCM write traffic than to reduce PCM read latency for application performance. To reduce the number of PCM writes, we propose a DRAM cache organization that employs compression. A new delta compression technique for modified data is used to achieve a large compression ratio. Our approach can selectively and predictively apply compression to improve its efficiency and performance. Our approach is designed to facilitate adoption in existing main memory compression frameworks. We describe an instance of how to incorporate delta compression in IBM's MXT memory compression architecture when used for DRAM cache in a hybrid main memory. For fourteen representative memory-intensive workloads, on average, our delta compression technique reduces the number of PCM writes by 54.3\%, and improves IPC performance by 24.4\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Purini:2013:FGO, author = "Suresh Purini and Lakshya Jain", title = "Finding good optimization sequences covering program space", journal = j-TACO, volume = "9", number = "4", pages = "56:1--56:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400715", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The compiler optimizations we enable and the order in which we apply them on a program have a substantial impact on the program execution time. Compilers provide default optimization sequences which can give good program speedup. As the default sequences have to optimize programs with different characteristics, they embed in them multiple subsequences which can optimize different classes of programs. These multiple subsequences may falsely interact with each other and affect the potential program speedup achievable. Instead of searching for a single universally optimal sequence, we can construct a small set of good sequences such that for every program class there exists a near-optimal optimization sequence in the good sequences set. If we can construct such a good sequences set which covers all the program classes in the program space, then we can choose the best sequence for a program by trying all the sequences in the good sequences set. This approach completely circumvents the need to solve the program classification problem. Using a sequence set size of around 10 we got an average speedup up to 14\% on PolyBench programs and up to 12\% on MiBench programs. Our approach is quite different from either the iterative compilation or machine-learning-based prediction modeling techniques proposed in the literature so far. We use different training and test datasets for cross-validation as against the Leave-One-Out cross-validation technique.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Belviranli:2013:DSS, author = "Mehmet E. Belviranli and Laxmi N. Bhuyan and Rajiv Gupta", title = "A dynamic self-scheduling scheme for heterogeneous multiprocessor architectures", journal = j-TACO, volume = "9", number = "4", pages = "57:1--57:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400716", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Today's heterogeneous architectures bring together multiple general-purpose CPUs and multiple domain-specific GPUs and FPGAs to provide dramatic speedup for many applications. However, the challenge lies in utilizing these heterogeneous processors to optimize overall application performance by minimizing workload completion time. Operating system and application development for these systems is in their infancy. In this article, we propose a new scheduling and workload balancing scheme, HDSS, for execution of loops having dependent or independent iterations on heterogeneous multiprocessor systems. The new algorithm dynamically learns the computational power of each processor during an adaptive phase and then schedules the remainder of the workload using a weighted self-scheduling scheme during the completion phase. Different from previous studies, our scheme uniquely considers the runtime effects of block sizes on the performance for heterogeneous multiprocessors. It finds the right trade-off between large and small block sizes to maintain balanced workload while keeping the accelerator utilization at maximum. Our algorithm does not require offline training or architecture-specific parameters. We have evaluated our scheme on two different heterogeneous architectures: AMD 64-core Bulldozer system with nVidia Fermi C2050 GPU and Intel Xeon 32-core SGI Altix 4700 supercomputer with Xilinx Virtex 4 FPGAs. The experimental results show that our new scheduling algorithm can achieve performance improvements up to over 200\% when compared to the closest existing load balancing scheme. Our algorithm also achieves full processor utilization with all processors completing at nearly the same time which is significantly better than alternative current approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Negi:2013:SCF, author = "Anurag Negi and Ruben Titos-Gil", title = "{SCIN-cache}: Fast speculative versioning in multithreaded cores", journal = j-TACO, volume = "9", number = "4", pages = "58:1--58:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400717", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article describes cache designs for efficiently supporting speculative techniques like transactional memory on chip multiprocessors with multithreaded cores. On-demand allocation and prompt freeing of speculative cache space in the design reduces the burden on nonspeculative execution. Quick access to both clean and speculative versions of data for multiple contexts provides flexibility and greater design freedom to HTM architects. Performance analysis shows the designs stand up well against other HTM design proposals, with potential performance gains in high contention applications with small transactions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lutz:2013:PAF, author = "Thibaut Lutz and Christian Fensch and Murray Cole", title = "{PARTANS}: an autotuning framework for stencil computation on multi-{GPU} systems", journal = j-TACO, volume = "9", number = "4", pages = "59:1--59:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400718", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GPGPUs are a powerful and energy-efficient solution for many problems. For higher performance or larger problems, it is necessary to distribute the problem across multiple GPUs, increasing the already high programming complexity. In this article, we focus on abstracting the complexity of multi-GPU programming for stencil computation. We show that the best strategy depends not only on the stencil operator, problem size, and GPU, but also on the PCI express layout. This adds nonuniform characteristics to a seemingly homogeneous setup, causing up to 23\% performance loss. We address this issue with an autotuner that optimizes the distribution across multiple GPUs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xiao:2013:SAT, author = "Chunhua Xiao and M-C. Frank Chang and Jason Cong and Michael Gill and Zhangqin Huang and Chunyue Liu and Glenn Reinman and Hao Wu", title = "Stream arbitration: Towards efficient bandwidth utilization for emerging on-chip interconnects", journal = j-TACO, volume = "9", number = "4", pages = "60:1--60:??", month = jan, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2400682.2400719", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jan 18 10:57:16 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Alternative interconnects are attractive for scaling on-chip communication bandwidth in a power-efficient manner. However, efficient utilization of the bandwidth provided by these emerging interconnects still remains an open problem due to the spatial and temporal communication heterogeneity. In this article, a Stream Arbitration scheme is proposed, where at runtime any source can compete for any communication channel of the interconnect to talk to any destination. We apply stream arbitration to radio frequency interconnect (RF-I). Experimental results show that compared to the representative token arbitration scheme, stream arbitration can provide an average 20\% performance improvement and 12\% power reduction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2013:DRU, author = "Yunji Chen and Tianshi Chen and Ling Li and Ruiyang Wu and Daofu Liu and Weiwu Hu", title = "Deterministic Replay Using Global Clock", journal = j-TACO, volume = "10", number = "1", pages = "1:1--1:??", month = apr, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2445572.2445573", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Apr 5 18:36:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Debugging parallel programs is a well-known difficult problem. A promising method to facilitate debugging parallel programs is using hardware support to achieve deterministic replay on a Chip Multi-Processor (CMP). As a Design-For-Debug (DFD) feature, a practical hardware-assisted deterministic replay scheme should have low design and verification costs, as well as a small log size. To achieve these goals, we propose a novel and succinct hardware-assisted deterministic replay scheme named LReplay. The key innovation of LReplay is that instead of recording the logical time orders between instructions or instruction blocks as previous investigations, LReplay is built upon recording the pending period information infused by the global clock. By the recorded pending period information, about 99\% execution orders are inferrable, implying that LReplay only needs to record directly the residual 1\% noninferrable execution orders in production run. The 1\% noninferrable orders can be addressed by a simple yet cost-effective direction prediction technique, which further reduces the log size of LReplay. Benefiting from the preceding innovations, the overall log size of LReplay over SPLASH-2 benchmarks is about 0.17B/K-Inst (byte per k-instruction) for the sequential consistency, and 0.57B/K-Inst for the Godson-3 consistency. Such log sizes are smaller in an order of magnitude than previous deterministic replay schemes incurring no performance loss. Furthermore, LReplay only consumes about 0.5\% area of the Godson-3 CMP, since it requires only trivial modifications to existing components of Godson-3. The features of LReplay demonstrate the potential of integrating hardware support for deterministic replay into future industrial processors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lustig:2013:TIC, author = "Daniel Lustig and Abhishek Bhattacharjee and Margaret Martonosi", title = "{TLB} Improvements for Chip Multiprocessors: Inter-Core Cooperative Prefetchers and Shared Last-Level {TLBs}", journal = j-TACO, volume = "10", number = "1", pages = "2:1--2:??", month = apr, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2445572.2445574", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Apr 5 18:36:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Translation Lookaside Buffers (TLBs) are critical to overall system performance. Much past research has addressed uniprocessor TLBs, lowering access times and miss rates. However, as Chip MultiProcessors (CMPs) become ubiquitous, TLB design and performance must be reevaluated. Our article begins by performing a thorough TLB performance evaluation of sequential and parallel benchmarks running on a real-world, modern CMP system using hardware performance counters. This analysis demonstrates the need for further improvement of TLB hit rates for both classes of application, and it also points out that the data TLB has a significantly higher miss rate than the instruction TLB in both cases. In response to the characterization data, we propose and evaluate both Inter-Core Cooperative (ICC) TLB prefetchers and Shared Last-Level (SLL) TLBs as alternatives to the commercial norm of private, per-core L2 TLBs. ICC prefetchers eliminate 19\% to 90\% of Data TLB (D-TLB) misses across parallel workloads while requiring only modest changes in hardware. SLL TLBs eliminate 7\% to 79\% of D-TLB misses for parallel workloads and 35\% to 95\% of D-TLB misses for multiprogrammed sequential workloads. This corresponds to 27\% and 21\% increases in hit rates as compared to private, per-core L2 TLBs, respectively, and is achieved this using even more modest hardware requirements. Because of their benefits for parallel applications, their applicability to sequential workloads, and their readily implementable hardware, SLL TLBs and ICC TLB prefetchers hold great promise for CMPs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2013:TME, author = "Rong Chen and Haibo Chen", title = "{Tiled-MapReduce}: Efficient and Flexible {MapReduce} Processing on Multicore with Tiling", journal = j-TACO, volume = "10", number = "1", pages = "3:1--3:??", month = apr, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2445572.2445575", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Apr 5 18:36:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The prevalence of chip multiprocessors opens opportunities of running data-parallel applications originally in clusters on a single machine with many cores. MapReduce, a simple and elegant programming model to program large-scale clusters, has recently been shown a promising alternative to harness the multicore platform. The differences such as memory hierarchy and communication patterns between clusters and multicore platforms raise new challenges to design and implement an efficient MapReduce system on multicore. This article argues that it is more efficient for MapReduce to iteratively process small chunks of data in turn than processing a large chunk of data at a time on shared memory multicore platforms. Based on the argument, we extend the general MapReduce programming model with a ``tiling strategy'', called Tiled --- MapReduce (TMR). TMR partitions a large MapReduce job into a number of small subjobs and iteratively processes one subjob at a time with efficient use of resources; TMR finally merges the results of all subjobs for output. Based on Tiled-MapReduce, we design and implement several optimizing techniques targeting multicore, including the reuse of the input buffer among subjobs, a NUCA/NUMA-aware scheduler, and pipelining a subjob's reduce phase with the successive subjob's map phase, to optimize the memory, cache, and CPU resources accordingly. Further, we demonstrate that Tiled-MapReduce supports fine-grained fault tolerance and enables several usage scenarios such as online and incremental computing on multicore machines. Performance evaluation with our prototype system called Ostrich on a 48-core machine shows that Ostrich saves up to 87.6\% memory, causes less cache misses, and makes more efficient use of CPU cores, resulting in a speedup ranging from 1.86x to 3.07x over Phoenix. Ostrich also efficiently supports fine-grained fault tolerance, online, and incremental computing with small performance penalty.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Becchi:2013:DTS, author = "Michela Becchi and Patrick Crowley", title = "{A-DFA}: a Time- and Space-Efficient {DFA} Compression Algorithm for Fast Regular Expression Evaluation", journal = j-TACO, volume = "10", number = "1", pages = "4:1--4:26", month = apr, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2445572.2445576", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Apr 5 18:36:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern network intrusion detection systems need to perform regular expression matching at line rate in order to detect the occurrence of critical patterns in packet payloads. While Deterministic Finite Automata (DFAs) allow this operation to be performed in linear time, they may exhibit prohibitive memory requirements. Kumar et al. [2006a] have proposed Delayed Input DFAs (D2FAs), which provide a trade-off between the memory requirements of the compressed DFA and the number of states visited for each character processed, which in turn affects the memory bandwidth required to evaluate regular expressions. In this article we introduce Amortized time --- bandwidth overhead DFAs ( A --- DFAs ), a general compression technique that results in at most N ( k + 1)/ k state traversals when processing a string of length N, k being a positive integer. In comparison to the D2FA approach, our technique achieves comparable levels of compression with lower provable bounds on memory bandwidth (or greater compression for a given bandwidth bound). Moreover, the A-DFA algorithm has lower complexity, can be applied during DFA creation, and is suitable for scenarios where a compressed DFA needs to be dynamically built or updated. Finally, we show how to combine A-DFA with alphabet reduction and multistride DFAs, two techniques aimed at reducing the memory space and bandwidth requirement of DFAs, and discuss memory encoding schemes suitable for A-DFAs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2013:MFM, author = "Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay B. Brockman and Dean M. Tullsen and Norman P. Jouppi", title = "The {McPAT} Framework for Multicore and Manycore Architectures: Simultaneously Modeling Power, Area, and Timing", journal = j-TACO, volume = "10", number = "1", pages = "5:1--5:??", month = apr, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2445572.2445577", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Apr 5 18:36:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article introduces McPAT, an integrated power, area, and timing modeling framework that supports comprehensive design space exploration for multicore and manycore processor configurations ranging from 90nm to 22nm and beyond. At microarchitectural level, McPAT includes models for the fundamental components of a complete chip multiprocessor, including in-order and out-of-order processor cores, networks-on-chip, shared caches, and integrated system components such as memory controllers and Ethernet controllers. At circuit level, McPAT supports detailed modeling of critical-path timing, area, and power. At technology level, McPAT models timing, area, and power for the device types forecast in the ITRS roadmap. McPAT has a flexible XML interface to facilitate its use with many performance simulators. Combined with a performance simulator, McPAT enables architects to accurately quantify the cost of new ideas and assess trade-offs of different architectures using new metrics such as Energy-Delay-Area2 Product (EDA2P) and Energy-Delay-Area Product (EDAP). This article explores the interconnect options of future manycore processors by varying the degree of clustering over generations of process technologies. Clustering will bring interesting trade-offs between area and performance because the interconnects needed to group cores into clusters incur area overhead, but many applications can make good use of them due to synergies from cache sharing. Combining power, area, and timing results of McPAT with performance simulation of PARSEC benchmarks for manycore designs at the 22nm technology shows that 8-core clustering gives the best energy-delay product, whereas when die area is taken into account, 4-core clustering gives the best EDA2P and EDAP.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kritikakou:2013:NOM, author = "Angeliki Kritikakou and Francky Catthoor and George S. Athanasiou and Vasilios Kelefouras and Costas Goutis", title = "Near-Optimal Microprocessor and Accelerators Codesign with Latency and Throughput Constraints", journal = j-TACO, volume = "10", number = "2", pages = "6:1--6:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2459316.2459317", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 1 16:38:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A systematic methodology for near-optimal software/hardware codesign mapping onto an FPGA platform with microprocessor and HW accelerators is proposed. The mapping steps deal with the inter-organization, the foreground memory management, and the datapath mapping. A step is described by parameters and equations combined in a scalable template. Mapping decisions are propagated as design constraints to prune suboptimal options in next steps. Several performance-area Pareto points are produced by instantiating the parameters. To evaluate our methodology we map a real-time bio-imaging application and loop-dominated benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2013:HAC, author = "Lei Jiang and Yu Du and Bo Zhao and Youtao Zhang and Bruce R. Childers and Jun Yang", title = "Hardware-Assisted Cooperative Integration of Wear-Leveling and Salvaging for Phase Change Memory", journal = j-TACO, volume = "10", number = "2", pages = "7:1--7:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2459316.2459318", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 1 16:38:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Phase Change Memory (PCM) has recently emerged as a promising memory technology. However, PCM's limited write endurance restricts its immediate use as a replacement for DRAM. To extend the lifetime of PCM chips, wear-leveling and salvaging techniques have been proposed. Wear-leveling balances write operations across different PCM regions while salvaging extends the duty cycle and provides graceful degradation for a nonnegligible number of failures. Current wear-leveling and salvaging schemes have not been designed and integrated to work cooperatively to achieve the best PCM device lifetime. In particular, a noncontiguous PCM space generated from salvaging complicates wear-leveling and incurs large overhead. In this article, we propose LLS, a Line-Level mapping and Salvaging design. By allocating a dynamic portion of total space in a PCM device as backup space, and mapping failed lines to backup PCM, LLS constructs a contiguous PCM space and masks lower-level failures from the OS and applications. LLS integrates wear-leveling and salvaging and copes well with modern OSes. Our experimental results show that LLS achieves 31\% longer lifetime than the state-of-the-art. It has negligible hardware cost and performance overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Han:2013:PEP, author = "Kyuseung Han and Junwhan Ahn and Kiyoung Choi", title = "Power-Efficient Predication Techniques for Acceleration of Control Flow Execution on {CGRA}", journal = j-TACO, volume = "10", number = "2", pages = "8:1--8:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2459316.2459319", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 1 16:38:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Coarse-grained reconfigurable architecture typically has an array of processing elements which are controlled by a centralized unit. This makes it difficult to execute programs having control divergence among PEs without predication. However, conventional predication techniques have a negative impact on both performance and power consumption due to longer instruction words and unnecessary instruction-fetching decoding nullifying steps. This article reveals performance and power issues in predicated execution which have not been well-addressed yet. Furthermore, it proposes fast and power-efficient predication mechanisms. Experiments conducted through gate-level simulation show that our mechanism improves energy-delay product by 11.9\% to 23.8\% on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2013:MTD, author = "Chao Wang and Xi Li and Junneng Zhang and Xuehai Zhou and Xiaoning Nie", title = "{MP-Tomasulo}: a Dependency-Aware Automatic Parallel Execution Engine for Sequential Programs", journal = j-TACO, volume = "10", number = "2", pages = "9:1--9:??", month = may, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2459316.2459320", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 1 16:38:16 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents MP-Tomasulo, a dependency-aware automatic parallel task execution engine for sequential programs. Applying the instruction-level Tomasulo algorithm to MPSoC environments, MP-Tomasulo detects and eliminates Write-After-Write (WAW) and Write-After-Read (WAR) inter-task dependencies in the dataflow execution, therefore to operate out-of-order task execution on heterogeneous units. We implemented the prototype system within a single FPGA. Experimental results on EEMBC applications demonstrate that MP-Tomasulo can execute the tasks out-of-order to achieve as high as 93.6\% to 97.6\% of ideal peak speedup. A comparative study against a state-of-the-art dataflow execution scheme is illustrated with a classic JPEG application. The promising results show MP-Tomasulo enables programmers to uncover more task-level parallelism on heterogeneous systems, as well as to ease the burden of programmers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Anonymous:2013:TR, author = "Anonymous", title = "{TACO} Reviewers 2012", journal = j-TACO, volume = "10", number = "3", pages = "9:1--9:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2509420.2509421", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shifer:2013:LLA, author = "Eran Shifer and Shlomo Weiss", title = "Low-latency adaptive mode transitions and hierarchical power management in asymmetric clustered cores", journal = j-TACO, volume = "10", number = "3", pages = "10:1--10:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2499901", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recently, engineering solutions that include asymmetric multicores have been fabricated for low form-factor computing devices, indicating a potential direction for future evolution of processors. In this article we propose an asymmetric clustered core architecture, exhibiting low-latency switching between modes relative to asymmetric multicores, and having similarities with the same asymmetric multicore architecture in the context of a wider dynamic range of the processor power-performance characteristic. Asymmetric clustered cores incur additional microarchitectural complexity and area cost inside a core but exhibit better chip-level integration characteristics compared to asymmetric multicores. Focusing on power efficiency of asymmetric clustered cores, we describe: (1) a hierarchical power management partitioning between the operating system and on-die firmware for coarse-grain switch policies, and (2) core-internal tracking hardware for fine-grain switching. The mode switch policies of the core's tracking hardware are dependent on higher-level directives and hints from the operating system, on-die firmware, and compiler or profiling software. We further explore the potential power management benefits of asymmetric clustered cores relative to asymmetric multicores, demonstrating that the ability of asymmetric clustered cores to use tight training periods for adaptive behavior, with low overhead switching between modes, results in a more efficient utilization of power management directives.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{BenAsher:2013:HTL, author = "Yosi {Ben Asher} and Nadav Rotem", title = "Hybrid type legalization for a sparse {SIMD} instruction set", journal = j-TACO, volume = "10", number = "3", pages = "11:1--11:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2509420.2509422", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "SIMD vector units implement only a subset of the operations used by vectorizing compilers, and there are multiple conflicting techniques to legalize arbitrary vector types into register-sized data types. Traditionally, type legalization is performed using a set of predefined rules, regardless of the operations used in the program. This method is not suitable to sparse SIMD instruction sets and often prevents the vectorization of programs. In this work we introduce a new technique for type legalization, namely vector element promotion, as well as a hybrid method for combining multiple techniques of type legalization. Our hybrid type legalization method makes decisions based on the knowledge of the available instruction set as well as the operations used in the program. Our experimental results demonstrate that program-dependent hybrid type legalization improves the execution time of vector programs, outperforms the existing legalization method, and allows the vectorization of workloads which were not vectorized before.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lei:2013:VCI, author = "Yuanwu Lei and Yong Dou and Lei Guo and Jinbo Xu and Jie Zhou and Yazhuo Dong and Hongjian Li", title = "{VLIW} coprocessor for {IEEE-754} quadruple-precision elementary functions", journal = j-TACO, volume = "10", number = "3", pages = "12:1--12:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512430", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, a unified VLIW coprocessor, based on a common group of atomic operation units, for Quad arithmetic and elementary functions (QP\_VELP) is presented. The explicitly parallel scheme of VLIW instruction and Estrin's evaluation scheme for polynomials are used to improve the performance. A two-level VLIW instruction RAM scheme is introduced to achieve high scalability and customizability, even for more complex key program kernels. Finally, the Quad arithmetic accelerator (QAA) with the QP\_VELP array is implemented on ASIC. Compared with hyper-thread software implementation on an Intel Xeon E5620, QAA with 8 QP\_VELP units achieves improvement by a factor of 18X.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kawahito:2013:IRF, author = "Motohiro Kawahito and Hideaki Komatsu and Takao Moriyama and Hiroshi Inoue and Toshio Nakatani", title = "Idiom recognition framework using topological embedding", journal = j-TACO, volume = "10", number = "3", pages = "13:1--13:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512431", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Modern processors support hardware-assist instructions (such as TRT and TROT instructions on the IBM System z) to accelerate certain functions such as delimiter search and character conversion. Such special instructions are often used in high-performance libraries, but their exploitation in optimizing compilers has been limited. We devised a new idiom recognition technique based on a topological embedding algorithm to detect idiom patterns in the input programs more aggressively than in previous approaches using exact pattern matching. Our approach can detect a pattern even if the code segment does not exactly match the idiom. For example, we can detect a code segment that includes additional code within the idiom pattern. We also propose an instruction simplification for the idiom recognition. This optimization analyzes all of the usages of the output of the optimized code for a specific idiom. If we find that we do not need an actual value for the output but only a value in a subrange, then we can assign a value in that subrange as the output. The code generation can generate faster code with this optimization. We implemented our new idiom recognition approach based on the Java Just-In-Time (JIT) compiler that is part of the J9 Java Virtual Machine, and we supported several important idioms for the special hardware-assist instructions on the IBM System z and on some models of the IBM System p. To demonstrate the effectiveness of our technique, we performed two experiments. The first experiment was to see how many more patterns we can detect compared to the previous approach. The second experiment measured the performance improvements over the previous approaches. For the first experiment, we used the Java Compatibility Kit (JCK) API tests. For the second experiment we used the IBM XML parser, SPECjvm98, and SPCjbb2000. In summary, relative to a baseline implementation using exact pattern matching, our algorithm converted 76\% more loops in JCK tests. On a z9, we also observed significant average performance improvement of the XML parser by 54\%, of SPECjvm98 by 1.9\%, and of SPECjbb2000 by 4.4\%. Finally, we observed that the JIT compilation time increased by only 0.32\% to 0.44\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shobaki:2013:PIS, author = "Ghassan Shobaki and Maxim Shawabkeh and Najm Eldeen Abu Rmaileh", title = "Preallocation instruction scheduling with register pressure minimization using a combinatorial optimization approach", journal = j-TACO, volume = "10", number = "3", pages = "14:1--14:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512432", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Balancing Instruction-Level Parallelism (ILP) and register pressure during preallocation instruction scheduling is a fundamentally important problem in code generation and optimization. The problem is known to be NP-complete. Many heuristic techniques have been proposed to solve this problem. However, due to the inherently conflicting requirements of maximizing ILP and minimizing register pressure, heuristic techniques may produce poor schedules in many cases. If such cases occur in hot code, significant performance degradation may result. A few combinatorial optimization approaches have also been proposed, but none of them has been shown to solve large real-world instances within reasonable time. This article presents the first combinatorial algorithm that is efficient enough to optimally solve large instances of this problem (basic blocks with hundreds of instructions) within a few seconds per instance. The proposed algorithm uses branch-and-bound enumeration with a number of powerful pruning techniques to efficiently search the solution space. The search is based on a cost function that incorporates schedule length and register pressure. An implementation of the proposed scheduling algorithm has been integrated into the LLVM Compiler and evaluated using SPEC CPU 2006. On x86-64, with a time limit of 10ms per instruction, it optimally schedules 79\% of the hot basic blocks in FP2006. Another 19\% of the blocks are not optimally scheduled but are improved in cost relative to LLVM's heuristic. This improves the execution time of some benchmarks by up to 21\%, with a geometric-mean improvement of 2.4\% across the entire benchmark suite. With the use of precise latency information, the geometric-mean improvement is increased to 2.8\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{She:2013:EEM, author = "Dongrui She and Yifan He and Henk Corporaal", title = "An energy-efficient method of supporting flexible special instructions in an embedded processor with compact {ISA}", journal = j-TACO, volume = "10", number = "3", pages = "15:1--15:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2509420.2509426", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In application-specific processor design, a common approach to improve performance and efficiency is to use special instructions that execute complex operation patterns. However, in a generic embedded processor with compact Instruction Set Architecture (ISA), these special instructions may lead to large overhead such as: ( i ) more bits are needed to encode the extra opcodes and operands, resulting in wider instructions; ( ii ) more Register File (RF) ports are required to provide the extra operands to the function units. Such overhead may increase energy consumption considerably. In this article, we propose to support flexible operation pair patterns in a processor with a compact 24-bit RISC-like ISA using: ( i ) a partially reconfigurable decoder that exploits the pattern locality to reduce opcode space requirement; ( ii ) a software-controlled bypass network to reduce operand encoding bit and RF port requirement. An energy-aware compiler backend is designed for the proposed architecture that performs pattern selection and bypass-aware scheduling to generate energy-efficient codes. Though the proposed design imposes extra constraints on the operation patterns, the experimental results show that for benchmark applications from different domains, the average dynamic instruction count is reduced by over 25\%, which is only about 2\% less than the architecture without such constraints. The proposed architecture reduces total energy by an average of 15.8\% compared to the RISC baseline, while the one without constraints achieves almost no improvement due to its high overhead. When high performance is required, the proposed architecture is able to achieve a speedup of 13.8\% with 13.1\% energy reduction compared to the baseline by introducing multicycle SFU operations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nandivada:2013:IBA, author = "V. Krishna Nandivada and Rajkishore Barik", title = "Improved bitwidth-aware variable packing", journal = j-TACO, volume = "10", number = "3", pages = "16:1--16:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2509420.2509427", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Bitwidth-aware register allocation has caught the attention of researchers aiming to effectively reduce the number of variables spilled into memory. For general-purpose processors, this improves the execution time performance and reduces runtime memory requirements (which in turn helps in the compilation of programs targeted to systems with constrained memory). Additionally, bitwidth-aware register allocation has been effective in reducing power consumption in embedded processors. One of the key components of bitwidth-aware register allocation is the variable packing algorithm that packs multiple narrow-width variables into one physical register. Tallam and Gupta [2003] have proved that optimal variable packing is an NP-complete problem for arbitrary-width variables and have proposed an approximate solution. In this article, we analyze the complexity of the variable packing problem and present three enhancements that improve the overall packing of variables. In particular, the improvements we describe are: (a) Width Static Single Assignment (W-SSA) form representation that splits the live range of a variable into several fixed-width live ranges (W-SSA) variables; (b) PoTR Representation --- use of powers-of-two representation for bitwidth information for W-SSA variables. Our empirical results have shown that the associated bit wastage resulting from the overapproximation of the widths of variables to the nearest next power of two is a small fraction compared to the total number of bits in use ($ \approx $ 13\%). The main advantage of this representation is that it leads to optimal variable packing in polynomial time; (c) Combined Packing and Coalescing --- we discuss the importance of coalescing (combining variables whose live ranges do not interfere) in the context of variable packing and present an iterative algorithm to perform coalescing and packing of W-SSA variables represented in PoTR. Our experimental results show up to 76.00\% decrease in the number of variables compared to the number of variables in the input program in Single Static Assignment (SSA) form. This reduction in the number of variables led to a significant reduction in dynamic spilling, packing, and unpacking instructions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ahn:2013:SHR, author = "Jung Ho Ahn and Young Hoon Son and John Kim", title = "Scalable high-radix router microarchitecture using a network switch organization", journal = j-TACO, volume = "10", number = "3", pages = "17:1--17:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512433", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As the system size of supercomputers and datacenters increases, cost-efficient networks become critical in achieving good scalability on those systems. High -radix routers reduce network cost by lowering the network diameter while providing a high bisection bandwidth and path diversity. The building blocks of these large-scale networks are the routers or the switches and they need to scale accordingly to the increasing port count and increasing pin bandwidth. However, as the port count increases, the high-radix router microarchitecture itself needs to scale efficiently. Hierarchical crossbar switch organization has been proposed where a single large crossbar used for a router switch is partitioned into many small crossbars and overcomes the limitations of conventional router microarchitecture. Although the organization provides high performance, it has limited scalability due to excessive power and area overheads by the wires and intermediate buffers. In this article, we propose scalable router microarchitectures that leverage a network within the switch design of the high-radix routers themselves. These alternative designs lower the wiring complexity and buffer requirements. For example, when a folded-Clos switch is used instead of the hierarchical crossbar switch for a radix-64 router, it provides up to 73\%, 58\%, and 87\% reduction in area, energy-delay product, and energy-delay-area product, respectively. We also explore more efficient switch designs by exploiting the traffic-pattern characteristics of the global network and its impact on the local network design within the switch for both folded-Clos and flattened butterfly networks. In particular, we propose a bilateral butterfly switch organization that has fewer crossbars and global wires compared to the topology-agnostic folded-Clos switch while achieving better low-load latency and equivalent saturation throughput.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huang:2013:ACM, author = "Libo Huang and Zhiying Wang and Nong Xiao and Yongwen Wang and Qiang Dou", title = "Adaptive communication mechanism for accelerating {MPI} functions in {NoC}-based multicore processors", journal = j-TACO, volume = "10", number = "3", pages = "18:1--18:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512434", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multicore designs have emerged as the dominant organization for future high-performance microprocessors. Communication in such designs is often enabled by Networks-on-Chip (NoCs). A new trend in such architectures is to fit a Message Passing Interface (MPI) programming model on NoCs to achieve optimal parallel application performance. A key issue in designing MPI over NoCs is communication protocol, which has not been explored in previous research. This article advocates a hardware-supported communication mechanism using a protocol-adaptive approach to adjust to varying NoC configurations (e.g., number of buffers) and workload behavior (e.g., number of messages). We propose the ADaptive Communication Mechanism (ADCM), a hybrid protocol that involves behavior similar to buffered communication when sufficient buffer is available in the receiver to that similar to a synchronous protocol when buffers in the receiver are limited. ADCM adapts dynamically by deciding communication protocol on a per-request basis using a local estimate of recent buffer utilization. ADCM attempts to combine both the advantages of buffered and synchronous communication modes to achieve enhanced throughput and performance. Simulations of various workloads show that the proposed communication mechanism can be effectively used in future NoC designs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Malik:2013:OSG, author = "Avinash Malik and David Gregg", title = "Orchestrating stream graphs using model checking", journal = j-TACO, volume = "10", number = "3", pages = "19:1--19:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512435", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article we use model checking to statically distribute and schedule Synchronous DataFlow (SDF) graphs on heterogeneous execution architectures. We show that model checking is capable of providing an optimal solution and it arrives at these solutions faster (in terms of algorithm runtime) than equivalent ILP formulations. Furthermore, we also show how different types of optimizations such as task parallelism, data parallelism, and state sharing can be included within our framework. Finally, comparison of our approach with the current state-of-the-art heuristic techniques show the pitfalls of these techniques and gives a glimpse of how these heuristic techniques can be improved.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2013:UML, author = "Zheng Wang and Michael F. P. O'Boyle", title = "Using machine learning to partition streaming programs", journal = j-TACO, volume = "10", number = "3", pages = "20:1--20:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512436", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Stream-based parallel languages are a popular way to express parallelism in modern applications. The efficient mapping of streaming parallelism to today's multicore systems is, however, highly dependent on the program and underlying architecture. We address this by developing a portable and automatic compiler-based approach to partitioning streaming programs using machine learning. Our technique predicts the ideal partition structure for a given streaming application using prior knowledge learned offline. Using the predictor we rapidly search the program space (without executing any code) to generate and select a good partition. We applied this technique to standard StreamIt applications and compared against existing approaches. On a 4-core platform, our approach achieves 60\% of the best performance found by iteratively compiling and executing over 3000 different partitions per program. We obtain, on average, a 1.90$ \times $ speedup over the already tuned partitioning scheme of the StreamIt compiler. When compared against a state-of-the-art analytical, model-based approach, we achieve, on average, a 1.77$ \times $ performance improvement. By porting our approach to an 8-core platform, we are able to obtain 1.8$ \times $ improvement over the StreamIt default scheme, demonstrating the portability of our approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bakhoda:2013:DCN, author = "Ali Bakhoda and John Kim and Tor M. Aamodt", title = "Designing on-chip networks for throughput accelerators", journal = j-TACO, volume = "10", number = "3", pages = "21:1--21:??", month = sep, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2512429", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Sep 16 17:20:12 MDT 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As the number of cores and threads in throughput accelerators such as Graphics Processing Units (GPU) increases, so does the importance of on-chip interconnection network design. This article explores throughput-effective Network-on-Chips (NoC) for future compute accelerators that employ Bulk-Synchronous Parallel (BSP) programming models such as CUDA and OpenCL. A hardware optimization is ``throughput effective'' if it improves parallel application-level performance per unit chip area. We evaluate performance of future looking workloads using detailed closed-loop simulations modeling compute nodes, NoC, and the DRAM memory system. We start from a mesh design with bisection bandwidth balanced to off-chip demand. Accelerator workloads tend to demand high off-chip memory bandwidth which results in a many-to-few traffic pattern when coupled with expected technology constraints of slow growth in pins-per-chip. Leveraging these observations we reduce NoC area by proposing a ``checkerboard'' NoC which alternates between conventional full routers and half routers with limited connectivity. Next, we show that increasing network terminal bandwidth at the nodes connected to DRAM controllers alleviates a significant fraction of the remaining imbalance resulting from the many-to-few traffic pattern. Furthermore, we propose a ``double checkerboard inverted'' NoC organization which takes advantage of channel slicing to reduce area while maintaining the performance improvements of the aforementioned techniques. This organization also has a simpler routing mechanism and improves average application throughput per unit area by 24.3\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jantz:2013:ESM, author = "Michael R. Jantz and Prasad A. Kulkarni", title = "Exploring single and multilevel {JIT} compilation policy for modern machines 1", journal = j-TACO, volume = "10", number = "4", pages = "22:1--22:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541229", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dynamic or Just-in-Time (JIT) compilation is essential to achieve high-performance emulation for programs written in managed languages, such as Java and C\#. It has been observed that a conservative JIT compilation policy is most effective to obtain good runtime performance without impeding application progress on single-core machines. At the same time, it is often suggested that a more aggressive dynamic compilation strategy may perform best on modern machines that provide abundant computing resources, especially with virtual machines (VMs) that are also capable of spawning multiple concurrent compiler threads. However, comprehensive research on the best JIT compilation policy for such modern processors and VMs is currently lacking. The goal of this work is to explore the properties of single-tier and multitier JIT compilation policies that can enable existing and future VMs to realize the best program performance on modern machines. In this work, we design novel experiments and implement new VM configurations to effectively control the compiler aggressiveness and optimization levels ( if and when methods are compiled) in the industry-standard Oracle HotSpot Java VM to achieve this goal. We find that the best JIT compilation policy is determined by the nature of the application and the speed and effectiveness of the dynamic compilers. We extend earlier results showing the suitability of conservative JIT compilation on single-core machines for VMs with multiple concurrent compiler threads. We show that employing the free compilation resources (compiler threads and hardware cores) to aggressively compile more program methods quickly reaches a point of diminishing returns. At the same time, we also find that using the free resources to reduce compiler queue backup (compile selected hot methods early ) significantly benefits program performance, especially for slower (highly optimizing) JIT compilers. For such compilers, we observe that accurately prioritizing JIT method compiles is crucial to realize the most performance benefit with the smallest hardware budget. Finally, we show that a tiered compilation policy, although complex to implement, greatly alleviates the impact of more and early JIT compilation of programs on modern machines.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dong:2013:CAC, author = "Xiangyu Dong and Norman P. Jouppi and Yuan Xie", title = "A circuit-architecture co-optimization framework for exploring nonvolatile memory hierarchies", journal = j-TACO, volume = "10", number = "4", pages = "23:1--23:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541230", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many new memory technologies are available for building future energy-efficient memory hierarchies. It is necessary to have a framework that can quickly find the optimal memory technology at each hierarchy level. In this work, we first build a circuit-architecture joint design space exploration framework by combining RC circuit analysis and Artificial Neural Network (ANN)-based performance modeling. Then, we use this framework to evaluate some emerging nonvolatile memory hierarchies. We demonstrate that a Resistive RAM (ReRAM)-based cache hierarchy on an 8-core Chip-Multiprocessor (CMP) system can achieve a 24\% Energy Delay Product (EDP) improvement and a 36\% Energy Delay Area Product (EDAP) improvement compared to a conventional hierarchy with SRAM on-chip caches and DRAM main memory.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2013:OGE, author = "Jishen Zhao and Guangyu Sun and Gabriel H. Loh and Yuan Xie", title = "Optimizing {GPU} energy efficiency with {$3$D} die-stacking graphics memory and reconfigurable memory interface", journal = j-TACO, volume = "10", number = "4", pages = "24:1--24:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541231", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The performance of graphics processing unit (GPU) systems is improving rapidly to accommodate the increasing demands of graphics and high-performance computing applications. With such a performance improvement, however, power consumption of GPU systems is dramatically increased. Up to 30\% of the total power of a GPU system is consumed by the graphic memory itself. Therefore, reducing graphics memory power consumption is critical to mitigate the power challenge. In this article, we propose an energy-efficient reconfigurable 3D die-stacking graphics memory design that integrates wide-interface graphics DRAMs side-by-side with a GPU processor on a silicon interposer. The proposed architecture is a ``3D+2.5D'' system, where the DRAM memory itself is 3D stacked memory with through-silicon via (TSV), whereas the integration of DRAM and the GPU processor is through the interposer solution (2.5D). Since GPU computing units, memory controllers, and memory are all integrated in the same package, the number of memory I/Os is no longer constrained by the package's pin count. We can reduce the memory power consumption by scaling down the supply voltage and frequency of memory interface while maintaining the same or even higher peak memory bandwidth. In addition, we design a reconfigurable memory interface that can dynamically adapt to the requirements of various applications. We propose two reconfiguration mechanisms to optimize the GPU system energy efficiency and throughput, respectively, and thus benefit both memory-intensive and compute-intensive applications. The experimental results show that the proposed GPU memory architecture can effectively improve GPU system energy efficiency by 21\%, without reconfiguration. The reconfigurable memory interface can further improve the system energy efficiency by 26\%, and system throughput by 31\% under a capped system power budget of 240W.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2013:EMT, author = "Chien-Chi Chen and Sheng-De Wang", title = "An efficient multicharacter transition string-matching engine based on the {Aho--Corasick} algorithm", journal = j-TACO, volume = "10", number = "4", pages = "25:1--25:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541232", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A string-matching engine capable of inspecting multiple characters in parallel can multiply the throughput. However, the space required for implementing a matching engine that can process multiple characters in parallel generally grows exponentially with respect to the characters to be processed in parallel. Based on the Aho--Corasick algorithm (AC-algorithm), this work presents a novel multicharacter transition Nondeterministic Finite Automaton (NFA) approach, called multicharacter AC-NFA, to allow for the inspection of multiple characters in parallel. This approach first converts an AC-trie to an AC-NFA by allowing for the simultaneous activation of multiple states and then converts the AC-NFA to a $k$-character AC-NFA by an algorithm with concatenation operations and assistant transitions. Additionally, the alignment problem, which occurs while multiple characters are being inspected in parallel, is solved using assistant transitions. Moreover, a corresponding output is provided for each inspected character by introducing priority multiplexers to determine the final matching outputs during implementation of the multicharacter AC-NFA. Consequently, the number of derived $k$-character transitions grows linearly with respect to the number $k$. Furthermore, the derived multicharacter AC-NFA is implemented on FPGAs for evaluation. The resulting throughput grows approximately 14 times and the hardware cost grows about 18 times for 16-character AC-NFA implementation, as compared with that for 1-character AC-NFA implementation. The achievable throughput is 21.4Gbps for the 16-character AC-NFA implementation operating at a 167.36MHz clock.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luo:2013:DIH, author = "Yangchun Luo and Wei-Chung Hsu and Antonia Zhai", title = "The design and implementation of heterogeneous multicore systems for energy-efficient speculative thread execution", journal = j-TACO, volume = "10", number = "4", pages = "26:1--26:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541233", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the emergence of multicore processors, various aggressive execution models have been proposed to exploit fine-grained thread-level parallelism, taking advantage of the fast on-chip interconnection communication. However, the aggressive nature of these execution models often leads to excessive energy consumption incommensurate to execution time reduction. In the context of Thread-Level Speculation, we demonstrated that on a same-ISA heterogeneous multicore system, by dynamically deciding how on-chip resources are utilized, speculative threads can achieve performance gain in an energy-efficient way. Through a systematic design space exploration, we built a multicore architecture that integrates heterogeneous components of processing cores and first-level caches. To cope with processor reconfiguration overheads, we introduced runtime mechanisms to mitigate their impacts. To match program execution with the most energy-efficient processor configuration, the system was equipped with a dynamic resource allocation scheme that characterizes program behaviors using novel processor counters. We evaluated the proposed heterogeneous system with a diverse set of benchmark programs from SPEC CPU2000 and CPU20006 suites. Compared to the most efficient homogeneous TLS implementation, we achieved similar performance but consumed 18\% less energy. Compared to the most efficient homogeneous uniprocessor running sequential programs, we improved performance by 29\% and reduced energy consumption by 3.6\%, which is a 42\% improvement in energy-delay-squared product.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rolan:2013:VSC, author = "Dyer Rol{\'a}n and Basilio B. Fraguela and Ram{\'o}n Doallo", title = "Virtually split cache: an efficient mechanism to distribute instructions and data 1", journal = j-TACO, volume = "10", number = "4", pages = "27:1--27:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541234", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "First-level caches are usually split for both instructions and data instead of unifying them in a single cache. Although that approach eases the pipeline design and provides a simple way to independently treat data and instructions, its global hit rate is usually smaller than that of a unified cache. Furthermore, unified lower-level caches usually behave and process memory requests disregarding whether they are data or instruction requests. In this article, we propose a new technique aimed to balance the amount of space devoted to instructions and data for optimizing set-associative caches: the Virtually Split Cache or VSC. Our technique combines the sharing of resources from unified approaches with the bandwidth and parallelism that split configurations provide, thus reducing power consumption while not degrading performance. Our design dynamically adjusts cache resources devoted to instructions and data depending on their particular demand. Two VSC designs are proposed in order to track the instructions and data requirements. The Shadow Tag VSC (ST-VSC) is based on shadow tags that store the last evicted line related to data and instructions in order to determine how well the cache would work with one more way per set devoted to each kind. The Global Selector VSC (GS-VSC) uses a saturation counter that is updated every time a cache miss occurs either under an instruction or data request applying a duel-like mechanism. Experiments with a variable and a fixed latency VSC show that ST-VSC and GS-VSC reduce on average the cache hierarchy power consumption by 29\% and 24\%, respectively, with respect to a standard baseline. As for performance, while the fixed latency designs virtually match the split baseline in a single-core system, a variable latency ST-VSC and GS-VSC increase the average IPC by 2.5\% and 2\%, respectively. In multicore systems, even the slower fixed latency ST-VSC and GS-VSC designs improve the baseline IPC by 3.1\% and 2.5\%, respectively, in a four-core system thanks to the reduction in the bandwidth demanded from the lower cache levels. This is in contrast with many techniques that trade performance degradation for power consumption reduction. VSC particularly benefits embedded processors with a single level of cache, where up to an average 9.2\% IPC improvement is achieved. Interestingly, we also find that partitioning the LLC for instructions and data can improve performance around 2\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Subramaniam:2013:UFC, author = "Samantika Subramaniam and Simon C. Steely and Will Hasenplaugh and Aamer Jaleel and Carl Beckmann and Tryggve Fossum and Joel Emer", title = "Using in-flight chains to build a scalable cache coherence protocol", journal = j-TACO, volume = "10", number = "4", pages = "28:1--28:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541235", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As microprocessor designs integrate more cores, scalability of cache coherence protocols becomes a challenging problem. Most directory-based protocols avoid races by using blocking tag directories that can impact the performance of parallel applications. In this article, we first quantitatively demonstrate that state-of-the-art blocking protocols significantly constrain throughput at large core counts for several parallel applications. Nonblocking protocols address this throughput concern at the expense of scalability in the interconnection network or in the required resource overheads. To address this concern, we enhance nonblocking directory protocols by migrating the point of service of responses. Our approach uses in-flight chains of cores making parallel memory requests to incorporate scalability while maintaining high-throughput. The proposed cache coherence protocol called chained cache coherence, can outperform blocking protocols by up to 20\% on scientific and 12\% on commercial applications. It also has low resource overheads and simple address ordering requirements making it both a high-performance and scalable protocol. Furthermore, in-flight chains provide a scalable solution to building hierarchical and nonblocking tag directories as well as optimize communication latencies.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sanchez:2013:MIP, author = "Daniel S{\'a}nchez and Yiannakis Sazeides and Juan M. Cebri{\'a}n and Jos{\'e} M. Garc{\'\i}a and Juan L. Arag{\'o}n", title = "Modeling the impact of permanent faults in caches", journal = j-TACO, volume = "10", number = "4", pages = "29:1--29:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541236", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The traditional performance cost benefits we have enjoyed for decades from technology scaling are challenged by several critical constraints including reliability. Increases in static and dynamic variations are leading to higher probability of parametric and wear-out failures and are elevating reliability into a prime design constraint. In particular, SRAM cells used to build caches that dominate the processor area are usually minimum sized and more prone to failure. It is therefore of paramount importance to develop effective methodologies that facilitate the exploration of reliability techniques for caches. To this end, we present an analytical model that can determine for a given cache configuration, address trace, and random probability of permanent cell failure the exact expected miss rate and its standard deviation when blocks with faulty bits are disabled. What distinguishes our model is that it is fully analytical, it avoids the use of fault maps, and yet, it is both exact and simpler than previous approaches. The analytical model is used to produce the miss-rate trends ( expected miss-rate ) for future technology nodes for both uncorrelated and clustered faults. Some of the key findings based on the proposed model are (i) block disabling has a negligible impact on the expected miss-rate unless probability of failure is equal or greater than 2.6e-4, (ii) the fault map methodology can accurately calculate the expected miss-rate as long as 1,000 to 10,000 fault maps are used, and (iii) the expected miss-rate for execution of parallel applications increases with the number of threads and is more pronounced for a given probability of failure as compared to sequential execution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2013:APF, author = "Sanghoon Lee and James Tuck", title = "Automatic parallelization of fine-grained metafunctions on a chip multiprocessor", journal = j-TACO, volume = "10", number = "4", pages = "30:1--30:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541237", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Due to the importance of reliability and security, prior studies have proposed inlining metafunctions into applications for detecting bugs and security vulnerabilities. However, because these software techniques add frequent, fine-grained instrumentation to programs, they often incur large runtime overheads. In this work, we consider an automatic thread extraction technique for removing these fine-grained checks from a main application and scheduling them on helper threads. In this way, we can leverage the resources available on a CMP to reduce the latency and overhead of fine-grained checking codes. Our parallelization strategy extracts metafunctions from a single threaded application and executes them in customized helper threads-threads constructed to mirror relevant fragments of the main program's behavior in order to keep communication and overhead low. To get good performance, we consider optimizations that reduce communication and balance work among many threads. We evaluate our parallelization strategy on Mudflap, a pointer-use checking tool in GCC. To show the benefits of our technique, we compare it to a manually parallelized version of Mudflap. We run our experiments on an architectural simulator with support for fast queueing operations. On a subset of SPECint 2000, our automatically parallelized code using static load balance is only 19\% slower, on average, than the manually parallelized version on a simulated eight-core system. In addition, our automatically parallelized code using dynamic load balance is competitive, on average, to the manually parallelized version on a simulated eight-core system. Furthermore, all the applications except parser achieve better speedups with our automatic algorithms than with the manual approach. Also, our approach introduces very little overhead in the main program-it is kept under 100\%, which is more than a 5.3$ \times $ reduction compared to serial Mudflap.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dubach:2013:DMA, author = "Christophe Dubach and Timothy M. Jones and Edwin V. Bonilla", title = "Dynamic microarchitectural adaptation using machine learning", journal = j-TACO, volume = "10", number = "4", pages = "31:1--31:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541238", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Adaptive microarchitectures are a promising solution for designing high-performance, power-efficient microprocessors. They offer the ability to tailor computational resources to the specific requirements of different programs or program phases. They have the potential to adapt the hardware cost-effectively at runtime to any application's needs. However, one of the key challenges is how to dynamically determine the best architecture configuration at any given time, for any new workload. This article proposes a novel control mechanism based on a predictive model for microarchitectural adaptivity control. This model is able to efficiently control adaptivity by monitoring the behaviour of an application's different phases at runtime. We show that by using this model on SPEC 2000, we double the energy\slash performance efficiency of the processor when compared to the best static configuration tuned for the whole benchmark suite. This represents 74\% of the improvement available if we know the best microarchitecture for each program phase ahead of time. In addition, we present an extended analysis of the best configurations found and show that the overheads associated with the implementation of our scheme have a negligible impact on performance and power.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2013:CME, author = "Long Chen and Yanan Cao and Zhao Zhang", title = "{E$^3$CC}: a memory error protection scheme with novel address mapping for subranked and low-power memories", journal = j-TACO, volume = "10", number = "4", pages = "32:1--32:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2541239", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 23 10:31:41 MST 2013", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This study presents and evaluates E$^3$ CC (Enhanced Embedded ECC), a full design and implementation of a generic embedded ECC scheme that enables power-efficient error protection for subranked memory systems. It incorporates a novel address mapping scheme called Biased Chinese Remainder Mapping (BCRM) to resolve the address mapping issue for memories of page interleaving, plus a simple and effective cache design to reduce extra ECC traffic. Our evaluation using SPEC CPU2006 benchmarks confirms the performance and power efficiency of the E$^3$ CC scheme for subranked memories as well as conventional memories.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tian:2013:TBM, author = "Yingying Tian and Samira M. Khan and Daniel A. Jim{\'e}nez", title = "Temporal-based multilevel correlating inclusive cache replacement", journal = j-TACO, volume = "10", number = "4", pages = "33:1--33:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555290", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Inclusive caches have been widely used in Chip Multiprocessors (CMPs) to simplify cache coherence. However, they have poor performance compared with noninclusive caches not only because of the limited capacity of the entire cache hierarchy but also due to ignorance of temporal locality of the Last-Level Cache (LLC). Blocks that are highly referenced (referred to as hot blocks ) are always hit in higher-level caches (e.g., L1 cache) and are rarely referenced in the LLC. Therefore, they become replacement victims in the LLC. Due to the inclusion property, blocks evicted from the LLC have to also be invalidated from higher-level caches. Invalidation of hot blocks from the entire cache hierarchy introduces costly off-chip misses that makes the inclusive cache perform poorly. Neither blocks that are highly referenced in the LLC nor blocks that are highly referenced in higher-level caches should be the LLC replacement victims. We propose temporal-based multilevel correlating cache replacement for inclusive caches to evict blocks in the LLC that are also not hot in higher-level caches using correlated temporal information acquired from all levels of a cache hierarchy with minimal overhead. Invalidation of these blocks does not hurt the performance. By contrast, replacing them as early as possible with useful blocks helps improve cache performance. Based on our experiments, in a dual-core CMP, an inclusive cache with temporal-based multilevel correlating cache replacement significantly outperforms an inclusive cache with traditional LRU replacement by yielding an average speedup of 12.7\%, which is comparable to an enhanced noninclusive cache, while requiring less than 1\% of storage overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2013:HSA, author = "Qixiao Liu and Miquel Moreto and Victor Jimenez and Jaume Abella and Francisco J. Cazorla and Mateo Valero", title = "Hardware support for accurate per-task energy metering in multicore systems", journal = j-TACO, volume = "10", number = "4", pages = "34:1--34:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555291", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Accurately determining the energy consumed by each task in a system will become of prominent importance in future multicore-based systems because it offers several benefits, including (i) better application energy/performance optimizations, (ii) improved energy-aware task scheduling, and (iii) energy-aware billing in data centers. Unfortunately, existing methods for energy metering in multicores fail to provide accurate energy estimates for each task when several tasks run simultaneously. This article makes a case for accurate Per-Task Energy Metering (PTEM) based on tracking the resource utilization and occupancy of each task. Different hardware implementations with different trade-offs between energy prediction accuracy and hardware-implementation complexity are proposed. Our evaluation shows that the energy consumed in a multicore by each task can be accurately measured. For a 32-core, 2-way, simultaneous multithreaded core setup, PTEM reduces the average accuracy error from more than 12\% when our hardware support is not used to less than 4\% when it is used. The maximum observed error for any task in the workload we used reduces from 58\% down to 9\% when our hardware support is used.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mehta:2013:TSS, author = "Sanyam Mehta and Gautham Beeraka and Pen-Chung Yew", title = "Tile size selection revisited", journal = j-TACO, volume = "10", number = "4", pages = "35:1--35:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555292", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Loop tiling is a widely used loop transformation to enhance data locality and allow data reuse. In the tiled code, however, tiles of different sizes can lead to significant variation in performance. Thus, selection of an optimal tile size is critical to performance of tiled codes. In the past, tile size selection has been attempted using both static analytical and dynamic empirical (auto-tuning) models. Past work using static models assumed a direct-mapped cache for the purpose of analysis and thus proved to be less robust. On the other hand, the auto-tuning models involve an exhaustive search in a large space of tiled codes. In this article, we propose a new analytical model for tile size selection that leverages the high set associativity in modern caches to minimize conflict misses. Our tile size selection model targets data reuse in multiple levels of cache. In addition, it considers the interaction of tiling with the SIMD unit in modern processors in estimating the optimal tile size. We find that these factors, not considered in previous models, are critical in developing a robust model for tile size selection. We implement our tile size selection model in a polyhedral compiler and test it on 12 benchmark kernels using two different problem sizes. Our model outperforms the previous analytical models that are based on reusing data in a single level of cache and achieves an average performance improvement of 9.7\% and 20.4\%, respectively, over the best square (cubic) tiles for the two problem sizes. In addition, the tile size chosen by our tile size selection algorithm is similar to the best performing size obtained through an extensive search, validating the analytical model underlying the algorithm.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Prisacari:2013:FPS, author = "Bogdan Prisacari and German Rodriguez and Cyriel Minkenberg and Torsten Hoefler", title = "Fast pattern-specific routing for fat tree networks", journal = j-TACO, volume = "10", number = "4", pages = "36:1--36:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555293", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In the context of eXtended Generalized Fat Tree (XGFT) topologies, widely used in HPC and datacenter network designs, we propose a generic method, based on Integer Linear Programming (ILP), to efficiently determine optimal routes for arbitrary workloads. We propose a novel approach that combines ILP with dynamic programming, effectively reducing the time to solution. Specifically, we divide the network into smaller subdomains optimized using a custom ILP formulation that ensures global optimality of local solutions. Local solutions are then combined into an optimal global solution using dynamic programming. Finally, we demonstrate through a series of extensive benchmarks that our approach scales in practice to networks interconnecting several thousands of nodes, using a single-threaded, freely available linear programming solver on commodity hardware, with the potential for higher scalability by means of commercial, parallel solvers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Breughe:2013:SRB, author = "Maximilien B. Breughe and Lieven Eeckhout", title = "Selecting representative benchmark inputs for exploring microprocessor design spaces", journal = j-TACO, volume = "10", number = "4", pages = "37:1--37:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555294", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The design process of a microprocessor requires representative workloads to steer the search process toward an optimum design point for the target application domain. However, considering a broad set of workloads to cover the large space of potential workloads is infeasible given how time-consuming design space exploration typically is. Hence, it is crucial to select a small yet representative set of workloads, which leads to a shorter design cycle while yielding a (near) optimal design. Prior work has mostly looked into selecting representative benchmarks; however, limited attention was given to the selection of benchmark inputs and how this affects workload representativeness during design space exploration. Using a set of 1,000 inputs for a number of embedded benchmarks and a design space with around 1,700 design points, we find that selecting a single or three random input(s) per benchmark potentially (in a worst-case scenario) leads to a suboptimal design that is 56\% and 33\% off, on average, relative to the optimal design in our design space in terms of Energy-Delay Product (EDP). We then propose and evaluate a number of methods for selecting representative inputs and show that we can find the optimum design point with as few as three inputs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kerschbaumer:2013:IFT, author = "Christoph Kerschbaumer and Eric Hennigan and Per Larsen and Stefan Brunthaler and Michael Franz", title = "Information flow tracking meets just-in-time compilation", journal = j-TACO, volume = "10", number = "4", pages = "38:1--38:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555295", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Web applications are vulnerable to cross-site scripting attacks that enable data thefts. Information flow tracking in web browsers can prevent communication of sensitive data to unintended recipients and thereby stop such data thefts. Unfortunately, existing solutions have focused on incorporating information flow into browsers' JavaScript interpreters, rather than just-in-time compilers, rendering the resulting performance noncompetitive. Few users will switch to a safer browser if it comes at the cost of significantly degrading web application performance. We present the first information flow tracking JavaScript engine that is based on a true just-in-time compiler, and that thereby outperforms all previous interpreter-based information flow tracking JavaScript engines by more than a factor of two. Our JIT-based engine (i) has the same coverage as previous interpreter- based solutions, (ii) requires reasonable implementation effort, and (iii) introduces new optimizations to achieve acceptable performance. When evaluated against three industry-standard JavaScript benchmark suites, there is still an average slowdown of 73\% over engines that do not support information flow, but this is now well within the range that many users will find an acceptable price for obtaining substantially increased security.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nasre:2013:TSE, author = "Rupesh Nasre", title = "Time- and space-efficient flow-sensitive points-to analysis", journal = j-TACO, volume = "10", number = "4", pages = "39:1--39:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555296", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compilation of real-world programs often requires hours. The term nightly build known to industrial researchers is an artifact of long compilation times. Our goal is to reduce the absolute analysis times for large C codes (of the order of millions of lines). Pointer analysis is one of the key analyses performed during compilation. Its scalability is paramount to achieve the efficiency of the overall compilation process and its precision directly affects that of the client analyses. In this work, we design a time- and space-efficient flow-sensitive pointer analysis and parallelize it on graphics processing units. Our analysis proposes to use an extended bloom filter, called multibloom, to store points-to information in an approximate manner and develops an analysis in terms of the operations over the multibloom. Since bloom filter is a probabilistic data structure, we develop ways to gain back the analysis precision. We achieve effective parallelization by achieving memory coalescing, reducing thread divergence, and improving load balance across GPU warps. Compared to a state-of-the-art sequential solution, our parallel version achieves a 7.8 $ \times $ speedup with less than 5\% precision loss on a suite of six large programs. Using two client transformations, we show that this loss in precision only minimally affects a client's precision.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ruan:2013:BTB, author = "Wenjia Ruan and Yujie Liu and Michael Spear", title = "Boosting timestamp-based transactional memory by exploiting hardware cycle counters", journal = j-TACO, volume = "10", number = "4", pages = "40:1--40:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555297", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Time-based transactional memories typically rely on a shared memory counter to ensure consistency. Unfortunately, such a counter can become a bottleneck. In this article, we identify properties of hardware cycle counters that allow their use in place of a shared memory counter. We then devise algorithms that exploit the x86 cycle counter to enable bottleneck-free transactional memory runtime systems. We also consider the impact of privatization safety and hardware ordering constraints on the correctness, performance, and generality of our algorithms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dey:2013:RMD, author = "Tanima Dey and Wei Wang and Jack W. Davidson and Mary Lou Soffa", title = "{ReSense}: Mapping dynamic workloads of colocated multithreaded applications using resource sensitivity", journal = j-TACO, volume = "10", number = "4", pages = "41:1--41:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555298", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To utilize the full potential of modern chip multiprocessors and obtain scalable performance improvements, it is critical to mitigate resource contention created by multithreaded workloads. In this article, we describe ReSense, the first runtime system that uses application characteristics to dynamically map multithreaded applications from dynamic workloads-workloads where multithreaded applications arrive, execute, and terminate continuously in unpredictable ways. ReSense mitigates contention for the shared resources in the memory hierarchy by applying a novel thread-mapping algorithm that dynamically adjusts the mapping of threads from dynamic workloads using a precalculated sensitivity score. The sensitivity score quantifies an application's sensitivity to sharing a particular memory resource and is calculated by an efficient characterization process that involves running the multithreaded application by itself on the target platform. To measure ReSense's effectiveness, sensitivity scores were determined for 21 benchmarks from PARSEC-2.1 and NPB-OMP-3.3 for the shared resources in the memory hierarchy on four different platforms. Using three different-sized dynamic workloads composed of randomly selected two, four, and eight corunning benchmarks with randomly selected start times, ReSense was able to improve the average response time of the three workloads by up to 27.03\%, 20.89\%, and 29.34\% and throughput by up to 19.97\%, 46.56\%, and 29.86\%, respectively, over the native OS on real hardware. By estimating and comparing ReSense's effectiveness with the optimal thread mapping for two different workloads, we found that the maximum average difference with the experimentally determined optimal performance was 1.49\% for average response time and 2.08\% for throughput.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Armejach:2013:TIP, author = "Adri{\`a} Armejach and Ruben Titos-Gil and Anurag Negi and Osman S. Unsal and Adri{\'a}n Cristal", title = "Techniques to improve performance in requester-wins hardware transactional memory", journal = j-TACO, volume = "10", number = "4", pages = "42:1--42:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555299", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The simplicity of requester-wins Hardware Transactional Memory (HTM) makes it easy to incorporate in existing chip multiprocessors. Hence, such systems are expected to be widely available in the near future. Unfortunately, these implementations are prone to suffer severe performance degradation due to transient and persistent livelock conditions. This article shows that existing techniques are unable to mitigate this degradation effectively. It then proposes and evaluates four novel techniques-two software-based that employ information provided by the hardware and two that require simple core-local hardware additions-which have the potential to boost the performance of requester-wins HTM designs substantially.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jeon:2013:RDR, author = "Myeongjae Jeon and Conglong Li and Alan L. Cox and Scott Rixner", title = "Reducing {DRAM} row activations with eager read\slash write clustering", journal = j-TACO, volume = "10", number = "4", pages = "43:1--43:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555300", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article describes and evaluates a new approach to optimizing DRAM performance and energy consumption that is based on eagerly writing dirty cache lines to DRAM. Under this approach, many dirty cache lines are written to DRAM before they are evicted. In particular, dirty cache lines that have not been recently accessed are eagerly written to DRAM when the corresponding row has been activated by an ordinary, noneager access, such as a read. This approach enables clustering of reads and writes that target the same row, resulting in a significant reduction in row activations. Specifically, for a variety of applications, it reduces the number of DRAM row activations by an average of 42\% and a maximum of 82\%. Moreover, the results from a full-system simulator show compelling performance improvements and energy consumption reductions. Out of 23 applications, 6 have overall performance improvements between 10\% and 20\%, and 3 have improvements in excess of 20\%. Furthermore, 12 consume between 10\% and 20\% less DRAM energy, and 7 have energy consumption reductions in excess of 20\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2013:HPP, author = "Zhijia Zhao and Michael Bebenita and Dave Herman and Jianhua Sun and Xipeng Shen", title = "{HPar}: a practical parallel parser for {HTML} --- taming {HTML} complexities for parallel parsing", journal = j-TACO, volume = "10", number = "4", pages = "44:1--44:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555301", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Parallelizing HTML parsing is challenging due to the complexities of HTML documents and the inherent dependencies in its parsing algorithm. As a result, despite numerous studies in parallel parsing, HTML parsing remains sequential today. It forms one of the final barriers for fully parallelizing browser operations to minimize the browser's response time-an important variable for user experiences, especially on portable devices. This article provides a comprehensive analysis on the special complexities of parallel HTML parsing and presents a systematic exploration in overcoming those difficulties through specially designed speculative parallelizations. This work develops, to the best of our knowledge, the first pipelining and data-level parallel HTML parsers. The data-level parallel parser, named HPar, achieves up to 2.4$ \times $ speedup on quadcore devices. This work demonstrates the feasibility of efficient, parallel HTML parsing for the first time and offers a set of novel insights for parallel HTML parsing", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Totoni:2013:EFE, author = "Ehsan Totoni and Mert Dikmen and Mar{\'\i}a Jes{\'u}s Garzar{\'a}n", title = "Easy, fast, and energy-efficient object detection on heterogeneous on-chip architectures", journal = j-TACO, volume = "10", number = "4", pages = "45:1--45:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555302", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We optimize a visual object detection application (that uses Vision Video Library kernels) and show that OpenCL is a unified programming paradigm that can provide high performance when running on the Ivy Bridge heterogeneous on-chip architecture. We evaluate different mapping techniques and show that running each kernel where it fits the best and using software pipelining can provide 1.91 times higher performance and 42\% better energy efficiency. We also show how to trade accuracy for energy at runtime. Overall, our application can perform accurate object detection at 40 frames per second (fps) in an energy-efficient manner.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fedorov:2013:AAL, author = "Viacheslav V. Fedorov and Sheng Qiu and A. L. Narasimha Reddy and Paul V. Gratz", title = "{ARI}: Adaptive {LLC}-memory traffic management", journal = j-TACO, volume = "10", number = "4", pages = "46:1--46:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2543697", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Decreasing the traffic from the CPU LLC to main memory is a very important issue in modern systems. Recent work focuses on cache misses, overlooking the impact of writebacks on the total memory traffic, energy consumption, IPC, and so forth. Policies that foster a balanced approach, between reducing write traffic to memory and improving miss rates, can increase overall performance and improve energy efficiency and memory system lifetime for NVM memory technology, such as phase-change memory (PCM). We propose Adaptive Replacement and Insertion (ARI), an adaptive approach to last-level CPU cache management, optimizing the two parameters (miss rate and writeback rate) simultaneously. Our specific focus is to reduce writebacks as much as possible while maintaining or improving the miss rate relative to conventional LRU replacement policy. ARI reduces LLC writebacks by 33\%, on average, while also decreasing misses by 4.7\%, on average. In a typical system, this boosts IPC by 4.9\%, on average, while decreasing energy consumption by 8.9\%. These results are achieved with minimal hardware overheads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gonzalez-Alvarez:2013:AAD, author = "Cecilia Gonz{\'a}lez-{\'A}lvarez and Jennifer B. Sartor and Carlos {\'A}lvarez and Daniel Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout", title = "Accelerating an application domain with specialized functional units", journal = j-TACO, volume = "10", number = "4", pages = "47:1--47:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555303", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hardware specialization has received renewed interest recently as chips are hitting power limits. Chip designers of traditional processor architectures have primarily focused on general-purpose computing, partially due to time-to-market pressure and simpler design processes. But new power limits require some chip specialization. Although hardware configured for a specific application yields large speedups for low-power dissipation, its design is more complex and less reusable. We instead explore domain-based specialization, a scalable approach that balances hardware's reusability and performance efficiency. We focus on specialization using customized compute units that accelerate particular operations. In this article, we develop automatic techniques to identify code sequences from different applications within a domain that can be targeted to a new custom instruction that will be run inside a configurable specialized functional unit (SFU). We demonstrate that using a canonical representation of computations finds more common code sequences among applications that can be mapped to the same custom instruction, leading to larger speedups while specializing a smaller core area than previous pattern-matching techniques. We also propose new heuristics to narrow the search space of domain-specific custom instructions, finding those that achieve the best performance across applications. We estimate the overall performance achieved with our automatic techniques using hardware models on a set of nine media benchmarks, showing that when limiting the core area devoted to specialization, the SFU customization with the largest speedups includes both application- and domain-specific custom instructions. We demonstrate that exploring domain-specific hardware acceleration is key to continued computing system performance improvements.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2013:RMM, author = "Xiaolin Wang and Lingmei Weng and Zhenlin Wang and Yingwei Luo", title = "Revisiting memory management on virtualized environments", journal = j-TACO, volume = "10", number = "4", pages = "48:1--48:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555304", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the evolvement of hardware, 64-bit Central Processing Units (CPUs) and 64-bit Operating Systems (OSs) have dominated the market. This article investigates the performance of virtual memory management of Virtual Machines (VMs) with a large virtual address space in 64-bit OSs, which imposes different pressure on memory virtualization than 32-bit systems. Each of the two conventional memory virtualization approaches, Shadowing Paging (SP) and Hardware-Assisted Paging (HAP), causes different overhead for different applications. Our experiments show that 64-bit applications prefer to run in a VM using SP, while 32-bit applications do not have a uniform preference between SP and HAP. In this article, we trace this inconsistency between 32-bit applications and 64-bit applications to its root cause through a systematic empirical study in Linux systems and discover that the major overhead of SP results from memory management in the 32-bit GNU C library ( glibc ). We propose enhancements to the existing memory management algorithms, which substantially reduce the overhead of SP. Based on the evaluations using SPEC CPU2006, Parsec 2.1, and cloud benchmarks, our results show that SP, with the improved memory allocators, can compete with HAP in almost all cases, in both 64-bit and 32-bit systems. We conclude that without a significant breakthrough in HAP, researchers should pay more attention to SP, which is more flexible and cost effective.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2013:PAP, author = "Chuntao Jiang and Zhibin Yu and Hai Jin and Chengzhong Xu and Lieven Eeckhout and Wim Heirman and Trevor E. Carlson and Xiaofei Liao", title = "{PCantorSim}: Accelerating parallel architecture simulation through fractal-based sampling", journal = j-TACO, volume = "10", number = "4", pages = "49:1--49:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555305", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Computer architects rely heavily on microarchitecture simulation to evaluate design alternatives. Unfortunately, cycle-accurate simulation is extremely slow, being at least 4 to 6 orders of magnitude slower than real hardware. This longstanding problem is further exacerbated in the multi-/many-core era, because single-threaded simulation performance has not improved much, while the design space has expanded substantially. Parallel simulation is a promising approach, yet does not completely solve the simulation challenge. Furthermore, existing sampling techniques, which are widely used for single-threaded applications, do not readily apply to multithreaded applications as thread interaction and synchronization must now be taken into account. This work presents PCantorSim, a novel Cantor set (a classic fractal)--based sampling scheme to accelerate parallel simulation of multithreaded applications. Through the use of the proposed methodology, only less than 5\% of an application's execution time is simulated in detail. We have implemented our approach in Sniper (a parallel multicore simulator) and evaluated it by running the PARSEC benchmarks on a simulated 8-core system. The results show that PCantorSim increases simulation speed over detailed parallel simulation by a factor of 20$ \times $, on average, with an average absolute execution time prediction error of 5.3\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Stipic:2013:PGT, author = "Srdan Stipi{\'c} and Vesna Smiljkovi{\'c} and Osman Unsal and Adri{\'a}n Cristal and Mateo Valero", title = "Profile-guided transaction coalescing-lowering transactional overheads by merging transactions", journal = j-TACO, volume = "10", number = "4", pages = "50:1--50:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555306", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Previous studies in software transactional memory mostly focused on reducing the overhead of transactional read and write operations. In this article, we introduce transaction coalescing, a profile-guided compiler optimization technique that attempts to reduce the overheads of starting and committing a transaction by merging two or more small transactions into one large transaction. We develop a profiling tool and a transaction coalescing heuristic to identify candidate transactions suitable for coalescing. We implement a compiler extension to automatically merge the candidate transactions at the compile time. We evaluate the effectiveness of our technique using the hash table micro-benchmark and the STAMP benchmark suite. Transaction coalescing improves the performance of the hash table significantly and the performance of Vacation and SSCA2 benchmarks by 19.4\% and 36.4\%, respectively, when running with 12 threads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2013:WWA, author = "Zhe Wang and Shuchang Shan and Ting Cao and Junli Gu and Yi Xu and Shuai Mu and Yuan Xie and Daniel A. Jim{\'e}nez", title = "{WADE}: Writeback-aware dynamic cache management for {NVM}-based main memory system", journal = j-TACO, volume = "10", number = "4", pages = "51:1--51:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555307", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Emerging Non-Volatile Memory (NVM) technologies are explored as potential alternatives to traditional SRAM/DRAM-based memory architecture in future microprocessor design. One of the major disadvantages for NVM is the latency and energy overhead associated with write operations. Mitigation techniques to minimize the write overhead for NVM-based main memory architecture have been studied extensively. However, most prior work focuses on optimization techniques for NVM-based main memory itself, with little attention paid to cache management policies for the Last-Level Cache (LLC). In this article, we propose a Writeback-Aware Dynamic CachE (WADE) management technique to help mitigate the write overhead in NVM-based memory. The proposal is based on the observation that, when dirty cache blocks are evicted from the LLC and written into NVM-based memory (with PCM as an example), the long latency and high energy associated with write operations to NVM-based memory can cause system performance/power degradation. Thus, reducing the number of writeback requests from the LLC is critical. The proposed WADE cache management technique tries to keep highly reused dirty cache blocks in the LLC. The technique predicts blocks that are frequently written back in the LLC. The LLC sets are dynamically partitioned into a frequent writeback list and a nonfrequent writeback list. It keeps a best size of each list in the LLC. Our evaluation shows that the technique can reduce the number of writeback requests by 16.5\% for memory-intensive single-threaded benchmarks and 10.8\% for multicore workloads. It yields a geometric mean speedup of 5.1\% for single-thread applications and 7.6\% for multicore workloads. Due to the reduced number of writeback requests to main memory, the technique reduces the energy consumption by 8.1\% for single-thread applications and 7.6\% for multicore workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2013:CCC, author = "Yong Li and Yaojun Zhang and Hai LI and Yiran Chen and Alex K. Jones", title = "{C1C}: a configurable, compiler-guided {STT-RAM L1} cache", journal = j-TACO, volume = "10", number = "4", pages = "52:1--52:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555308", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Spin-Transfer Torque RAM (STT-RAM), a promising alternative to SRAM for reducing leakage power consumption, has been widely studied to mitigate the impact of its asymmetrically long write latency. Recently, STT-RAM has been proposed for L1 caches by relaxing the data retention time to improve write performance and dynamic energy. However, as the technology scales down from 65nm to 22nm, the performance of the read operation scales poorly due to reduced sense margins and sense amplifier delays. In this article, we leverage a dual-mode STT memory cell to design a configurable L1 cache architecture termed C1C to mitigate read performance barriers with technology scaling. Guided by application access characteristics discovered through novel compiler analyses, the proposed cache adaptively switches between a high performance and a low-power access mode. Our evaluation demonstrates that the proposed cache with compiler guidance outperforms a state-of-the-art STT-RAM cache design by 9\% with high dynamic energy efficiency, leading to significant performance/watt improvements over several competing approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fauzia:2013:BRD, author = "Naznin Fauzia and Venmugil Elango and Mahesh Ravishankar and J. Ramanujam and Fabrice Rastello and Atanas Rountev and Louis-No{\"e}l Pouchet and P. Sadayappan", title = "Beyond reuse distance analysis: Dynamic analysis for characterization of data locality potential", journal = j-TACO, volume = "10", number = "4", pages = "53:1--53:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555309", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Emerging computer architectures will feature drastically decreased flops/byte (ratio of peak processing rate to memory bandwidth) as highlighted by recent studies on Exascale architectural trends. Further, flops are getting cheaper, while the energy cost of data movement is increasingly dominant. The understanding and characterization of data locality properties of computations is critical in order to guide efforts to enhance data locality. Reuse distance analysis of memory address traces is a valuable tool to perform data locality characterization of programs. A single reuse distance analysis can be used to estimate the number of cache misses in a fully associative LRU cache of any size, thereby providing estimates on the minimum bandwidth requirements at different levels of the memory hierarchy to avoid being bandwidth bound. However, such an analysis only holds for the particular execution order that produced the trace. It cannot estimate potential improvement in data locality through dependence-preserving transformations that change the execution schedule of the operations in the computation. In this article, we develop a novel dynamic analysis approach to characterize the inherent locality properties of a computation and thereby assess the potential for data locality enhancement via dependence-preserving transformations. The execution trace of a code is analyzed to extract a Computational-Directed Acyclic Graph (CDAG) of the data dependences. The CDAG is then partitioned into convex subsets, and the convex partitioning is used to reorder the operations in the execution trace to enhance data locality. The approach enables us to go beyond reuse distance analysis of a single specific order of execution of the operations of a computation in characterization of its data locality properties. It can serve a valuable role in identifying promising code regions for manual transformation, as well as assessing the effectiveness of compiler transformations for data locality enhancement. We demonstrate the effectiveness of the approach using a number of benchmarks, including case studies where the potential shown by the analysis is exploited to achieve lower data movement costs and better performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bardizbanyan:2013:DPD, author = "Alen Bardizbanyan and Magnus Sj{\"a}lander and David Whalley and Per Larsson-Edefors", title = "Designing a practical data filter cache to improve both energy efficiency and performance", journal = j-TACO, volume = "10", number = "4", pages = "54:1--54:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555310", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Conventional Data Filter Cache (DFC) designs improve processor energy efficiency, but degrade performance. Furthermore, the single-cycle line transfer suggested in prior studies adversely affects Level-1 Data Cache (L1 DC) area and energy efficiency. We propose a practical DFC that is accessed early in the pipeline and transfers a line over multiple cycles. Our DFC design improves performance and eliminates a substantial fraction of L1 DC accesses for loads, L1 DC tag checks on stores, and data translation lookaside buffer accesses for both loads and stores. Our evaluation shows that the proposed DFC can reduce the data access energy by 42.5\% and improve execution time by 4.2\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hagiescu:2013:GCG, author = "Andrei Hagiescu and Bing Liu and R. Ramanathan and Sucheendra K. Palaniappan and Zheng Cui and Bipasa Chattopadhyay and P. S. Thiagarajan and Weng-Fai Wong", title = "{GPU} code generation for {ODE}-based applications with phased shared-data access patterns", journal = j-TACO, volume = "10", number = "4", pages = "55:1--55:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555311", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present a novel code generation scheme for GPUs. Its key feature is the platform-aware generation of a heterogeneous pool of threads. This exposes more data-sharing opportunities among the concurrent threads and reduces the memory requirements that would otherwise exceed the capacity of the on-chip memory. Instead of the conventional strategy of focusing on exposing as much parallelism as possible, our scheme leverages on the phased nature of memory access patterns found in many applications that exhibit massive parallelism. We demonstrate the effectiveness of our code generation strategy on a computational systems biology application. This application consists of computing a Dynamic Bayesian Network (DBN) approximation of the dynamics of signalling pathways described as a system of Ordinary Differential Equations (ODEs). The approximation algorithm involves (i) sampling many (of the order of a few million) times from the set of initial states, (ii) generating trajectories through numerical integration, and (iii) storing the statistical properties of this set of trajectories in Conditional Probability Tables (CPTs) of a DBN via a prespecified discretization of the time and value domains. The trajectories can be computed in parallel. However, the intermediate data needed for computing them, as well as the entries for the CPTs, are too large to be stored locally. Our experiments show that the proposed code generation scheme scales well, achieving significant performance improvements on three realistic signalling pathways models. These results suggest how our scheme could be extended to deal with other applications involving systems of ODEs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2013:TLS, author = "Junghee Lee and Chrysostomos Nicopoulos and Hyung Gyu Lee and Jongman Kim", title = "{TornadoNoC}: a lightweight and scalable on-chip network architecture for the many-core era", journal = j-TACO, volume = "10", number = "4", pages = "56:1--56:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555312", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The rapid emergence of Chip Multi-Processors (CMP) as the de facto microprocessor archetype has highlighted the importance of scalable and efficient on-chip networks. Packet-based Networks-on-Chip (NoC) are gradually cementing themselves as the medium of choice for the multi-/many-core systems of the near future, due to their innate scalability. However, the prominence of the debilitating power wall requires the NoC to also be as energy efficient as possible. To achieve these two antipodal requirements-scalability and energy efficiency-we propose TornadoNoC, an interconnect architecture that employs a novel flow control mechanism. To prevent livelocks and deadlocks, a sequence numbering scheme and a dynamic ring inflation technique are proposed, and their correctness formally proven. The primary objective of TornadoNoC is to achieve substantial gains in (a) scalability to many-core systems and (b) the area/power footprint, as compared to current state-of-the-art router implementations. The new router is demonstrated to provide better scalability to hundreds of cores than an ideal single-cycle wormhole implementation and other scalability-enhanced low-cost routers. Extensive simulations using both synthetic traffic patterns and real applications running in a full-system simulator corroborate the efficacy of the proposed design. Finally, hardware synthesis analysis using commercial 65nm standard-cell libraries indicates that the area and power budgets of the new router are reduced by up to 53\% and 58\%, respectively, as compared to existing state-of-the-art low-cost routers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Strydis:2013:SAP, author = "Christos Strydis and Robert M. Seepers and Pedro Peris-Lopez and Dimitrios Siskos and Ioannis Sourdis", title = "A system architecture, processor, and communication protocol for secure implants", journal = j-TACO, volume = "10", number = "4", pages = "57:1--57:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555313", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Secure and energy-efficient communication between Implantable Medical Devices (IMDs) and authorized external users is attracting increasing attention these days. However, there currently exists no systematic approach to the problem, while solutions from neighboring fields, such as wireless sensor networks, are not directly transferable due to the peculiarities of the IMD domain. This work describes an original, efficient solution for secure IMD communication. A new implant system architecture is proposed, where security and main-implant functionality are made completely decoupled by running the tasks onto two separate cores. Wireless communication goes through a custom security ASIP, called SISC (Smart-Implant Security Core), which runs an energy-efficient security protocol. The security core is powered by RF-harvested energy until it performs external-reader authentication, providing an elegant defense mechanism against battery Denial-of-Service (DoS) and other, more common attacks. The system has been evaluated based on a realistic case study involving an artificial pancreas implant. When synthesized for a UMC 90nm CMOS ASIC technology, our system architecture achieves defense against unauthorized accesses having zero energy cost, running entity authentication through harvesting only 7.45 $ \mu $J of RF energy from the requesting entity. In all other successfully authenticated accesses, our architecture achieves secure data exchange without affecting the performance of the main IMD functionality, adding less than 1o/oo (1.3 mJ ) to the daily energy consumption of a typical implant. Compared to a singe-core, secure reference IMD, which would still be more vulnerable to some types of attacks, our secure system on chip (SoC) achieves high security levels at 56\% energy savings and at an area overhead of less than 15\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kim:2013:FMS, author = "Wonsub Kim and Yoonseo Choi and Haewoo Park", title = "Fast modulo scheduler utilizing patternized routes for coarse-grained reconfigurable architectures", journal = j-TACO, volume = "10", number = "4", pages = "58:1--58:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555314", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Coarse-Grained Reconfigurable Architectures (CGRAs) present a potential of high compute throughput with energy efficiency. A CGRA consists of an array of Functional Units (FUs), which communicate with each other through an interconnect network containing transmission nodes and register files. To achieve high performance from the software solutions mapped onto CGRAs, modulo scheduling of loops is generally employed. One of the key challenges in modulo scheduling for CGRAs is to explicitly handle routings of operands from a source to a destination operations through various routing resources. Existing modulo schedulers for CGRAs are slow because finding a valid routing is generally a searching problem over a large space, even with the guidance of well-defined cost metrics. Applications in traditional embedded multimedia domains are regarded as relatively tolerant to a slow compile time in exchange for a high-quality solution. However, many rapidly growing domains of applications, such as 3D graphics, require a fast compilation. Entrances of CGRAs to these domains have been blocked mainly due to their long compile time. We attack this problem by utilizing patternized routes, for which resources and time slots for a success can be estimated in advance when a source operation is placed. By conservatively reserving predefined resources at predefined time slots, future routings originating from the source operation are guaranteed. Experiments on a real-world 3D graphics benchmark suite show that our scheduler improves the compile time up to 6,000 times while achieving an average 70\% throughputs of the state-of-the-art CGRA modulo scheduler, the Edge-centric Modulo Scheduler (EMS).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nuzman:2013:JTC, author = "Dorit Nuzman and Revital Eres and Sergei Dyshel and Marcel Zalmanovici and Jose Castanos", title = "{JIT} technology with {C\slash C++}: Feedback-directed dynamic recompilation for statically compiled languages", journal = j-TACO, volume = "10", number = "4", pages = "59:1--59:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555315", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The growing gap between the advanced capabilities of static compilers as reflected in benchmarking results and the actual performance that users experience in real-life scenarios makes client-side dynamic optimization technologies imperative to the domain of static languages. Dynamic optimization of software distributed in the form of a platform-agnostic Intermediate-Representation (IR) has been very successful in the domain of managed languages, greatly improving upon interpreted code, especially when online profiling is used. However, can such feedback-directed IR-based dynamic code generation be viable in the domain of statically compiled, rather than interpreted, languages? We show that fat binaries, which combine the IR together with the statically compiled executable, can provide a practical solution for software vendors, allowing their software to be dynamically optimized without the limitation of binary-level approaches, which lack the high-level IR of the program, and without the warm-up costs associated with the IR-only software distribution approach. We describe and evaluate the fat-binary-based runtime compilation approach using SPECint2006, demonstrating that the overheads it incurs are low enough to be successfully surmounted by dynamic optimization. Building on Java JIT technologies, our results already improve upon common real-world usage scenarios, including very small workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ramashekar:2013:ADA, author = "Thejas Ramashekar and Uday Bondhugula", title = "Automatic data allocation and buffer management for multi-{GPU} machines", journal = j-TACO, volume = "10", number = "4", pages = "60:1--60:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2544100", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multi-GPU machines are being increasingly used in high-performance computing. Each GPU in such a machine has its own memory and does not share the address space either with the host CPU or other GPUs. Hence, applications utilizing multiple GPUs have to manually allocate and manage data on each GPU. Existing works that propose to automate data allocations for GPUs have limitations and inefficiencies in terms of allocation sizes, exploiting reuse, transfer costs, and scalability. We propose a scalable and fully automatic data allocation and buffer management scheme for affine loop nests on multi-GPU machines. We call it the Bounding-Box-based Memory Manager (BBMM). BBMM can perform at runtime, during standard set operations like union, intersection, and difference, finding subset and superset relations on hyperrectangular regions of array data (bounding boxes). It uses these operations along with some compiler assistance to identify, allocate, and manage data required by applications in terms of disjoint bounding boxes. This allows it to (1) allocate exactly or nearly as much data as is required by computations running on each GPU, (2) efficiently track buffer allocations and hence maximize data reuse across tiles and minimize data transfer overhead, and (3) and as a result, maximize utilization of the combined memory on multi-GPU machines. BBMM can work with any choice of parallelizing transformations, computation placement, and scheduling schemes, whether static or dynamic. Experiments run on a four-GPU machine with various scientific programs showed that BBMM reduces data allocations on each GPU by up to 75\% compared to current allocation schemes, yields performance of at least 88\% of manually written code, and allows excellent weak scaling.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vandierendonck:2013:ADT, author = "Hans Vandierendonck and George Tzenakis and Dimitrios S. Nikolopoulos", title = "Analysis of dependence tracking algorithms for task dataflow execution", journal = j-TACO, volume = "10", number = "4", pages = "61:1--61:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555316", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Processor architectures has taken a turn toward many-core processors, which integrate multiple processing cores on a single chip to increase overall performance, and there are no signs that this trend will stop in the near future. Many-core processors are harder to program than multicore and single-core processors due to the need for writing parallel or concurrent programs with high degrees of parallelism. Moreover, many-cores have to operate in a mode of strong scaling because of memory bandwidth constraints. In strong scaling, increasingly finer-grain parallelism must be extracted in order to keep all processing cores busy. Task dataflow programming models have a high potential to simplify parallel programming because they alleviate the programmer from identifying precisely all intertask dependences when writing programs. Instead, the task dataflow runtime system detects and enforces intertask dependences during execution based on the description of memory accessed by each task. The runtime constructs a task dataflow graph that captures all tasks and their dependences. Tasks are scheduled to execute in parallel, taking into account dependences specified in the task graph. Several papers report important overheads for task dataflow systems, which severely limits the scalability and usability of such systems. In this article, we study efficient schemes to manage task graphs and analyze their scalability. We assume a programming model that supports input, output, and in/out annotations on task arguments, as well as commutative in/out and reductions. We analyze the structure of task graphs and identify versions and generations as key concepts for efficient management of task graphs. Then, we present three schemes to manage task graphs building on graph representations, hypergraphs, and lists. We also consider a fourth edgeless scheme that synchronizes tasks using integers. Analysis using microbenchmarks shows that the graph representation is not always scalable and that the edgeless scheme introduces least overhead in nearly all situations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jeong:2013:EET, author = "Yeonghun Jeong and Seongseok Seo and Jongeun Lee", title = "Evaluator-executor transformation for efficient pipelining of loops with conditionals", journal = j-TACO, volume = "10", number = "4", pages = "62:1--62:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555317", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Control divergence poses many problems in parallelizing loops. While predicated execution is commonly used to convert control dependence into data dependence, it often incurs high overhead because it allocates resources equally for both branches of a conditional statement regardless of their execution frequencies. For those loops with unbalanced conditionals, we propose a software transformation that divides a loop into two or three smaller loops so that the condition is evaluated only in the first loop, while the less frequent branch is executed in the second loop in a way that is much more efficient than in the original loop. To reduce the overhead of extra data transfer caused by the loop fission, we also present a hardware extension for a class of Coarse-Grained Reconfigurable Architectures (CGRAs). Our experiments using MiBench and computer vision benchmarks on a CGRA demonstrate that our techniques can improve the performance of loops over predicated execution by up to 65\% (37.5\%, on average), when the hardware extension is enabled. Without any hardware modification, our software-only version can improve performance by up to 64\% (33\%, on average), while simultaneously reducing the energy consumption of the entire CGRA including configuration and data memory by 22\%, on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "62", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Barik:2013:DNS, author = "Rajkishore Barik and Jisheng Zhao and Vivek Sarkar", title = "A decoupled non-{SSA} global register allocation using bipartite liveness graphs", journal = j-TACO, volume = "10", number = "4", pages = "63:1--63:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2544101", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Register allocation is an essential optimization for all compilers. A number of sophisticated register allocation algorithms have been developed over the years. The two fundamental classes of register allocation algorithms used in modern compilers are based on Graph Coloring (GC) and Linear Scan (LS). However, these two algorithms have fundamental limitations in terms of precision. For example, the key data structure used in GC-based algorithms, the interference graph, lacks information on the program points at which two variables may interfere. The LS-based algorithms make local decisions regarding spilling, and thereby trade off global optimization for reduced compile-time and space overheads. Recently, researchers have proposed Static Single Assignment (SSA)-based decoupled register allocation algorithms that exploit the live-range split points of the SSA representation to optimally solve the spilling problem. However, SSA-based register allocation often requires extra complexity in repairing register assignments during SSA elimination and in addressing architectural constraints such as aliasing and ABI encoding; this extra overhead can be prohibitively expensive in dynamic compilation contexts. This article proposes a decoupled non-SSA--based global register allocation algorithm for dynamic compilation. It addresses the limitations in current algorithms by introducing a Bipartite Liveness Graph (BLG)-based register allocation algorithm that models the spilling phase as an optimization problem on the BLG itself and the assignment phase as a separate optimization problem. Advanced register allocation optimizations such as move coalescing, live-range splitting, and register class handling are also performed along with the spilling and assignment phases. In the presence of register classes, we propose a bucket-based greedy heuristic for assignment that strikes a balance between spill-cost and register class constraints. We present experimental evaluation of our BLG-based register allocation algorithm and compare it with production-quality register allocators in Jikes RVM and LLVM.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "63", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gavin:2013:RIF, author = "Peter Gavin and David Whalley and Magnus Sj{\"a}lander", title = "Reducing instruction fetch energy in multi-issue processors", journal = j-TACO, volume = "10", number = "4", pages = "64:1--64:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2541228.2555318", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jan 9 10:42:35 MST 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The need to minimize power while maximizing performance has led to recent developments of powerful superscalar designs targeted at embedded and portable use. Instruction fetch is responsible for a significant fraction of microprocessor power and energy, and is therefore an attractive target for architectural power optimization. We present novel techniques that take advantage of guarantees so that the instruction translation lookaside buffer, branch target buffer, and branch prediction buffer can frequently be disabled, reducing their energy usage, while simultaneously reducing branch predictor contention. These techniques require no changes to the instruction set and can easily be integrated into most single- and multiple-issue processors.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "64", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Anonymous:2013:LDR, author = "Anonymous", title = "List of distinguished reviewers {ACM TACO}", journal = j-TACO, volume = "10", number = "4", pages = "65:1--65:??", month = dec, year = "2013", CODEN = "????", DOI = "https://doi.org/10.1145/2560216", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:44 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "65", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Goel:2014:SPR, author = "Neeraj Goel and Anshul Kumar and Preeti Ranjan Panda", title = "Shared-port register file architecture for low-energy {VLIW} processors", journal = j-TACO, volume = "11", number = "1", pages = "1:1--1:32", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2533397", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We propose a reduced-port Register File (RF) architecture for reducing RF energy in a VLIW processor. With port reduction, RF ports need to be shared among Function Units (FUs), which may lead to access conflicts, and thus, reduced performance. Our solution includes (i) a carefully designed RF-FU interconnection network that permits port sharing with minimum conflicts and without any delay/energy overheads, and (ii) a novel scheduling and binding algorithm that reduces the performance penalty. With our solution, we observed as much as 83\% RF energy savings with no more than a 10\% loss in performance for a set of Mediabench and Mibench benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2014:IPD, author = "Zheng Wang and Georgios Tournavitis and Bj{\"o}rn Franke and Michael F. P. O'Boyle", title = "Integrating profile-driven parallelism detection and machine-learning-based mapping", journal = j-TACO, volume = "11", number = "1", pages = "2:1--2:26", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579561", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compiler-based auto-parallelization is a much-studied area but has yet to find widespread application. This is largely due to the poor identification and exploitation of application parallelism, resulting in disappointing performance far below that which a skilled expert programmer could achieve. We have identified two weaknesses in traditional parallelizing compilers and propose a novel, integrated approach resulting in significant performance improvements of the generated parallel code. Using profile-driven parallelism detection, we overcome the limitations of static analysis, enabling the identification of more application parallelism, and only rely on the user for final approval. We then replace the traditional target-specific and inflexible mapping heuristics with a machine-learning-based prediction mechanism, resulting in better mapping decisions while automating adaptation to different target architectures. We have evaluated our parallelization strategy on the NAS and SPEC CPU2000 benchmarks and two different multicore platforms (dual quad-core Intel Xeon SMP and dual-socket QS20 Cell blade). We demonstrate that our approach not only yields significant improvements when compared with state-of-the-art parallelizing compilers but also comes close to and sometimes exceeds the performance of manually parallelized codes. On average, our methodology achieves 96\% of the performance of the hand-tuned OpenMP NAS and SPEC parallel benchmarks on the Intel Xeon platform and gains a significant speedup for the IBM Cell platform, demonstrating the potential of profile-guided and machine-learning- based parallelization for complex multicore platforms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Samadi:2014:LGU, author = "Mehrzad Samadi and Amir Hormati and Janghaeng Lee and Scott Mahlke", title = "Leveraging {GPUs} using cooperative loop speculation", journal = j-TACO, volume = "11", number = "1", pages = "3:1--3:26", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579617", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graphics processing units, or GPUs, provide TFLOPs of additional performance potential in commodity computer systems that frequently go unused by most applications. Even with the emergence of languages such as CUDA and OpenCL, programming GPUs remains a difficult challenge for a variety of reasons, including the inherent algorithmic characteristics and data structure choices used by applications as well as the tedious performance optimization cycle that is necessary to achieve high performance. The goal of this work is to increase the applicability of GPUs beyond CUDA/OpenCL to implicitly data-parallel applications written in C/C++ using speculative parallelization. To achieve this goal, we propose Paragon: a static/dynamic compiler platform to speculatively run possibly data-parallel portions of sequential applications on the GPU while cooperating with the system CPU. For such loops, Paragon utilizes the GPU in an opportunistic way while orchestrating a cooperative relation between the CPU and GPU to reduce the overhead of miss-speculations. Paragon monitors the dependencies for the loops running speculatively on the GPU and nonspeculatively on the CPU using a lightweight distributed conflict detection designed specifically for GPUs, and transfers the execution to the CPU in case a conflict is detected. Paragon resumes the execution on the GPU after the CPU resolves the dependency. Our experiments show that Paragon achieves 4x on average and up to 30x speedup compared to unsafe CPU execution with four threads and 7x on average and up to 64x speedup versus sequential execution across a set of sequential but implicitly data-parallel applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2014:EAC, author = "Jue Wang and Xiangyu Dong and Yuan Xie and Norman P. Jouppi", title = "Endurance-aware cache line management for non-volatile caches", journal = j-TACO, volume = "11", number = "1", pages = "4:1--4:24", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579671", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Nonvolatile memories (NVMs) have the potential to replace low-level SRAM or eDRAM on-chip caches because NVMs save standby power and provide large cache capacity. However, limited write endurance is a common problem for NVM technologies, and today's cache management might result in unbalanced cache write traffic, causing heavily written cache blocks to fail much earlier than others. Although wear-leveling techniques for NVM-based main memories exist, we cannot simply apply them to NVM-based caches. This is because cache writes have intraset variations as well as interset variations, while writes to main memories only have interset variations. To solve this problem, we propose i$^2$ WAP, a new cache management policy that can reduce both inter- and intraset write variations. i$^2$ WAP has two features: Swap-Shift, an enhancement based on existing main memory wear leveling to reduce cache interset write variations, and Probabilistic Set Line Flush, a novel technique to reduce cache intraset write variations. Implementing i$^2$ WAP only needs two global counters and two global registers. In one of our studies, i$^2$ WAP can improve the NVM cache lifetime by 75\% on average and up to 224\%. We also validate that i$^2$ WAP is effective in systems with different cache configurations and workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2014:BBS, author = "Lei Liu and Zehan Cui and Yong Li and Yungang Bao and Mingyu Chen and Chengyong Wu", title = "{{BPM\slash BPM+}}: Software-based dynamic memory partitioning mechanisms for mitigating {DRAM} bank-\slash channel-level interferences in multicore systems", journal = j-TACO, volume = "11", number = "1", pages = "5:1--5:28", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579672", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The main memory system is a shared resource in modern multicore machines that can result in serious interference leading to reduced throughput and unfairness. Many new memory scheduling mechanisms have been proposed to address the interference problem. However, these mechanisms usually employ relative complex scheduling logic and need modifications to Memory Controllers (MCs), which incur expensive hardware design and manufacturing overheads. This article presents a practical software approach to effectively eliminate the interference without any hardware modifications. The key idea is to modify the OS memory management system and adopt a page-coloring-based Bank-level Partitioning Mechanism (BPM) that allocates dedicated DRAM banks to each core (or thread). By using BPM, memory requests from distinct programs are segregated across multiple memory banks to promote locality/fairness and reduce interference. We further extend BPM to BPM+ by incorporating channel-level partitioning, on which we demonstrate additional gain over BPM in many cases. To achieve benefits in the presence of diverse application memory needs and avoid performance degradation due to resource underutilization, we propose a dynamic mechanism upon BPM/BPM+ that assigns appropriate bank/channel resources based on application memory/bandwidth demands monitored through PMU (performance-monitoring unit) and a low-overhead OS page table scanning process. We implement BPM/BPM+ in Linux 2.6.32.15 kernel and evaluate the technique on four-core and eight-core real machines by running a large amount of randomly generated multiprogrammed and multithreaded workloads. Experimental results show that BPM/BPM+ can improve the overall system throughput by 4.7\%/5.9\%, on average, (up to 8.6\%/9.5\%) and reduce the unfairness by an average of 4.2\%/6.1\% (up to 15.8\%/13.9\%).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Haubl:2014:TTE, author = "Christian H{\"a}ubl and Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}ck", title = "Trace transitioning and exception handling in a trace-based {JIT} compiler for {Java}", journal = j-TACO, volume = "11", number = "1", pages = "6:1--6:26", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579673", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Trace-based Just-In-Time (JIT) compilation generates machine code for frequently executed paths (so-called traces) instead of whole methods. While this has several advantages, it complicates invocation of compiled traces as well as exception handling, so that previous trace-based compilers limited the way in which traces could be invoked. We present a significantly enhanced trace-based compiler where arbitrary transitions between interpreted and compiled traces are possible. For that, we introduce suitable trace calling conventions and extend exception handling to work both within traces and across trace boundaries. Furthermore, we use the recorded trace information for optimizations and combine the tracing ideas with ideas from partial-method compilation to avoid code bloat. An extensive evaluation with the benchmark suites DaCapo 9.12 Bach and SPECjvm2008 shows that our trace-based compiler achieves up to 59\% higher peak performance than the method-based Java HotSpot client compiler. On a few benchmarks, our fairly simple trace-based compiler shows a higher peak performance than the Java HotSpot server compiler, which is one of today's best optimizing JIT compilers for Java.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huang:2014:HHH, author = "Yongbing Huang and Licheng Chen and Zehan Cui and Yuan Ruan and Yungang Bao and Mingyu Chen and Ninghui Sun", title = "{HMTT}: a hybrid hardware\slash software tracing system for bridging the {DRAM} access trace's semantic gap", journal = j-TACO, volume = "11", number = "1", pages = "7:1--7:25", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579668", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "DRAM access traces (i.e., off-chip memory references) can be extremely valuable for the design of memory subsystems and performance tuning of software. Hardware snooping on the off-chip memory interface is an effective and nonintrusive approach to monitoring and collecting real-life DRAM accesses. However, compared with software-based approaches, hardware snooping approaches typically lack semantic information, such as process/function/object identifiers, virtual addresses, and lock contexts, that is essential to the complete understanding of the systems and software under investigation. In this article, we propose a hybrid hardware/software mechanism that is able to collect off-chip memory reference traces with semantic information. We have designed and implemented a prototype system called HMTT (Hybrid Memory Trace Tool), which uses a custom-made DIMM connector to collect off-chip memory references and a high-level event-encoding scheme to correlate semantic information with memory references. In addition to providing complete, undistorted DRAM access traces, the proposed system is also able to perform various types of low-overhead profiling, such as object-relative accesses and multithread lock accesses.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2014:AWA, author = "Quan Chen and Minyi Guo", title = "Adaptive workload-aware task scheduling for single-{ISA} asymmetric multicore architectures", journal = j-TACO, volume = "11", number = "1", pages = "8:1--8:25", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579674", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Single-ISA Asymmetric Multicore (AMC) architectures have shown high performance as well as power efficiency. However, current parallel programming environments do not perform well on AMC because they are designed for symmetric multicore architectures in which all cores provide equal performance. Their random task scheduling policies can result in unbalanced workloads in AMC and severely degrade the performance of parallel applications. To balance the workloads of parallel applications in AMC, this article proposes an adaptive Workload-Aware Task Scheduler (WATS) that consists of a history-based task allocator and a preference-based task scheduler. The history-based task allocator is based on a near-optimal, static task allocation using the historical statistics collected during the execution of a parallel application. The preference-based task scheduler, which schedules tasks based on a preference list, can dynamically adjust the workloads in AMC if the task allocation is less optimal due to approximation in the history-based task allocator. Experimental results show that WATS can improve both the performance and energy efficiency of task-based applications, with the performance gain up to 66.1\% compared with traditional task schedulers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Savrun-Yeniceri:2014:EHI, author = "G{\"u}lfem Savrun-Yeni{\c{c}}eri and Wei Zhang and Huahan Zhang and Eric Seckler and Chen Li and Stefan Brunthaler and Per Larsen and Michael Franz", title = "Efficient hosted interpreters on the {JVM}", journal = j-TACO, volume = "11", number = "1", pages = "9:1--9:24", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2532642", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:08:33 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", URL = "https://dl.acm.org/doi/abs/10.1145/2532642", abstract = "Many guest languages are implemented using the Java Virtual Machine (JVM) as a host environment. There are two major implementation choices: custom compilers and so-called hosted interpreters. Custom compilers are complex to build but offer good performance. Hosted interpreters are comparatively simpler to implement but until now have suffered from poor performance.\par We studied the performance of hosted interpreters and identified common bottlenecks preventing their efficient execution. First, similar to interpreters written in C/C++, instruction dispatch is expensive on the JVM. Second, Java's semantics require expensive runtime exception checks that negatively affect array performance essential to interpreters.\par We present two optimizations targeting these bottlenecks and show that the performance of optimized interpreters increases dramatically: we report speedups by a factor of up to 2.45 over the Jython interpreter, 3.57 over the Rhino interpreter, and 2.52 over the JRuby interpreter, respectively. The resulting performance is comparable with that of custom compilers. Our optimizations are enabled by a few simple annotations that require only modest implementation effort; in return, performance increases substantially.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nair:2014:RPD, author = "Prashant J. Nair and Chia-Chen Chou and Moinuddin K. Qureshi", title = "Refresh pausing in {DRAM} memory systems", journal = j-TACO, volume = "11", number = "1", pages = "10:1--10:26", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579669", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:08:33 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/2579669", abstract = "Dynamic Random Access Memory (DRAM) cells rely on periodic refresh operations to maintain data integrity. As the capacity of DRAM memories has increased, so has the amount of time consumed in doing refresh. Refresh operations contend with read \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jothi:2014:TCF, author = "Komal Jothi and Haitham Akkary", title = "Tuning the continual flow pipeline architecture with virtual register renaming", journal = j-TACO, volume = "11", number = "1", pages = "11:1--11:27", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579675", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Continual Flow Pipelines (CFPs) allow a processor core to process hundreds of in-flight instructions without increasing cycle-critical pipeline resources. When a load misses the data cache, CFP checkpoints the processor register state and then moves all miss-dependent instructions into a low-complexity WB to unblock the pipeline. Meanwhile, miss-independent instructions execute normally and update the processor state. When the miss data return, CFP replays the miss-dependent instructions from the WB and then merges the miss-dependent and miss-independent execution results. CFP was initially proposed for cache misses to DRAM. Later work focused on reducing the execution overhead of CFP by avoiding the pipeline flush before replaying miss-dependent instructions and executing dependent and independent instructions concurrently. The goal of these improvements was to gain performance by applying CFP to L1 data cache misses that hit the last level on chip cache. However, many applications or execution phases of applications incur excessive amount of replay and/or rollbacks to the checkpoint. This frequently cancels benefits from CFP and reduces performance. In this article, we improve the CFP architecture by using a novel virtual register renaming substrate and by tuning the replay policies to mitigate excessive replays and rollbacks to the checkpoint. We describe these new design optimizations and show, using Spec 2006 benchmarks and microarchitecture performance and power models of our design, that our Tuned-CFP architecture improves performance and energy consumption over previous CFP architectures by ~10\% and ~8\%, respectively. We also demonstrate that our proposed architecture gives better performance return on energy per instruction compared to a conventional superscalar as well as previous CFP architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Carle:2014:PAM, author = "Thomas Carle and Dumitru Potop-Butucaru", title = "Predicate-aware, makespan-preserving software pipelining of scheduling tables", journal = j-TACO, volume = "11", number = "1", pages = "12:1--12:26", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579676", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:08:33 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/2579676", abstract = "We propose a software pipelining technique adapted to specific hard real-time scheduling problems. Our technique optimizes both computation throughput and execution cycle makespan, with makespan being prioritary. It also takes advantage of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kritikakou:2014:SNO, author = "Angeliki Kritikakou and Francky Catthoor and Vasilios Kelefouras and Costas Goutis", title = "A scalable and near-optimal representation of access schemes for memory management", journal = j-TACO, volume = "11", number = "1", pages = "13:1--13:25", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2579677", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memory management searches for the resources required to store the concurrently alive elements. The solution quality is affected by the representation of the element accesses: a sub-optimal representation leads to overestimation and a non-scalable representation increases the exploration time. We propose a methodology to near-optimal and scalable represent regular and irregular accesses. The representation consists of a set of pattern entries to compactly describe the behavior of the memory accesses and of pattern operations to consistently combine the pattern entries. The result is a final sequence of pattern entries which represents the global access scheme without unnecessary overestimation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Leather:2014:AFG, author = "Hugh Leather and Edwin Bonilla and Michael O'Boyle", title = "Automatic feature generation for machine learning--based optimising compilation", journal = j-TACO, volume = "11", number = "1", pages = "14:1--14:32", month = feb, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2536688", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 14 17:30:52 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent work has shown that machine learning can automate and in some cases outperform handcrafted compiler optimisations. Central to such an approach is that machine learning techniques typically rely upon summaries or features of the program. The quality of these features is critical to the accuracy of the resulting machine learned algorithm; no machine learning method will work well with poorly chosen features. However, due to the size and complexity of programs, theoretically there are an infinite number of potential features to choose from. The compiler writer now has to expend effort in choosing the best features from this space. This article develops a novel mechanism to automatically find those features that most improve the quality of the machine learned heuristic. The feature space is described by a grammar and is then searched with genetic programming and predictive modelling. We apply this technique to loop unrolling in GCC 4.3.1 and evaluate our approach on a Pentium 6. On a benchmark suite of 57 programs, GCCs hard-coded heuristic achieves only 3\% of the maximum performance available, whereas a state-of-the-art machine learning approach with hand-coded features obtains 59\%. Our feature generation technique is able to achieve 76\% of the maximum available speedup, outperforming existing approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kluter:2014:VWL, author = "Theo Kluter and Samuel Burri and Philip Brisk and Edoardo Charbon and Paolo Ienne", title = "Virtual Ways: Low-Cost Coherence for Instruction Set Extensions with Architecturally Visible Storage", journal = j-TACO, volume = "11", number = "2", pages = "15:1--15:26", month = jul, year = "2014", DOI = "https://doi.org/10.1145/2576877", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:13:09 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Instruction set extensions (ISEs) improve the performance and energy consumption of application-specific processors. ISEs can use architecturally visible storage (AVS), localized compiler-controlled memories, to provide higher I/O bandwidth than reading data from the processor pipeline. AVS creates coherence and consistence problems with the data cache. Although a hardware coherence protocol could solve the problem, this approach is costly for a single-processor system. As a low-cost alternative, we introduce Virtual Ways, which ensures coherence through a reduced form of inclusion between the data cache and AVS. Virtual Ways achieve higher performance and lower energy consumption than using a hardware coherence protocol.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ren:2014:POE, author = "Bin Ren and Todd Mytkowicz and Gagan Agrawal", title = "A Portable Optimization Engine for Accelerating Irregular Data-Traversal Applications on {SIMD} Architectures", journal = j-TACO, volume = "11", number = "2", pages = "16:1--16:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2632215", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 30 19:02:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Fine-grained data parallelism is increasingly common in the form of longer vectors integrated with mainstream processors (SSE, AVX) and various GPU architectures. This article develops support for exploiting such data parallelism for a class of nonnumeric, nongraphic applications, which perform computations while traversing many independent, irregular data structures. We address this problem by developing several novel techniques. First, for code generation, we develop an intermediate language for specifying such traversals, followed by a runtime scheduler that maps traversals to various SIMD units. Second, we observe that good data locality is crucial to sustained performance from SIMD architectures, whereas many applications that operate on irregular data structures (e.g., trees and graphs) have poor data locality. To address this challenge, we develop a set of data layout optimizations that improve spatial locality for applications that traverse many irregular data structures. Unlike prior data layout optimizations, our approach incorporates a notion of both interthread and intrathread spatial reuse into data layout. Finally, we enable performance portability (i.e., the ability to automatically optimize applications for different architectures) by accurately modeling the impact of inter- and intrathread locality on program performance. As a consequence, our model can predict which data layout optimization to use on a wide variety of SIMD architectures. To demonstrate the efficacy of our approach and optimizations, we first show how they enable up to a 12X speedup on one SIMD architecture for a set of real-world applications. To demonstrate that our approach enables performance portability, we show how our model predicts the optimal layout for applications across a diverse set of three real-world SIMD architectures, which offers as much as 45\% speedup over a suboptimal solution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Qi:2014:VVG, author = "Zhengwei Qi and Jianguo Yao and Chao Zhang and Miao Yu and Zhizhou Yang and Haibing Guan", title = "{VGRIS}: Virtualized {GPU} Resource Isolation and Scheduling in Cloud Gaming", journal = j-TACO, volume = "11", number = "2", pages = "17:1--17:25", month = jul, year = "2014", DOI = "https://doi.org/10.1145/2632216", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:16:31 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To achieve efficient resource management on a graphics processing unit (GPU), there is a demand to develop a framework for scheduling virtualized resources in cloud gaming. In this article, we propose VGRIS, a resource management framework for virtualized GPU resource isolation and scheduling in cloud gaming. A set of application programming interfaces (APIs) is provided so that a variety of scheduling algorithms can be implemented within the framework without modifying the framework itself. Three scheduling algorithms are implemented by the APIs within VGRIS. Experimental results show that VGRIS can effectively schedule GPU resources among various workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shen:2014:RSB, author = "Bor-Yeh Shen and Wei-Chung Hsu and Wuu Yang", title = "A Retargetable Static Binary Translator for the {ARM} Architecture", journal = j-TACO, volume = "11", number = "2", pages = "18:1--18:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629335", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 30 19:02:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Machines designed with new but incompatible Instruction Set Architecture (ISA) may lack proper applications. Binary translation can address this incompatibility by migrating applications from one legacy ISA to a new one, although binary translation has problems such as code discovery for variable-length ISA and code location issues for handling indirect branches. Dynamic Binary Translation (DBT) has been widely adopted for migrating applications since it avoids those problems. Static Binary Translation (SBT) is a less general solution and has not been actively researched. However, SBT performs more aggressive optimizations, which could yield more compact code and better code quality. Applications translated by SBT can consume less memory, processor cycles, and power than DBT and can be started more quickly. These advantages are even more critical for embedded systems than for general systems. In this article, we designed and implemented a new SBT tool, called LLBT, which translates ARM instructions into LLVM IRs and then retargets the LLVM IRs to various ISAs, including x86, x86-64, ARM, and MIPS. LLBT leverages two important functionalities from LLVM: comprehensive optimizations and retargetability. More importantly, LLBT solves the code discovery problem for ARM/Thumb binaries without resorting to interpretation. LLBT also effectively reduced the size of the address mapping table, making SBT a viable solution for embedded systems. Our experiments based on the EEMBC benchmark suite show that the LLBT-generated code can run more than $ 6 \times $ and $ 2.3 \times $ faster on average than emulation with QEMU and HQEMU, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gracia:2014:RLN, author = "Dar{\'\i}o Su{\'a}rez Gracia and Alexandra Ferrer{\'o}n and Luis Montesano {Del Campo} and Teresa Monreal Arnal and V{\'\i}ctor Vi{\~n}als Y{\'u}fera", title = "Revisiting {LP--NUCA} Energy Consumption: Cache Access Policies and Adaptive Block Dropping", journal = j-TACO, volume = "11", number = "2", pages = "19:1--19:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2632217", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 30 19:02:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Cache working-set adaptation is key as embedded systems move to multiprocessor and Simultaneous Multithreaded Architectures (SMT) because interthread pollution harms system performance and battery life. Light-Power NUCA (LP-NUCA) is a working-set adaptive cache that depends on temporal-locality to save energy. This work identifies the sources of energy waste in LP-NUCAs: parallel access to the tag and data arrays of the tiles and low locality phases with useless block migration. To counteract both issues, we prove that switching to serial access reduces energy without harming performance and propose a machine learning Adaptive Drop Rate (ADR) controller that minimizes the amount of replacement and migration when locality is low. This work demonstrates that these techniques efficiently adapt the cache drop and access policies to save energy. They reduce LP-NUCA consumption 22.7\% for 1SMT. With interthread cache contention in 2SMT, the savings rise to 29\%. Versus a conventional organization, energy--delay improves 20.8\% and 25\% for 1- and 2SMT benchmarks, and, in 65\% of the 2SMT mixes, gains are larger than 20\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liang:2014:DCC, author = "Zhibin Liang and Wei Zhang and Yung-Cheng Ma", title = "Deadline-Constrained Clustered Scheduling for {VLIW} Architectures using Power-Gated Register Files", journal = j-TACO, volume = "11", number = "2", pages = "20:1--20:26", month = jul, year = "2014", DOI = "https://doi.org/10.1145/2632218", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:18:32 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Designing energy-efficient Digital Signal Processor (DSP) cores has become a key concern in embedded systems development. This paper proposes an energy-proportional computing scheme for Very Long Instruction Word (VLIW) architectures. To make the processor power scales with adapted parallelism, we propose incorporating distributed Power-Gated Register Files (PGRF) into VLIW to achieve a PGRF-VLIW architecture. For energy efficiency, we also propose an instruction scheduling algorithm called the Deadline-Constrained Clustered Scheduling (DCCS) algorithm. The algorithm clusters the data dependence graph to reduce data transfer energy and makes optimal use of low-powered local registers for tree-structured data dependence graphs. The results of evaluations conducted using the MiBench and DSPstone benchmark suites substantiate the expected power saving and scaling effects.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fang:2014:PPA, author = "Shuangde Fang and Zidong Du and Yuntan Fang and Yuanjie Huang and Yang Chen and Lieven Eeckhout and Olivier Temam and Huawei Li and Yunji Chen and Chengyong Wu", title = "Performance Portability Across Heterogeneous {SoCs} Using a Generalized Library-Based Approach", journal = j-TACO, volume = "11", number = "2", pages = "21:1--21:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2608253", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 30 19:02:49 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Because of tight power and energy constraints, industry is progressively shifting toward heterogeneous system-on-chip (SoC) architectures composed of a mix of general-purpose cores along with a number of accelerators. However, such SoC architectures can be very challenging to efficiently program for the vast majority of programmers, due to numerous programming approaches and languages. Libraries, on the other hand, provide a simple way to let programmers take advantage of complex architectures, which does not require programmers to acquire new accelerator-specific or domain-specific languages. Increasingly, library-based, also called algorithm-centric, programming approaches propose to generalize the usage of libraries and to compose programs around these libraries, instead of using libraries as mere complements. In this article, we present a software framework for achieving performance portability by leveraging a generalized library-based approach. Inspired by the notion of a component, as employed in software engineering and HW/SW codesign, we advocate nonexpert programmers to write simple wrapper code around existing libraries to provide simple but necessary semantic information to the runtime. To achieve performance portability, the runtime employs machine learning (simulated annealing) to select the most appropriate accelerator and its parameters for a given algorithm. This selection factors in the possibly complex composition of algorithms used in the application, the communication among the various accelerators, and the tradeoff between different objectives (i.e., accuracy, performance, and energy). Using a set of benchmarks run on a real heterogeneous SoC composed of a multicore processor and a GPU, we show that the runtime overhead is fairly small at 5.1\% for the GPU and 6.4\% for the multi-core. We then apply our accelerator selection approach to a simulated SoC platform containing multiple inexact accelerators. We show that accelerator selection together with hardware parameter tuning achieves an average 46.2\% energy reduction and a speedup of 2.1$ \times $ while meeting the desired application error target.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kaitoua:2014:HED, author = "Abdulrahman Kaitoua and Hazem Hajj and Mazen A. R. Saghir and Hassan Artail and Haitham Akkary and Mariette Awad and Mageda Sharafeddine and Khaleel Mershad", title = "{Hadoop} Extensions for Distributed Computing on Reconfigurable Active {SSD} Clusters", journal = j-TACO, volume = "11", number = "2", pages = "22:1--22:??", month = jun, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2608199", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:18 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we propose new extensions to Hadoop to enable clusters of reconfigurable active solid-state drives (RASSDs) to process streaming data from SSDs using FPGAs. We also develop an analytical model to estimate the performance of RASSD clusters running under Hadoop. Using the Hadoop RASSD platform and network simulators, we validate our design and demonstrate its impact on performance for different workloads taken from Stanford's Phoenix MapReduce project. Our results show that for a hardware acceleration factor of 20$ \times $, compute-intensive workloads processing 153MB of data can run up to 11$ \times $ faster than a standard Hadoop cluster.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2014:PSR, author = "Jue Wang and Xiangyu Dong and Yuan Xie", title = "Preventing {STT-RAM} Last-Level Caches from Port Obstruction", journal = j-TACO, volume = "11", number = "3", pages = "23:1--23:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2633046", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many new nonvolatile memory (NVM) technologies have been heavily studied to replace the power-hungry SRAM/DRAM-based memory hierarchy in today's computers. Among various emerging NVM technologies, Spin-Transfer Torque RAM (STT-RAM) has many benefits, such as fast read latency, low leakage power, and high density, making it a promising candidate for last-level caches (LLCs).$^1$ However, STT-RAM write operation is expensive. In particular, a long STT-RAM cache write operation might obstruct other cache accesses and result in severe performance degradation. Consequently, how to mitigate STT-RAM write overhead is critical to the success of STT-RAM adoption. In this article, we propose an obstruction-aware cache management policy called OAP. OAP monitors cache traffic, detects LLC-obstructive processes, and differentiates the cache accesses from different processes. Our experiment on a four-core architecture with an 8MB STT-RAM L3 cache shows a 14\% performance improvement and 64\% energy reduction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gonzalez-Mesa:2014:ETM, author = "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L. Zapata and Oscar Plata", title = "Effective Transactional Memory Execution Management for Improved Concurrency", journal = j-TACO, volume = "11", number = "3", pages = "24:1--24:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2633048", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article describes a transactional memory execution model intended to exploit maximum parallelism from sequential and multithreaded programs. A program code section is partitioned into chunks that will be mapped onto threads and executed transactionally. These transactions run concurrently and out of order, trying to exploit maximum parallelism but managed by a specific fully distributed commit control to meet data dependencies. To accomplish correct parallel execution, a partial precedence order relation is derived from the program code section and/or defined by the programmer. When a conflict between chunks is eagerly detected, the precedence order relation is used to determine the best policy to solve the conflict that preserves the precedence order while maximizing concurrency. The model defines a new transactional state called executed but not committed. This state allows exploiting concurrency on two levels: intrathread and interthread. Intrathread concurrency is improved by having pending uncommitted transactions while executing a new one in the same thread. The new state improves interthread concurrency because it permits out-of-order transaction commits regarding the precedence order. Our model has been implemented in a lightweight software transactional memory system, TinySTM, and has been evaluated on a set of benchmarks obtaining an important performance improvement over the baseline TM system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kumar:2014:EPG, author = "Rakesh Kumar and Alejandro Mart{\'\i}nez and Antonio Gonz{\'a}lez", title = "Efficient Power Gating of {SIMD} Accelerators Through Dynamic Selective Devectorization in an {HW\slash SW} Codesigned Environment", journal = j-TACO, volume = "11", number = "3", pages = "25:1--25:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629681", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Leakage energy is a growing concern in current and future microprocessors. Functional units of microprocessors are responsible for a major fraction of this energy. Therefore, reducing functional unit leakage has received much attention in recent years. Power gating is one of the most widely used techniques to minimize leakage energy. Power gating turns off the functional units during the idle periods to reduce the leakage. Therefore, the amount of leakage energy savings is directly proportional to the idle time duration. This article focuses on increasing the idle interval for the higher SIMD lanes. The applications are profiled dynamically, in a hardware/software codesigned environment, to find the higher SIMD lanes' usage pattern. If the higher lanes need to be turned on for small time periods, the corresponding portion of the code is devectorized to keep the higher lanes off. The devectorized code is executed on the lowest SIMD lane. Our experimental results show that the average energy savings of the proposed mechanism are 15\%, 12\%, and 71\% greater than power gating for SPECFP2006, Physicsbench, and Eigen benchmark suites, respectively. Moreover, the slowdown caused by devectorization is negligible.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Carlo:2014:FAA, author = "Stefano {Di Carlo} and Salvatore Galfano and Marco Indaco and Paolo Prinetto and Davide Bertozzi and Piero Olivo and Cristian Zambelli", title = "{FLARES}: an Aging Aware Algorithm to Autonomously Adapt the Error Correction Capability in {NAND} Flash Memories", journal = j-TACO, volume = "11", number = "3", pages = "26:1--26:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2631919", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the advent of solid-state storage systems, NAND flash memories are becoming a key storage technology. However, they suffer from serious reliability and endurance issues during the operating lifetime that can be handled by the use of appropriate error correction codes (ECCs) in order to reconstruct the information when needed. Adaptable ECCs may provide the flexibility to avoid worst-case reliability design, thus leading to improved performance. However, a way to control such adaptable ECCs' strength is required. This article proposes FLARES, an algorithm able to adapt the ECC correction capability of each page of a flash based on a flash RBER prediction model and on a measurement of the number of errors detected in a given time window. FLARES has been fully implemented within the YAFFS 2 filesystem under the Linux operating system. This allowed us to perform an extensive set of simulations on a set of standard benchmarks that highlighted the benefit of FLARES on the overall storage subsystem performances.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bartolini:2014:AFG, author = "Davide B. Bartolini and Filippo Sironi and Donatella Sciuto and Marco D. Santambrogio", title = "Automated Fine-Grained {CPU} Provisioning for Virtual Machines", journal = j-TACO, volume = "11", number = "3", pages = "27:1--27:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2637480", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Ideally, the pay-as-you-go model of Infrastructure as a Service (IaaS) clouds should enable users to rent just enough resources (e.g., CPU or memory bandwidth) to fulfill their service level objectives (SLOs). Achieving this goal is hard on current IaaS offers, which require users to explicitly specify the amount of resources to reserve; this requirement is nontrivial for users, because estimating the amount of resources needed to attain application-level SLOs is often complex, especially when resources are virtualized and the service provider colocates virtual machines (VMs) on host nodes. For this reason, users who deploy VMs subject to SLOs are usually prone to overprovisioning resources, thus resulting in inflated business costs. This article tackles this issue with AutoPro: a runtime system that enhances IaaS clouds with automated and fine-grained resource provisioning based on performance SLOs. Our main contribution with AutoPro is filling the gap between application-level performance SLOs and allocation of a contended resource, without requiring explicit reservations from users. In this article, we focus on CPU bandwidth allocation to throughput-driven, compute-intensive multithreaded applications colocated on a multicore processor; we show that a theoretically sound, yet simple, control strategy can enable automated fine-grained allocation of this contended resource, without the need for offline profiling. Additionally, AutoPro helps service providers optimize infrastructure utilization by provisioning idle resources to best-effort workloads, so as to maximize node-level utilization. Our extensive experimental evaluation confirms that AutoPro is able to automatically determine and enforce allocations to meet performance SLOs while maximizing node-level utilization by supporting batch workloads on a best-effort basis.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Carlson:2014:EHL, author = "Trevor E. Carlson and Wim Heirman and Stijn Eyerman and Ibrahim Hur and Lieven Eeckhout", title = "An Evaluation of High-Level Mechanistic Core Models", journal = j-TACO, volume = "11", number = "3", pages = "28:1--28:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2629677", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Large core counts and complex cache hierarchies are increasing the burden placed on commonly used simulation and modeling techniques. Although analytical models provide fast results, they do not apply to complex, many-core shared-memory systems. In contrast, detailed cycle-level simulation can be accurate but also tends to be slow, which limits the number of configurations that can be evaluated. A middle ground is needed that provides for fast simulation of complex many-core processors while still providing accurate results. In this article, we explore, analyze, and compare the accuracy and simulation speed of high-abstraction core models as a potential solution to slow cycle-level simulation. We describe a number of enhancements to interval simulation to improve its accuracy while maintaining simulation speed. In addition, we introduce the instruction-window centric (IW-centric) core model, a new mechanistic core model that bridges the gap between interval simulation and cycle-accurate simulation by enabling high-speed simulations with higher levels of detail. We also show that using accurate core models like these are important for memory subsystem studies, and that simple, naive models, like a one-IPC core model, can lead to misleading and incorrect results and conclusions in practical design studies. Validation against real hardware shows good accuracy, with an average single-core error of 11.1\% and a maximum of 18.8\% for the IW-centric model with a 1.5$ \times $ slowdown compared to interval simulation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hijaz:2014:NLN, author = "Farrukh Hijaz and Omer Khan", title = "{NUCA-L1}: a Non-Uniform Access Latency Level-1 Cache Architecture for Multicores Operating at Near-Threshold Voltages", journal = j-TACO, volume = "11", number = "3", pages = "29:1--29:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2631918", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Research has shown that operating in the near-threshold region is expected to provide up to 10$ \times $ energy efficiency for future processors. However, reliable operation below a minimum voltage (Vccmin) cannot be guaranteed due to process variations. Because SRAM margins can easily be violated at near-threshold voltages, their bit-cell failure rates are expected to rise steeply. Multicore processors rely on fast private L1 caches to exploit data locality and achieve high performance. In the presence of high bit-cell fault rates, traditionally an L1 cache either sacrifices capacity or incurs additional latency to correct the faults. We observe that L1 cache sensitivity to hit latency offers a design trade-off between capacity and latency. When fault rate is high at extreme Vccmin, it is beneficial to recover L1 cache capacity, even if it comes at the cost of additional latency. However, at low fault rates, the additional constant latency to recover cache capacity degrades performance. With this trade-off in mind, we propose a Non-Uniform Cache Access L1 architecture (NUCA-L1) that avoids additional latency on accesses to fault-free cache lines. To mitigate the capacity bottleneck, it deploys a correction mechanism to recover capacity at the cost of additional latency. Using extensive simulations of a 64-core multicore, we demonstrate that at various bit-cell fault rates, our proposed private NUCA-L1 cache architecture performs better than state-of-the-art schemes, along with a significant reduction in energy consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Drebes:2014:TAD, author = "Andi Drebes and Karine Heydemann and Nathalie Drach and Antoniu Pop and Albert Cohen", title = "Topology-Aware and Dependence-Aware Scheduling and Memory Allocation for Task-Parallel Languages", journal = j-TACO, volume = "11", number = "3", pages = "30:1--30:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2641764", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present a joint scheduling and memory allocation algorithm for efficient execution of task-parallel programs on non-uniform memory architecture (NUMA) systems. Task and data placement decisions are based on a static description of the memory hierarchy and on runtime information about intertask communication. Existing locality-aware scheduling strategies for fine-grained tasks have strong limitations: they are specific to some class of machines or applications, they do not handle task dependences, they require manual program annotations, or they rely on fragile profiling schemes. By contrast, our solution makes no assumption on the structure of programs or on the layout of data in memory. Experimental results, based on the OpenStream language, show that locality of accesses to main memory of scientific applications can be increased significantly on a 64-core machine, resulting in a speedup of up to 1.63$ \times $ compared to a state-of-the-art work-stealing scheduler.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tawa:2014:EEF, author = "Venkata Kalyan Tawa and Ravi Kasha and Madhu Mutyam", title = "{EFGR}: an Enhanced Fine Granularity Refresh Feature for High-Performance {DDR4 DRAM} Devices", journal = j-TACO, volume = "11", number = "3", pages = "31:1--31:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2656340", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High-density DRAM devices spend significant time refreshing the DRAM cells, leading to performance drop. The JEDEC DDR4 standard provides a Fine Granularity Refresh (FGR) feature to tackle refresh. Motivated by the observation that in FGR mode, only a few banks are involved, we propose an Enhanced FGR (EFGR) feature that introduces three optimizations to the basic FGR feature and exposes the bank-level parallelism within the rank even during the refresh. The first optimization decouples the nonrefreshing banks. The second and third optimizations determine the maximum number of nonrefreshing banks that can be active during refresh and selectively precharge the banks before refresh, respectively. Our simulation results show that the EFGR feature is able to recover almost 56.6\% of the performance loss incurred due to refresh operations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yalcin:2014:EEC, author = "Gulay Yalcin and Oguz Ergin and Emrah Islek and Osman Sabri Unsal and Adrian Cristal", title = "Exploiting Existing Comparators for Fine-Grained Low-Cost Error Detection", journal = j-TACO, volume = "11", number = "3", pages = "32:1--32:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2656341", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Fault tolerance has become a fundamental concern in computer design, in addition to performance and power. Although several error detection schemes have been proposed to discover a faulty core in the system, these proposals could waste the whole core, including many error-free structures in it after error detection. Moreover, many fault-tolerant designs require additional hardware for data replication or for comparing the replicated data. In this study, we provide a low-cost, fine-grained error detection scheme by exploiting already existing comparators and data replications in the several pipeline stages such as issue queue, rename logic, and translation lookaside buffer. We reduce the vulnerability of the source register tags in IQ by 60\%, the vulnerability of instruction TLB by 64\%, the vulnerability of data TLB by 45\%, and the vulnerability of the register tags of rename logic by 20\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ramachandran:2014:HFR, author = "Pradeep Ramachandran and Siva Kumar Sastry Hari and Manlap Li and Sarita V. Adve", title = "Hardware Fault Recovery for {I/O} Intensive Applications", journal = j-TACO, volume = "11", number = "3", pages = "33:1--33:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2656342", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With continued process scaling, the rate of hardware failures in commodity systems is increasing. Because these commodity systems are highly sensitive to cost, traditional solutions that employ heavy redundancy to handle such failures are no longer acceptable owing to their high associated costs. Detecting such faults by identifying anomalous software execution and recovering through checkpoint-and-replay is emerging as a viable low-cost alternative for future commodity systems. An important but commonly ignored aspect of such solutions is ensuring that external outputs to the system are fault-free. The outputs must be delayed until the detectors guarantee this, influencing fault-free performance. The overheads for resiliency must thus be evaluated while taking these delays into consideration; prior work has largely ignored this relationship. This article concerns recovery for I/O intensive applications from in-core faults. We present a strategy to buffer external outputs using dedicated hardware and show that checkpoint intervals previously considered as acceptable incur exorbitant overheads when hardware buffering is considered. We then present two techniques to reduce the checkpoint interval and demonstrate a practical solution that provides high resiliency while incurring low overheads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Eyerman:2014:MTM, author = "Stijn Eyerman and Pierre Michaud and Wouter Rogiest", title = "Multiprogram Throughput Metrics: a Systematic Approach", journal = j-TACO, volume = "11", number = "3", pages = "34:1--34:??", month = oct, year = "2014", CODEN = "????", DOI = "https://doi.org/10.1145/2663346", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 27 17:02:20 MDT 2014", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Running multiple programs on a processor aims at increasing the throughput of that processor. However, defining meaningful throughput metrics in a simulation environment is not as straightforward as reporting execution time. This has led to an ongoing debate on what forms a meaningful throughput metric for multiprogram workloads. We present a method to construct throughput metrics in a systematic way: we start by expressing assumptions on job size, job distribution, scheduling, and so forth that together define a theoretical throughput experiment. The throughput metric is then the average throughput of this experiment. Different assumptions lead to different metrics, so one should be aware of these assumptions when making conclusions based on results using a specific metric. Throughput metrics should always be defined from explicit assumptions, because this leads to a better understanding of the implications and limits of the results obtained with that metric. We elaborate multiple metrics based on different assumptions. In particular, we identify the assumptions that lead to the commonly used weighted speedup and harmonic mean of speedups. Our study clarifies that they are actual throughput metrics, which was recently questioned. We also propose some new throughput metrics, which cannot always be expressed as a closed formula. We use real experimental data to characterize metrics and show how they relate to each other.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nugteren:2015:BAS, author = "Cedric Nugteren and Henk Corporaal", title = "{Bones}: an Automatic Skeleton-Based {C-to-CUDA} Compiler for {GPUs}", journal = j-TACO, volume = "11", number = "4", pages = "35:1--35:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2665079", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The shift toward parallel processor architectures has made programming and code generation increasingly challenging. To address this programmability challenge, this article presents a technique to fully automatically generate efficient and readable code for parallel processors (with a focus on GPUs). This is made possible by combining algorithmic skeletons, traditional compilation, and ``algorithmic species,'' a classification of program code. Compilation starts by automatically annotating C code with class information (the algorithmic species). This code is then fed into the skeleton-based source-to-source compiler bones to generate CUDA code. To generate efficient code, bones also performs optimizations including host-accelerator transfer optimization and kernel fusion. This results in a unique approach, integrating a skeleton-based compiler for the first time into an automated flow. The benefits are demonstrated experimentally for PolyBench GPU kernels, showing geometric mean speed-ups of 1.4$ \times $ and 2.4$ \times $ compared to ppcg and Par4All, and for five Rodinia GPU benchmarks, showing a gap of only 1.2$ \times $ compared to hand-optimized code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2015:BOM, author = "Jue Wang and Xiangyu Dong and Yuan Xie", title = "Building and Optimizing {MRAM}-Based Commodity Memories", journal = j-TACO, volume = "11", number = "4", pages = "36:1--36:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2667105", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Emerging non-volatile memory technologies such as MRAM are promising design solutions for energy-efficient memory architecture, especially for mobile systems. However, building commodity MRAM by reusing DRAM designs is not straightforward. The existing memory interfaces are incompatible with MRAM small page size, and they fail to leverage MRAM unique properties, causing unnecessary performance and energy overhead. In this article, we propose four techniques to enable and optimize an LPDDRx-compatible MRAM solution: ComboAS to solve the pin incompatibility; DynLat to avoid unnecessary access latencies; and EarlyPA and BufW to further improve performance by exploiting the MRAM unique features of non-destructive read and independent write path. Combining all these techniques together, we boost the MRAM performance by 17\% and provide a DRAM-compatible MRAM solution consuming 21\% less energy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Komuravelli:2015:RCH, author = "Rakesh Komuravelli and Sarita V. Adve and Ching-Tsun Chou", title = "Revisiting the Complexity of Hardware Cache Coherence and Some Implications", journal = j-TACO, volume = "11", number = "4", pages = "37:1--37:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2663345", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Cache coherence is an integral part of shared-memory systems but is also widely considered to be one of the most complex parts of such systems. Much prior work has addressed this complexity and the verification techniques to prove the correctness of hardware coherence. Given the new multicore era with increasing number of cores, there is a renewed debate about whether the complexity of hardware coherence has been tamed or whether it should be abandoned in favor of software coherence. This article revisits the complexity of hardware cache coherence by verifying a publicly available, state-of-the-art implementation of the widely used MESI protocol, using the Mur$ \varphi $ model checking tool. To our surprise, we found six bugs in this protocol, most of which were hard to analyze and took several days to fix. To compare the complexity, we also verified the recently proposed DeNovo protocol, which exploits disciplined software programming models. We found three relatively easy to fix bugs in this less mature protocol. After fixing these bugs, our verification experiments showed that, compared to DeNovo, MESI had 15X more reachable states leading to a 20X increase in verification (model checking) time. Although we were eventually successful in verifying the protocols, the tool required making several simplifying assumptions (e.g., two cores, one address). Our results have several implications: (1) they indicate that hardware coherence protocols remain complex; (2) they reinforce the need for protocol designers to embrace formal verification tools to demonstrate correctness of new protocols and extensions; (3) they reinforce the need for formal verification tools that are both scalable and usable by non-expert; and (4) they show that a system based on hardware-software co-design can offer a simpler approach for cache coherence, thus reducing the overall verification effort and allowing verification of more detailed models and protocol extensions that are otherwise limited by computing resources.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rodriguez:2015:VSR, author = "Gabriel Rodr{\'\i}guez and Juan Touri{\~n}o and Mahmut T. Kandemir", title = "Volatile {STT--RAM} Scratchpad Design and Data Allocation for Low Energy", journal = j-TACO, volume = "11", number = "4", pages = "38:1--38:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2669556", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "On-chip power consumption is one of the fundamental challenges of current technology scaling. Cache memories consume a sizable part of this power, particularly due to leakage energy. STT-RAM is one of several new memory technologies that have been proposed in order to improve power while preserving performance. It features high density and low leakage, but at the expense of write energy and performance. This article explores the use of STT-RAM--based scratchpad memories that trade nonvolatility in exchange for faster and less energetically expensive accesses, making them feasible for on-chip implementation in embedded systems. A novel multiretention scratchpad partitioning is proposed, featuring multiple storage spaces with different retention, energy, and performance characteristics. A customized compiler-based allocation algorithm suitable for use with such a scratchpad organization is described. Our experiments indicate that a multiretention STT-RAM scratchpad can provide energy savings of 53\% with respect to an iso-area, hardware-managed SRAM cache.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Camarero:2015:TCH, author = "Crist{\'o}bal Camarero and Enrique Vallejo and Ram{\'o}n Beivide", title = "Topological Characterization of {Hamming} and Dragonfly Networks and Its Implications on Routing", journal = j-TACO, volume = "11", number = "4", pages = "39:1--39:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2677038", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Current High-Performance Computing (HPC) and data center networks rely on large-radix routers. Hamming graphs (Cartesian products of complete graphs) and dragonflies (two-level direct networks with nodes organized in groups) are some direct topologies proposed for such networks. The original definition of the dragonfly topology is very loose, with several degrees of freedom, such as the inter- and intragroup topology, the specific global connectivity, and the number of parallel links between groups (or trunking level). This work provides a comprehensive analysis of the topological properties of the dragonfly network, providing balancing conditions for network dimensioning, as well as introducing and classifying several alternatives for the global connectivity and trunking level. From a topological study of the network, it is noted that a Hamming graph can be seen as a canonical dragonfly topology with a high level of trunking. Based on this observation and by carefully selecting the global connectivity, the Dimension Order Routing (DOR) mechanism safely used in Hamming graphs is adapted to dragonfly networks with trunking. The resulting routing algorithms approximate the performance of minimal, nonminimal, and adaptive routings typically used in dragonflies but without requiring virtual channels to avoid packet deadlock, thus allowing for lower cost router implementations. This is obtained by properly selecting the link to route between groups based on a graph coloring of network routers. Evaluations show that the proposed mechanisms are competitive with traditional solutions when using the same number of virtual channels and enable for simpler implementations with lower cost. Finally, multilevel dragonflies are discussed, considering how the proposed mechanisms could be adapted to them.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yoon:2015:EDM, author = "Hanbin Yoon and Justin Meza and Naveen Muralimanohar and Norman P. Jouppi and Onur Mutlu", title = "Efficient Data Mapping and Buffering Techniques for Multilevel Cell Phase-Change Memories", journal = j-TACO, volume = "11", number = "4", pages = "40:1--40:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2669365", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "New phase-change memory (PCM) devices have low-access latencies (like DRAM) and high capacities (i.e., low cost per bit, like Flash). In addition to being able to scale to smaller cell sizes than DRAM, a PCM cell can also store multiple bits per cell (referred to as multilevel cell, or MLC), enabling even greater capacity per bit. However, reading and writing the different bits of data from and to an MLC PCM cell requires different amounts of time: one bit is read or written first, followed by another. Due to this asymmetric access process, the bits in an MLC PCM cell have different access latency and energy depending on which bit in the cell is being read or written. We leverage this observation to design a new way to store and buffer data in MLC PCM devices. While traditional devices couple the bits in each cell next to one another in the address space, our key idea is to logically decouple the bits in each cell into two separate regions depending on their read/write characteristics: fast-read/slow-write bits and slow-read/fast-write bits. We propose a low-overhead hardware/software technique to predict and map data that would benefit from being in each region at runtime. In addition, we show how MLC bit decoupling provides more flexibility in the way data is buffered in the device, enabling more efficient use of existing device buffer space. Our evaluations for a multicore system show that MLC bit decoupling improves system performance by 19.2\%, memory energy efficiency by 14.4\%, and thread fairness by 19.3\% over a state-of-the-art MLC PCM system that couples the bits in its cells. We show that our results are consistent across a variety of workloads and system configurations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Premillieu:2015:EOE, author = "Nathanael Pr{\'e}millieu and Andr{\'e} Seznec", title = "Efficient Out-of-Order Execution of Guarded {ISAs}", journal = j-TACO, volume = "11", number = "4", pages = "41:1--41:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2677037", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "ARM ISA-based processors are no longer low-cost, low-power processors. Nowadays, ARM ISA-based processor manufacturers are striving to implement medium-end to high-end processor cores, which implies implementing a state-of-the-art out-of-order execution engine. Unfortunately, providing efficient out-of-order execution on legacy ARM codes may be quite challenging due to guarded instructions. Predicting the guarded instructions addresses the main serialization impact associated with guarded instructions execution and the multiple definition problem. Moreover, guard prediction allows one to use a global branch-and-guard history predictor to predict both branches and guards, often improving branch prediction accuracy. Unfortunately, such a global branch-and-guard history predictor requires the systematic use of guard predictions. In that case, poor guard prediction accuracy would lead to poor overall performance on some applications. Building on top of recent advances in branch prediction and confidence estimation, we propose a hybrid branch-and-guard predictor, combining a global branch history component and global branch-and-guard history component. The potential gain or loss due to the systematic use of guard prediction is dynamically evaluated at runtime. Two computing modes are enabled: systematic guard prediction use and high-confidence-only guard prediction use. Our experiments show that on most applications, an overwhelming majority of guarded instructions are predicted. Therefore, a simple but relatively inefficient hardware solution can be used to execute the few unpredicted guarded instructions. Significant performance benefits are observed on most applications, while applications with poorly predictable guards do not suffer from performance loss.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2015:APM, author = "Zheng Wang and Dominik Grewe and Michael F. P. O'Boyle", title = "Automatic and Portable Mapping of Data Parallel Programs to {OpenCL} for {GPU}-Based Heterogeneous Systems", journal = j-TACO, volume = "11", number = "4", pages = "42:1--42:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2677036", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "General-purpose GPU-based systems are highly attractive, as they give potentially massive performance at little cost. Realizing such potential is challenging due to the complexity of programming. This article presents a compiler-based approach to automatically generate optimized OpenCL code from data parallel OpenMP programs for GPUs. A key feature of our scheme is that it leverages existing transformations, especially data transformations, to improve performance on GPU architectures and uses automatic machine learning to build a predictive model to determine if it is worthwhile running the OpenCL code on the GPU or OpenMP code on the multicore host. We applied our approach to the entire NAS parallel benchmark suite and evaluated it on distinct GPU-based systems. We achieved average (up to) speedups of $ 4.51 \times $ and $ 4.20 \times $ ($ 143 \times $ and $ 67 \times $) on Core i7/NVIDIA GeForce GTX580 and Core i7/AMD Radeon 7970 platforms, respectively, over a sequential baseline. Our approach achieves, on average, greater than $ 10 \times $ speedups over two state-of-the-art automatic GPU code generators.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{He:2015:IHF, author = "Dan He and Fang Wang and Hong Jiang and Dan Feng and Jing Ning Liu and Wei Tong and Zheng Zhang", title = "Improving Hybrid {FTL} by Fully Exploiting Internal {SSD} Parallelism with Virtual Blocks", journal = j-TACO, volume = "11", number = "4", pages = "43:1--43:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2677160", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compared with either block or page-mapping Flash Translation Layer (FTL), hybrid-mapping FTL for flash Solid State Disks (SSDs), such as Fully Associative Section Translation (FAST), has relatively high space efficiency because of its smaller mapping table than the latter and higher flexibility than the former. As a result, hybrid-mapping FTL has become the most commonly used scheme in SSDs. But the hybrid-mapping FTL incurs a large number of costly full-merge operations. Thus, a critical challenge to hybrid-mapping FTL is how to reduce the cost of full-merge operations and improve partial merge operations and switch operations. In this article, we propose a novel FTL scheme, called Virtual Block-based Parallel FAST (VBP-FAST), that divides flash area into Virtual Blocks (VBlocks) and Physical Blocks (PBlocks) where VBlocks are used to fully exploit channel-level, die-level, and plane-level parallelism of flash. Leveraging these three levels of parallelism, the cost of full merge in VBP-FAST is significantly reduced from that of FAST. In the meantime, VBP-FAST uses PBlocks to retain the advantages of partial merge and switch operations. Our extensive trace-driven simulation results show that VBP-FAST speeds up FAST by a factor of 5.3--8.4 for random workloads and of 1.7 for sequential workloads with channel-level, die-level, and plane-level parallelism of 8, 2, and 2 (i.e., eight channels, two dies, and two planes).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rubin:2015:MOM, author = "Eri Rubin and Ely Levy and Amnon Barak and Tal Ben-Nun", title = "{MAPS}: Optimizing Massively Parallel Applications Using Device-Level Memory Abstraction", journal = j-TACO, volume = "11", number = "4", pages = "44:1--44:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2680544", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GPUs play an increasingly important role in high-performance computing. While developing naive code is straightforward, optimizing massively parallel applications requires deep understanding of the underlying architecture. The developer must struggle with complex index calculations and manual memory transfers. This article classifies memory access patterns used in most parallel algorithms, based on Berkeley's Parallel ``Dwarfs.'' It then proposes the MAPS framework, a device-level memory abstraction that facilitates memory access on GPUs, alleviating complex indexing using on-device containers and iterators. This article presents an implementation of MAPS and shows that its performance is comparable to carefully optimized implementations of real-world applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cilardo:2015:IMM, author = "Alessandro Cilardo and Luca Gallo", title = "Improving Multibank Memory Access Parallelism with Lattice-Based Partitioning", journal = j-TACO, volume = "11", number = "4", pages = "45:1--45:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2675359", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Emerging architectures, such as reconfigurable hardware platforms, provide the unprecedented opportunity of customizing the memory infrastructure based on application access patterns. This work addresses the problem of automated memory partitioning for such architectures, taking into account potentially parallel data accesses to physically independent banks. Targeted at affine static control parts (SCoPs), the technique relies on the Z-polyhedral model for program analysis and adopts a partitioning scheme based on integer lattices. The approach enables the definition of a solution space including previous works as particular cases. The problem of minimizing the total amount of memory required across the partitioned banks, referred to as storage minimization throughout the article, is tackled by an optimal approach yielding asymptotically zero memory waste or, as an alternative, an efficient approach ensuring arbitrarily small waste. The article also presents a prototype toolchain and a detailed step-by-step case study demonstrating the impact of the proposed technique along with extensive comparisons with alternative approaches in the literature.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Martinsen:2015:EPT, author = "Jan Kasper Martinsen and H{\aa}kan Grahn and Anders Isberg", title = "The Effects of Parameter Tuning in Software Thread-Level Speculation in {JavaScript} Engines", journal = j-TACO, volume = "11", number = "4", pages = "46:1--46:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686036", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "JavaScript is a sequential programming language that has a large potential for parallel execution in Web applications. Thread-level speculation can take advantage of this, but it has a large memory overhead. In this article, we evaluate the effects of adjusting various parameters for thread-level speculation. Our results clearly show that thread-level speculation is a useful technique for taking advantage of multicore architectures for JavaScript in Web applications, that nested speculation is required in thread-level speculation, and that the execution characteristics of Web applications significantly reduce the needed memory, the number of threads, and the depth of our speculation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Colombet:2015:SOS, author = "Quentin Colombet and Florian Brandner and Alain Darte", title = "Studying Optimal Spilling in the Light of {SSA}", journal = j-TACO, volume = "11", number = "4", pages = "47:1--47:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2685392", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent developments in register allocation, mostly linked to static single assignment (SSA) form, have shown the benefits of decoupling the problem in two phases: a first spilling phase places load and store instructions so that the register pressure at all program points is small enough, and a second assignment and coalescing phase maps the variables to physical registers and reduces the number of move instructions among registers. This article focuses on the first phase, for which many open questions remain: in particular, we study the notion of optimal spilling (what can be expressed?) and the impact of SSA form (does it help?). To identify the important features for optimal spilling on load-store architectures, we develop a new integer linear programming formulation, more accurate and expressive than previous approaches. Among other features, we can express SSA $ \phi $-functions, memory-to-memory copies, and the fact that a value can be stored simultaneously in a register and in memory. Based on this formulation, we present a thorough analysis of the results obtained for the SPECINT 2000 and EEMBC 1.1 benchmarks, from which we draw, among others, the following conclusions: (1) rematerialization is extremely important; (2) SSA complicates the formulation of optimal spilling, especially because of memory coalescing when the code is not in conventional SSA (CSSA); (3) microarchitectural features are significant and thus have to be accounted for; and (4) significant savings can be obtained in terms of static spill costs, cache miss rates, and dynamic instruction counts.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Haj-Yihia:2015:CDP, author = "Jawad Haj-Yihia and Yosi {Ben Asher} and Efraim Rotem and Ahmad Yasin and Ran Ginosar", title = "Compiler-Directed Power Management for Superscalars", journal = j-TACO, volume = "11", number = "4", pages = "48:1--48:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2685393", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern superscalar CPUs contain large complex structures and diverse execution units, consuming wide dynamic power range. Building a power delivery network for the worst-case power consumption is not energy efficient and often is impossible to fit in small systems. Instantaneous power excursions can cause voltage droops. Power management algorithms are too slow to respond to instantaneous events. In this article, we propose a novel compiler-directed framework to address this problem. The framework is validated on a 4th Generation Intel\reg{} CoreTM processor and with simulator on output trace. Up to 16\% performance speedup is measured over baseline for the SPEC CPU2006 benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Trinh:2015:EDE, author = "Hong-Phuc Trinh and Marc Duranton and Michel Paindavoine", title = "Efficient Data Encoding for Convolutional Neural Network application", journal = j-TACO, volume = "11", number = "4", pages = "49:1--49:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2685394", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents an approximate data encoding scheme called Significant Position Encoding (SPE). The encoding allows efficient implementation of the recall phase (forward propagation pass) of Convolutional Neural Networks (CNN)-a typical Feed-Forward Neural Network. This implementation uses only 7 bits data representation and achieves almost the same classification performance compared with the initial network: on MNIST handwriting recognition task, using this data encoding scheme losses only 0.03\% in terms of recognition rate (99.27\% vs. 99.3\%). In terms of storage, we achieve a 12.5\% gain compared with an 8 bits fixed-point implementation of the same CNN. Moreover, this data encoding allows efficient implementation of processing unit thanks to the simplicity of scalar product operation-the principal operation in a Feed-Forward Neural Network.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Breugh:2015:MAM, author = "Maximilien B. Breugh and Stijn Eyerman and Lieven Eeckhout", title = "Mechanistic Analytical Modeling of Superscalar In-Order Processor Performance", journal = j-TACO, volume = "11", number = "4", pages = "50:1--50:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2678277", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Superscalar in-order processors form an interesting alternative to out-of-order processors because of their energy efficiency and lower design complexity. However, despite the reduced design complexity, it is nontrivial to get performance estimates or insight in the application--microarchitecture interaction without running slow, detailed cycle-level simulations, because performance highly depends on the order of instructions within the application's dynamic instruction stream, as in-order processors stall on interinstruction dependences and functional unit contention. To limit the number of detailed cycle-level simulations needed during design space exploration, we propose a mechanistic analytical performance model that is built from understanding the internal mechanisms of the processor. The mechanistic performance model for superscalar in-order processors is shown to be accurate with an average performance prediction error of 3.2\% compared to detailed cycle-accurate simulation using gem5. We also validate the model against hardware, using the ARM Cortex-A8 processor and show that it is accurate within 10\% on average. We further demonstrate the usefulness of the model through three case studies: (1) design space exploration, identifying the optimum number of functional units for achieving a given performance target; (2) program--machine interactions, providing insight into microarchitecture bottlenecks; and (3) compiler--architecture interactions, visualizing the impact of compiler optimizations on performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Seshadri:2015:MPC, author = "Vivek Seshadri and Samihan Yedkar and Hongyi Xin and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry", title = "Mitigating Prefetcher-Caused Pollution Using Informed Caching Policies for Prefetched Blocks", journal = j-TACO, volume = "11", number = "4", pages = "51:1--51:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2677956", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many modern high-performance processors prefetch blocks into the on-chip cache. Prefetched blocks can potentially pollute the cache by evicting more useful blocks. In this work, we observe that both accurate and inaccurate prefetches lead to cache pollution, and propose a comprehensive mechanism to mitigate prefetcher-caused cache pollution. First, we observe that over 95\% of useful prefetches in a wide variety of applications are not reused after the first demand hit (in secondary caches). Based on this observation, our first mechanism simply demotes a prefetched block to the lowest priority on a demand hit. Second, to address pollution caused by inaccurate prefetches, we propose a self-tuning prefetch accuracy predictor to predict if a prefetch is accurate or inaccurate. Only predicted-accurate prefetches are inserted into the cache with a high priority. Evaluations show that our final mechanism, which combines these two ideas, significantly improves performance compared to both the baseline LRU policy and two state-of-the-art approaches to mitigating prefetcher-caused cache pollution (up to 49\%, and 6\% on average for 157 two-core multiprogrammed workloads). The performance improvement is consistent across a wide variety of system configurations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Matheou:2015:ASD, author = "George Matheou and Paraskevas Evripidou", title = "Architectural Support for Data-Driven Execution", journal = j-TACO, volume = "11", number = "4", pages = "52:1--52:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686874", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The exponential growth of sequential processors has come to an end, and thus, parallel processing is probably the only way to achieve performance growth. We propose the development of parallel architectures based on data-driven scheduling. Data-driven scheduling enforces only a partial ordering as dictated by the true data dependencies, which is the minimum synchronization possible. This is very beneficial for parallel processing because it enables it to exploit the maximum possible parallelism. We provide architectural support for data-driven execution for the Data-Driven Multithreading (DDM) model. In the past, DDM has been evaluated mostly in the form of virtual machines. The main contribution of this work is the development of a highly efficient hardware support for data-driven execution and its integration into a multicore system with eight cores on a Virtex-6 FPGA. The DDM semantics make barriers and cache coherence unnecessary, which reduces the synchronization latencies significantly and makes the cache simpler. The performance evaluation has shown that the support for data-driven execution is very efficient with negligible overheads. Our prototype can support very small problem sizes (matrix $ 16 \times 16$) and ultra-lightweight threads (block of $ 4 \times 4$) that achieve speedups close to linear. Such results cannot be achieved by software-based systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Morad:2015:GSP, author = "Amir Morad and Leonid Yavits and Ran Ginosar", title = "{GP--SIMD} Processing-in-Memory", journal = j-TACO, volume = "11", number = "4", pages = "53:1--53:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686875", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GP-SIMD, a novel hybrid general-purpose SIMD computer architecture, resolves the issue of data synchronization by in-memory computing through combining data storage and massively parallel processing. GP-SIMD employs a two-dimensional access memory with modified SRAM storage cells and a bit-serial processing unit per each memory row. An analytic performance model of the GP-SIMD architecture is presented, comparing it to associative processor and to conventional SIMD architectures. Cycle-accurate simulation of four workloads supports the analytical comparison. Assuming a moderate die area, GP-SIMD architecture outperforms both the associative processor and conventional SIMD coprocessor architectures by almost an order of magnitude while consuming less power.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Schaub:2015:ISW, author = "Thomas Schaub and Simon Moll and Ralf Karrenberg and Sebastian Hack", title = "The Impact of the {SIMD} Width on Control-Flow and Memory Divergence", journal = j-TACO, volume = "11", number = "4", pages = "54:1--54:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687355", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Power consumption is a prevalent issue in current and future computing systems. SIMD processors amortize the power consumption of managing the instruction stream by executing the same instruction in parallel on multiple data. Therefore, in the past years, the SIMD width has steadily increased, and it is not unlikely that it will continue to do so. In this article, we experimentally study the influence of the SIMD width to the execution of data-parallel programs. We investigate how an increasing SIMD width (up to 1024) influences control-flow divergence and memory-access divergence, and how well techniques to mitigate them will work on larger SIMD widths. We perform our study on 76 OpenCL applications and show that a group of programs scales well up to SIMD width 1024, whereas another group of programs increasingly suffers from control-flow divergence. For those programs, thread regrouping techniques may become increasingly important for larger SIMD widths. We show what average speedups can be expected when increasing the SIMD width. For example, when switching from scalar execution to SIMD width 64, one can expect a speedup of 60.11, which increases to 62.46 when using thread regrouping. We also analyze the frequency of regular (uniform, consecutive) memory access patterns and observe a monotonic decrease of regular memory accesses from 82.6 at SIMD width 4 to 43.1\% at SIMD width 1024.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fang:2015:MMD, author = "Zhenman Fang and Sanyam Mehta and Pen-Chung Yew and Antonia Zhai and James Greensky and Gautham Beeraka and Binyu Zang", title = "Measuring Microarchitectural Details of Multi- and Many-Core Memory Systems through Microbenchmarking", journal = j-TACO, volume = "11", number = "4", pages = "55:1--55:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687356", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As multicore and many-core architectures evolve, their memory systems are becoming increasingly more complex. To bridge the latency and bandwidth gap between the processor and memory, they often use a mix of multilevel private/shared caches that are either blocking or nonblocking and are connected by high-speed network-on-chip. Moreover, they also incorporate hardware and software prefetching and simultaneous multithreading (SMT) to hide memory latency. On such multi- and many-core systems, to incorporate various memory optimization schemes using compiler optimizations and performance tuning techniques, it is crucial to have microarchitectural details of the target memory system. Unfortunately, such details are often unavailable from vendors, especially for newly released processors. In this article, we propose a novel microbenchmarking methodology based on short elapsed-time events (SETEs) to obtain comprehensive memory microarchitectural details in multi- and many-core processors. This approach requires detailed analysis of potential interfering factors that could affect the intended behavior of such memory systems. We lay out effective guidelines to control and mitigate those interfering factors. Taking the impact of SMT into consideration, our proposed methodology not only can measure traditional cache/memory latency and off-chip bandwidth but also can uncover the details of software and hardware prefetching units not attempted in previous studies. Using the newly released Intel Xeon Phi many-core processor (with in-order cores) as an example, we show how we can use a set of microbenchmarks to determine various microarchitectural features of its memory system (many are undocumented from vendors). To demonstrate the portability and validate the correctness of such a methodology, we use the well-documented Intel Sandy Bridge multicore processor (with out-of-order cores) as another example, where most data are available and can be validated. Moreover, to illustrate the usefulness of the measured data, we do a multistage coordinated data prefetching case study on both Xeon Phi and Sandy Bridge and show that by using the measured data, we can achieve 1.3X and 1.08X performance speedup, respectively, compared to the state-of-the-art Intel ICC compiler. We believe that these measurements also provide useful insights into memory optimization, analysis, and modeling of such multicore and many-core architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chi:2015:LPH, author = "Chi Ching Chi and Mauricio Alvarez-Mesa and Ben Juurlink", title = "Low-Power High-Efficiency Video Decoding using General-Purpose Processors", journal = j-TACO, volume = "11", number = "4", pages = "56:1--56:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2685551", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we investigate how code optimization techniques and low-power states of general-purpose processors improve the power efficiency of HEVC decoding. The power and performance efficiency of the use of SIMD instructions, multicore architectures, and low-power active and idle states are analyzed in detail for offline video decoding. In addition, the power efficiency of techniques such as ``race to idle'' and ``exploiting slack'' with DVFS are evaluated for real-time video decoding. Results show that ``exploiting slack'' is more power efficient than ``race to idle'' for all evaluated platforms representing smartphone, tablet, laptop, and desktop computing systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luporini:2015:CLO, author = "Fabio Luporini and Ana Lucia Varbanescu and Florian Rathgeber and Gheorghe-Teodor Bercea and J. Ramanujam and David A. Ham and Paul H. J. Kelly", title = "Cross-Loop Optimization of Arithmetic Intensity for Finite Element Local Assembly", journal = j-TACO, volume = "11", number = "4", pages = "57:1--57:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687415", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We study and systematically evaluate a class of composable code transformations that improve arithmetic intensity in local assembly operations, which represent a significant fraction of the execution time in finite element methods. Their performance optimization is indeed a challenging issue. Even though affine loop nests are generally present, the short trip counts and the complexity of mathematical expressions, which vary among different problems, make it hard to determine an optimal sequence of successful transformations. Our investigation has resulted in the implementation of a compiler (called COFFEE) for local assembly kernels, fully integrated with a framework for developing finite element methods. The compiler manipulates abstract syntax trees generated from a domain-specific language by introducing domain-aware optimizations for instruction-level parallelism and register locality. Eventually, it produces C code including vector SIMD intrinsics. Experiments using a range of real-world finite element problems of increasing complexity show that significant performance improvement is achieved. The generality of the approach and the applicability of the proposed code transformations to other domains is also discussed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2015:OPS, author = "Xing Zhou and Mar{\'\i}a J. Garzar{\'a}n and David A. Padua", title = "Optimal Parallelogram Selection for Hierarchical Tiling", journal = j-TACO, volume = "11", number = "4", pages = "58:1--58:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687414", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Loop tiling is an effective optimization to improve performance of multiply nested loops, which are the most time-consuming parts in many programs. Most massively parallel systems today are organized hierarchically, and different levels of the hierarchy differ in the organization of parallelism and the memory models they adopt. To make better use of these machines, it is clear that loop nests should be tiled hierarchically to fit the hierarchical organization of the machine; however, it is not so clear what should be the exact form of these hierarchical tiles. In particular, tile shape selection is of critical importance to expose parallelism of the tiled loop nests. Although loop tiling is a well-known optimization, not much is known about tile shape selection. In this article, we study tile shape selection when the shapes are any type of parallelograms and introduce a model to relate the tile shape of the hierarchy to the execution time. Using this model, we implement a system that automatically finds the tile shapes that minimize the execution time in a hierarchical system. Our experimental results show that in several cases, the tiles automatically selected by our system outperform the most intuitive tiling schemes usually adopted by programmers because of their simplicity.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Porter:2015:MMS, author = "Leo Porter and Michael A. Laurenzano and Ananta Tiwari and Adam Jundt and William A. {Ward, Jr.} and Roy Campbell and Laura Carrington", title = "Making the Most of {SMT} in {HPC}: System- and Application-Level Perspectives", journal = j-TACO, volume = "11", number = "4", pages = "59:1--59:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687651", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This work presents an end-to-end methodology for quantifying the performance and power benefits of simultaneous multithreading (SMT) for HPC centers and applies this methodology to a production system and workload. Ultimately, SMT's value system-wide depends on whether users effectively employ SMT at the application level. However, predicting SMT's benefit for HPC applications is challenging; by doubling the number of threads, the application's characteristics may change. This work proposes statistical modeling techniques to predict the speedup SMT confers to HPC applications. This approach, accurate to within 8\%, uses only lightweight, transparent performance monitors collected during a single run of the application.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tong:2015:OMT, author = "Xin Tong and Toshihiko Koju and Motohiro Kawahito and Andreas Moshovos", title = "Optimizing Memory Translation Emulation in Full System Emulators", journal = j-TACO, volume = "11", number = "4", pages = "60:1--60:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2686034", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The emulation speed of a full system emulator (FSE) determines its usefulness. This work quantitatively measures where time is spent in QEMU [Bellard 2005], an industrial-strength FSE. The analysis finds that memory emulation is one of the most heavily exercised emulator components. For workloads studied, 38.1\% of the emulation time is spent in memory emulation on average, even though QEMU implements a software translation lookaside buffer (STLB) to accelerate dynamic address translation. Despite the amount of time spent in memory emulation, there has been no study on how to further improve its speed. This work analyzes where time is spent in memory emulation and studies the performance impact of a number of STLB optimizations. Although there are several performance optimization techniques for hardware TLBs, this work finds that the trade-offs with an STLB are quite different compared to those with hardware TLBs. As a result, not all hardware TLB performance optimization techniques are applicable to STLBs and vice versa. The evaluated STLB optimizations target STLB lookups, as well as refills, and result in an average emulator performance improvement of 24.4\% over the baseline.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kong:2015:CRF, author = "Martin Kong and Antoniu Pop and Louis-No{\"e}l Pouchet and R. Govindarajan and Albert Cohen and P. Sadayappan", title = "Compiler\slash Runtime Framework for Dynamic Dataflow Parallelization of Tiled Programs", journal = j-TACO, volume = "11", number = "4", pages = "61:1--61:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687652", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Task-parallel languages are increasingly popular. Many of them provide expressive mechanisms for intertask synchronization. For example, OpenMP 4.0 will integrate data-driven execution semantics derived from the StarSs research language. Compared to the more restrictive data-parallel and fork-join concurrency models, the advanced features being introduced into task-parallel models in turn enable improved scalability through load balancing, memory latency hiding, mitigation of the pressure on memory bandwidth, and, as a side effect, reduced power consumption. In this article, we develop a systematic approach to compile loop nests into concurrent, dynamically constructed graphs of dependent tasks. We propose a simple and effective heuristic that selects the most profitable parallelization idiom for every dependence type and communication pattern. This heuristic enables the extraction of interband parallelism (cross-barrier parallelism) in a number of numerical computations that range from linear algebra to structured grids and image processing. The proposed static analysis and code generation alleviates the burden of a full-blown dependence resolver to track the readiness of tasks at runtime. We evaluate our approach and algorithms in the PPCG compiler, targeting OpenStream, a representative dataflow task-parallel language with explicit intertask dependences and a lightweight runtime. Experimental results demonstrate the effectiveness of the approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Melot:2015:FCS, author = "Nicolas Melot and Christoph Kessler and J{\"o}rg Keller and Patrick Eitschberger", title = "Fast Crown Scheduling Heuristics for Energy-Efficient Mapping and Scaling of Moldable Streaming Tasks on Manycore Systems", journal = j-TACO, volume = "11", number = "4", pages = "62:1--62:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2687653", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Exploiting effectively massively parallel architectures is a major challenge that stream programming can help facilitate. We investigate the problem of generating energy-optimal code for a collection of streaming tasks that include parallelizable or moldable tasks on a generic manycore processor with dynamic discrete frequency scaling. Streaming task collections differ from classical task sets in that all tasks are running concurrently, so that cores typically run several tasks that are scheduled round-robin at user level in a data-driven way. A stream of data flows through the tasks and intermediate results may be forwarded to other tasks, as in a pipelined task graph. In this article, we consider crown scheduling, a novel technique for the combined optimization of resource allocation, mapping, and discrete voltage/frequency scaling for moldable streaming task collections in order to optimize energy efficiency given a throughput constraint. We first present optimal offline algorithms for separate and integrated crown scheduling based on integer linear programming (ILP). We make no restricting assumption about speedup behavior. We introduce the fast heuristic Longest Task, Lowest Group (LTLG) as a generalization of the Longest Processing Time (LPT) algorithm to achieve a load-balanced mapping of parallel tasks, and the Height heuristic for crown frequency scaling. We use them in feedback loop heuristics based on binary search and simulated annealing to optimize crown allocation. Our experimental evaluation of the ILP models for a generic manycore architecture shows that at least for small and medium-sized streaming task collections even the integrated variant of crown scheduling can be solved to optimality by a state-of-the-art ILP solver within a few seconds. Our heuristics produce makespan and energy consumption close to optimality within the limits of the phase-separated crown scheduling technique and the crown structure. Their optimization time is longer than the one of other algorithms we test, but our heuristics consistently produce better solutions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "62", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ruan:2015:TRM, author = "Wenjia Ruan and Yujie Liu and Michael Spear", title = "Transactional Read-Modify-Write Without Aborts", journal = j-TACO, volume = "11", number = "4", pages = "63:1--63:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2688904", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Language-level transactions are said to provide ``atomicity,'' implying that the order of operations within a transaction should be invisible to concurrent transactions and thus that independent operations within a transaction should be safe to execute in any order. In this article, we present a mechanism for dynamically reordering memory operations within a transaction so that read-modify-write operations on highly contended locations can be delayed until the very end of the transaction. When integrated with traditional transactional conflict detection mechanisms, our approach reduces aborts on hot memory locations, such as statistics counters, thereby improving throughput and reducing wasted work. We present three algorithms for delaying highly contended read-modify-write operations within transactions, and we evaluate their impact on throughput for eager and lazy transactional systems across multiple workloads. We also discuss complications that arise from the interaction between our mechanism and the need for strong language-level semantics, and we propose algorithmic extensions that prevent errors from occurring when accesses are aggressively reordered in a transactional memory implementation with weak semantics.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "63", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{UlHuda:2015:UTM, author = "Zia {Ul Huda} and Ali Jannesari and Felix Wolf", title = "Using Template Matching to Infer Parallel Design Patterns", journal = j-TACO, volume = "11", number = "4", pages = "64:1--64:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2688905", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The triumphant spread of multicore processors over the past decade increases the pressure on software developers to exploit the growing amount of parallelism available in the hardware. However, writing parallel programs is generally challenging. For sequential programs, the formulation of design patterns marked a turning point in software development, boosting programmer productivity and leading to more reusable and maintainable code. While the literature is now also reporting a rising number of parallel design patterns, programmers confronted with the task of parallelizing an existing sequential program still struggle with the question of which parallel pattern to apply where in their code. In this article, we show how template matching, a technique traditionally used in the discovery of sequential design patterns, can also be used to support parallelization decisions. After looking for matches in a previously extracted dynamic dependence graph, we classify code blocks of the input program according to the structure of the parallel patterns we find. Based on this information, the programmer can easily implement the detected pattern and create a parallel version of his or her program. We tested our approach with six programs, in which we successfully detected pipeline and do-all patterns.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "64", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Litz:2015:ECA, author = "Heiner Litz and Ricardo J. Dias and David R. Cheriton", title = "Efficient Correction of Anomalies in Snapshot Isolation Transactions", journal = j-TACO, volume = "11", number = "4", pages = "65:1--65:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2693260", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Transactional memory systems providing snapshot isolation enable concurrent access to shared data without incurring aborts on read-write conflicts. Reducing aborts is extremely relevant as it leads to higher concurrency, greater performance, and better predictability. Unfortunately, snapshot isolation does not provide serializability as it allows certain anomalies that can lead to subtle consistency violations. While some mechanisms have been proposed to verify the correctness of a program utilizing snapshot isolation transactions, it remains difficult to repair incorrect applications. To reduce the programmer's burden in this case, we present a technique based on dynamic code and graph dependency analysis that automatically corrects existing snapshot isolation anomalies in transactional memory programs. Our evaluation shows that corrected applications retain the performance benefits characteristic of snapshot isolation over conventional transactional memory systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "65", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bahmann:2015:PRC, author = "Helge Bahmann and Nico Reissmann and Magnus Jahre and Jan Christian Meyer", title = "Perfect Reconstructability of Control Flow from Demand Dependence Graphs", journal = j-TACO, volume = "11", number = "4", pages = "66:1--66:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2693261", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Demand-based dependence graphs (DDGs), such as the (Regionalized) Value State Dependence Graph ((R)VSDG), are intermediate representations (IRs) well suited for a wide range of program transformations. They explicitly model the flow of data and state, and only implicitly represent a restricted form of control flow. These features make DDGs especially suitable for automatic parallelization and vectorization, but cannot be leveraged by practical compilers without efficient construction and destruction algorithms. Construction algorithms remodel the arbitrarily complex control flow of a procedure to make it amenable to DDG representation, whereas destruction algorithms reestablish control flow for generating efficient object code. Existing literature presents solutions to both problems, but these impose structural constraints on the generatable control flow, and omit qualitative evaluation. The key contribution of this article is to show that there is no intrinsic structural limitation in the control flow directly extractable from RVSDGs. This fundamental result originates from an interpretation of loop repetition and decision predicates as computed continuations, leading to the introduction of the predicate continuation normal form. We provide an algorithm for constructing RVSDGs in predicate continuation form, and propose a novel destruction algorithm for RVSDGs in this form. Our destruction algorithm can generate arbitrarily complex control flow; we show this by proving that the original CFG an RVSDG was derived from can, apart from overspecific detail, be reconstructed perfectly. Additionally, we prove termination and correctness of these algorithms. Furthermore, we empirically evaluate the performance, the representational overhead at compile time, and the reduction in branch instructions compared to existing solutions. In contrast to previous work, our algorithms impose no additional overhead on the control flow of the produced object code. To our knowledge, this is the first scheme that allows the original control flow of a procedure to be recovered from a DDG representation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "66", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Elango:2015:URM, author = "Venmugil Elango and Naser Sedaghati and Fabrice Rastello and Louis-No{\"e}l Pouchet and J. Ramanujam and Radu Teodorescu and P. Sadayappan", title = "On Using the Roofline Model with Lower Bounds on Data Movement", journal = j-TACO, volume = "11", number = "4", pages = "67:1--67:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2693656", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The roofline model is a popular approach for ``bound and bottleneck'' performance analysis. It focuses on the limits to the performance of processors because of limited bandwidth to off-chip memory. It models upper bounds on performance as a function of operational intensity, the ratio of computational operations per byte of data moved from/to memory. While operational intensity can be directly measured for a specific implementation of an algorithm on a particular target platform, it is of interest to obtain broader insights on bottlenecks, where various semantically equivalent implementations of an algorithm are considered, along with analysis for variations in architectural parameters. This is currently very cumbersome and requires performance modeling and analysis of many variants. In this article, we address this problem by using the roofline model in conjunction with upper bounds on the operational intensity of computations as a function of cache capacity, derived from lower bounds on data movement. This enables bottleneck analysis that holds across all dependence-preserving semantically equivalent implementations of an algorithm. We demonstrate the utility of the approach in assessing fundamental limits to performance and energy efficiency for several benchmark algorithms across a design space of architectural variations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "67", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Anonymous:2015:LDR, author = "Anonymous", title = "List of Distinguished Reviewers {ACM TACO 2014}", journal = j-TACO, volume = "11", number = "4", pages = "68:1--68:??", month = jan, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2714082", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jan 12 11:38:56 MST 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "68", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zimmer:2015:NSM, author = "Christopher Zimmer and Frank Mueller", title = "{NoCMsg}: a Scalable Message-Passing Abstraction for Network-on-Chips", journal = j-TACO, volume = "12", number = "1", pages = "1:1--1:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2701426", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The number of cores of contemporary processors is constantly increasing and thus continues to deliver ever higher peak performance (following Moore's transistor law). Yet high core counts present a challenge to hardware and software alike. Following this trend, the network-on-chip (NoC) topology has changed from buses over rings and fully connected meshes to 2D meshes. This work contributes NoCMsg, a low-level message-passing abstraction over NoCs, which is specifically designed for large core counts in 2D meshes. NoCMsg ensures deadlock-free messaging for wormhole Manhattan-path routing over the NoC via a polling-based message abstraction and non--flow-controlled communication for selective communication patterns. Experimental results on the TilePro hardware platform show that NoCMsg can significantly reduce communication times by up to 86\% for single packet messages and up to 40\% for larger messages compared to other NoC-based message approaches. On the TilePro platform, NoCMsg outperforms shared memory abstractions by up to 93\% as core counts and interprocess communication increase. Results for fully pipelined double-precision numerical codes show speedups of up to 64\% for message passing over shared memory at 32 cores. Overall, we observe that shared memory scales up to about 16 cores on this platform, whereas message passing performs well beyond that threshold. These results generalize to similar NoC-based platforms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Grigorian:2015:ADA, author = "Beayna Grigorian and Glenn Reinman", title = "Accelerating Divergent Applications on {SIMD} Architectures Using Neural Networks", journal = j-TACO, volume = "12", number = "1", pages = "2:1--2:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2717311", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The purpose of this research is to find a neural-network-based solution to the well-known problem of branch divergence in Single Instruction Multiple Data (SIMD) architectures. Our approach differs from existing techniques that handle branch (or control-flow) divergence, which use costly hardware modifications, low-utilization masking techniques, or static prediction methods. As we examine divergent applications, we characterize the degree of data-dependent control flow seen in each and isolate the code regions (or ``kernels'') that cause the most performance degradation due to branch divergence. We then train neural networks (NNs) offline to approximate these kernels and inject the NN computations directly into the applications as substitutes for the kernels they approximate. This essentially translates control flow into nondivergent computation, trading off precision for performance. As our methodology manipulates application source code directly, it is inherently platform agnostic and can be adopted as a general means for accelerating divergent applications on data-parallel architectures. In this article, we present the Neuralizer, an automated software flow for kernel identification, NN training, and NN integration, as well as supplementary user-controlled optimization techniques. Evaluating our approach on a variety of divergent applications run on a Graphics Processing Unit (GPU), we on average achieve performance gains of 13.6 $ \times $ and energy savings of 14.8 $ \times $ with 96\% accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Holey:2015:PEC, author = "Anup Holey and Vineeth Mekkat and Pen-Chung Yew and Antonia Zhai", title = "Performance-Energy Considerations for Shared Cache Management in a Heterogeneous Multicore Processor", journal = j-TACO, volume = "12", number = "1", pages = "3:1--3:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2710019", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Heterogeneous multicore processors that integrate CPU cores and data-parallel accelerators such as graphic processing unit (GPU) cores onto the same die raise several new issues for sharing various on-chip resources. The shared last-level cache (LLC) is one of the most important shared resources due to its impact on performance. Accesses to the shared LLC in heterogeneous multicore processors can be dominated by the GPU due to the significantly higher number of concurrent threads supported by the architecture. Under current cache management policies, the CPU applications' share of the LLC can be significantly reduced in the presence of competing GPU applications. For many CPU applications, a reduced share of the LLC could lead to significant performance degradation. On the contrary, GPU applications can tolerate increase in memory access latency when there is sufficient thread-level parallelism (TLP). In addition to the performance challenge, introduction of diverse cores onto the same die changes the energy consumption profile and, in turn, affects the energy efficiency of the processor. In this work, we propose heterogeneous LLC management (HeLM), a novel shared LLC management policy that takes advantage of the GPU's tolerance for memory access latency. HeLM is able to throttle GPU LLC accesses and yield LLC space to cache-sensitive CPU applications. This throttling is achieved by allowing GPU accesses to bypass the LLC when an increase in memory access latency can be tolerated. The latency tolerance of a GPU application is determined by the availability of TLP, which is measured at runtime as the average number of threads that are available for issuing. For a baseline configuration with two CPU cores and four GPU cores, modeled after existing heterogeneous processor designs, HeLM outperforms least recently used (LRU) policy by 10.4\%. Additionally, HeLM also outperforms competing policies. Our evaluations show that HeLM is able to sustain performance with varying core mix. In addition to the performance benefit, bypassing also reduces total accesses to the LLC, leading to a reduction in the energy consumption of the LLC module. However, LLC bypassing has the potential to increase off-chip bandwidth utilization and DRAM energy consumption. Our experiments show that HeLM exhibits better energy efficiency by reducing the ED$^2$ by 18\% over LRU while impacting only a 7\% increase in off-chip bandwidth utilization.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Suh:2015:DMR, author = "Jinho Suh and Chieh-Ting Huang and Michel Dubois", title = "Dynamic {MIPS} Rate Stabilization for Complex Processors", journal = j-TACO, volume = "12", number = "1", pages = "4:1--4:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2714575", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern microprocessor cores reach their high performance levels with the help of high clock rates, parallel and speculative execution of a large number of instructions, and vast cache hierarchies. Modern cores also have adaptive features to regulate power and temperature and avoid thermal emergencies. All of these features contribute to highly unpredictable execution times. In this article, we demonstrate that the execution time of in-order (IO), out-of-order (OoO), and OoO simultaneous multithreaded processors can be stable and predictable by stabilizing their mega instructions executed per second (MIPS) rate via a proportional, integral, and differential (PID) gain feedback controller and dynamic voltage and frequency scaling (DVFS). Processor cores in idle cycles are continuously consuming power, which is highly undesirable in systems, especially in real-time systems. In addition to meeting deadlines in real-time systems, our MIPS rate stabilization framework can be applied on top of it to reduce power and energy by avoiding idle cycles. If processors are equipped with MIPS rate stabilization, the execution time can be predicted. Because the MIPS rate remains steady, a stabilized processor meets deadlines on time in real-time systems or in systems with quality-of-service execution latency requirements at the lowest possible frequency. To demonstrate and evaluate this capability, we have selected a subset of the MiBench benchmarks with the widest execution rate variations. We stabilize their MIPS rate on a 1GHz Pentium III--like OoO single-thread microarchitecture, a 1.32GHz StrongARM-like IO microarchitecture, and the 1GHz OoO processor augmented with two-way and four-way simultaneous multithreading. Both IO and OoO cores can take advantage of the stabilization framework, but the energy per instruction of the stabilized OoO core is less because it runs at a lower frequency to meet the same deadlines. The MIPS rate stabilization of complex processors using a PID feedback control loop is a general technique applicable to environments in which lower power or energy coupled with steady, predictable performance are desirable, although we target more specifically real-time systems in this article.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Karimi:2015:MMA, author = "Naghmeh Karimi and Arun Karthik Kanuparthi and Xueyang Wang and Ozgur Sinanoglu and Ramesh Karri", title = "{MAGIC}: Malicious Aging in Circuits\slash Cores", journal = j-TACO, volume = "12", number = "1", pages = "5:1--5:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2724718", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The performance of an IC degrades over its lifetime, ultimately resulting in IC failure. In this article, we present a hardware attack (called MAGIC) to maliciously accelerate NBTI aging effects in cores. In this attack, we identify the input patterns that maliciously age the pipestages of a core. We then craft a program that generates these patterns at the inputs of the targeted pipestage. We demonstrate the MAGIC-based attack on the OpenSPARC processor. Executing this program dramatically accelerates the aging process and degrades the processor's performance by 10.92\% in 1 month, bypassing existing aging mitigation and timing-error correction schemes. We also present two low-cost techniques to thwart the proposed attack.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{DeOliveiraCastro:2015:CLB, author = "Pablo {De Oliveira Castro} and Chadi Akel and Eric Petit and Mihail Popov and William Jalby", title = "{CERE}: {LLVM}-Based {Codelet Extractor and REplayer} for Piecewise Benchmarking and Optimization", journal = j-TACO, volume = "12", number = "1", pages = "6:1--6:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2724717", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents Codelet Extractor and REplayer (CERE), an open-source framework for code isolation. CERE finds and extracts the hotspots of an application as isolated fragments of code, called codelets. Codelets can be modified, compiled, run, and measured independently from the original application. Code isolation reduces benchmarking cost and allows piecewise optimization of an application. Unlike previous approaches, CERE isolates codes at the compiler Intermediate Representation (IR) level. Therefore CERE is language agnostic and supports many input languages such as C, C++, Fortran, and D. CERE automatically detects codelets invocations that have the same performance behavior. Then, it selects a reduced set of representative codelets and invocations, much faster to replay, which still captures accurately the original application. In addition, CERE supports recompiling and retargeting the extracted codelets. Therefore, CERE can be used for cross-architecture performance prediction or piecewise code optimization. On the SPEC 2006 FP benchmarks, CERE codelets cover 90.9\% and accurately replay 66.3\% of the execution time. We use CERE codelets in a realistic study to evaluate three different architectures on the NAS benchmarks. CERE accurately estimates each architecture performance and is 7.3 $ \times $ to 46.6 $ \times $ cheaper than running the full benchmark.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gaster:2015:HRA, author = "Benedict R. Gaster and Derek Hower and Lee Howes", title = "{HRF}-Relaxed: Adapting {HRF} to the Complexities of Industrial Heterogeneous Memory Models", journal = j-TACO, volume = "12", number = "1", pages = "7:1--7:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2701618", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memory consistency models, or memory models, allow both programmers and program language implementers to reason about concurrent accesses to one or more memory locations. Memory model specifications balance the often conflicting needs for precise semantics, implementation flexibility, and ease of understanding. Toward that end, popular programming languages like Java, C, and C++ have adopted memory models built on the conceptual foundation of Sequential Consistency for Data-Race-Free programs (SC for DRF). These SC for DRF languages were created with general-purpose homogeneous CPU systems in mind, and all assume a single, global memory address space. Such a uniform address space is usually power and performance prohibitive in heterogeneous Systems on Chips (SoCs), and for that reason most heterogeneous languages have adopted split address spaces and operations with nonglobal visibility. There have recently been two attempts to bridge the disconnect between the CPU-centric assumptions of the SC for DRF framework and the realities of heterogeneous SoC architectures. Hower et al. proposed a class of Heterogeneous-Race-Free (HRF) memory models that provide a foundation for understanding many of the issues in heterogeneous memory models. At the same time, the Khronos Group developed the OpenCL 2.0 memory model that builds on the C++ memory model. The OpenCL 2.0 model includes features not addressed by HRF: primarily support for relaxed atomics and a property referred to as scope inclusion. In this article, we generalize HRF to allow formalization of and reasoning about more complicated models using OpenCL 2.0 as a point of reference. With that generalization, we (1) make the OpenCL 2.0 memory model more accessible by introducing a platform for feature comparisons to other models, (2) consider a number of shortcomings in the current OpenCL 2.0 model, and (3) propose changes that could be adopted by future OpenCL 2.0 revisions or by other, related, models.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Streit:2015:GTP, author = "Kevin Streit and Johannes Doerfert and Clemens Hammacher and Andreas Zeller and Sebastian Hack", title = "Generalized Task Parallelism", journal = j-TACO, volume = "12", number = "1", pages = "8:1--8:??", month = apr, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2723164", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Apr 16 18:39:56 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Existing approaches to automatic parallelization produce good results in specific domains. Yet, it is unclear how to integrate their individual strengths to match the demands and opportunities of complex software. This lack of integration has both practical reasons, as integrating those largely differing approaches into one compiler would impose an engineering hell, as well as theoretical reasons, as no joint cost model exists that would drive the choice between parallelization methods. By reducing the problem of generating parallel code from a program dependence graph to integer linear programming, {\em generalized task parallelization\/} integrates central aspects of existing parallelization approaches into a single unified framework. Implemented on top of LLVM, the framework seamlessly integrates enabling technologies such as speculation, privatization, and the realization of reductions. Evaluating our implementation on various C programs from different domains, we demonstrate the effectiveness and generality of generalized task parallelization. On a quad-core machine with hyperthreading we achieve speedups of up to $ 4.6 \times $.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tabkhi:2015:JSH, author = "Hamed Tabkhi and Gunar Schirner", title = "A Joint {SW\slash HW} Approach for Reducing Register File Vulnerability", journal = j-TACO, volume = "12", number = "2", pages = "9:1--9:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2733378", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Register File (RF) is a particularly vulnerable component within processor core and at the same time a hotspot with high power density. To reduce RF vulnerability, conventional HW-only approaches such as Error Correction Codes (ECCs) or modular redundancies are not suitable due to their significant power overhead. Conversely, SW-only approaches either have limited improvement on RF reliability or require considerable performance overhead. As a result, new approaches are needed that reduce RF vulnerability with minimal power and performance overhead. This article introduces Application-guided Reliability-enhanced Register file Architecture (ARRA), a novel approach to reduce RF vulnerability of embedded processors. Taking advantage of uneven register utilization, ARRA mirrors, guided by a SW instrumentation, frequently used active registers into passive registers. ARRA is particularly suitable for control applications, as they have a high reliability demand with fairly low (uneven) RF utilization. ARRA is a cross-layer joint HW/SW approach based on an ARRA-extended RF microarchitecture, an ISA extension, as well as static binary analysis and instrumentation. We evaluate ARRA benefits using an ARRA-enhanced Blackfin processor executing a set of DSPBench and MiBench benchmarks. We quantify the benefits using RF Vulnerability Factor (RFVF) and Mean Work To Failure (MWTF). ARRA significantly reduces RFVF from 35\% to 6.9\% in cost of 0.5\% performance lost for control applications. With ARRA's register mirroring, it can also correct Multiple Bit Upsets (MBUs) errors, achieving an 8x increase in MWTF. Compared to a partially ECC-protected RF approach, ARRA demonstrates higher efficiency by achieving comparable vulnerability reduction at much lower power consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kanuparthi:2015:RIC, author = "Arun Kanuparthi and Ramesh Karri", title = "Reliable Integrity Checking in Multicore Processors", journal = j-TACO, volume = "12", number = "2", pages = "10:1--10:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2738052", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Security and reliability have become important concerns in the design of computer systems. On one hand, microarchitectural enhancements for security (such as for dynamic integrity checking of code at runtime) have been proposed. On the other hand, independently, microarchitectural enhancements for reliability to detect and tolerate natural faults have also been proposed. A fault in these security enhancements due to alpha particles or aging might potentially pass off maliciously modified instructions as safe, rendering the security enhancements useless. Deliberate fault attacks by attackers can be launched to disable the security enhancements and then launch the well-known security attacks that would otherwise have been detected by these enhancements. We report an integrated microarchitecture support for security and reliability in multicore processors. Specifically, we add integrity checkers to protect the code running on the multiple cores in a multicore processor. We then adapt these checkers to check one another periodically to ensure reliable operation. These checkers naturally can check the other parts of the core. The average performance, power, and area costs for these security-reliability enhancements are 6.42\%, 0.73\%, and 0.53\%, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2015:NMD, author = "Do-Heon Lee and Su-Kyung Yoon and Jung-Geun Kim and Charles C. Weems and Shin-Dug Kim", title = "A New Memory-Disk Integrated System with {HW} Optimizer", journal = j-TACO, volume = "12", number = "2", pages = "11:1--11:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2738053", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Current high-performance computer systems utilize a memory hierarchy of on-chip cache, main memory, and secondary storage due to differences in device characteristics. Limiting the amount of main memory causes page swap operations and duplicates data between the main memory and the storage device. The characteristics of next-generation memory, such as nonvolatility, byte addressability, and scaling to greater capacity, can be used to solve these problems. Simple replacement of secondary storage with new forms of nonvolatile memory in a traditional memory hierarchy still causes typical problems, such as memory bottleneck, page swaps, and write overhead. Thus, we suggest a single architecture that merges the main memory and secondary storage into a system called a Memory-Disk Integrated System (MDIS). The MDIS architecture is composed of a virtually decoupled NVRAM and a nonvolatile memory performance optimizer combining hardware and software to support this system. The virtually decoupled NVRAM module can support conventional main memory and disk storage operations logically without data duplication and can reduce write operations to the NVRAM. To increase the lifetime and optimize the performance of this NVRAM, another hardware module called a Nonvolatile Performance Optimizer (NVPO) is used that is composed of four small buffers. The NVPO exploits spatial and temporal characteristics of static/dynamic data based on program execution characteristics. Enhanced virtual memory management and address translation modules in the operating system can support these hardware components to achieve a seamless memory-storage environment. Our experimental results show that the proposed architecture can improve execution time by about 89\% over a conventional DRAM main memory/HDD storage system, and 77\% over a state-of-the-art PRAM main memory/HDD disk system with DRAM buffer. Also, the lifetime of the virtually decoupled NVRAM is estimated to be 40\% longer than that of a traditional hierarchy based on the same device technology.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kafshdooz:2015:DSS, author = "Morteza Mohajjel Kafshdooz and Alireza Ejlali", title = "Dynamic Shared {SPM} Reuse for Real-Time Multicore Embedded Systems", journal = j-TACO, volume = "12", number = "2", pages = "12:1--12:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2738051", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Allocating the scratchpad memory (SPM) space to tasks is a challenging problem in real-time multicore embedded systems that use shared SPM. Proper SPM space allocation is important, as it considerably influences the application worst-case execution time (WCET), which is of great importance in real-time applications. To address this problem, in this article we present a dynamic SPM reuse scheme, where SPM space can be reused by other tasks during runtime without requiring any static SPM partitioning. Although the proposed scheme is applied dynamically at runtime, the required decision making is fairly complex and hence cannot be performed at runtime. We have developed techniques to perform the decision making offline at design time in the form of optimization problems combined with task scheduling/mapping. The proposed work is unlike previous works that either exploit static schemes for SPM space allocation or perform task scheduling/mapping and SPM space allocation incoherently. The experimental results show that our dynamic SPM reuse scheme can reduce WCET by up to 55\% as compared to recent previous works on SPM allocation in real-time multicore embedded systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jia:2015:GPP, author = "Wenhao Jia and Elba Garza and Kelly A. Shaw and Margaret Martonosi", title = "{GPU} Performance and Power Tuning Using Regression Trees", journal = j-TACO, volume = "12", number = "2", pages = "13:1--13:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2736287", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GPU performance and power tuning is difficult, requiring extensive user expertise and time-consuming trial and error. To accelerate design tuning, statistical design space exploration methods have been proposed. This article presents Starchart, a novel design space partitioning tool that uses regression trees to approach GPU tuning problems. Improving on prior work, Starchart offers more automation in identifying key design trade-offs and models design subspaces with distinctly different behaviors. Starchart achieves good model accuracy using very few random samples: less than 0.3\% of a given design space; iterative sampling can more quickly target subspaces of interest.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pananilath:2015:OCG, author = "Irshad Pananilath and Aravind Acharya and Vinay Vasista and Uday Bondhugula", title = "An Optimizing Code Generator for a Class of Lattice-{Boltzmann} Computations", journal = j-TACO, volume = "12", number = "2", pages = "14:1--14:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2739047", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Lattice-Boltzmann method (LBM), a promising new particle-based simulation technique for complex and multiscale fluid flows, has seen tremendous adoption in recent years in computational fluid dynamics. Even with a state-of-the-art LBM solver such as Palabos, a user has to still manually write the program using library-supplied primitives. We propose an automated code generator for a class of LBM computations with the objective to achieve high performance on modern architectures. Few studies have looked at time tiling for LBM codes. We exploit a key similarity between stencils and LBM to enable polyhedral optimizations and in turn time tiling for LBM. We also characterize the performance of LBM with the Roofline performance model. Experimental results for standard LBM simulations like Lid Driven Cavity, Flow Past Cylinder, and Poiseuille Flow show that our scheme consistently outperforms Palabos-on average by up to $ 3 \times $ while running on 16 cores of an Intel Xeon (Sandybridge). We also obtain an improvement of $ 2.47 \times $ on the SPEC LBM benchmark.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fang:2015:PIO, author = "Shuangde Fang and Wenwen Xu and Yang Chen and Lieven Eeckhout and Olivier Temam and Yunji Chen and Chengyong Wu and Xiaobing Feng", title = "Practical Iterative Optimization for the Data Center", journal = j-TACO, volume = "12", number = "2", pages = "15:1--15:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2739048", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Iterative optimization is a simple but powerful approach that searches the best possible combination of compiler optimizations for a given workload. However, iterative optimization is plagued by several practical issues that prevent it from being widely used in practice: a large number of runs are required to find the best combination, the optimum combination is dataset dependent, and the exploration process incurs significant overhead that needs to be compensated for by performance benefits. Therefore, although iterative optimization has been shown to have a significant performance potential, it seldom is used in production compilers. In this article, we propose iterative optimization for the data center (IODC): we show that the data center offers a context in which all of the preceding hurdles can be overcome. The basic idea is to spawn different combinations across workers and recollect performance statistics at the master, which then evolves to the optimum combination of compiler optimizations. IODC carefully manages costs and benefits, and it is transparent to the end user. To bring IODC to practice, we evaluate it in the presence of co-runners to better reflect real-life data center operation with multiple applications co-running per server. We enhance IODC with the capability to find compatible co-runners along with a mechanism to dynamically adjust the level of aggressiveness to improve its robustness in the presence of co-running applications. We evaluate IODC using both MapReduce and compute-intensive throughput server applications. To reflect the large number of users interacting with the system, we gather a very large collection of datasets (up to hundreds of millions of unique datasets per program), for a total storage of 16.4TB and 850 days of CPU time. We report an average performance improvement of $ 1.48 \times $ and up to $ 2.08 \times $ for five MapReduce applications, and $ 1.12 \times $ and up to $ 1.39 \times $ for nine server applications. Furthermore, our experiments demonstrate that IODC is effective in the presence of co-runners, improving performance by greater than 13\% compared to the worst possible co-runner schedule.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2015:BSS, author = "Tao Zhang and Naifeng Jing and Kaiming Jiang and Wei Shu and Min-You Wu and Xiaoyao Liang", title = "{Buddy SM}: Sharing Pipeline Front-End for Improved Energy Efficiency in {GPGPUs}", journal = j-TACO, volume = "12", number = "2", pages = "16:1--16:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2744202", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A modern general-purpose graphics processing unit (GPGPU) usually consists of multiple streaming multiprocessors (SMs), each having a pipeline that incorporates a group of threads executing a common instruction flow. Although SMs are designed to work independently, we observe that they tend to exhibit very similar behavior for many workloads. If multiple SMs can be grouped and work in the lock-step manner, it is possible to save energy by sharing the front-end units among multiple SMs, including the instruction fetch, decode, and schedule components. However, such sharing brings architectural challenges and sometime causes performance degradation. In this article, we show our design, implementation, and evaluation for such an architecture, which we call Buddy SM. Specifically, multiple SMs can be opportunistically grouped into a buddy cluster. One SM becomes the master, and the rest become the slaves. The front-end unit of the master works actively for itself as well as for the slaves, whereas the front-end logics of the slaves are power gated. For efficient flow control and program correctness, the proposed architecture can identify unfavorable conditions and ungroup the buddy cluster when necessary. We analyze various techniques to improve the performance and energy efficiency of Buddy SM. Detailed experiments manifest that 37.2\% front-end and 7.5\% total GPU energy reduction can be achieved.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cheng:2015:ECS, author = "Hsiang-Yun Cheng and Matt Poremba and Narges Shahidi and Ivan Stalev and Mary Jane Irwin and Mahmut Kandemir and Jack Sampson and Yuan Xie", title = "{EECache}: a Comprehensive Study on the Architectural Design for Energy-Efficient Last-Level Caches in Chip Multiprocessors", journal = j-TACO, volume = "12", number = "2", pages = "17:1--17:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2756552", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Power management for large last-level caches (LLCs) is important in chip multiprocessors (CMPs), as the leakage power of LLCs accounts for a significant fraction of the limited on-chip power budget. Since not all workloads running on CMPs need the entire cache, portions of a large, shared LLC can be disabled to save energy. In this article, we explore different design choices, from circuit-level cache organization to microarchitectural management policies, to propose a low-overhead runtime mechanism for energy reduction in the large, shared LLC. We first introduce a slice-based cache organization that can shut down parts of the shared LLC with minimal circuit overhead. Based on this slice-based organization, part of the shared LLC can be turned off according to the spatial and temporal cache access behavior captured by low-overhead sampling-based hardware. In order to eliminate the performance penalties caused by flushing data before powering off a cache slice, we propose data migration policies to prevent the loss of useful data in the LLC. Results show that our energy-efficient cache design (EECache) provides 14.1\% energy savings at only 1.2\% performance degradation and consumes negligible hardware overhead compared to prior work.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Suresh:2015:IFM, author = "Arjun Suresh and Bharath Narasimha Swamy and Erven Rohou and Andr{\'e} Seznec", title = "Intercepting Functions for Memoization: a Case Study Using Transcendental Functions", journal = j-TACO, volume = "12", number = "2", pages = "18:1--18:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2751559", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Memoization is the technique of saving the results of executions so that future executions can be omitted when the input set repeats. Memoization has been proposed in previous literature at the instruction, basic block, and function levels using hardware, as well as pure software-level approaches including changes to programming language. In this article, we focus on software memoization for procedural languages such as C and Fortran at the granularity of a function. We propose a simple linker-based technique for enabling software memoization of any dynamically linked pure function by function interception and illustrate our framework using a set of computationally expensive pure functions-the transcendental functions. Transcendental functions are those that cannot be expressed in terms of a finite sequence of algebraic operations (trigonometric functions, exponential functions, etc.) and hence are computationally expensive. Our technique does not need the availability of source code and thus can even be applied to commercial applications, as well as applications with legacy codes. As far as users are concerned, enabling memoization is as simple as setting an environment variable. Our framework does not make any specific assumptions about the underlying architecture or compiler toolchains and can work with a variety of current architectures. We present experimental results for a x86-64 platform using both gcc and icc compiler toolchains, and an ARM Cortex-A9 platform using gcc. Our experiments include a mix of real-world programs and standard benchmark suites: SPEC and Splash2x. On standard benchmark applications that extensively call the transcendental functions, we report memoization benefits of up to 50\% on Intel Ivy Bridge and up to 10\% on ARM Cortex-A9. Memoization was able to regain a performance loss of 76\% in bwaves due to a known performance bug in the GNU implementation of the pow function. The same benchmark on ARM Cortex-A9 benefited by more than 200\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lin:2015:SSE, author = "Chung-Hsiang Lin and De-Yu Shen and Yi-Jung Chen and Chia-Lin Yang and Cheng-Yuan Michael Wang", title = "{SECRET}: a Selective Error Correction Framework for Refresh Energy Reduction in {DRAMs}", journal = j-TACO, volume = "12", number = "2", pages = "19:1--19:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2747876", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "DRAMs are used as the main memory in most computing systems today. Studies show that DRAMs contribute to a significant part of overall system power consumption. One of the main challenges in low-power DRAM design is the inevitable refresh process. Due to process variation, memory cells exhibit retention time variations. Current DRAMs use a single refresh period determined by the cell with the largest leakage. Since prolonging refresh intervals introduces retention errors, a set of previous works adopt conventional error-correcting code (ECC) to correct retention errors. However, these approaches introduce significant area and energy overheads. In this article, we propose a novel error correction framework for retention errors in DRAMs, called SECRET (selective error correction for refresh energy reduction). The key observations we make are that retention errors are hard errors rather than soft errors, and only few DRAM cells have large leakage. Therefore, instead of equipping error correction capability for all memory cells as existing ECC schemes, we only allocate error correction information to leaky cells under a refresh interval. Our SECRET framework contains two parts: an offline phase to identify memory cells with retention errors given a target error rate and a low-overhead error correction mechanism. The experimental results show that among all test cases performed, the proposed SECRET framework can reduce refresh power by 87.2\% and overall DRAM power up to 18.57\% with negligible area and performance overheads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Simon:2015:STH, author = "Doug Simon and Christian Wimmer and Bernhard Urban and Gilles Duboscq and Lukas Stadler and Thomas W{\"u}rthinger", title = "Snippets: Taking the High Road to a Low Level", journal = j-TACO, volume = "12", number = "2", pages = "20:1--20:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2764907", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "When building a compiler for a high-level language, certain intrinsic features of the language must be expressed in terms of the resulting low-level operations. Complex features are often expressed by explicitly weaving together bits of low-level IR, a process that is tedious, error prone, difficult to read, difficult to reason about, and machine dependent. In the Graal compiler for Java, we take a different approach: we use snippets of Java code to express semantics in a high-level, architecture-independent way. Two important restrictions make snippets feasible in practice: they are compiler specific, and they are explicitly prepared and specialized. Snippets make Graal simpler and more portable while still capable of generating machine code that can compete with other compilers of the Java HotSpot VM.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Balasubramanian:2015:EGL, author = "Raghuraman Balasubramanian and Vinay Gangadhar and Ziliang Guo and Chen-Han Ho and Cherin Joseph and Jaikrishnan Menon and Mario Paulo Drumond and Robin Paul and Sharath Prasad and Pradip Valathol and Karthikeyan Sankaralingam", title = "Enabling {GPGPU} Low-Level Hardware Explorations with {MIAOW}: an Open-Source {RTL} Implementation of a {GPGPU}", journal = j-TACO, volume = "12", number = "2", pages = "21:1--21:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2764908", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graphic processing unit (GPU)-based general-purpose computing is developing as a viable alternative to CPU-based computing in many domains. Today's tools for GPU analysis include simulators like GPGPU-Sim, Multi2Sim, and Barra. While useful for modeling first-order effects, these tools do not provide a detailed view of GPU microarchitecture and physical design. Further, as GPGPU research evolves, design ideas and modifications demand detailed estimates of impact on overall area and power. Fueled by this need, we introduce MIAOW (Many-core Integrated Accelerator Of Wisconsin), an open-source RTL implementation of the AMD Southern Islands GPGPU ISA, capable of running unmodified OpenCL-based applications. We present our design motivated by our goals to create a realistic, flexible, OpenCL-compatible GPGPU, capable of emulating a full system. We first explore if MIAOW is realistic and then use four case studies to show that MIAOW enables the following: physical design perspective to ``traditional'' microarchitecture, new types of research exploration, and validation/calibration of simulator-based characterization of hardware. The findings and ideas are contributions in their own right, in addition to MIAOW's utility as a tool for others' research.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2015:LAW, author = "Quan Chen and Minyi Guo", title = "Locality-Aware Work Stealing Based on Online Profiling and Auto-Tuning for Multisocket Multicore Architectures", journal = j-TACO, volume = "12", number = "2", pages = "22:1--22:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2766450", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern mainstream powerful computers adopt multisocket multicore CPU architecture and NUMA-based memory architecture. While traditional work-stealing schedulers are designed for single-socket architectures, they incur severe shared cache misses and remote memory accesses in these computers. To solve the problem, we propose a locality-aware work-stealing (LAWS) scheduler, which better utilizes both the shared cache and the memory system. In LAWS, a load-balanced task allocator is used to evenly split and store the dataset of a program to all the memory nodes and allocate a task to the socket where the local memory node stores its data for reducing remote memory accesses. Then, an adaptive DAG packer adopts an auto-tuning approach to optimally pack an execution DAG into cache-friendly subtrees. After cache-friendly subtrees are created, every socket executes cache-friendly subtrees sequentially for optimizing shared cache usage. Meanwhile, a triple-level work-stealing scheduler is applied to schedule the subtrees and the tasks in each subtree. Through theoretical analysis, we show that LAWS has comparable time and space bounds compared with traditional work-stealing schedulers. Experimental results show that LAWS can improve the performance of memory-bound programs up to 54.2\% on AMD-based experimental platforms and up to 48.6\% on Intel-based experimental platforms compared with traditional work-stealing schedulers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Das:2015:SBP, author = "Madan Das and Gabriel Southern and Jose Renau", title = "Section-Based Program Analysis to Reduce Overhead of Detecting Unsynchronized Thread Communication", journal = j-TACO, volume = "12", number = "2", pages = "23:1--23:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2766451", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Most systems that test and verify parallel programs, such as deterministic execution engines, data race detectors, and software transactional memory systems, require instrumenting loads and stores in an application. This can cause a very significant runtime and memory overhead compared to executing uninstrumented code. Multithreaded programming typically allows any thread to perform loads and stores to any location in the process's address space independently, and such tools monitor all these memory accesses. However, many of the addresses in these unsynchronized memory accesses are only used by a single thread and do not affect other executing threads. We propose Section-Based Program Analysis (SBPA), a novel way to decompose the program into disjoint code sections to identify and eliminate instrumenting such loads and stores during program compilation so that the program runtime overhead is significantly reduced. Our analysis includes improvements to pointer analysis and uses a few user directives to increase the effectiveness of SBPA further. We implemented SBPA for a deterministic execution runtime environment and were able to eliminate 51\% of dynamic memory access instrumentations. When combined with directives, such reduction increased to 63\%. We also integrated SBPA with ThreadSanitizer, a state-of-the-art dynamic race detector, and achieved a speedup of 2.43 (2.74 with directives) on a geometric mean basis.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lotfi:2015:AAC, author = "Atieh Lotfi and Abbas Rahimi and Luca Benini and Rajesh K. Gupta", title = "Aging-Aware Compilation for {GP-GPUs}", journal = j-TACO, volume = "12", number = "2", pages = "24:1--24:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2778984", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "General-purpose graphic processing units (GP-GPUs) offer high computational throughput using thousands of integrated processing elements (PEs). These PEs are stressed during workload execution, and negative bias temperature instability (NBTI) adversely affects their reliability by introducing new delay-induced faults. However, the effect of these delay variations is not uniformly spread across the PEs: some are affected more --- hence less reliable --- than others. This variation causes significant reduction in the lifetime of GP-GPU parts. In this article, we address the problem of ``wear leveling'' across processing units to mitigate lifetime uncertainty in GP-GPUs. We propose innovations in the static compiled code that can improve healing in PEs and stream cores (SCs) based on their degradation status. PE healing is a fine-grained very long instruction word (VLIW) slot assignment scheme that balances the stress of instructions across the PEs within an SC. SC healing is a coarse-grained workload allocation scheme that distributes workload across SCs in GP-GPUs. Both schemes share a common property: they adaptively shift workload from less reliable units to more reliable units, either spatially or temporally. These software schemes are based on online calibration with NBTI monitoring that equalizes the expected lifetime of PEs and SCs by regenerating adaptive compiled codes to respond to the specific health state of the GP-GPUs. We evaluate the effectiveness of the proposed schemes for various OpenCL kernels from the AMD APP SDK on Evergreen and Southern Island GPU architectures. The aging-aware healthy kernels generated by the PE (or SC) healing scheme reduce NBTI-induced voltage threshold shift by 30\% (77\% in the case of SCs), with no (moderate) performance penalty compared to the naive kernels.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Railing:2015:CEG, author = "Brian P. Railing and Eric R. Hein and Thomas M. Conte", title = "{Contech}: Efficiently Generating Dynamic Task Graphs for Arbitrary Parallel Programs", journal = j-TACO, volume = "12", number = "2", pages = "25:1--25:??", month = jul, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2776893", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 7 09:46:00 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Parallel programs can be characterized by task graphs encoding instructions, memory accesses, and the parallel work's dependencies, while representing any threading library and architecture. This article presents Contech, a high performance framework for generating dynamic task graphs from arbitrary parallel programs, and a novel representation enabling programmers and compiler optimizations to understand and exploit program aspects. The Contech framework supports a variety of languages (including C, C++, and Fortran), parallelization libraries, and ISAs (including x86 and ARM). Running natively for collection speed and minimizing program perturbation, the instrumentation shows $ 4 \times $ improvement over a Pin-based implementation on PARSEC and NAS benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Davari:2015:EGA, author = "Mahdad Davari and Alberto Ros and Erik Hagersten and Stefanos Kaxiras", title = "The Effects of Granularity and Adaptivity on Private\slash Shared Classification for Coherence", journal = j-TACO, volume = "12", number = "3", pages = "26:1--26:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2790301", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 7 18:51:05 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Classification of data into private and shared has proven to be a catalyst for techniques to reduce coherence cost, since private data can be taken out of coherence and resources can be concentrated on providing coherence for shared data. In this article, we examine how granularity-page-level versus cache-line level-and adaptivity-going from shared to private-affect the outcome of classification and its final impact on coherence. We create a classification technique, called Generational Classification, and a coherence protocol called Generational Coherence, which treats data as private or shared based on cache-line generations. We compare two coherence protocols based on self-invalidation/self-downgrade with respect to data classification. Our findings are enlightening: (i) Some programs benefit from finer granularity, some benefit further from adaptivity, but some do not benefit from either. (ii) Reducing the amount of shared data has no perceptible impact on coherence misses caused by self-invalidation of shared data, hence no impact on performance. (iii) In contrast, classifying more data as private has implications for protocols that employ write-through as a means of self-downgrade, resulting in network traffic reduction-up to 30\%-by reducing write-through traffic.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gottscho:2015:DDP, author = "Mark Gottscho and Abbas BanaiyanMofrad and Nikil Dutt and Alex Nicolau and Puneet Gupta", title = "{DPCS}: Dynamic Power\slash Capacity Scaling for {SRAM} Caches in the Nanoscale Era", journal = j-TACO, volume = "12", number = "3", pages = "27:1--27:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2792982", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 7 18:51:05 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Fault-Tolerant Voltage-Scalable (FTVS) SRAM cache architectures are a promising approach to improve energy efficiency of memories in the presence of nanoscale process variation. Complex FTVS schemes are commonly proposed to achieve very low minimum supply voltages, but these can suffer from high overheads and thus do not always offer the best power/capacity trade-offs. We observe on our 45nm test chips that the ``fault inclusion property'' can enable lightweight fault maps that support multiple runtime supply voltages. Based on this observation, we propose a simple and low-overhead FTVS cache architecture for power/capacity scaling. Our mechanism combines multilevel voltage scaling with optional architectural support for power gating of blocks as they become faulty at low voltages. A static (SPCS) policy sets the runtime cache VDD once such that a only a few cache blocks may be faulty in order to minimize the impact on performance. We describe a Static Power/Capacity Scaling (SPCS) policy and two alternate Dynamic Power/Capacity Scaling (DPCS) policies that opportunistically reduce the cache voltage even further for more energy savings. This architecture achieves lower static power for all effective cache capacities than a recent more complex FTVS scheme. This is due to significantly lower overheads, despite the inability of our approach to match the min-VDD of the competing work at a fixed target yield. Over a set of SPEC CPU2006 benchmarks on two system configurations, the average total cache (system) energy saved by SPCS is 62\% (22\%), while the two DPCS policies achieve roughly similar energy reduction, around 79\% (26\%). On average, the DPCS approaches incur 2.24\% performance and 6\% area penalties.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Michaud:2015:RCM, author = "Pierre Michaud and Andrea Mondelli and Andr{\'e} Seznec", title = "Revisiting Clustered Microarchitecture for Future Superscalar Cores: a Case for Wide Issue Clusters", journal = j-TACO, volume = "12", number = "3", pages = "28:1--28:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2800787", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 7 18:51:05 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "During the past 10 years, the clock frequency of high-end superscalar processors has not increased. Performance keeps growing mainly by integrating more cores on the same chip and by introducing new instruction set extensions. However, this benefits only some applications and requires rewriting and/or recompiling these applications. A more general way to accelerate applications is to increase the IPC, the number of instructions executed per cycle. Although the focus of academic microarchitecture research moved away from IPC techniques, the IPC of commercial processors was continuously improved during these years. We argue that some of the benefits of technology scaling should be used to raise the IPC of future superscalar cores further. Starting from microarchitecture parameters similar to recent commercial high-end cores, we show that an effective way to increase the IPC is to allow the out-of-order engine to issue more micro-ops per cycle. But this must be done without impacting the clock cycle. We propose combining two techniques: clustering and register write specialization. Past research on clustered microarchitectures focused on narrow issue clusters, as the emphasis at that time was on allowing high clock frequencies. Instead, in this study, we consider wide issue clusters, with the goal of increasing the IPC under a constant clock frequency. We show that on a wide issue dual cluster, a very simple steering policy that sends 64 consecutive instructions to the same cluster, the next 64 instructions to the other cluster, and so forth, permits tolerating an intercluster delay of three cycles. We also propose a method for decreasing the energy cost of sending results from one cluster to the other cluster.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Natarajan:2015:LTE, author = "Ragavendra Natarajan and Antonia Zhai", title = "Leveraging Transactional Execution for Memory Consistency Model Emulation", journal = j-TACO, volume = "12", number = "3", pages = "29:1--29:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2786980", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 7 18:51:05 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "System emulation is widely used in today's computer systems. This technology opens new opportunities for resource sharing as well as enhancing system security and reliability. System emulation across different instruction set architectures (ISA) can enable further opportunities. For example, cross-ISA emulation can enable workload consolidation over a wide range of microprocessors and potentially facilitate the seamless deployment of new processor architectures. As multicore and manycore processors become pervasive, it is important to address the challenges toward supporting system emulation on these platforms. A key challenge in cross-ISA emulation on multicore systems is ensuring the correctness of emulation when the guest and the host memory consistency models differ. Many existing cross-ISA system emulators are sequential, thus they are able to avoid this problem at the cost of significant performance degradation. Recently proposed parallel emulators are able to address the performance limitation; however, they provide limited support for memory consistency model emulation. When the host system has a weaker memory consistency model compared to the guest system, the emulator can insert memory fences at appropriate locations in the translated code to enforce the guest memory ordering constraints. These memory fences can significantly degrade the performance of the translated code. Transactional execution support available on certain recent microprocessors provides an alternative approach. Transactional execution of the translated code enforces sequential consistency (SC) at the coarse-grained transaction level, which in turn ensures that all memory accesses made on the host machine conform to SC. Enforcing SC on the host machine guarantees that the emulated execution will be correct for any guest memory model. In this article, we compare and evaluate the overheads associated with using transactions and fences for memory consistency model emulation on the Intel Haswell processor. Our experience of implementing these two approaches on a state-of-the-art parallel emulator, COREMU, demonstrates that memory consistency model emulation using transactions performs better when the transaction sizes are large enough to amortize the transaction overhead and the transaction conflict rate is low, whereas inserting memory fences is better for applications in which the transaction overhead is high. A hybrid implementation that dynamically determines which approach to invoke can outperform both approaches. Our results, based on the SPLASH-2 and the PARSEC benchmark suites, demonstrate that the proposed hybrid approach is able to outperform the fence insertion mechanism by 4.9\% and the transactional execution approach by 24.9\% for two-thread applications, and outperform them by 4.5\% and 44.7\%, respectively, for four-threaded execution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Panda:2015:CUD, author = "Biswabandan Panda and Shankar Balachandran", title = "{CAFFEINE}: a Utility-Driven Prefetcher Aggressiveness Engine for Multicores", journal = j-TACO, volume = "12", number = "3", pages = "30:1--30:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2806891", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 7 18:51:05 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Aggressive prefetching improves system performance by hiding and tolerating off-chip memory latency. However, on a multicore system, prefetchers of different cores contend for shared resources and aggressive prefetching can degrade the overall system performance. The role of a prefetcher aggressiveness engine is to select appropriate aggressiveness levels for each prefetcher such that shared resource contention caused by prefetchers is reduced, thereby improving system performance. State-of-the-art prefetcher aggressiveness engines monitor metrics such as prefetch accuracy, bandwidth consumption, and last-level cache pollution. They use carefully tuned thresholds for these metrics, and when the thresholds are crossed, they trigger aggressiveness control measures. These engines have three major shortcomings: (1) thresholds are dependent on the system configuration (cache size, DRAM scheduling policy, and cache replacement policy) and have to be tuned appropriately, (2) there is no single threshold that works well across all the workloads, and (3) thresholds are oblivious to the phase change of applications. To overcome these shortcomings, we propose CAFFEINE, a model-based approach that analyzes the effectiveness of a prefetcher and uses a metric called net utility to control the aggressiveness. Our metric provides net processor cycles saved because of prefetching by approximating the cycles saved across the memory subsystem, from last-level cache to DRAM. We evaluate CAFFEINE across a wide range of workloads and compare it with the state-of-the-art prefetcher aggressiveness engine. Experimental results demonstrate that, on average (geomean), CAFFEINE achieves 9.5\% (as much as 38.29\%) and 11\% (as much as 20.7\%) better performance than the best-performing aggressiveness engine for four-core and eight-core systems, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2015:BSB, author = "Jishen Zhao and Sheng Li and Jichuan Chang and John L. Byrne and Laura L. Ramirez and Kevin Lim and Yuan Xie and Paolo Faraboschi", title = "{Buri}: Scaling Big-Memory Computing with Hardware-Based Memory Expansion", journal = j-TACO, volume = "12", number = "3", pages = "31:1--31:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2808233", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 7 18:51:05 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Motivated by the challenges of scaling up memory capacity and fully exploiting the benefits of memory compression, we propose Buri, a hardware-based memory compression scheme, which simultaneously achieves cost efficiency, high performance, and ease of adoption. Buri combines (1) a self-contained, ready-to-adopt hardware compression module, which manages metadata compression and memory allocation/relocation operations; (2) a set of hardware optimization mechanisms, which reduce the area and performance overheads in accommodating the address indirection required by memory compression; and (3) lightweight BIOS/OS extensions used to handle exceptions. Our evaluation with large memory workload traces shows that Buri can increase capacity by 70\%, in addition to the compression ratio already provided by database software.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lucas:2015:SSS, author = "Jan Lucas and Michael Andersch and Mauricio Alvarez-Mesa and Ben Juurlink", title = "Spatiotemporal {SIMT} and Scalarization for Improving {GPU} Efficiency", journal = j-TACO, volume = "12", number = "3", pages = "32:1--32:??", month = oct, year = "2015", CODEN = "????", DOI = "https://doi.org/10.1145/2811402", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Oct 7 18:51:05 MDT 2015", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Temporal SIMT (TSIMT) has been suggested as an alternative to conventional (spatial) SIMT for improving GPU performance on branch-intensive code. Although TSIMT has been briefly mentioned before, it was not evaluated. We present a complete design and evaluation of TSIMT GPUs, along with the inclusion of scalarization and a combination of temporal and spatial SIMT, named Spatiotemporal SIMT (STSIMT). Simulations show that TSIMT alone results in a performance reduction, but a combination of scalarization and STSIMT yields a mean performance enhancement of 19.6\% and improves the energy-delay product by 26.2\% compared to SIMT.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Das:2016:RDB, author = "Subhasis Das and Tor M. Aamodt and William J. Dally", title = "Reuse Distance-Based Probabilistic Cache Replacement", journal = j-TACO, volume = "12", number = "4", pages = "33:1--33:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818374", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article proposes Probabilistic Replacement Policy (PRP), a novel replacement policy that evicts the line with minimum estimated hit probability under optimal replacement instead of the line with maximum expected reuse distance. The latter is optimal under the independent reference model of programs, which does not hold for last-level caches (LLC). PRP requires 7\% and 2\% metadata overheads in the cache and DRAM respectively. Using a sampling scheme makes DRAM overhead negligible, with minimal performance impact. Including detailed overhead modeling and equal cache areas, PRP outperforms SHiP, a state-of-the-art LLC replacement algorithm, by 4\% for memory-intensive SPEC-CPU2006 benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Deniz:2016:MGM, author = "Etem Deniz and Alper Sen", title = "{MINIME-GPU}: Multicore Benchmark Synthesizer for {GPUs}", journal = j-TACO, volume = "12", number = "4", pages = "34:1--34:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818693", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We introduce MINIME-GPU, a novel automated benchmark synthesis framework for graphics processing units (GPUs) that serves to speed up architectural simulation of modern GPU architectures. Our framework captures important characteristics of original GPU applications and generates synthetic GPU benchmarks using the Open Computing Language (OpenCL) library from those applications. To the best of our knowledge, this is the first time synthetic OpenCL benchmarks for GPUs are generated from existing applications. We use several characteristics, including instruction throughput, compute unit occupancy, and memory efficiency, to compare the similarity of original applications and their corresponding synthetic benchmarks. The experimental results show that our synthetic benchmark generation framework is capable of generating synthetic benchmarks that have similar characteristics with the original applications from which they are generated. On average, the similarity (accuracy) is 96\% and the speedup is 541 $ \times $ . In addition, our synthetic benchmarks use the OpenCL library, which allows us to obtain portable human readable benchmarks as opposed to using assembly-level code, and they are faster and smaller than the original applications from which they are generated. We experimentally validated that our synthetic benchmarks preserve the characteristics of the original applications across different architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tan:2016:SEE, author = "Li Tan and Zizhong Chen and Shuaiwen Leon Song", title = "Scalable Energy Efficiency with Resilience for High Performance Computing Systems: a Quantitative Methodology", journal = j-TACO, volume = "12", number = "4", pages = "35:1--35:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2822893", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Ever-growing performance of supercomputers nowadays brings demanding requirements of energy efficiency and resilience, due to rapidly expanding size and duration in use of the large-scale computing systems. Many application/architecture-dependent parameters that determine energy efficiency and resilience individually have causal effects with each other, which directly affect the trade-offs among performance, energy efficiency and resilience at scale. To enable high-efficiency management for large-scale High-Performance Computing (HPC) systems nowadays, quantitatively understanding the entangled effects among performance, energy efficiency, and resilience is thus required. While previous work focuses on exploring energy-saving and resilience-enhancing opportunities separately, little has been done to theoretically and empirically investigate the interplay between energy efficiency and resilience at scale. In this article, by extending the Amdahl's Law and the Karp-Flatt Metric, taking resilience into consideration, we quantitatively model the integrated energy efficiency in terms of performance per Watt and showcase the trade-offs among typical HPC parameters, such as number of cores, frequency/voltage, and failure rates. Experimental results for a wide spectrum of HPC benchmarks on two HPC systems show that the proposed models are accurate in extrapolating resilience-aware performance and energy efficiency, and capable of capturing the interplay among various energy-saving and resilience factors. Moreover, the models can help find the optimal HPC configuration for the highest integrated energy efficiency, in the presence of failures and applied resilience techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pusukuri:2016:TEL, author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N. Bhuyan", title = "{Tumbler}: an Effective Load-Balancing Technique for Multi-{CPU} Multicore Systems", journal = j-TACO, volume = "12", number = "4", pages = "36:1--36:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2827698", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Schedulers used by modern OSs (e.g., Oracle Solaris 11{\trademark} and GNU/Linux) balance load by balancing the number of threads in run queues of different cores. While this approach is effective for a single CPU multicore system, we show that it can lead to a significant load imbalance across CPUs of a multi-CPU multicore system. Because different threads of a multithreaded application often exhibit different levels of CPU utilization, load cannot be measured in terms of the number of threads alone. We propose Tumbler that migrates the threads of a multithreaded program across multiple CPUs to balance the load across the CPUs. While Tumbler distributes the threads equally across the CPUs, its assignment of threads to CPUs is aimed at minimizing the variation in utilization of different CPUs to achieve load balance. We evaluated Tumbler using a wide variety of 35 multithreaded applications, and our experimental results show that Tumbler outperforms both Oracle Solaris 11{\trademark} and GNU/Linux.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tomusk:2016:FME, author = "Erik Tomusk and Christophe Dubach and Michael O'Boyle", title = "Four Metrics to Evaluate Heterogeneous Multicores", journal = j-TACO, volume = "12", number = "4", pages = "37:1--37:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2829950", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Semiconductor device scaling has made single-ISA heterogeneous processors a reality. Heterogeneous processors contain a number of different CPU cores that all implement the same Instruction Set Architecture (ISA). This enables greater flexibility and specialization, as runtime constraints and workload characteristics can influence which core a given workload is run on. A major roadblock to the further development of heterogeneous processors is the lack of appropriate evaluation metrics. Existing metrics can be used to evaluate individual cores, but to evaluate a heterogeneous processor, the cores must be considered as a collective. Without appropriate metrics, it is impossible to establish design goals for processors, and it is difficult to accurately compare two different heterogeneous processors. We present four new metrics to evaluate user-oriented aspects of sets of heterogeneous cores: localized nonuniformity, gap overhead, set overhead, and generality. The metrics consider sets rather than individual cores. We use examples to demonstrate each metric, and show that the metrics can be used to quantify intuitions about heterogeneous cores.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hoseinzadeh:2016:SSP, author = "Morteza Hoseinzadeh and Mohammad Arjomand and Hamid Sarbazi-Azad", title = "{SPCM}: The Striped Phase Change Memory", journal = j-TACO, volume = "12", number = "4", pages = "38:1--38:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2829951", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Phase Change Memory (PCM) devices are one of the known promising technologies to take the place of DRAM devices with the aim of overcoming the obstacles of reducing feature size and stopping ever growing amounts of leakage power. In exchange for providing high capacity, high density, and nonvolatility, PCM Multilevel Cells (MLCs) impose high write energy and long latency. Many techniques have been proposed to resolve these side effects. However, read performance issues are usually left behind the great importance of write latency, energy, and lifetime. In this article, we focus on read performance and improve the critical path latency of the main memory system. To this end, we exploit striping scheme by which multiple lines are grouped and lie on a single MLC line array. In order to achieve more performance gain, an adaptive ordering mechanism is used to sort lines in a group based on their read frequency. This scheme imposes large energy and lifetime overheads due to its intensive demand for higher write bandwidth. Thus, we equipped our design with a grouping/pairing write queue to synchronize write-back requests such that all updates to an MLC array occur at once. The design is also augmented by a directional write scheme that takes benefits of the uniformity of accesses to the PCM device---caused by the large DRAM cache---to determine the writing mode (striped or nonstriped). This adaptation to write operations relaxes the energy and lifetime overheads. We improve the read latency of a 2-bit MLC PCM memory by more than 24\% (and Instructions Per Cycle (IPC) by about 9\%) and energy-delay product by about 20\% for a small lifetime degradation of 8\%, on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2016:TLH, author = "Chuntao Jiang and Zhibin Yu and Lieven Eeckhout and Hai Jin and Xiaofei Liao and Chengzhong Xu", title = "Two-Level Hybrid Sampled Simulation of Multithreaded Applications", journal = j-TACO, volume = "12", number = "4", pages = "39:1--39:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2818353", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Sampled microarchitectural simulation of single-threaded applications is mature technology for over a decade now. Sampling multithreaded applications, on the other hand, is much more complicated. Not until very recently have researchers proposed solutions for sampled simulation of multithreaded applications. Time-Based Sampling (TBS) samples multithreaded application execution based on time---not instructions as is typically done for single-threaded applications---yielding estimates for a multithreaded application's execution time. In this article, we revisit and analyze previously proposed TBS approaches (periodic and cantor fractal based sampling), and we obtain a number of novel and surprising insights, such as (i) accurately estimating fast-forwarding IPC, that is, performance in-between sampling units, is more important than accurately estimating sample IPC, that is, performance within the sampling units; (ii) fast-forwarding IPC estimation accuracy is determined by both the sampling unit distribution and how to use the sampling units to predict fast-forwarding IPC; and (iii) cantor sampling is more accurate at small sampling unit sizes, whereas periodic is more accurate at large sampling unit sizes. These insights lead to the development of Two-level Hybrid Sampling (THS), a novel sampling methodology for multithreaded applications that combines periodic sampling's accuracy at large time scales (i.e., uniformly selecting coarse-grain sampling units across the entire program execution) with cantor sampling's accuracy at small time scales (i.e., the ability to accurately predict fast-forwarding IPC in-between small sampling units). The clustered occurrence of small sampling units under cantor sampling also enables shortened warmup and thus enhanced simulation speed. Overall, THS achieves an average absolute execution time prediction error of 4\% while yielding an average simulation speedup of 40 $ \times $ compared to detailed simulation, which is both more accurate and faster than the current state-of-the-art. Case studies illustrate THS' ability to accurately predict relative performance differences across the design space.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dsouza:2016:IMS, author = "Sandeep D'souza and Soumya J. and Santanu Chattopadhyay", title = "Integrated Mapping and Synthesis Techniques for Network-on-Chip Topologies with Express Channels", journal = j-TACO, volume = "12", number = "4", pages = "40:1--40:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2831233", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The addition of express channels to a traditional mesh network-on-chip (NoC) has emerged as a viable solution to solve the problem of high latency. In this article, we address the problem of integrated mapping and synthesis for express channel--based mesh NoC topologies. An integer linear programming--based formulation has been presented for the mapping problem followed by a constructive heuristic for simultaneous application mapping and synthesis for an express channel--based NoC. The static and dynamic simulation results indicate that the obtained mappings lead to significant reduction in both average packet delay and network energy consumption. The obtained synthesized topologies were also found to be much more power efficient compared to conventional express channel topologies.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chasapis:2016:PEI, author = "Dimitrios Chasapis and Marc Casas and Miquel Moret{\'o} and Raul Vidal and Eduard Ayguad{\'e} and Jes{\'u}s Labarta and Mateo Valero", title = "{PARSECSs}: Evaluating the Impact of Task Parallelism in the {PARSEC} Benchmark Suite", journal = j-TACO, volume = "12", number = "4", pages = "41:1--41:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2829952", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this work, we show how parallel applications can be implemented efficiently using task parallelism. We also evaluate the benefits of such parallel paradigm with respect to other approaches. We use the PARSEC benchmark suite as our test bed, which includes applications representative of a wide range of domains from HPC to desktop and server applications. We adopt different parallelization techniques, tailored to the needs of each application, to fully exploit the task-based model. Our evaluation shows that task parallelism achieves better performance than thread-based parallelization models, such as Pthreads. Our experimental results show that we can obtain scalability improvements up to 42\% on a 16-core system and code size reductions up to 81\%. Such reductions are achieved by removing from the source code application specific schedulers or thread pooling systems and transferring these responsibilities to the runtime system software.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gaspar:2016:FAG, author = "Francisco Gaspar and Luis Tani{\c{c}}a and Pedro Tom{\'a}s and Aleksandar Ilic and Leonel Sousa", title = "A Framework for Application-Guided Task Management on Heterogeneous Embedded Systems", journal = j-TACO, volume = "12", number = "4", pages = "42:1--42:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2835177", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we propose a general framework for fine-grain application-aware task management in heterogeneous embedded platforms, which allows integration of different mechanisms for an efficient resource utilization, frequency scaling, and task migration. The proposed framework incorporates several components for accurate runtime monitoring by relying on the OS facilities and performance self-reporting for parallel and iterative applications. The framework efficiency is experimentally evaluated on a real hardware platform, where significant power and energy savings are attained for SPEC CPU2006 and PARSEC benchmarks, by guiding frequency scaling and intercluster migrations according to the runtime application behavior and predefined performance targets.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ardestani:2016:MMV, author = "Ehsan K. Ardestani and Rafael Trapani Possignolo and Jose Luis Briz and Jose Renau", title = "Managing Mismatches in Voltage Stacking with {CoreUnfolding}", journal = j-TACO, volume = "12", number = "4", pages = "43:1--43:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2835178", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Five percent to 25\% of power could be wasted before it is delivered to the computational resources on a die, due to inefficiencies of voltage regulators and resistive loss. The power delivery could benefit if, at the same power, the delivered voltage increases and the current decreases. This article presents CoreUnfolding, a technique that leverages voltage Stacking to improve power delivery efficiency. Our experiments show that about 10\% system-wide power can be saved, the voltage regulator area can be reduced by 30\%, di / dt improves 49\%, and the power pin count is reduced by 40\% ({\SGMLap} 20\% reduction in packaging costs), with negligible performance degradation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nair:2016:FFC, author = "Prashant J. Nair and David A. Roberts and Moinuddin K. Qureshi", title = "{FaultSim}: a Fast, Configurable Memory-Reliability Simulator for Conventional and {$3$D}-Stacked Systems", journal = j-TACO, volume = "12", number = "4", pages = "44:1--44:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2831234", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As memory systems scale, maintaining their Reliability Availability and Serviceability (RAS) is becoming more complex. To make matters worse, recent studies of DRAM failures in data centers and supercomputer environments have highlighted that large-granularity failures are common in DRAM chips. Furthermore, the move toward 3D-stacked memories can make the system vulnerable to newer failure modes, such as those occurring from faults in Through-Silicon Vias (TSVs). To architect future systems and to use emerging technology, system designers will need to employ strong error correction and repair techniques. Unfortunately, evaluating the relative effectiveness of these reliability mechanisms is often difficult and is traditionally done with analytical models, which are both error prone and time-consuming to develop. To this end, this article proposes FaultSim, a fast configurable memory-reliability simulation tool for 2D and 3D-stacked memory systems. FaultSim employs Monte Carlo simulations, which are driven by real-world failure statistics. We discuss the novel algorithms and data structures used in FaultSim to accelerate the evaluation of different resilience schemes. We implement BCH-1 (SECDED) and ChipKill codes using FaultSim and validate against an analytical model. FaultSim implements BCH-1 and ChipKill codes with a deviation of only 0.032\% and 8.41\% from the analytical model. FaultSim can simulate 1 million Monte Carlo trials (each for a period of 7 years) of BCH-1 and ChipKill codes in only 34 seconds and 33 seconds, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2016:ACS, author = "Byeongcheol Lee", title = "Adaptive Correction of Sampling Bias in Dynamic Call Graphs", journal = j-TACO, volume = "12", number = "4", pages = "45:1--45:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2840806", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article introduces a practical low-overhead adaptive technique of correcting sampling bias in profiling dynamic call graphs. Timer-based sampling keeps the overhead low but sampling bias lowers the accuracy when either observable call events or sampling actions are not equally spaced in time. To mitigate sampling bias, our adaptive correction technique weights each sample by monitoring time-varying spacing of call events and sampling actions. We implemented and evaluated our adaptive correction technique in Jikes RVM, a high-performance virtual machine. In our empirical evaluation, our technique significantly improved the sampling accuracy without measurable overhead and resulted in effective feedback directed inlining.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mcpherson:2016:FPL, author = "Andrew J. Mcpherson and Vijay Nagarajan and Susmit Sarkar and Marcelo Cintra", title = "Fence Placement for Legacy Data-Race-Free Programs via Synchronization Read Detection", journal = j-TACO, volume = "12", number = "4", pages = "46:1--46:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2835179", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Shared-memory programmers traditionally assumed Sequential Consistency (SC), but modern systems have relaxed memory consistency. Here, the trend in languages is toward Data-Race-Free (DRF) models, where, assuming annotated synchronizations and the program being well-synchronized by those synchronizations, the hardware and compiler guarantee SC. However, legacy programs lack annotations, so even well-synchronized (legacy DRF) programs aren't recognized. For legacy DRF programs, we can significantly prune the set of memory orderings determined by automated fence placement by automatically identifying synchronization reads. We prove our rules for identifying them conservatively, implement them within LLVM, and observe a 30\% average performance improvement over previous techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hong:2016:OCT, author = "Ding-Yong Hong and Chun-Chen Hsu and Cheng-Yi Chou and Wei-Chung Hsu and Pangfeng Liu and Jan-Jan Wu", title = "Optimizing Control Transfer and Memory Virtualization in Full System Emulators", journal = j-TACO, volume = "12", number = "4", pages = "47:1--47:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2837027", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Full system emulators provide virtual platforms for several important applications, such as kernel and system software development, co-verification with cycle accurate CPU simulators, or application development for hardware still in development. Full system emulators usually use dynamic binary translation to obtain reasonable performance. This paper focuses on optimizing the performance of full system emulators. First, we optimize performance by enabling classic control transfer optimizations of dynamic binary translation in full system emulation, such as indirect branch target caching and block chaining. Second, we improve the performance of memory virtualization of cross-ISA virtual machines by improving the efficiency of the software translation lookaside buffer (software TLB). We implement our optimizations on QEMU, an industrial-strength full system emulator, along with the Android emulator. Experimental results show that our optimizations achieve an average speedup of 1.98X for ARM-to-X86-64 QEMU running SPEC CINT2006 benchmarks with train inputs. Our optimizations also achieve an average speedup of 1.44X and 1.40X for IA32-to-X86-64 QEMU and AArch64-to-X86-64 QEMU on SPEC CINT2006. We use a set of real applications downloaded from Google Play as benchmarks for the Android emulator. Experimental results show that our optimizations achieve an average speedup of 1.43X for the Android emulator running these applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sukumaran-Rajam:2016:PMN, author = "Aravind Sukumaran-Rajam and Philippe Clauss", title = "The Polyhedral Model of Nonlinear Loops", journal = j-TACO, volume = "12", number = "4", pages = "48:1--48:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2838734", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Runtime code optimization and speculative execution are becoming increasingly prominent to leverage performance in the current multi- and many-core era. However, a wider and more efficient use of such techniques is mainly hampered by the prohibitive time overhead induced by centralized data race detection, dynamic code behavior modeling, and code generation. Most of the existing Thread Level Speculation (TLS) systems rely on naively slicing the target loops into chunks and trying to execute the chunks in parallel with the help of a centralized performance-penalizing verification module that takes care of data races. Due to the lack of a data dependence model, these speculative systems are not capable of doing advanced transformations, and, more importantly, the chances of rollback are high. The polyhedral model is a well-known mathematical model to analyze and optimize loop nests. The current state-of-art tools limit the application of the polyhedral model to static control codes. Thus, none of these tools can generally handle codes with while loops, indirect memory accesses, or pointers. Apollo (Automatic POLyhedral Loop Optimizer) is a framework that goes one step beyond and applies the polyhedral model dynamically by using TLS. Apollo can predict, at runtime, whether the codes are behaving linearly or not, and it applies polyhedral transformations on-the-fly. This article presents a novel system that enables Apollo to handle codes whose memory accesses and loop bounds are not necessarily linear. More generally, this approach expands the applicability of the polyhedral model at runtime to a wider class of codes. Plugging together both linear and nonlinear accesses to the dependence prediction model enables the application of polyhedral loop optimizing transformations even for nonlinear code kernels while also allowing a low-cost speculation verification.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nair:2016:CEP, author = "Prashant J. Nair and David A. Roberts and Moinuddin K. Qureshi", title = "Citadel: Efficiently Protecting Stacked Memory from {TSV} and Large Granularity Failures", journal = j-TACO, volume = "12", number = "4", pages = "49:1--49:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2840807", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Stacked memory modules are likely to be tightly integrated with the processor. It is vital that these memory modules operate reliably, as memory failure can require the replacement of the entire socket. To make matters worse, stacked memory designs are susceptible to newer failure modes (e.g., due to faulty through-silicon vias, or TSVs) that can cause large portions of memory, such as a bank, to become faulty. To avoid data loss from large-granularity failures, the memory system may use symbol-based codes that stripe the data for a cache line across several banks (or channels). Unfortunately, such data-striping reduces memory-level parallelism, causing significant slowdown and higher power consumption. This article proposes Citadel, a robust memory architecture that allows the memory system to retain each cache line within one bank. By retaining cache lines within banks, Citadel enables a high-performance and low-power memory system and also efficiently protects the stacked memory system from large-granularity failures. Citadel consists of three components; TSV-Swap, which can tolerate both faulty data-TSVs and faulty address-TSVs; Tri-Dimensional Parity (3DP), which can tolerate column failures, row failures, and bank failures; and Dynamic Dual-Granularity Sparing (DDS), which can mitigate permanent faults by dynamically sparing faulty memory regions either at a row granularity or at a bank granularity. Our evaluations with real-world data for DRAM failures show that Citadel provides performance and power similar to maintaining the entire cache line in the same bank, and yet provides 700 $ \times $ higher reliability than ChipKill-like ECC codes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Anderson:2016:AVI, author = "Andrew Anderson and Avinash Malik and David Gregg", title = "Automatic Vectorization of Interleaved Data Revisited", journal = j-TACO, volume = "12", number = "4", pages = "50:1--50:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2838735", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Automatically exploiting short vector instructions sets (SSE, AVX, NEON) is a critically important task for optimizing compilers. Vector instructions typically work best on data that is contiguous in memory, and operating on non-contiguous data requires additional work to gather and scatter the data. There are several varieties of non-contiguous access, including interleaved data access. An existing approach used by GCC generates extremely efficient code for loops with power-of-2 interleaving factors (strides). In this paper we propose a generalization of this approach that produces similar code for any compile-time constant interleaving factor. In addition, we propose several novel program transformations, which were made possible by our generalized representation of the problem. Experiments show that our approach achieves significant speedups for both power-of-2 and non--power-of-2 interleaving factors. Our vectorization approach results in mean speedups over scalar code of 1.77x on Intel SSE and 2.53x on Intel AVX2 in real-world benchmarking on a selection of BLAS Level 1 routines. On the same benchmark programs, GCC 5.0 achieves mean improvements of 1.43x on Intel SSE and 1.30x on Intel AVX2. In synthetic benchmarking on Intel SSE, our maximum improvement on data movement is over 4x for gathering operations and over 6x for scattering operations versus scalar code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2016:FMR, author = "Lihang Zhao and Lizhong Chen and Woojin Choi and Jeffrey Draper", title = "A Filtering Mechanism to Reduce Network Bandwidth Utilization of Transaction Execution", journal = j-TACO, volume = "12", number = "4", pages = "51:1--51:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2837028", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hardware Transactional Memory (HTM) relies heavily on the on-chip network for intertransaction communication. However, the network bandwidth utilization of transactions has been largely neglected in HTM designs. In this work, we propose a cost model to analyze network bandwidth in transaction execution. The cost model identifies a set of key factors that can be optimized through system design to reduce the communication cost of HTM. Based on the model and network traffic characterization of a representative HTM design, we identify a huge source of superfluous traffic due to failed requests in transaction conflicts. As observed in a spectrum of workloads, 39\% of the transactional requests fail due to conflicts, which renders 58\% of the transactional network traffic futile. To combat this pathology, a novel in-network filtering mechanism is proposed. The on-chip router is augmented to predict conflicts among transactions and proactively filter out those requests that have a high probability to fail. Experimental results show the proposed mechanism reduces total network traffic by 24\% on average for a set of high-contention TM applications, thereby reducing energy consumption by an average of 24\%. Meanwhile, the contention in the coherence directory is reduced by 68\%, on average. These improvements are achieved with only 5\% area added to a conventional on-chip router design.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Serres:2016:EPP, author = "Olivier Serres and Abdullah Kayi and Ahmad Anbar and Tarek El-Ghazawi", title = "Enabling {PGAS} Productivity with Hardware Support for Shared Address Mapping: a {UPC} Case Study", journal = j-TACO, volume = "12", number = "4", pages = "52:1--52:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842686", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Due to its rich memory model, the partitioned global address space (PGAS) parallel programming model strikes a balance between locality-awareness and the ease of use of the global address space model. Although locality-awareness can lead to high performance, supporting the PGAS memory model is associated with penalties that can hinder PGAS's potential for scalability and speed of execution. This is because mapping the PGAS memory model to the underlying system requires a mapping process that is done in software, thereby introducing substantial overhead for shared accesses even when they are local. Compiler optimizations have not been sufficient to offset this overhead. On the other hand, manual code optimizations can help, but this eliminates the productivity edge of PGAS. This article proposes a processor microarchitecture extension that can perform such address mapping in hardware with nearly no performance overhead. These extensions are then availed to compilers through extensions to the processor instructions. Thus, the need for manual optimizations is eliminated and the productivity of PGAS languages is unleashed. Using Unified Parallel C (UPC), a PGAS language, we present a case study of a prototype compiler and architecture support. Two different implementations of the system were realized. The first uses a full-system simulator, gem5, which evaluates the overall performance gain of the new hardware support. The second uses an FPGA Leon3 soft-core processor to verify implementation feasibility and to parameterize the cost of the new hardware. The new instructions show promising results on all tested codes, including the NAS Parallel Benchmark kernels in UPC. Performance improvements of up to 5.5 $ \times $ for unmodified codes, sometimes surpassing hand-optimized performance, were demonstrated. We also show that our four-core FPGA prototype requires less than 2.4\% of the overall chip's area.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cattaneo:2016:HAI, author = "Riccardo Cattaneo and Giuseppe Natale and Carlo Sicignano and Donatella Sciuto and Marco Domenico Santambrogio", title = "On How to Accelerate Iterative Stencil Loops: a Scalable Streaming-Based Approach", journal = j-TACO, volume = "12", number = "4", pages = "53:1--53:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842615", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In high-performance systems, stencil computations play a crucial role as they appear in a variety of different fields of application, ranging from partial differential equation solving, to computer simulation of particles' interaction, to image processing and computer vision. The computationally intensive nature of those algorithms created the need for solutions to efficiently implement them in order to save both execution time and energy. This, in combination with their regular structure, has justified their widespread study and the proposal of largely different approaches to their optimization. However, most of these works are focused on aggressive compile time optimization, cache locality optimization, and parallelism extraction for the multicore/multiprocessor domain, while fewer works are focused on the exploitation of custom architectures to further exploit the regular structure of Iterative Stencil Loops (ISLs), specifically with the goal of improving power efficiency. This work introduces a methodology to systematically design power-efficient hardware accelerators for the optimal execution of ISL algorithms on Field-programmable Gate Arrays (FPGAs). As part of the methodology, we introduce the notion of Streaming Stencil Time-step (SST), a streaming-based architecture capable of achieving both low resource usage and efficient data reuse thanks to an optimal data buffering strategy, and we introduce a technique called SSTs queuing that is capable of delivering a pseudolinear execution time speedup with constant bandwidth. The methodology has been validated on significant benchmarks on a Virtex-7 FPGA using the Xilinx Vivado suite. Results demonstrate how the efficient usage of the on-chip memory resources realized by an SST allows one to treat problem sizes whose implementation would otherwise not be possible via direct synthesis of the original, unmanipulated code via High-Level Synthesis (HLS). We also show how the SSTs queuing effectively ensures a pseudolinear throughput speedup while consuming constant off-chip bandwidth.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{C:2016:FGM, author = "Unnikrishnan C and Rupesh Nasre and Y. N. Srikant", title = "{Falcon}: a Graph Manipulation Language for Heterogeneous Systems", journal = j-TACO, volume = "12", number = "4", pages = "54:1--54:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842618", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graph algorithms have been shown to possess enough parallelism to keep several computing resources busy---even hundreds of cores on a GPU. Unfortunately, tuning their implementation for efficient execution on a particular hardware configuration of heterogeneous systems consisting of multicore CPUs and GPUs is challenging, time consuming, and error prone. To address these issues, we propose a domain-specific language (DSL), Falcon, for implementing graph algorithms that (i) abstracts the hardware, (ii) provides constructs to write explicitly parallel programs at a higher level, and (iii) can work with general algorithms that may change the graph structure (morph algorithms). We illustrate the usage of our DSL to implement local computation algorithms (that do not change the graph structure) and morph algorithms such as Delaunay mesh refinement, survey propagation, and dynamic SSSP on GPU and multicore CPUs. Using a set of benchmark graphs, we illustrate that the generated code performs close to the state-of-the-art hand-tuned implementations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", remark = "Yes, the first author name is correct as given: Unnikrishnan C.", } @Article{Kalayappan:2016:FRT, author = "Rajshekar Kalayappan and Smruti R. Sarangi", title = "{FluidCheck}: a Redundant Threading-Based Approach for Reliable Execution in Manycore Processors", journal = j-TACO, volume = "12", number = "4", pages = "55:1--55:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842620", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Soft errors have become a serious cause of concern with reducing feature sizes. The ability to accommodate complex, Simultaneous Multithreading (SMT) cores on a single chip presents a unique opportunity to achieve reliable execution, safe from soft errors, with low performance penalties. In this context, we present FluidCheck, a checker architecture that allows highly flexible assignment and migration of checking duties across cores. In this article, we present a mechanism to dynamically use the resources of SMT cores for checking the results of other threads, and propose a variety of heuristics for migration of such checker threads across cores. Secondly, to make the process of checking more efficient, we propose a set of architectural enhancements that reduce power consumption, decrease the length of the critical path, and reduce the load on the Network-on-Chip (NoC). Based on our observations, we design a 16 core system for running SPEC2006 based bag-of-tasks applications. Our experiments demonstrate that fully reliable execution can be attained with a mere 27\% slowdown, surpassing traditional redundant threading based techniques by roughly 42\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Elwell:2016:RMP, author = "Jesse Elwell and Ryan Riley and Nael Abu-Ghazaleh and Dmitry Ponomarev and Iliano Cervesato", title = "Rethinking Memory Permissions for Protection Against Cross-Layer Attacks", journal = j-TACO, volume = "12", number = "4", pages = "56:1--56:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842621", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The inclusive permissions structure (e.g., the Intel ring model) of modern commodity CPUs provides privileged system software layers with arbitrary permissions to access and modify client processes, allowing them to manage these clients and the system resources efficiently. Unfortunately, these inclusive permissions allow a compromised high-privileged software layer to perform arbitrary malicious activities. In this article, our goal is to prevent attacks that cross system layers while maintaining the abilities of system software to manage the system and allocate resources. In particular, we present a hardware-supported page permission framework for physical pages that is based on the concept of noninclusive sets of memory permissions for different layers of system software (such as hypervisors, operating systems, and user-level applications). Instead of viewing privilege levels as an ordered hierarchy with each successive level being more privileged, we view them as distinct levels each with its own set of permissions. In order to enable system software to manage client processes, we define a set of legal permission transitions that support resource allocation but preserve security. We show that the model prevents a range of recent attacks. We also show that it can be implemented with negligible performance overhead (both at load time and at runtime), low hardware complexity, and minimal changes to the commodity OS and hypervisor code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Morad:2016:RGS, author = "Amir Morad and Leonid Yavits and Shahar Kvatinsky and Ran Ginosar", title = "Resistive {GP-SIMD} Processing-In-Memory", journal = j-TACO, volume = "12", number = "4", pages = "57:1--57:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2845084", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GP-SIMD, a novel hybrid general-purpose SIMD architecture, addresses the challenge of data synchronization by in-memory computing, through combining data storage and massive parallel processing. In this article, we explore a resistive implementation of the GP-SIMD architecture. In resistive GP-SIMD, a novel resistive row and column addressable 4F$^2$ crossbar is utilized, replacing the modified CMOS 190F$^2$ SRAM storage previously proposed for GP-SIMD architecture. The use of the resistive crossbar allows scaling the GP-SIMD from few millions to few hundred millions of processing units on a single silicon die. The performance, power consumption and power efficiency of a resistive GP-SIMD are compared with the CMOS version. We find that PiM architectures and, specifically, GP-SIMD benefit more than other many-core architectures from using resistive memory. A framework for in-place arithmetic operation on a single multivalued resistive cell is explored, demonstrating a potential to become a building block for next-generation PiM architectures.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2016:IIB, author = "Yaohua Wang and Dong Wang and Shuming Chen and Zonglin Liu and Shenggang Chen and Xiaowen Chen and Xu Zhou", title = "Iteration Interleaving--Based {SIMD} Lane Partition", journal = j-TACO, volume = "12", number = "4", pages = "58:1--58:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2847253", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The efficacy of single instruction, multiple data (SIMD) architectures is limited when handling divergent control flows. This circumstance results in SIMD fragments using only a subset of the available lanes. We propose an iteration interleaving--based SIMD lane partition (IISLP) architecture that interleaves the execution of consecutive iterations and dynamically partitions SIMD lanes into branch paths with comparable execution time. The benefits are twofold: SIMD fragments under divergent branches can execute in parallel, and the pathology of fragment starvation can also be well eliminated. Our experiments show that IISLP doubles the performance of a baseline mechanism and provides a speedup of 28\% versus instruction shuffle.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Aijo:2016:ILP, author = "Tomi {\"A}ij{\"o} and Pekka J{\"a}{\"a}skel{\"a}inen and Tapio Elomaa and Heikki Kultala and Jarmo Takala", title = "Integer Linear Programming-Based Scheduling for Transport Triggered Architectures", journal = j-TACO, volume = "12", number = "4", pages = "59:1--59:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2845082", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Static multi-issue machines, such as traditional Very Long Instructional Word (VLIW) architectures, move complexity from the hardware to the compiler. This is motivated by the ability to support high degrees of instruction-level parallelism without requiring complicated scheduling logic in the processor hardware. The simpler-control hardware results in reduced area and power consumption, but leads to a challenge of engineering a compiler with good code-generation quality. Transport triggered architectures (TTA), and other so-called exposed datapath architectures, take the compiler-oriented philosophy even further by pushing more details of the datapath under software control. The main benefit of this is the reduced register file pressure, with a drawback of adding even more complexity to the compiler side. In this article, we propose an Integer Linear Programming (ILP) -based instruction scheduling model for TTAs. The model describes the architecture characteristics, the particular processor resource constraints, and the operation dependencies of the scheduled program. The model is validated and measured by compiling application kernels to various TTAs with a different number of datapath components and connectivity. In the best case, the cycle count is reduced to 52\% when compared to a heuristic scheduler. In addition to producing shorter schedules, the number of register accesses in the compiled programs is generally notably less than those with the heuristic scheduler; in the best case, the ILP scheduler reduced the number of register file reads to 33\% of the heuristic results and register file writes to 18\%. On the other hand, as expected, the ILP-based scheduler uses distinctly more time to produce a schedule than the heuristic scheduler, but the compilation time is within tolerable limits for production-code generation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2016:SEA, author = "Qixiao Liu and Miquel Moreto and Jaume Abella and Francisco J. Cazorla and Daniel A. Jimenez and Mateo Valero", title = "Sensible Energy Accounting with Abstract Metering for Multicore Systems", journal = j-TACO, volume = "12", number = "4", pages = "60:1--60:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2842616", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Chip multicore processors (CMPs) are the preferred processing platform across different domains such as data centers, real-time systems, and mobile devices. In all those domains, energy is arguably the most expensive resource in a computing system. Accurately quantifying energy usage in a multicore environment presents a challenge as well as an opportunity for optimization. Standard metering approaches are not capable of delivering consistent results with shared resources, since the same task with the same inputs may have different energy consumption based on the mix of co-running tasks. However, it is reasonable for data-center operators to charge on the basis of estimated energy usage rather than time since energy is more correlated with their actual cost. This article introduces the concept of Sensible Energy Accounting (SEA). For a task running in a multicore system, SEA accurately estimates the energy the task would have consumed running in isolation with a given fraction of the CMP shared resources. We explain the potential benefits of SEA in different domains and describe two hardware techniques to implement it for a shared last-level cache and on-core resources in SMT processors. Moreover, with SEA, an energy-aware scheduler can find a highly efficient on-chip resource assignment, reducing by up to 39\% the total processor energy for a 4-core system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2016:SAC, author = "Miao Zhou and Yu Du and Bruce Childers and Daniel Mosse and Rami Melhem", title = "Symmetry-Agnostic Coordinated Management of the Memory Hierarchy in Multicore Systems", journal = j-TACO, volume = "12", number = "4", pages = "61:1--61:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2847254", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In a multicore system, many applications share the last-level cache (LLC) and memory bandwidth. These resources need to be carefully managed in a coordinated way to maximize performance. DRAM is still the technology of choice in most systems. However, as traditional DRAM technology faces energy, reliability, and scalability challenges, nonvolatile memory (NVM) technologies are gaining traction. While DRAM is read/write symmetric (a read operation has comparable latency and energy consumption as a write operation), many NVM technologies (such as Phase-Change Memory, PCM) experience read/write asymmetry: write operations are typically much slower and more power hungry than read operations. Whether the memory's characteristics are symmetric or asymmetric influences the way shared resources are managed. We propose two symmetry-agnostic schemes to manage a shared LLC through way partitioning and memory through bandwidth allocation. The proposals work well for both symmetric and asymmetric memory. First, an exhaustive search is proposed to find the best combination of a cache way partition and bandwidth allocation. Second, an approximate scheme, derived from a theoretical model, is proposed without the overhead of exhaustive search. Simulation results show that the approximate scheme improves weighted speedup by at least 14\% on average (regardless of the memory symmetry) over a state-of-the-art way partitioning and memory bandwidth allocation. Simulation results also show that the approximate scheme achieves comparable weighted speedup as a state-of-the-art multiple resource management scheme, XChange, for symmetric memory, and outperforms it by an average of 10\% for asymmetric memory.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yazdanbakhsh:2016:RRF, author = "Amir Yazdanbakhsh and Gennady Pekhimenko and Bradley Thwaites and Hadi Esmaeilzadeh and Onur Mutlu and Todd C. Mowry", title = "{RFVP}: Rollback-Free Value Prediction with Safe-to-Approximate Loads", journal = j-TACO, volume = "12", number = "4", pages = "62:1--62:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2836168", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article aims to tackle two fundamental memory bottlenecks: limited off-chip bandwidth (bandwidth wall) and long access latency (memory wall). To achieve this goal, our approach exploits the inherent error resilience of a wide range of applications. We introduce an approximation technique, called Rollback-Free Value Prediction (RFVP). When certain safe-to-approximate load operations miss in the cache, RFVP predicts the requested values. However, RFVP does not check for or recover from load-value mispredictions, hence, avoiding the high cost of pipeline flushes and re-executions. RFVP mitigates the memory wall by enabling the execution to continue without stalling for long-latency memory accesses. To mitigate the bandwidth wall, RFVP drops a fraction of load requests that miss in the cache after predicting their values. Dropping requests reduces memory bandwidth contention by removing them from the system. The drop rate is a knob to control the trade-off between performance/energy efficiency and output quality. Our extensive evaluations show that RFVP, when used in GPUs, yields significant performance improvement and energy reduction for a wide range of quality-loss levels. We also evaluate RFVP's latency benefits for a single core CPU. The results show performance improvement and energy reduction for a wide variety of applications with less than 1\% loss in quality.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "62", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2016:SML, author = "Donghyuk Lee and Saugata Ghose and Gennady Pekhimenko and Samira Khan and Onur Mutlu", title = "Simultaneous Multi-Layer Access: Improving {$3$D}-Stacked Memory Bandwidth at Low Cost", journal = j-TACO, volume = "12", number = "4", pages = "63:1--63:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2832911", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "3D-stacked DRAM alleviates the limited memory bandwidth bottleneck that exists in modern systems by leveraging through silicon vias (TSVs) to deliver higher external memory channel bandwidth. Today's systems, however, cannot fully utilize the higher bandwidth offered by TSVs, due to the limited internal bandwidth within each layer of the 3D-stacked DRAM. We identify that the bottleneck to enabling higher bandwidth in 3D-stacked DRAM is now the global bitline interface, the connection between the DRAM row buffer and the peripheral IO circuits. The global bitline interface consists of a limited and expensive set of wires and structures, called global bitlines and global sense amplifiers, whose high cost makes it difficult to simply scale up the bandwidth of the interface within a single DRAM layer in the 3D stack. We alleviate this bandwidth bottleneck by exploiting the observation that several global bitline interfaces already exist across the multiple DRAM layers in current 3D-stacked designs, but only a fraction of them are enabled at the same time. We propose a new 3D-stacked DRAM architecture, called Simultaneous Multi-Layer Access (SMLA), which increases the internal DRAM bandwidth by accessing multiple DRAM layers concurrently, thus making much greater use of the bandwidth that the TSVs offer. To avoid channel contention, the DRAM layers must coordinate with each other when simultaneously transferring data. We propose two approaches to coordination, both of which deliver four times the bandwidth for a four-layer DRAM, over a baseline that accesses only one layer at a time. Our first approach, Dedicated-IO, statically partitions the TSVs by assigning each layer to a dedicated set of TSVs that operate at a higher frequency. Unfortunately, Dedicated-IO requires a nonuniform design for each layer (increasing manufacturing costs), and its DRAM energy consumption scales linearly with the number of layers. Our second approach, Cascaded-IO, solves both issues by instead time multiplexing all of the TSVs across layers. Cascaded-IO reduces DRAM energy consumption by lowering the operating frequency of higher layers. Our evaluations show that SMLA provides significant performance improvement and energy reduction across a variety of workloads (55\%/18\% on average for multiprogrammed workloads, respectively) over a baseline 3D-stacked DRAM, with low overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "63", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Na:2016:JPC, author = "Yeoul Na and Seon Wook Kim and Youngsun Han", title = "{JavaScript} Parallelizing Compiler for Exploiting Parallelism from Data-Parallel {HTML5} Applications", journal = j-TACO, volume = "12", number = "4", pages = "64:1--64:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2846098", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the advent of the HTML5 standard, JavaScript is increasingly processing computationally intensive, data-parallel workloads. Thus, the enhancement of JavaScript performance has been emphasized because the performance gap between JavaScript and native applications is still substantial. Despite this urgency, conventional JavaScript compilers do not exploit much of parallelism even from data-parallel JavaScript applications, despite contemporary mobile devices being equipped with expensive parallel hardware platforms, such as multicore processors and GPGPUs. In this article, we propose an automatically parallelizing JavaScript compiler that targets emerging, data-parallel HTML5 applications by leveraging the mature affine loop analysis of conventional static compilers. We identify that the most critical issues when parallelizing JavaScript with a conventional static analysis are ensuring correct parallelization, minimizing compilation overhead, and conducting low-cost recovery when there is a speculation failure during parallel execution. We propose a mechanism for safely handling the failure at a low cost, based on compiler techniques and the property of idempotence. Our experiment shows that the proposed JavaScript parallelizing compiler detects most affine parallel loops. Also, we achieved a maximum speedup of 3.22 times on a quad-core system, while incurring negligible compilation and recovery overheads with various sets of data-parallel HTML5 applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "64", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Usui:2016:DDA, author = "Hiroyuki Usui and Lavanya Subramanian and Kevin Kai-Wei Chang and Onur Mutlu", title = "{DASH}: Deadline-Aware High-Performance Memory Scheduler for Heterogeneous Systems with Hardware Accelerators", journal = j-TACO, volume = "12", number = "4", pages = "65:1--65:??", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2847255", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern SoCs integrate multiple CPU cores and hardware accelerators (HWAs) that share the same main memory system, causing interference among memory requests from different agents. The result of this interference, if it is not controlled well, is missed deadlines for HWAs and low CPU performance. Few previous works have tackled this problem. State-of-the-art mechanisms designed for CPU-GPU systems strive to meet a target frame rate for GPUs by prioritizing the GPU close to the time when it has to complete a frame. We observe two major problems when such an approach is adapted to a heterogeneous CPU-HWA system. First, HWAs miss deadlines because they are prioritized only when close to their deadlines. Second, such an approach does not consider the diverse memory access characteristics of different applications running on CPUs and HWAs, leading to low performance for latency-sensitive CPU applications and deadline misses for some HWAs, including GPUs. In this article, we propose a Deadline-Aware memory Scheduler for Heterogeneous systems (DASH), which overcomes these problems using three key ideas, with the goal of meeting HWAs' deadlines while providing high CPU performance. First, DASH prioritizes an HWA when it is not on track to meet its deadline any time during a deadline period, instead of prioritizing it only when close to a deadline. Second, DASH prioritizes HWAs over memory-intensive CPU applications based on the observation that memory-intensive applications' performance is not sensitive to memory latency. Third, DASH treats short-deadline HWAs differently as they are more likely to miss their deadlines and schedules their requests based on worst-case memory access time estimates. Extensive evaluations across a wide variety of different workloads and systems show that DASH achieves significantly better CPU performance than the best previous scheduler while always meeting the deadlines for all HWAs, including GPUs, thereby largely improving frame rates.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "65", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kafshdooz:2016:CTO, author = "Morteza Mohajjel Kafshdooz and Mohammadkazem Taram and Sepehr Assadi and Alireza Ejlali", title = "A Compile-Time Optimization Method for {WCET} Reduction in Real-Time Embedded Systems through Block Formation", journal = j-TACO, volume = "12", number = "4", pages = "66:1--66:25", month = jan, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2845083", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Feb 16 15:36:38 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compile-time optimizations play an important role in the efficient design of real-time embedded systems. Usually, compile-time optimizations are designed to reduce average-case execution time (ACET). While ACET is a main concern in high-performance computing systems, in real-time embedded systems, concerns are different and worst-case execution time (WCET) is much more important than ACET. Therefore, WCET reduction is more desirable than ACET reduction in many real-time embedded systems. In this article, we propose a compile-time optimization method aimed at reducing WCET in real-time embedded systems. In the proposed method, based on the predicated execution capability of embedded processors, program code blocks that are in the worst-case paths of the program are merged to increase instruction-level parallelism and opportunity for WCET reduction. The use of predicated execution enables merging code blocks from different worst-case paths that can be very effective in WCET reduction. The experimental results show that the proposed method can reduce WCET by up to 45\% as compared to previous compile-time block formation methods. It is noteworthy that compared to previous works, while the proposed method usually achieves more WCET reduction, it has considerably less negative impact on ACET and code size.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "66", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Koukos:2016:BHU, author = "Konstantinos Koukos and Alberto Ros and Erik Hagersten and Stefanos Kaxiras", title = "Building Heterogeneous {Unified Virtual Memories (UVMs)} without the Overhead", journal = j-TACO, volume = "13", number = "1", pages = "1:1--1:22", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2889488", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This work proposes a novel scheme to facilitate heterogeneous systems with unified virtual memory. Research proposals implement coherence protocols for sequential consistency (SC) between central processing unit (CPU) cores and between devices. Such mechanisms introduce severe bottlenecks in the system; therefore, we adopt the heterogeneous-race-free (HRF) memory model. The use of HRF simplifies the coherency protocol and the graphics processing unit (GPU) memory management unit (MMU). Our protocol optimizes CPU and GPU demands separately, with the GPU part being simpler while the CPU is more elaborate and latency aware. We achieve an average 45\% speedup and 45\% electronic data processing reduction (20\% energy) over the corresponding SC implementation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2016:DMB, author = "Zhigang Wang and Xiaolin Wang and Fang Hou and Yingwei Luo and Zhenlin Wang", title = "Dynamic Memory Balancing for Virtualization", journal = j-TACO, volume = "13", number = "1", pages = "2:1--2:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2851501", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Allocating memory dynamically for virtual machines (VMs) according to their demands provides significant benefits as well as great challenges. Efficient memory resource management requires knowledge of the memory demands of applications or systems at runtime. A widely proposed approach is to construct a miss ratio curve (MRC) for a VM, which not only summarizes the current working set size (WSS) of the VM but also models the relationship between its performance and the target memory allocation size. Unfortunately, the cost of monitoring and maintaining the MRC structures is nontrivial. This article first introduces a low-cost WSS tracking system with effective optimizations on data structures, as well as an efficient mechanism to decrease the frequency of monitoring. We also propose a Memory Balancer (MEB), which dynamically reallocates guest memory based on the predicted WSS. Our experimental results show that our prediction schemes yield a high accuracy of 95.2\% and low overhead of 2\%. Furthermore, the overall system throughput can be significantly improved with MEB, which brings a speedup up to 7.4 for two to four VMs and 4.54 for an overcommitted system with 16 VMs.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2016:HPC, author = "Xueyang Wang and Sek Chai and Michael Isnardi and Sehoon Lim and Ramesh Karri", title = "Hardware Performance Counter-Based Malware Identification and Detection with Adaptive Compressive Sensing", journal = j-TACO, volume = "13", number = "1", pages = "3:1--3:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2857055", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hardware Performance Counter-based (HPC) runtime checking is an effective way to identify malicious behaviors of malware and detect malicious modifications to a legitimate program's control flow. To reduce the overhead in the monitored system which has limited storage and computing resources, we present a ``sample-locally-analyze-remotely'' technique. The sampled HPC data are sent to a remote server for further analysis. To minimize the I/O bandwidth required for transmission, the fine-grained HPC profiles are compressed into much smaller vectors with Compressive Sensing. The experimental results demonstrate an 80\% I/O bandwidth reduction after applying Compressive Sensing, without compromising the detection and identification capabilities.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Akram:2016:BPG, author = "Shoaib Akram and Jennifer B. Sartor and Kenzo {Van Craeynest} and Wim Heirman and Lieven Eeckhout", title = "Boosting the Priority of Garbage: Scheduling Collection on Heterogeneous Multicore Processors", journal = j-TACO, volume = "13", number = "1", pages = "4:1--4:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2875424", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "While hardware is evolving toward heterogeneous multicore architectures, modern software applications are increasingly written in managed languages. Heterogeneity was born of a need to improve energy efficiency; however, we want the performance of our applications not to suffer from limited resources. How best to schedule managed language applications on a mix of big, out-of-order cores and small, in-order cores is an open question, complicated by the host of service threads that perform key tasks such as memory management. These service threads compete with the application for core and memory resources, and garbage collection (GC) must sometimes suspend the application if there is not enough memory available for allocation. In this article, we explore concurrent garbage collection's behavior, particularly when it becomes critical, and how to schedule it on a heterogeneous system to optimize application performance. While some applications see no difference in performance when GC threads are run on big versus small cores, others --- those with GC criticality --- see up to an 18\% performance improvement. We develop a new, adaptive scheduling algorithm that responds to GC criticality signals from the managed runtime, giving more big-core cycles to the concurrent collector when it is under pressure and in danger of suspending the application. Our experimental results show that our GC-criticality-aware scheduler is robust across a range of heterogeneous architectures with different core counts and frequency scaling and across heap sizes. Our algorithm is performance and energy neutral for GC-uncritical Java applications and significantly speeds up GC-critical applications by 16\%, on average, while being 20\% more energy efficient for a heterogeneous multicore with three big cores and one small core.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yilmaz:2016:ARS, author = "Buse Yilmaz and Baris Aktemur and Mar{\'\i}A J. Garzar{\'a}n and Sam Kamin and Furkan Kira{\c{c}}", title = "Autotuning Runtime Specialization for Sparse Matrix-Vector Multiplication", journal = j-TACO, volume = "13", number = "1", pages = "5:1--5:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2851500", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Runtime specialization is used for optimizing programs based on partial information available only at runtime. In this paper we apply autotuning on runtime specialization of Sparse Matrix-Vector Multiplication to predict a best specialization method among several. In 91\% to 96\% of the predictions, either the best or the second-best method is chosen. Predictions achieve average speedups that are very close to the speedups achievable when only the best methods are used. By using an efficient code generator and a carefully designed set of matrix features, we show the runtime costs can be amortized to bring performance benefits for many real-world cases.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2016:ERI, author = "Mingzhou Zhou and Bo Wu and Xipeng Shen and Yaoqing Gao and Graham Yiu", title = "Examining and Reducing the Influence of Sampling Errors on Feedback-Driven Optimizations", journal = j-TACO, volume = "13", number = "1", pages = "6:1--6:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2851502", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Feedback-driven optimization (FDO) is an important component in mainstream compilers. By allowing the compiler to reoptimize the program based on some profiles of the program's dynamic behaviors, it often enhances the quality of the generated code substantially. A barrier for using FDO is that it often requires many training runs to collect enough profiles to amortize the sensitivity of program optimizations to program input changes. Various sampling techniques have been explored to alleviate this time-consuming process. However, the lowered profile accuracy caused by sampling often hurts the benefits of FDO. This article gives the first systematic study in how sampling rates affect the accuracy of collected profiles and how the accuracy correlates with the usefulness of the profile for modern FDO. Studying basic block and edge profiles for FDO in two mature compilers reveals several counterintuitive observations, one of which is that profiling accuracy does not strongly correlate with the benefits of the FDO. A detailed analysis identifies three types of sampling-caused errors that critically impair the quality of the profiles for FDO. It then introduces a simple way to rectify profiles based on the findings. Experiments demonstrate that the simple rectification fixes most of those critical errors in sampled profiles and significantly enhances the effectiveness of FDO.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dantras:2016:OIB, author = "Amanieu D'antras and Cosmin Gorgovan and Jim Garside and Mikel Luj{\'a}n", title = "Optimizing Indirect Branches in Dynamic Binary Translators", journal = j-TACO, volume = "13", number = "1", pages = "7:1--7:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2866573", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dynamic binary translation is a technology for transparently translating and modifying a program at the machine code level as it is running. A significant factor in the performance of a dynamic binary translator is its handling of indirect branches. Unlike direct branches, which have a known target at translation time, an indirect branch requires translating a source program counter address to a translated program counter address every time the branch is executed. This translation can impose a serious runtime penalty if it is not handled efficiently. MAMBO-X64, a dynamic binary translator that translates 32-bit ARM (AArch32) code to 64-bit ARM (AArch64) code, uses three novel techniques to improve the performance of indirect branch translation. Together, these techniques allow MAMBO-X64 to achieve a very low performance overhead of only 10\% on average compared to native execution of 32-bit programs. Hardware-assisted function returns use a software return address stack to predict the targets of function returns, making use of several novel optimizations while also exploiting hardware return address prediction. This technique has a significant impact on most benchmarks, reducing binary translation overhead compared to native execution by 40\% on average and by 90\% on some benchmarks. Branch table inference, an algorithm for detecting and translating branch tables, can reduce the overhead of translated code by up to 40\% on some SPEC CPU2006 benchmarks. The remaining indirect branches are handled using a fast atomic hash table, which is optimized to work with multiple threads. This last technique translates indirect branches using a single shared hash table while avoiding expensive synchronization in performance-critical lookup code. This allows the performance to be on par with thread-private hash tables while having superior memory scalability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Martins:2016:CBS, author = "Luiz G. A. Martins and Ricardo Nobre and Jo{\~a}o M. P. Cardoso and Alexandre C. B. Delbem and Eduardo Marques", title = "Clustering-Based Selection for the Exploration of Compiler Optimization Sequences", journal = j-TACO, volume = "13", number = "1", pages = "8:1--8:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2883614", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A large number of compiler optimizations are nowadays available to users. These optimizations interact with each other and with the input code in several and complex ways. The sequence of application of optimization passes can have a significant impact on the performance achieved. The effect of the optimizations is both platform and application dependent. The exhaustive exploration of all viable sequences of compiler optimizations for a given code fragment is not feasible. As this exploration is a complex and time-consuming task, several researchers have focused on Design Space Exploration (DSE) strategies both to select optimization sequences to improve the performance of each function of the application and to reduce the exploration time. In this article, we present a DSE scheme based on a clustering approach for grouping functions with similarities and exploration of a reduced search space resulting from the combination of optimizations previously suggested for the functions in each group. The identification of similarities between functions uses a data mining method that is applied to a symbolic code representation. The data mining process combines three algorithms to generate clusters: the Normalized Compression Distance, the Neighbor Joining, and a new ambiguity-based clustering algorithm. Our experiments for evaluating the effectiveness of the proposed approach address the exploration of optimization sequences in the context of the ReflectC compiler, considering 49 compilation passes while targeting a Xilinx MicroBlaze processor, and aiming at performance improvements for 51 functions and four applications. Experimental results reveal that the use of our clustering-based DSE approach achieves a significant reduction in the total exploration time of the search space ($ 20 \times $ over a Genetic Algorithm approach) at the same time that considerable performance speedups (41\% over the baseline) were obtained using the optimized codes. Additional experiments were performed considering the LLVM compiler, considering 124 compilation passes, and targeting a LEON3 processor. The results show that our approach achieved geometric mean speedups of $ 1.49 \times $, $ 1.32 \times $, and $ 1.24 \times $ for the best 10, 20, and 30 functions, respectively, and a global improvement of 7\% over the performance obtained when compiling with -O2.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Do:2016:PEH, author = "Sang Wook Stephen Do and Michel Dubois", title = "Power Efficient Hardware Transactional Memory: Dynamic Issue of Transactions", journal = j-TACO, volume = "13", number = "1", pages = "9:1--9:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2875425", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Transactional Memory (TM) is no longer just an academic interest as industry has started to adopt the idea in its commercial products. In this paper, we propose Dynamic Transaction Issue (DTI), a new scheme that can be easily implemented on top of existing Hardware TM (HTM) systems, provided additional messages. Instead of wasting power and energy in transaction aborts, Dynamic Transaction Issue puts a processor core into a low-power state when there is a reasonable suspicion that the current transaction running on it will be aborted soon in the future. We have implemented Dynamic Transaction Issue on a cycle-accurate simulator of a multicore processor system with out-of-order superscalar cores, augmented with a power package and a TM package which add accurate dynamic power estimates and a TM framework to the simulator. Our simulation results show that Dynamic Transaction Issue can achieve energy savings up to 37\% from the energy consumption of a base machine with no mechanism to suppress useless aborts. We also compare Dynamic Transaction Issue with various alternative hardware TM mechanisms.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Evtyushkin:2016:UMC, author = "Dmitry Evtyushkin and Dmitry Ponomarev and Nael Abu-Ghazaleh", title = "Understanding and Mitigating Covert Channels Through Branch Predictors", journal = j-TACO, volume = "13", number = "1", pages = "10:1--10:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2870636", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Covert channels through shared processor resources provide secret communication between two malicious processes: the trojan and the spy. In this article, we classify, analyze, and compare covert channels through dynamic branch prediction units in modern processors. Through experiments on a real hardware platform, we compare contention-based channel and the channel that is based on exploiting the branch predictor's residual state. We analyze these channels in SMT and single-threaded environments under both clean and noisy conditions. Our results show that the residual state-based channel provides a cleaner signal and is effective even in noisy execution environments with another application sharing the same physical core with the trojan and the spy. We also estimate the capacity of the branch predictor covert channels and describe a software-only mitigation technique that is based on randomizing the state of the predictor tables on context switches. We show that this protection eliminates all covert channels through the branch prediction unit with minimal impact on performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2016:CAE, author = "Hao Zhou and Jingling Xue", title = "A Compiler Approach for Exploiting Partial {SIMD} Parallelism", journal = j-TACO, volume = "13", number = "1", pages = "11:1--11:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2886101", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Existing vectorization techniques are ineffective for loops that exhibit little loop-level parallelism but some limited superword-level parallelism (SLP). We show that effectively vectorizing such loops requires partial vector operations to be executed correctly and efficiently, where the degree of partial SIMD parallelism is smaller than the SIMD datapath width. We present a simple yet effective SLP compiler technique called P aver (PArtial VEctorizeR), formulated and implemented in LLVM as a generalization of the traditional SLP algorithm, to optimize such partially vectorizable loops. The key idea is to maximize SIMD utilization by widening vector instructions used while minimizing the overheads caused by memory access, packing/unpacking, and/or masking operations, without introducing new memory errors or new numeric exceptions. For a set of 9 C/C++/Fortran applications with partial SIMD parallelism, Paver achieves significantly better kernel and whole-program speedups than LLVM on both Intel's AVX and ARM's NEON.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{VanDenBraak:2016:RGR, author = "Gert-Jan {Van Den Braak} and Henk Corporaal", title = "{R-GPU}: a Reconfigurable {GPU} Architecture", journal = j-TACO, volume = "13", number = "1", pages = "12:1--12:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2890506", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Over the last decade, Graphics Processing Unit (GPU) architectures have evolved from a fixed-function graphics pipeline to a programmable, energy-efficient compute accelerator for massively parallel applications. The compute power arises from the GPU's Single Instruction/Multiple Threads architecture: concurrently running many threads and executing them as Single Instruction/Multiple Data--style vectors. However, compute power is still lost due to cycles spent on data movement and control instructions instead of data computations. Even more cycles are lost on pipeline stalls resulting from long latency (memory) operations. To improve not only performance but also energy efficiency, we introduce R-GPU: a reconfigurable GPU architecture with communicating cores. R-GPU is an addition to a GPU, which can still be used as such, but also has the ability to reorganize the cores of a GPU in a reconfigurable network. In R-GPU data movement and control is implicit in the configuration of the network. Each core executes a fixed instruction, reducing instruction decode count and increasing energy efficiency. On a number of benchmarks we show an average performance improvement of $ 2.1 \times $ over the same GPU without modifications. We further make a conservative power estimation of R-GPU which shows that power consumption can be reduced by 6\%, leading to an energy consumption reduction of 55\%, while area only increases by a mere 4\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2016:TAA, author = "Peng Liu and Jiyang Yu and Michael C. Huang", title = "Thread-Aware Adaptive Prefetcher on Multicore Systems: Improving the Performance for Multithreaded Workloads", journal = j-TACO, volume = "13", number = "1", pages = "13:1--13:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2890505", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Most processors employ hardware data prefetching techniques to hide memory access latencies. However, the prefetching requests from different threads on a multicore processor can cause severe interference with prefetching and/or demand requests of others. The data prefetching can lead to significant performance degradation due to shared resource contention on shared memory multicore systems. This article proposes a thread-aware data prefetching mechanism based on low-overhead runtime information to tune prefetching modes and aggressiveness, mitigating the resource contention in the memory system. Our solution has three new components: (1) a self-tuning prefetcher that uses runtime feedback to dynamically adjust data prefetching modes and arguments of each thread, (2) a filtering mechanism that informs the hardware about which prefetching request can cause shared data invalidation and should be discarded, and (3) a limiter thread acceleration mechanism to estimate and accelerate the critical thread which has the longest completion time in the parallel region of execution. On a set of multithreaded parallel benchmarks, our thread-aware data prefetching mechanism improves the overall performance of 64-core system by 13\% over a multimode prefetch baseline system with two-level cache organization and conventional modified, exclusive, shared, and invalid-based directory coherence protocol. We compare our approach with the feedback directed prefetching technique and find that it provides 9\% performance improvement on multicore systems, while saving the memory bandwidth consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gorgovan:2016:MLO, author = "Cosmin Gorgovan and Amanieu D'antras and Mikel Luj{\'a}n", title = "{MAMBO}: a Low-Overhead Dynamic Binary Modification Tool for {ARM}", journal = j-TACO, volume = "13", number = "1", pages = "14:1--14:??", month = apr, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2896451", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Apr 5 16:27:36 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As the ARM architecture expands beyond its traditional embedded domain, there is a growing interest in dynamic binary modification (DBM) tools for general-purpose multicore processors that are part of the ARM family. Existing DBM tools for ARM suffer from introducing large overheads in the execution of applications. The specific questions that this article addresses are (i) how to develop such DBM tools for the ARM architecture and (ii) whether new optimisations are plausible and needed. We describe the general design of MAMBO, a new DBM tool for ARM, which we release together with this publication, and introduce novel optimisations to handle indirect branches. In addition, we explore scenarios in which it may be possible to relax the transparency offered by DBM tools to allow extra optimisations to be applied. These scenarios arise from analysing the most typical usages: for example, application binaries without handcrafted assembly. The performance evaluation shows that MAMBO introduces small overheads for SPEC CPU2006 and PARSEC 3.0 when comparing with the execution times of the unmodified programs: a geometric mean overhead of 28\% on a Cortex-A9 and of 34\% on a Cortex-A15 for CPU2006, and between 27\% and 32\%, depending on the number of threads, for PARSEC on a Cortex-A15.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Theocharis:2016:BSC, author = "Panagiotis Theocharis and Bjorn {De Sutter}", title = "A Bimodal Scheduler for Coarse-Grained Reconfigurable Arrays", journal = j-TACO, volume = "13", number = "2", pages = "15:1--15:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2893475", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compilers for Course-Grained Reconfigurable Array (CGRA) architectures suffer from long compilation times and code quality levels far below the theoretical upper bounds. This article presents a new scheduler, called the Bimodal Modulo Scheduler (BMS), to map inner loops onto (heterogeneous) CGRAs of the Architecture for Dynamically Reconfigurable Embedded Systems (ADRES) family. BMS significantly outperforms existing schedulers for similar architectures in terms of generated code quality and compilation time. This is achieved by combining new schemes for backtracking with extended and adapted forms of priority functions and cost functions, as described in the article. BMS is evaluated by mapping multimedia and software-defined radio benchmarks onto tuned ADRES instances.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Anbar:2016:EHL, author = "Ahmad Anbar and Olivier Serres and Engin Kayraklioglu and Abdel-Hameed A. Badawy and Tarek El-Ghazawi", title = "Exploiting Hierarchical Locality in Deep Parallel Architectures", journal = j-TACO, volume = "13", number = "2", pages = "16:1--16:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2897783", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Parallel computers are becoming deeply hierarchical. Locality-aware programming models allow programmers to control locality at one level through establishing affinity between data and executing activities. This, however, does not enable locality exploitation at other levels. Therefore, we must conceive an efficient abstraction of hierarchical locality and develop techniques to exploit it. Techniques applied directly by programmers, beyond the first level, burden the programmer and hinder productivity. In this article, we propose the Parallel Hierarchical Locality Abstraction Model for Execution (PHLAME). PHLAME is an execution model to abstract and exploit machine hierarchical properties through locality-aware programming and a runtime that takes into account machine characteristics, as well as a data sharing and communication profile of the underlying application. This article presents and experiments with concepts and techniques that can drive such runtime system in support of PHLAME. Our experiments show that our techniques scale up and achieve performance gains of up to 88\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gonzalez-alvarez:2016:MEF, author = "Cecilia Gonz{\'a}lez-{\'a}lvarez and Jennifer B. Sartor and Carlos {\'A}lvarez and Daniel Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout", title = "{MInGLE}: an Efficient Framework for Domain Acceleration Using Low-Power Specialized Functional Units", journal = j-TACO, volume = "13", number = "2", pages = "17:1--17:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2898356", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The end of Dennard scaling leads to new research directions that try to cope with the utilization wall in modern chips, such as the design of specialized architectures. Processor customization utilizes transistors more efficiently, optimizing not only for performance but also for power. However, hardware specialization for each application is costly and impractical due to time-to-market constraints. Domain-specific specialization is an alternative that can increase hardware reutilization across applications that share similar computations. This article explores the specialization of low-power processors with custom instructions (CIs) that run on a specialized functional unit. We are the first, to our knowledge, to design CIs for an application domain and across basic blocks, selecting CIs that maximize both performance and energy efficiency improvements. We present the Merged Instructions Generator for Large Efficiency (MInGLE), an automated framework that identifies and selects CIs. Our framework analyzes large sequences of code (across basic blocks) to maximize acceleration potential while also performing partial matching across applications to optimize for reuse of the specialized hardware. To do this, we convert the code into a new canonical representation, the Merging Diagram, which represents the code's functionality instead of its structure. This is key to being able to find similarities across such large code sequences from different applications with different coding styles. Groups of potential CIs are clustered depending on their similarity score to effectively reduce the search space. Additionally, we create new CIs that cover not only whole-body loops but also fragments of the code to optimize hardware reutilization further. For a set of 11 applications from the media domain, our framework generates CIs that significantly improve the energy-delay product (EDP) and performance speedup. CIs with the highest utilization opportunities achieve an average EDP improvement of 3.8 $ \times $ compared to a baseline processor modeled after an Intel Atom. We demonstrate that we can efficiently accelerate a domain with partially matched CIs, and that their design time, from identification to selection, stays within tractable bounds.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Andreetta:2016:FPF, author = "Christian Andreetta and Vivien B{\'e}got and Jost Berthold and Martin Elsman and Fritz Henglein and Troels Henriksen and Maj-Britt Nordfang and Cosmin E. Oancea", title = "{FinPar}: a Parallel Financial Benchmark", journal = j-TACO, volume = "13", number = "2", pages = "18:1--18:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2898354", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Commodity many-core hardware is now mainstream, but parallel programming models are still lagging behind in efficiently utilizing the application parallelism. There are (at least) two principal reasons for this. First, real-world programs often take the form of a deeply nested composition of parallel operators, but mapping the available parallelism to the hardware requires a set of transformations that are tedious to do by hand and beyond the capability of the common user. Second, the best optimization strategy, such as what to parallelize and what to efficiently sequentialize, is often sensitive to the input dataset and therefore requires multiple code versions that are optimized differently, which also raises maintainability problems. This article presents three array-based applications from the financial domain that are suitable for gpgpu execution. Common benchmark-design practice has been to provide the same code for the sequential and parallel versions that are optimized for only one class of datasets. In comparison, we document (1) all available parallelism via nested map-reduce functional combinators, in a simple Haskell implementation that closely resembles the original code structure, (2) the invariants and code transformations that govern the main trade-offs of a data-sensitive optimization space, and (3) report target cpu and multiversion gpgpu code together with an evaluation that demonstrates optimization trade-offs and other difficulties. We believe that this work provides useful insight into the language constructs and compiler infrastructure capable of expressing and optimizing such applications, and we report in-progress work in this direction.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dardaillon:2016:NCF, author = "Micka{\"e}l Dardaillon and Kevin Marquet and Tanguy Risset and J{\'e}r{\^o}me Martin and Henri-Pierre Charles", title = "A New Compilation Flow for Software-Defined Radio Applications on Heterogeneous {MPSoCs}", journal = j-TACO, volume = "13", number = "2", pages = "19:1--19:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2910583", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The advent of portable software-defined radio ( sdr) technology is tightly linked to the resolution of a difficult problem: efficient compilation of signal processing applications on embedded computing devices. Modern wireless communication protocols use packet processing rather than infinite stream processing and also introduce dependencies between data value and computation behavior leading to dynamic dataflow behavior. Recently, parametric dataflow has been proposed to support dynamicity while maintaining the high level of analyzability needed for efficient real-life implementations of signal processing computations. This article presents a new compilation flow that is able to compile parametric dataflow graphs. Built on the llvm compiler infrastructure, the compiler offers an actor-based C++ programming model to describe parametric graphs, a compilation front end for graph analysis, and a back end that currently matches the Magali platform: a prototype heterogeneous MPSoC dedicated to LTE-Advanced. We also introduce an innovative scheduling technique, called microscheduling, allowing one to adapt the mapping of parametric dataflow programs to the specificities of the different possible MPSoCs targeted. A specific focus on fifo sizing on the target architecture is presented. The experimental results show compilation of 3gpp lte-advanced demodulation on Magali with tight memory size constraints. The compiled programs achieve performance similar to handwritten code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liao:2016:DPM, author = "Jianwei Liao and Fran{\c{c}}ois Trahay and Guoqiang Xiao", title = "Dynamic Process Migration Based on Block Access Patterns Occurring in Storage Servers", journal = j-TACO, volume = "13", number = "2", pages = "20:1--20:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2899002", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "An emerging trend in developing large and complex applications on today's high-performance computers is to couple independent components into a comprehensive application. The components may employ the global file system to exchange their data when executing the application. In order to reduce the time required for input/output (I/O) data exchange and data transfer in the coupled systems or other applications, this article proposes a dynamic process migration mechanism on the basis of block access pattern similarity for utilizing the local file cache to exchange the data. We first introduce the scheme of the block access counting diagram to profile the process access pattern during a time period on the storage server. Next, we propose an algorithm that compares the access patterns of processes running on different computing nodes. Last, processes are migrated in order to group processes with similar access patterns. Consequently, the processes on the computing node can exchange their data by accessing the local file cache, instead of the global file system. The experimental results show that the proposed process migration mechanism can reduce the execution time required by the application because of the shorter I/O time, as well as yield attractive I/O throughput. In summary, this dynamic process migration technique can work fairly well for distributed applications whose data dependency rely on distributed file systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ashouri:2016:CCA, author = "Amir Hossein Ashouri and Giovanni Mariani and Gianluca Palermo and Eunjung Park and John Cavazos and Cristina Silvano", title = "{COBAYN}: Compiler Autotuning Framework Using {Bayesian} Networks", journal = j-TACO, volume = "13", number = "2", pages = "21:1--21:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2928270", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The variety of today's architectures forces programmers to spend a great deal of time porting and tuning application codes across different platforms. Compilers themselves need additional tuning, which has considerable complexity as the standard optimization levels, usually designed for the average case and the specific target architecture, often fail to bring the best results. This article proposes COBAYN: Compiler autotuning framework using BAYesian Networks, an approach for a compiler autotuning methodology using machine learning to speed up application performance and to reduce the cost of the compiler optimization phases. The proposed framework is based on the application characterization done dynamically by using independent microarchitecture features and Bayesian networks. The article also presents an evaluation based on using static analysis and hybrid feature collection approaches. In addition, the article compares Bayesian networks with respect to several state-of-the-art machine-learning models. Experiments were carried out on an ARM embedded platform and GCC compiler by considering two benchmark suites with 39 applications. The set of compiler configurations, selected by the model (less than 7\% of the search space), demonstrated an application performance speedup of up to 4.6 $ \times $ on Polybench (1.85 $ \times $ on average) and 3.1 $ \times $ on cBench (1.54 $ \times $ on average) with respect to standard optimization levels. Moreover, the comparison of the proposed technique with (i) random iterative compilation, (ii) machine learning--based iterative compilation, and (iii) noniterative predictive modeling techniques shows, on average, 1.2 $ \times $ , 1.37 $ \times $ , and 1.48 $ \times $ speedup, respectively. Finally, the proposed method demonstrates 4 $ \times $ and 3 $ \times $ speedup, respectively, on cBench and Polybench in terms of exploration efficiency given the same quality of the solutions generated by the random iterative compilation model.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chrysanthou:2016:ORT, author = "Kypros Chrysanthou and Panayiotis Englezakis and Andreas Prodromou and Andreas Panteli and Chrysostomos Nicopoulos and Yiannakis Sazeides and Giorgos Dimitrakopoulos", title = "An Online and Real-Time Fault Detection and Localization Mechanism for Network-on-Chip Architectures", journal = j-TACO, volume = "13", number = "2", pages = "22:1--22:??", month = jun, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2930670", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 27 16:18:10 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Networks-on-Chip (NoC) are becoming increasingly susceptible to emerging reliability threats. The need to detect and localize the occurrence of faults at runtime is steadily becoming imperative. In this work, we propose NoCAlert, a comprehensive online and real-time fault detection and localization mechanism that demonstrates 0\% false negatives within the interconnect for the fault models and stimulus set used in this study. Based on the concept of invariance checking, NoCAlert employs a group of lightweight microchecker modules that collectively implement real-time hardware assertions. The checkers operate concurrently with normal NoC operation, thus eliminating the need for periodic, or triggered-based, self-testing. Based on the pattern/signature of asserted checkers, NoCAlert can pinpoint the location of the fault at various granularity levels. Most important, 97\% of the transient and 90\% of the permanent faults are detected instantaneously, within a single clock cycle upon fault manifestation. The fault localization accuracy ranges from 90\% to 100\%, depending on the desired localization granularity. Extensive cycle-accurate simulations in a 64-node CMP and analysis at the RTL netlist-level demonstrate the efficacy of the proposed technique.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mehta:2016:VL, author = "Sanyam Mehta and Pen-Chung Yew", title = "Variable Liberalization", journal = j-TACO, volume = "13", number = "3", pages = "23:1--23:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2963101", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In the wake of the current trend of increasing the number of cores on a chip, compiler optimizations for improving the memory performance have assumed increased importance. Loop fusion is one such key optimization that can alleviate memory and bandwidth wall and thus improve parallel performance. However, we find that loop fusion in interesting memory-intensive applications is prevented by the existence of dependences between temporary variables that appear in different loop nests. Furthermore, known techniques of allowing useful transformations in the presence of temporary variables, such as privatization and expansion, prove insufficient in such cases. In this work, we introduce variable liberalization, a technique that selectively removes dependences on temporary variables in different loop nests to achieve loop fusion while preserving the semantical correctness of the optimized program. This removal of extra-stringent dependences effectively amounts to variable expansion, thus achieving the benefit of an increased degree of freedom for program transformation but without an actual expansion. Hence, there is no corresponding increase in the memory footprint incurred. We implement liberalization in the Pluto polyhedral compiler and evaluate its performance on nine hot regions in five real applications. Results demonstrate parallel performance improvement of 1.92 $ \times $ over the Intel compiler, averaged over the nine hot regions, and an overall improvement of as much as 2.17 $ \times $ for an entire application, on an eight-core Intel Xeon processor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2016:RER, author = "Hsing-Min Chen and Carole-Jean Wu and Trevor Mudge and Chaitali Chakrabarti", title = "{RATT-ECC}: Rate Adaptive Two-Tiered Error Correction Codes for Reliable {$3$D} Die-Stacked Memory", journal = j-TACO, volume = "13", number = "3", pages = "24:1--24:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2957758", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article proposes a rate-adaptive, two-tiered error-correction scheme (RATT-ECC) that provides strong reliability (10$^{10}$ x reduction in raw FIT rate) for an HBM-like 3D DRAM system. The tier-1 code is a strong symbol-based code that can correct errors due to small granularity faults and detect errors caused by large granularity faults; the tier-2 code is an XOR-based code that corrects errors detected by the tier-1 code. The rate-adaptive feature of RATT-ECC enables permanent bank failures to be handled through sparing. It can also be used to significantly reduce the refresh power consumption without decreasing reliability and timing performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2016:IDO, author = "Wenjie Chen and Zhibin Wang and Qin Wu and Jiuzhen Liang and Zhilei Chai", title = "Implementing Dense Optical Flow Computation on a Heterogeneous {FPGA SoC} in {C}", journal = j-TACO, volume = "13", number = "3", pages = "25:1--25:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2948976", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High-quality optical flow computation algorithms are computationally intensive. The low computational speed of such algorithms causes difficulties for real-world applications. In this article, we propose an optimized implementation of the classical Combine-Brightness-Gradient (CBG) model on the Xilinx ZYNQ FPGA-SoC, by taking advantage of the inherent algorithmic parallelism and ZYNQ architecture. The execution time decreases to 0.82 second with a lower power consumption (1.881W). It is better than software implementation on PC (Intel i7-3520M, 2.9GHz), which costs 2.635 seconds and 35W. We use C rather than HDLs to describe the algorithm for rapid prototyping.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vaish:2016:OMT, author = "Nilay Vaish and Michael C. Ferris and David A. Wood", title = "Optimization Models for Three On-Chip Network Problems", journal = j-TACO, volume = "13", number = "3", pages = "26:1--26:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2943781", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We model three on-chip network design problems-memory controller placement, resource allocation in heterogeneous on-chip networks, and their combination-as mathematical optimization problems. We model the first two problems as mixed integer linear programs. We model the third problem as a mixed integer nonlinear program, which we then linearize exactly. Sophisticated optimization algorithms enable solutions to be obtained much more efficiently. Detailed simulations using synthetic traffic and benchmark applications validate that our designs provide better performance than solutions proposed previously. Our work provides further evidence toward suitability of optimization models in searching/pruning architectural design space.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sardashti:2016:YAC, author = "Somayeh Sardashti and Andre Seznec and David A. Wood", title = "Yet Another Compressed Cache: a Low-Cost Yet Effective Compressed Cache", journal = j-TACO, volume = "13", number = "3", pages = "27:1--27:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2976740", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Cache memories play a critical role in bridging the latency, bandwidth, and energy gaps between cores and off-chip memory. However, caches frequently consume a significant fraction of a multicore chip's area and thus account for a significant fraction of its cost. Compression has the potential to improve the effective capacity of a cache, providing the performance and energy benefits of a larger cache while using less area. The design of a compressed cache must address two important issues: (i) a low-latency, low-overhead compression algorithm that can represent a fixed-size cache block using fewer bits and (ii) a cache organization that can efficiently store the resulting variable-size compressed blocks. This article focuses on the latter issue. Here, we propose Yet Another Compressed Cache (YACC), a new compressed cache design that targets improving effective cache capacity with a simple design. YACC uses super-blocks to reduce tag overheads while packing variable-size compressed blocks to reduce internal fragmentation. YACC achieves the benefits of two state-of-the art compressed caches-Decoupled Compressed Cache (DCC) [Sardashti and Wood 2013a, 2013b] and Skewed Compressed Cache (SCC) [Sardashti et al. 2014]-with a more practical and simpler design. YACC's cache layout is similar to conventional caches, with a largely unmodified tag array and unmodified data array. Compared to DCC and SCC, YACC requires neither the significant extra metadata (i.e., back pointers) needed by DCC to track blocks nor the complexity and overhead of skewed associativity (i.e., indexing ways differently) needed by SCC. An additional advantage over previous work is that YACC enables modern replacement mechanisms, such as RRIP. For our benchmark set, compared to a conventional uncompressed 8MB LLC, YACC improves performance by 8\% on average and up to 26\%, and reduces total energy by 6\% on average and up to 20\%. An 8MB YACC achieves approximately the same performance and energy improvements as a 16MB conventional cache at a much smaller silicon footprint, with only 1.6\% greater area than an 8MB conventional cache. YACC performs comparably to DCC and SCC but is much simpler to implement.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cruz:2016:HAT, author = "Eduardo H. M. Cruz and Matthias Diener and La{\'e}rcio L. Pilla and Philippe O. A. Navaux", title = "Hardware-Assisted Thread and Data Mapping in Hierarchical Multicore Architectures", journal = j-TACO, volume = "13", number = "3", pages = "28:1--28:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2975587", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The performance and energy efficiency of modern architectures depend on memory locality, which can be improved by thread and data mappings considering the memory access behavior of parallel applications. In this article, we propose intense pages mapping, a mechanism that analyzes the memory access behavior using information about the time the entry of each page resides in the translation lookaside buffer. It provides accurate information with a very low overhead. We present experimental results with simulation and real machines, with average performance improvements of 13.7\% and energy savings of 4.4\%, which come from reductions in cache misses and interconnection traffic.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Adileh:2016:MHP, author = "Almutaz Adileh and Stijn Eyerman and Aamer Jaleel and Lieven Eeckhout", title = "Maximizing Heterogeneous Processor Performance Under Power Constraints", journal = j-TACO, volume = "13", number = "3", pages = "29:1--29:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2976739", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Heterogeneous processors (e.g., ARM's big.LITTLE) improve performance in power-constrained environments by executing applications on the `little' low-power core and move them to the `big' high-performance core when there is available power budget. The total time spent on the big core depends on the rate at which the application dissipates the available power budget. When applications with different big-core power consumption characteristics concurrently execute on a heterogeneous processor, it is best to give a larger share of the power budget to applications that can run longer on the big core, and a smaller share to applications that run for a very short duration on the big core. This article investigates mechanisms to manage the available power budget on power-constrained heterogeneous processors. We show that existing proposals that schedule applications onto a big core based on various performance metrics are not high performing, as these strategies do not optimize over an entire power period and are unaware of the applications' power/performance characteristics. We use linear programming to design the DPDP power management technique, which guarantees optimal performance on heterogeneous processors. We mathematically derive a metric (Delta Performance by Delta Power) that takes into account the power/performance characteristics of each running application and allows our power-management technique to decide how best to distribute the available power budget among the co-running applications at minimal overhead. Our evaluations with a 4-core heterogeneous processor consisting of big.LITTLE pairs show that DPDP improves performance by 16\% on average and up to 40\% compared to a strategy that globally and greedily optimizes the power budget. We also show that DPDP outperforms existing heterogeneous scheduling policies that use performance metrics to decide how best to schedule applications on the big core.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wibowo:2016:ACL, author = "Bagus Wibowo and Abhinav Agrawal and Thomas Stanton and James Tuck", title = "An Accurate Cross-Layer Approach for Online Architectural Vulnerability Estimation", journal = j-TACO, volume = "13", number = "3", pages = "30:1--30:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2975588", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Processor soft-error rates are projected to increase as feature sizes scale down, necessitating the adoption of reliability-enhancing techniques, but power and performance overhead remain a concern of such techniques. Dynamic cross-layer techniques are a promising way to improve the cost-effectiveness of resilient systems. As a foundation for making such a system, we propose a cross-layer approach for estimating the architectural vulnerability of a processor core online that works by combining information from software, compiler, and microarchitectural layers at runtime. The hardware layer combines the metadata from software and compiler layers with microarchitectural measurements to estimate architectural vulnerability online. We describe our design and evaluate it in detail on a set of SPEC CPU 2006 applications. We find that our online AVF estimate is highly accurate with respect to a postmortem AVF analysis, with only 0.46\% average absolute error. Also, our design incurs negligible performance impact for SPEC2006 applications and about 1.2\% for a Monte Carlo application, requires approximately 1.4\% area overhead, and costs about 3.3\% more power on average. We compare our technique against two prior online AVF estimation techniques, one using a linear regression to estimate AVF and another based on PVF-HVF; our evaluation finds that our approach, on average, is more accurate. Our case study of a Monte Carlo simulation shows that our AVF estimate can adapt to the inherent resiliency of the algorithm. Finally, we demonstrate the effectiveness of our approach using a dynamic protection scheme that limits vulnerability to soft errors while reducing the energy consumption by an average of 4.8\%, and with a target normalized SER of 10\%, compared to enabling a simple parity+ECC protection at all times.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Acacio:2016:LDR, author = "Manuel Acacio", title = "List of Distinguished Reviewers {ACM TACO 2014}", journal = j-TACO, volume = "13", number = "3", pages = "31:1--31:??", month = sep, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2989990", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 17 16:20:58 MDT 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vora:2016:SAE, author = "Keval Vora and Rajiv Gupta and Guoqing Xu", title = "Synergistic Analysis of Evolving Graphs", journal = j-TACO, volume = "13", number = "4", pages = "32:1--32:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2992784", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Evolving graph processing involves repeating analyses, which are often iterative, over multiple snapshots of the graph corresponding to different points in time. Since the snapshots of an evolving graph share a great number of vertices and edges, traditional approaches that process these snapshots one at a time without exploiting this overlap contain much wasted effort on both data loading and computation, making them extremely inefficient. In this article, we identify major sources of inefficiencies and present two optimization techniques to address them. First, we propose a technique for amortizing the fetch cost by merging fetching of values for different snapshots of the same vertex. Second, we propose a technique for amortizing the processing cost by feeding values computed by earlier snapshots into later snapshots. We have implemented these optimizations in two distributed graph processing systems, namely, GraphLab and ASPIRE. Our experiments with multiple real evolving graphs and algorithms show that, on average fetch amortization speeds up execution of GraphLab and ASPIRE by 5.2$ \times $ and 4.1$ \times $ , respectively. Amortizing the processing cost yields additional average speedups of 2$ \times $ and 7.9$ \times $, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2016:CPS, author = "Yunquan Zhang and Shigang Li and Shengen Yan and Huiyang Zhou", title = "A Cross-Platform {SpMV} Framework on Many-Core Architectures", journal = j-TACO, volume = "13", number = "4", pages = "33:1--33:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2994148", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Sparse Matrix-Vector multiplication (SpMV) is a key operation in engineering and scientific computing. Although the previous work has shown impressive progress in optimizing SpMV on many-core architectures, load imbalance and high memory bandwidth remain the critical performance bottlenecks. We present our novel solutions to these problems, for both GPUs and Intel MIC many-core architectures. First, we devise a new SpMV format, called Blocked Compressed Common Coordinate (BCCOO). BCCOO extends the blocked Common Coordinate (COO) by using bit flags to store the row indices to alleviate the bandwidth problem. We further improve this format by partitioning the matrix into vertical slices for better data locality. Then, to address the load imbalance problem, we propose a highly efficient matrix-based segmented sum/scan algorithm for SpMV, which eliminates global synchronization. At last, we introduce an autotuning framework to choose optimization parameters. Experimental results show that our proposed framework has a significant advantage over the existing SpMV libraries. In single precision, our proposed scheme outperforms clSpMV COCKTAIL format by 255\% on average on AMD FirePro W8000, and outperforms CUSPARSE V7.0 by 73.7\% on average and outperforms CSR5 by 53.6\% on average on GeForce Titan X; in double precision, our proposed scheme outperforms CUSPARSE V7.0 by 34.0\% on average and outperforms CSR5 by 16.2\% on average on Tesla K20, and has equivalent performance compared with CSR5 on Intel MIC.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ahn:2016:AEE, author = "Junwhan Ahn and Sungjoo Yoo and Kiyoung Choi", title = "{AIM}: Energy-Efficient Aggregation Inside the Memory Hierarchy", journal = j-TACO, volume = "13", number = "4", pages = "34:1--34:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2994149", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we propose Aggregation-in-Memory (AIM), a new processing-in-memory system designed for energy efficiency and near-term adoption. In order to efficiently perform aggregation, we implement simple aggregation operations in main memory and develop a locality-adaptive host architecture for in-memory aggregation, called cache-conscious aggregation. Through this, AIM executes aggregation at the most energy-efficient location among all levels of the memory hierarchy. Moreover, AIM minimally changes existing sequential programming models and provides fully automated compiler toolchain, thereby allowing unmodified legacy software to use AIM. Evaluations show that AIM greatly improves the energy efficiency of main memory and the system performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ziabari:2016:UHB, author = "Amir Kavyan Ziabari and Yifan Sun and Yenai Ma and Dana Schaa and Jos{\'e} L. Abell{\'a}n and Rafael Ubal and John Kim and Ajay Joshi and David Kaeli", title = "{UMH}: a Hardware-Based Unified Memory Hierarchy for Systems with Multiple Discrete {GPUs}", journal = j-TACO, volume = "13", number = "4", pages = "35:1--35:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2996190", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we describe how to ease memory management between a Central Processing Unit (CPU) and one or multiple discrete Graphic Processing Units (GPUs) by architecting a novel hardware-based Unified Memory Hierarchy (UMH). Adopting UMH, a GPU accesses the CPU memory only if it does not find its required data in the directories associated with its high-bandwidth memory, or the NMOESI coherency protocol limits the access to that data. Using UMH with NMOESI improves performance of a CPU-multiGPU system by at least 1.92 $ \times $ in comparison to alternative software-based approaches. It also allows the CPU to access GPUs modified data by at least 13 $ \times $ faster.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Spink:2016:HAC, author = "Tom Spink and Harry Wagstaff and Bj{\"o}rn Franke", title = "Hardware-Accelerated Cross-Architecture Full-System Virtualization", journal = j-TACO, volume = "13", number = "4", pages = "36:1--36:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2996798", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Hardware virtualization solutions provide users with benefits ranging from application isolation through server consolidation to improved disaster recovery and faster server provisioning. While hardware assistance for virtualization is supported by all major processor architectures, including Intel, ARM, PowerPC, and MIPS, these extensions are targeted at virtualization of the same architecture, for example, an x86 guest on an x86 host system. Existing techniques for cross-architecture virtualization, for example, an ARM guest on an x86 host, still incur a substantial overhead for CPU, memory, and I/O virtualization due to the necessity for software emulation of these mismatched system components. In this article, we present a new hardware-accelerated hypervisor called C aptive, employing a range of novel techniques that exploit existing hardware virtualization extensions for improving the performance of full-system cross-platform virtualization. We illustrate how (1) guest memory management unit (MMU) events and operations can be mapped onto host memory virtualization extensions, eliminating the need for costly software MMU emulation, (2) a block-based dynamic binary translation engine inside the virtual machine can improve CPU virtualization performance, (3) memory-mapped guest I/O can be efficiently translated to fast I/O specific calls to emulated devices, and (4) the cost for asynchronous guest interrupts can be reduced. For an ARM-based Linux guest system running on an x86 host with Intel VT support, we demonstrate application performance levels, based on SPEC CPU2006 benchmarks, of up to 5.88$ \times $ over state-of-the-art Qemu and 2.5$ \times $ on average, achieving a guest dynamic instruction throughput of up to 1280 MIPS (million instructions per second) and 915.52 MIPS, on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shi:2016:LLA, author = "Qingchuan Shi and George Kurian and Farrukh Hijaz and Srinivas Devadas and Omer Khan", title = "{LDAC}: Locality-Aware Data Access Control for Large-Scale Multicore Cache Hierarchies", journal = j-TACO, volume = "13", number = "4", pages = "37:1--37:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2983632", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The trend of increasing the number of cores to achieve higher performance has challenged efficient management of on-chip data. Moreover, many emerging applications process massive amounts of data with varying degrees of locality. Therefore, exploiting locality to improve on-chip traffic and resource utilization is of fundamental importance. Conventional multicore cache management schemes either manage the private cache (L1) or the Last-Level Cache (LLC), while ignoring the other. We propose a holistic locality-aware cache hierarchy management protocol for large-scale multicores. The proposed scheme improves on-chip data access latency and energy consumption by intelligently bypassing cache line replication in the L1 caches, and/or intelligently replicating cache lines in the LLC. The approach relies on low overhead yet highly accurate in-hardware runtime classification of data locality at both L1 cache and the LLC. The decision to bypass L1 and/or replicate in LLC is then based on the measured reuse at the fine granularity of cache lines. The locality tracking mechanism is decoupled from the sharer tracking structures that cause scalability concerns in traditional cache coherence protocols. Moreover, the complexity of the protocol is low since no additional coherence states are created. However, the proposed classifier incurs a 5.6 KB per-core storage overhead. On a set of parallel benchmarks, the locality-aware protocol reduces average energy consumption by 26\% and completion time by 16\%, when compared to the state-of-the-art Reactive-NUCA multicore cache management scheme.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fernandes:2016:EHO, author = "Fernando Fernandes and Lucas Weigel and Claudio Jung and Philippe Navaux and Luigi Carro and Paolo Rech", title = "Evaluation of Histogram of Oriented Gradients Soft Errors Criticality for Automotive Applications", journal = j-TACO, volume = "13", number = "4", pages = "38:1--38:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/2998573", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Pedestrian detection reliability is a key problem for autonomous or aided driving, and methods that use Histogram of Oriented Gradients (HOG) are very popular. Embedded Graphics Processing Units (GPUs) are exploited to run HOG in a very efficient manner. Unfortunately, GPUs architecture has been shown to be particularly vulnerable to radiation-induced failures. This article presents an experimental evaluation and analytical study of HOG reliability. We aim at quantifying and qualifying the radiation-induced errors on pedestrian detection applications executed in embedded GPUs. We analyze experimental results obtained executing HOG on embedded GPUs from two different vendors, exposed for about 100 hours to a controlled neutron beam at Los Alamos National Laboratory. We consider the number and position of detected objects as well as precision and recall to discriminate critical erroneous computations. The reported analysis shows that, while being intrinsically resilient (65\% to 85\% of output errors only slightly impact detection), HOG experienced some particularly critical errors that could result in undetected pedestrians or unnecessary vehicle stops. Additionally, we perform a fault-injection campaign to identify HOG critical procedures. We observe that Resize and Normalize are the most sensitive and critical phases, as about 20\% of injections generate an output error that significantly impacts HOG detection. With our insights, we are able to find those limited portions of HOG that, if hardened, are more likely to increase reliability without introducing unnecessary overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dublish:2016:CCG, author = "Saumay Dublish and Vijay Nagarajan and Nigel Topham", title = "Cooperative Caching for {GPUs}", journal = j-TACO, volume = "13", number = "4", pages = "39:1--39:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3001589", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The rise of general-purpose computing on GPUs has influenced architectural innovation on them. The introduction of an on-chip cache hierarchy is one such innovation. High L1 miss rates on GPUs, however, indicate inefficient cache usage due to myriad factors, such as cache thrashing and extensive multithreading. Such high L1 miss rates in turn place high demands on the shared L2 bandwidth. Extensive congestion in the L2 access path therefore results in high memory access latencies. In memory-intensive applications, these latencies get exposed due to a lack of active compute threads to mask such high latencies. In this article, we aim to reduce the pressure on the shared L2 bandwidth, thereby reducing the memory access latencies that lie in the critical path. We identify significant replication of data among private L1 caches, presenting an opportunity to reuse data among L1s. We further show how this reuse can be exploited via an L1 Cooperative Caching Network (CCN), thereby reducing the bandwidth demand on L2. In the proposed architecture, we connect the L1 caches with a lightweight ring network to facilitate intercore communication of shared data. We show that this technique reduces traffic to the L2 cache by an average of 29\%, freeing up the bandwidth for other accesses. We also show that the CCN reduces the average memory latency by 24\%, thereby reducing core stall cycles by 26\% on average. This translates into an overall performance improvement of 14.7\% on average (and up to 49\%) for applications that exhibit reuse across L1 caches. In doing so, the CCN incurs a nominal area and energy overhead of 1.3\% and 2.5\%, respectively. Notably, the performance improvement with our proposed CCN compares favorably to the performance improvement achieved by simply doubling the number of L2 banks by up to 34\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tampouratzis:2016:AIH, author = "Nikolaos Tampouratzis and Pavlos M. Mattheakis and Ioannis Papaefstathiou", title = "Accelerating Intercommunication in Highly Parallel Systems", journal = j-TACO, volume = "13", number = "4", pages = "40:1--40:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3005717", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Every HPC system consists of numerous processing nodes interconnect using a number of different inter-process communication protocols such as Messaging Passing Interface (MPI) and Global Arrays (GA). Traditionally, research has focused on optimizing these protocols and identifying the most suitable ones for each system and/or application. Recently, there has been a proposal to unify the primitive operations of the different inter-processor communication protocols through the Portals library. Portals offer a set of low-level communication routines which can be composed in order to implement the functionality of different intercommunication protocols. However, Portals modularity comes at a performance cost, since it adds one more layer in the actual protocol implementation. This work aims at closing the performance gap between a generic and reusable intercommunication layer, such as Portals, and the several monolithic and highly optimized intercommunication protocols. This is achieved through the development of a novel hardware offload engine efficiently implementing the basic Portals' modules. Our innovative system is up to two2 orders of magnitude faster than the conventional software implementation of Portals' while the speedup achieved over the conventional monolithic software implementations of MPI and GAs is more than an order of magnitude. The power consumption of our hardware system is less than 1/100th of what a low-power CPU consumes when executing the Portal's software while its silicon cost is less than 1/10th of that of a very simple RISC CPU. Moreover, our design process is also innovative since we have first modeled the hardware within an untimed virtual prototype which allowed for rapid design space exploration; then we applied a novel methodology to transform the untimed description into an efficient timed hardware description, which was then transformed into a hardware netlist through a High-Level Synthesis (HLS) tool.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Park:2016:CJP, author = "Hyukwoo Park and Myungsu Cha and Soo-Mook Moon", title = "Concurrent {JavaScript} Parsing for Faster Loading of {Web} Apps", journal = j-TACO, volume = "13", number = "4", pages = "41:1--41:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3004281", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "JavaScript is a dynamic language mainly used as a client-side web script. Nowadays, web is evolving into an application platform with its web apps, and JavaScript increasingly undertakes complex computations and interactive user interfaces, requiring a high-performance JavaScript engine. There have been many optimizations for efficient JavaScript engines, but one component that has not been optimized much is JavaScript parsing. A JavaScript function needs to be parsed before being executed, and the parsing overhead takes a substantial portion of JavaScript execution time for web apps, especially during app loading. This article proposes concurrent parsing of JavaScript, which performs the parsing of JavaScript functions in advance on different threads, while the main thread is executing the parsed JavaScript functions. This can hide the parsing overhead from the main execution thread, reducing the JavaScript execution time, thus reducing the overall app loading time. More specifically, we separated JavaScript parsing and made it run on different threads without violating the execution semantics of JavaScript. We also designed an efficient multi-threaded parsing architecture, which reduces the synchronization overhead and schedules the parsing requests appropriately. Finally, we explored two methods of choosing the target functions for concurrent parsing: one based on profiled information and the other based on speculative heuristics. We performed experiments on the WebKit browser with the JSC engine for real web apps. The result shows that the proposed concurrent parsing can improve the JavaScript performance during app loading by as much as 64\% and by 39.7\% on average. This improves the whole app loading performance tangibly, by as much as 32.7\% and by 18.2\%, on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xiong:2016:MAS, author = "Dongliang Xiong and Kai Huang and Xiaowen Jiang and Xiaolang Yan", title = "Memory Access Scheduling Based on Dynamic Multilevel Priority in Shared {DRAM} Systems", journal = j-TACO, volume = "13", number = "4", pages = "42:1--42:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3007647", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Interapplication interference at shared main memory severely degrades performance and increasing DRAM frequency calls for simple memory schedulers. Previous memory schedulers employ a per-application ranking scheme for high system performance or a per-group ranking scheme for low hardware cost, but few provide a balance. We propose DMPS, a memory scheduler based on dynamic multilevel priority. First, DMPS uses ``memory occupancy'' to measure interference quantitatively. Second, DMPS groups applications, favors latency-sensitive groups, and dynamically prioritizes applications by employing a per-level ranking scheme. The simulation results show that DMPS has 7.2\% better system performance and 22\% better fairness over FRFCFS at low hardware complexity and cost.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{DeSensi:2016:RAP, author = "Daniele {De Sensi} and Massimo Torquati and Marco Danelutto", title = "A Reconfiguration Algorithm for Power-Aware Parallel Applications", journal = j-TACO, volume = "13", number = "4", pages = "43:1--43:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3004054", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In current computing systems, many applications require guarantees on their maximum power consumption to not exceed the available power budget. On the other hand, for some applications, it could be possible to decrease their performance, yet maintain an acceptable level, in order to reduce their power consumption. To provide such guarantees, a possible solution consists in changing the number of cores assigned to the application, their clock frequency, and the placement of application threads over the cores. However, power consumption and performance have different trends depending on the application considered and on its input. Finding a configuration of resources satisfying user requirements is, in the general case, a challenging task. In this article, we propose Nornir, an algorithm to automatically derive, without relying on historical data about previous executions, performance and power consumption models of an application in different configurations. By using these models, we are able to select a close-to-optimal configuration for the given user requirement, either performance or power consumption. The configuration of the application will be changed on-the-fly throughout the execution to adapt to workload fluctuations, external interferences, and/or application's phase changes. We validate the algorithm by simulating it over the applications of the Parsec benchmark suit. Then, we implement our algorithm and we analyse its accuracy and overhead over some of these applications on a real execution environment. Eventually, we compare the quality of our proposal with that of the optimal algorithm and of some state-of-the-art solutions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jantz:2016:IIP, author = "Michael R. Jantz and Forrest J. Robinson and Prasad A. Kulkarni", title = "Impact of Intrinsic Profiling Limitations on Effectiveness of Adaptive Optimizations", journal = j-TACO, volume = "13", number = "4", pages = "44:1--44:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3008661", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many performance optimizations rely on or are enhanced by runtime profile information. However, both offline and online profiling techniques suffer from intrinsic and practical limitations that affect the quality of delivered profile data. The quality of profile data is its ability to accurately predict (relevant aspects of) future program behavior. While these limitations are known, their impact on the effectiveness of profile-guided optimizations, compared to the ideal performance, is not as well understood. We define ideal performance for adaptive optimizations as that achieved with a precise profile of future program behavior. In this work, we study and quantify the performance impact of fundamental profiling limitations by comparing the effectiveness of typical adaptive optimizations when using the best profiles generated by offline and online schemes against a baseline where the adaptive optimization is given access to profile information about the future execution of the program. We model and compare the behavior of three adaptive JVM optimizations-heap memory management using object usage profiles, code cache management using method usage profiles, and selective just-in-time compilation using method hotness profiles-for the Java DaCapo benchmarks. Our results provide insight into the advantages and drawbacks of current profiling strategies and shed light on directions for future profiling research.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Damschen:2016:EWP, author = "Marvin Damschen and Lars Bauer and J{\"o}rg Henkel", title = "Extending the {WCET} Problem to Optimize for Runtime-Reconfigurable Processors", journal = j-TACO, volume = "13", number = "4", pages = "45:1--45:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3014059", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The correctness of a real-time system does not depend on the correctness of its calculations alone but also on the non-functional requirement of adhering to deadlines. Guaranteeing these deadlines by static timing analysis, however, is practically infeasible for current microarchitectures with out-of-order scheduling pipelines, several hardware threads, and multiple (shared) cache layers. Novel timing-analyzable features are required to sustain the strongly increasing demand for processing power in real-time systems. Recent advances in timing analysis have shown that runtime-reconfigurable instruction set processors are one way to escape the scarcity of analyzable processing power while preserving the flexibility of the system. When moving calculations from software to hardware by means of reconfigurable custom instructions (CIs)-additional to a considerable speedup-the overestimation of a task's worst-case execution time (WCET) can be reduced. CIs typically implement functionality that corresponds to several hundred instructions on the central processing unit (CPU) pipeline. While analyzing instructions for worst-case latency may introduce pessimism, the latency of CIs-executed on the reconfigurable fabric-is precisely known. In this work, we introduce the problem of selecting reconfigurable CIs to optimize the WCET of an application. We model this problem as an extension to state-of-the-art integer linear programming (ILP)-based program path analysis. This way, we enable optimization based on accurate WCET estimates with integration of information about global program flow, for example, infeasible paths. We present an optimal solution with effective techniques to prune the search space and a greedy heuristic that performs a maximum number of steps linear in the number of partitions of reconfigurable area available. Finally, we show the effectiveness of optimizing the WCET on a reconfigurable processor by evaluating a complex multimedia application with multiple reconfigurable CIs for several hardware parameters.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2016:MAP, author = "Zheng Li and Fang Wang and Dan Feng and Yu Hua and Jingning Liu and Wei Tong", title = "{MaxPB}: Accelerating {PCM} Write by Maximizing the Power Budget Utilization", journal = j-TACO, volume = "13", number = "4", pages = "46:1--46:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3012007", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Phase Change Memory (PCM) is one of the promising memory technologies but suffers from some critical problems such as poor write performance and high write energy consumption. Due to the high write energy consumption and limited power supply, the size of concurrent bit-write is restricted inside one PCM chip. Typically, the size of concurrent bit-write is much less than the cache line size and it is normal that many serially executed write units are consumed to write down the data block to PCM when using it as the main memory. Existing state-of-the-art PCM write schemes, such as FNW (Flip-N-Write) and two-stage-write, address the problem of poor performance by improving the write parallelism under the power constraints. The parallelism is obtained via reducing the data amount and leveraging power as well as time asymmetries, respectively. However, due to the extremely pessimistic assumptions of current utilization (FNW) and optimistic assumptions of asymmetries (two-stage-write), these schemes fail to maximize the power supply utilization and hence improve the write parallelism. In this article, we propose a novel PCM write scheme, called MaxPB (Maximize the Power Budget utilization) to maximize the power budget utilization with minimum changes about the circuits design. MaxPB is a ``think before acting'' method. The main idea of MaxPB is to monitor the actual power needs of all data units first and then effectively package them into the least number of write units under the power constraints. Experimental results show the efficiency and performance improvements on MaxPB. For example, four-core PARSEC and SPEC experimental results show that MaxPB gets 32.0\% and 20.3\% more read latency reduction, 26.5\% and 16.1\% more write latency reduction, 24.3\% and 15.6\% more running time decrease, 1.32$ \times $ and 0.92$ \times $ more speedup, as well as 30.6\% and 18.4\% more energy consumption reduction on average compared with the state-of-the-art FNW and two-stage-write write schemes, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Muralidharan:2016:DTN, author = "Saurav Muralidharan and Michael Garland and Albert Sidelnik and Mary Hall", title = "Designing a Tunable Nested Data-Parallel Programming System", journal = j-TACO, volume = "13", number = "4", pages = "47:1--47:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3012011", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article describes Surge, a nested data-parallel programming system designed to simplify the porting and tuning of parallel applications to multiple target architectures. Surge decouples high-level specification of computations, expressed using a C++ programming interface, from low-level implementation details using two first-class constructs: schedules and policies. Schedules describe the valid ways in which data-parallel operators may be implemented, while policies encapsulate a set of parameters that govern platform-specific code generation. These two mechanisms are used to implement a code generation system that analyzes computations and automatically generates a search space of valid platform-specific implementations. An input and architecture-adaptive autotuning system then explores this search space to find optimized implementations. We express in Surge five real-world benchmarks from domains such as machine learning and sparse linear algebra and from the high-level specifications, Surge automatically generates CPU and GPU implementations that perform on par with or better than manually optimized versions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Akturk:2016:ABN, author = "Ismail Akturk and Riad Akram and Mohammad Majharul Islam and Abdullah Muzahid and Ulya R. Karpuzcu", title = "Accuracy Bugs: a New Class of Concurrency Bugs to Exploit Algorithmic Noise Tolerance", journal = j-TACO, volume = "13", number = "4", pages = "48:1--48:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3017991", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Parallel programming introduces notoriously difficult bugs, usually referred to as concurrency bugs. This article investigates the potential for deviating from the conventional wisdom of writing concurrency bug-free, parallel programs. It explores the benefit of accepting buggy but approximately correct parallel programs by leveraging the inherent tolerance of emerging parallel applications to inaccuracy in computations. Under algorithmic noise tolerance, a new class of concurrency bugs, accuracy bugs, degrade the accuracy of computation (often at acceptable levels) rather than causing catastrophic termination. This study demonstrates how embracing accuracy bugs affects the application output quality and performance and analyzes the impact on execution semantics.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tomusk:2016:SHC, author = "Erik Tomusk and Christophe Dubach and Michael O'Boyle", title = "Selecting Heterogeneous Cores for Diversity", journal = j-TACO, volume = "13", number = "4", pages = "49:1--49:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3014165", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Mobile devices with heterogeneous processors are becoming mainstream. With a heterogeneous processor, the runtime scheduler can pick the best CPU core for a given task based on program characteristics, performance requirements, and power limitations. For a heterogeneous processor to be effective, it must contain a diverse set of cores to match a range of runtime requirements and program behaviors. Selecting a diverse set of cores is, however, a non-trivial problem. Power and performance are dependent on both program features and the microarchitectural features of cores, and a selection of cores must satisfy the competing demands of different types of programs. We present a method of core selection that chooses cores at a range of power-performance points. Our algorithm is based on the observation that it is not necessary for a core to consistently have high performance or low power; one type of core can fulfill different roles for different types of programs. Given a power budget, cores selected with our method provide an average speedup of 6\% on EEMBC mobile benchmarks and a 24\% speedup on SPEC 2006 integer benchmarks over the state-of-the-art core selection method.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Michaud:2016:SMF, author = "Pierre Michaud", title = "Some Mathematical Facts About Optimal Cache Replacement", journal = j-TACO, volume = "13", number = "4", pages = "50:1--50:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3017992", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article exposes and proves some mathematical facts about optimal cache replacement that were previously unknown or not proved rigorously. An explicit formula is obtained, giving OPT hits and misses as a function of past references. Several mathematical facts are derived from this formula, including a proof that OPT miss curves are always convex, and a new algorithm called OPT tokens, for reasoning about optimal replacement.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bao:2016:SDF, author = "Wenlei Bao and Changwan Hong and Sudheer Chunduri and Sriram Krishnamoorthy and Louis-No{\"e}l Pouchet and Fabrice Rastello and P. Sadayappan", title = "Static and Dynamic Frequency Scaling on Multicore {CPUs}", journal = j-TACO, volume = "13", number = "4", pages = "51:1--51:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3011017", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dynamic Voltage and Frequency Scaling (DVFS) typically adapts CPU power consumption by modifying a processor's operating frequency (and the associated voltage). Typical DVFS approaches include using default strategies such as running at the lowest or the highest frequency or reacting to the CPU's runtime load to reduce or increase frequency based on the CPU usage. In this article, we argue that a compile-time approach to CPU frequency selection is achievable for affine program regions and can significantly outperform runtime-based approaches. We first propose a lightweight runtime approach that can exploit the properties of the power profile specific to a processor, outperforming classical Linux governors such as powersave or on-demand for computational kernels. We then demonstrate that, for affine kernels in the application, a purely compile-time approach to CPU frequency and core count selection is achievable, providing significant additional benefits over the runtime approach. Our framework relies on a one-time profiling of the target CPU, along with a compile-time categorization of loop-based code segments in the application. These are combined to determine at compile-time the frequency and the number of cores to use to execute each affine region to optimize energy or energy-delay product. Extensive evaluation on 60 benchmarks and 5 multi-core CPUs show that our approach systematically outperforms the powersave Linux governor while also improving overall performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vale:2016:PDT, author = "Tiago M. Vale and Jo{\~a}o A. Silva and Ricardo J. Dias and Jo{\~a}o M. Louren{\c{c}}o", title = "{Pot}: Deterministic Transactional Execution", journal = j-TACO, volume = "13", number = "4", pages = "52:1--52:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3017993", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article presents Pot, a system that leverages the concept of preordered transactions to achieve deterministic multithreaded execution of programs that use Transactional Memory. Preordered transactions eliminate the root cause of nondeterminism in transactional execution: they provide the illusion of executing in a deterministic serial order, unlike traditional transactions that appear to execute in a nondeterministic order that can change from execution to execution. Pot uses a new concurrency control protocol that exploits the serialization order to distinguish between fast and speculative transaction execution modes in order to mitigate the overhead of imposing a deterministic order. We build two Pot prototypes: one using STM and another using off-the-shelf HTM. To the best of our knowledge, Pot enables deterministic execution of programs using off-the-shelf HTM for the first time. An experimental evaluation shows that Pot achieves deterministic execution of TM programs with low overhead, sometimes even outperforming nondeterministic executions, and clearly outperforming the state of the art.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2016:AFB, author = "Zhonghai Lu and Yuan Yao", title = "Aggregate Flow-Based Performance Fairness in {CMPs}", journal = j-TACO, volume = "13", number = "4", pages = "53:1--53:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3014429", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In CMPs, multiple co-executing applications create mutual interference when sharing the underlying network-on-chip architecture. Such interference causes different performance slowdowns to different applications. To mitigate the unfairness problem, we treat traffic initiated from the same thread as an aggregate flow such that causal request/reply packet sequences can be allocated to resources consistently and fairly according to online profiled traffic injection rates. Our solution comprises three coherent mechanisms from rate profiling, rate inheritance, and rate-proportional channel scheduling to facilitate and realize unbiased workload-adaptive resource allocation. Full-system evaluations in GEM5 demonstrate that, compared to classic packet-centric and latest application-prioritization approaches, our approach significantly improves weighted speed-up for all multi-application mixtures and achieves nearly ideal performance fairness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Demir:2016:EPP, author = "Yigit Demir and Nikos Hardavellas", title = "Energy-Proportional Photonic Interconnects", journal = j-TACO, volume = "13", number = "4", pages = "54:1--54:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3018110", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Photonic interconnects have emerged as the prime candidate technology for efficient networks on chip at future process nodes. However, the high optical loss of many nanophotonic components coupled with the low efficiency of current laser sources results in exceedingly high total power requirements for the laser. As optical interconnects stay on even during periods of system inactivity, most of this power is wasted, which has prompted research on laser gating. Unfortunately, prior work has been complicated by the long laser turn-on delays and has failed to deliver the full savings. In this article, we propose ProLaser, a laser control mechanism that monitors the requests sent on the interconnect, the cache, and the coherence directory to detect highly correlated events and turn on proactively the lasers of a photonic interconnect. While ProLaser requires fast lasers with a turn-on delay of a few nanoseconds, a technology that is still experimental, several types of such lasers that are suitable for power gating have already been manufactured over the last decade. Overall, ProLaser saves 42\% to 85\% of the laser power, outperforms the current state of the art by 2$ \times $ on average, and closely tracks (within 2\%--6\%) a perfect prediction scheme with full knowledge of future interconnect requests. Moreover, the power savings of ProLaser allow the cores to exploit a higher-power budget and run faster, achieving speedups of 1.5 to 1.7$ \times $ (1.6$ \times $ on average).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kurt:2016:UAS, author = "Mehmet Can Kurt and Sriram Krishnamoorthy and Gagan Agrawal and Bin Ren", title = "User-Assisted Store Recycling for Dynamic Task Graph Schedulers", journal = j-TACO, volume = "13", number = "4", pages = "55:1--55:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3018111", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The emergence of the multi-core era has led to increased interest in designing effective yet practical parallel programming models. Models based on task graphs that operate on single-assignment data are attractive in several ways. Notably, they can support dynamic applications and precisely represent the available concurrency. However, for efficient execution, they also require nuanced algorithms for scheduling and memory management. In this article, we consider memory-efficient dynamic scheduling of task graphs. Specifically, we present a novel approach for dynamically recycling the memory locations assigned to data items as they are produced by tasks. We develop algorithms to identify memory-efficient store recycling functions by systematically evaluating the validity of a set of user-provided or automatically generated alternatives. Because recycling functions can be input data-dependent, we have also developed support for continued correct execution of a task graph in the presence of a potentially incorrect store recycling function. Experimental evaluation demonstrates that this approach to automatic store recycling incurs little to no overheads, achieves memory usage comparable to the best manually derived solutions, often produces recycling functions valid across problem sizes and input parameters, and efficiently recovers from an incorrect choice of store recycling functions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Haj-Yihia:2016:FGP, author = "Jawad Haj-Yihia and Ahmad Yasin and Yosi {Ben Asher} and Avi Mendelson", title = "Fine-Grain Power Breakdown of Modern Out-of-Order Cores and Its Implications on {Skylake}-Based Systems", journal = j-TACO, volume = "13", number = "4", pages = "56:1--56:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3018112", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A detailed analysis of power consumption at low system levels becomes important as a means for reducing the overall power consumption of a system and its thermal hot spots. This work presents a new power estimation method that allows understanding the power breakdown of an application when running on modern processor architecture such as the newly released Intel Skylake processor. This work also provides a detailed power and performance characterization report for the SPEC CPU2006 benchmarks, analysis of the data using side-by-side power and performance breakdowns, as well as few interesting case studies.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Scolari:2016:SCP, author = "Alberto Scolari and Davide Basilio Bartolini and Marco Domenico Santambrogio", title = "A Software Cache Partitioning System for Hash-Based Caches", journal = j-TACO, volume = "13", number = "4", pages = "57:1--57:??", month = dec, year = "2016", CODEN = "????", DOI = "https://doi.org/10.1145/3018113", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 28 16:24:46 MST 2016", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Contention on the shared Last-Level Cache (LLC) can have a fundamental negative impact on the performance of applications executed on modern multicores. An interesting software approach to address LLC contention issues is based on page coloring, which is a software technique that attempts to achieve performance isolation by partitioning a shared cache through careful memory management. The key assumption of traditional page coloring is that the cache is physically addressed. However, recent multicore architectures (e.g., Intel Sandy Bridge and later) switched from a physical addressing scheme to a more complex scheme that involves a hash function. Traditional page coloring is ineffective on these recent architectures. In this article, we extend page coloring to work on these recent architectures by proposing a mechanism able to handle their hash-based LLC addressing scheme. Just as for traditional page coloring, the goal of this new mechanism is to deliver performance isolation by avoiding contention on the LLC, thus enabling predictable performance. We implement this mechanism in the Linux kernel, and evaluate it using several benchmarks from the SPEC CPU2006 and PARSEC 3.0 suites. Our results show that our solution is able to deliver performance isolation to concurrently running applications by enforcing partitioning of a Sandy Bridge LLC, which traditional page coloring techniques are not able to handle.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mukhanov:2017:AFG, author = "Lev Mukhanov and Pavlos Petoumenos and Zheng Wang and Nikos Parasyris and Dimitrios S. Nikolopoulos and Bronis R. {De Supinski} and Hugh Leather", title = "{ALEA}: a Fine-Grained Energy Profiling Tool", journal = j-TACO, volume = "14", number = "1", pages = "1:1--1:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3050436", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Energy efficiency is becoming increasingly important, yet few developers understand how source code changes affect the energy and power consumption of their programs. To enable them to achieve energy savings, we must associate energy consumption with software structures, especially at the fine-grained level of functions and loops. Most research in the field relies on direct power/energy measurements taken from on-board sensors or performance counters. However, this coarse granularity does not directly provide the needed fine-grained measurements. This article presents ALEA, a novel fine-grained energy profiling tool based on probabilistic analysis for fine-grained energy accounting. ALEA overcomes the limitations of coarse-grained power-sensing instruments to associate energy information effectively with source code at a fine-grained level. We demonstrate and validate that ALEA can perform accurate energy profiling at various granularity levels on two different architectures: Intel Sandy Bridge and ARM big.LITTLE. ALEA achieves a worst-case error of only 2\% for coarse-grained code structures and 6\% for fine-grained ones, with less than 1\% runtime overhead. Our use cases demonstrate that ALEA supports energy optimizations, with energy savings of up to 2.87 times for a latency-critical option pricing workload under a given power budget.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pathania:2017:DTM, author = "Anuj Pathania and Vanchinathan Venkataramani and Muhammad Shafique and Tulika Mitra and J{\"o}rg Henkel", title = "Defragmentation of Tasks in Many-Core Architecture", journal = j-TACO, volume = "14", number = "1", pages = "2:1--2:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3050437", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many-cores can execute multiple multithreaded tasks in parallel. A task performs most efficiently when it is executed over a spatially connected and compact subset of cores so that performance loss due to communication overhead imposed by the task's threads spread across the allocated cores is minimal. Over a span of time, unallocated cores can get scattered all over the many-core, creating fragments in the task mapping. These fragments can prevent efficient contiguous mapping of incoming new tasks leading to loss of performance. This problem can be alleviated by using a task defragmenter, which consolidates smaller fragments into larger fragments wherein the incoming tasks can be efficiently executed. Optimal defragmentation of a many-core is an NP-hard problem in the general case. Therefore, we simplify the original problem to a problem that can be solved optimally in polynomial time. In this work, we introduce a concept of exponentially separable mapping (ESM), which defines a set of task mapping constraints on a many-core. We prove that an ESM enforcing many-core can be defragmented optimally in polynomial time.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zivanovic:2017:MMH, author = "Darko Zivanovic and Milan Pavlovic and Milan Radulovic and Hyunsung Shin and Jongpil Son and Sally A. Mckee and Paul M. Carpenter and Petar Radojkovi{\'c} and Eduard Ayguad{\'e}", title = "Main Memory in {HPC}: Do We Need More or Could We Live with Less?", journal = j-TACO, volume = "14", number = "1", pages = "3:1--3:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3023362", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "An important aspect of High-Performance Computing (HPC) system design is the choice of main memory capacity. This choice becomes increasingly important now that 3D-stacked memories are entering the market. Compared with conventional Dual In-line Memory Modules (DIMMs), 3D memory chiplets provide better performance and energy efficiency but lower memory capacities. Therefore, the adoption of 3D-stacked memories in the HPC domain depends on whether we can find use cases that require much less memory than is available now. This study analyzes the memory capacity requirements of important HPC benchmarks and applications. We find that the High-Performance Conjugate Gradients (HPCG) benchmark could be an important success story for 3D-stacked memories in HPC, but High-Performance Linpack (HPL) is likely to be constrained by 3D memory capacity. The study also emphasizes that the analysis of memory footprints of production HPC applications is complex and that it requires an understanding of application scalability and target category, i.e., whether the users target capability or capacity computing. The results show that most of the HPC applications under study have per-core memory footprints in the range of hundreds of megabytes, but we also detect applications and use cases that require gigabytes per core. Overall, the study identifies the HPC applications and use cases with memory footprints that could be provided by 3D-stacked memory chiplets, making a first step toward adoption of this novel technology in the HPC domain.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zheng:2017:WAD, author = "Wenguang Zheng and Hui Wu and Qing Yang", title = "{WCET}-Aware Dynamic {I}-Cache Locking for a Single Task", journal = j-TACO, volume = "14", number = "1", pages = "4:1--4:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3046683", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Caches are widely used in embedded systems to bridge the increasing speed gap between processors and off-chip memory. However, caches make it significantly harder to compute the worst-case execution time (WCET) of a task. To alleviate this problem, cache locking has been proposed. We investigate the WCET-aware I-cache locking problem and propose a novel dynamic I-cache locking heuristic approach for reducing the WCET of a task. For a nonnested loop, our approach aims at selecting a minimum set of memory blocks of the loop as locked cache contents by using the min-cut algorithm. For a loop nest, our approach not only aims at selecting a minimum set of memory blocks of the loop nest as locked cache contents but also finds a good loading point for each selected memory block. We propose two algorithms for finding a good loading point for each selected memory block, a polynomial-time heuristic algorithm and an integer linear programming (ILP)-based algorithm, further reducing the WCET of each loop nest. We have implemented our approach and compared it to two state-of-the-art I-cache locking approaches by using a set of benchmarks from the MRTC benchmark suite. The experimental results show that the polynomial-time heuristic algorithm for finding a good loading point for each selected memory block performs almost equally as well as the ILP-based algorithm. Compared to the partial locking approach proposed in Ding et al. [2012], our approach using the heuristic algorithm achieves the average improvements of 33\%, 15\%, 9\%, 3\%, 8\%, and 11\% for the 256B, 512B, 1KB, 4KB, 8KB, and 16KB caches, respectively. Compared to the dynamic locking approach proposed in Puaut [2006], it achieves the average improvements of 9\%, 19\%, 18\%, 5\%, 11\%, and 16\% for the 256B, 512B, 1KB, 4KB, 8KB, and 16KB caches, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2017:EJV, author = "Byung-Sun Yang and Jae-Yun Kim and Soo-Mook Moon", title = "Exceptionization: a {Java} {VM} Optimization for Non-{Java} Languages", journal = j-TACO, volume = "14", number = "1", pages = "5:1--5:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3046681", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", abstract = "Java virtual machine (JVM) has recently evolved into a general-purpose language runtime environment to execute popular programming languages such as JavaScript, Ruby, Python, and Scala. These languages have complex non-Java features, including dynamic typing and first-class function, so additional language runtimes (engines) are provided on top of the JVM to support them with bytecode extensions. Although there are high-performance JVMs with powerful just-in-time (JIT) compilers, running these languages efficiently on the JVM is still a challenge. This article introduces a simple and novel technique for the JVM JIT compiler called exceptionization to improve the performance of JVM-based language runtimes. We observed that the JVM executing some non-Java languages encounters at least 2 times more branch bytecodes than Java, most of which are highly biased to take only one target. Exceptionization treats such a highly biased branch as some implicit exception-throwing instruction. This allows the JVM JIT compiler to prune the infrequent target of the branch from the frequent control flow, thus compiling the frequent control flow more aggressively with better optimization. If a pruned path were taken, then it would run like a Java exception handler, that is, a catch block. We also devised de-exceptionization, a mechanism to cope with the case when a pruned path is executed more often than expected. Since exceptionization is a generic JVM optimization, independent of any specific language runtime, it would be generally applicable to other language runtimes on the JVM. Our experimental result shows that exceptionization accelerates the performance of several non-Java languages. For example, JavaScript-on-JVM runs faster by as much as 60\% and by 6\% on average, when experimented with the Octane benchmark suite on Oracle's latest Nashorn JavaScript engine and HotSpot 1.9 JVM. Furthermore, the performance of Ruby-on-JVM shows an improvement by as much as 60\% and by 6\% on average, while Python-on-JVM improves by as much as 6\% and by 2\% on average. We found that exceptionization is more effective to apply to the branch bytecode of the language runtime itself than the bytecode corresponding to the application code or the bytecode of the Java class libraries. This implies that the performance benefit of exceptionization comes from better JIT compilation of the language runtime of non-Java languages.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sen:2017:PGE, author = "Rathijit Sen and David A. Wood", title = "{Pareto} Governors for Energy-Optimal Computing", journal = j-TACO, volume = "14", number = "1", pages = "6:1--6:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3046682", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The original definition of energy-proportional computing does not characterize the energy efficiency of recent reconfigurable computers, resulting in nonintuitive ``super-proportional'' behavior. This article introduces a new definition of ideal energy-proportional computing, new metrics to quantify computational energy waste, and new SLA-aware OS governors that seek Pareto optimality to achieve power-efficient performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chaudhuri:2017:MSC, author = "Mainak Chaudhuri and Mukesh Agrawal and Jayesh Gaur and Sreenivas Subramoney", title = "Micro-Sector Cache: Improving Space Utilization in Sectored {DRAM} Caches", journal = j-TACO, volume = "14", number = "1", pages = "7:1--7:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3046680", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent research proposals on DRAM caches with conventional allocation units (64 or 128 bytes) as well as large allocation units (512 bytes to 4KB) have explored ways to minimize the space/latency impact of the tag store and maximize the effective utilization of the bandwidth. In this article, we study sectored DRAM caches that exercise large allocation units called sectors, invest reasonably small storage to maintain tag/state, enable space- and bandwidth-efficient tag/state caching due to low tag working set size and large data coverage per tag element, and minimize main memory bandwidth wastage by fetching only the useful portions of an allocated sector. However, the sectored caches suffer from poor space utilization, since a large sector is always allocated even if the sector utilization is low. The recently proposed Unison cache addresses only a special case of this problem by not allocating the sectors that have only one active block. We propose Micro-sector cache, a locality-aware sectored DRAM cache architecture that features a flexible mechanism to allocate cache blocks within a sector and a locality-aware sector replacement algorithm. Simulation studies on a set of 30 16-way multi-programmed workloads show that our proposal, when incorporated in an optimized Unison cache baseline, improves performance (weighted speedup) by 8\%, 14\%, and 16\% on average, respectively, for 1KB, 2KB, and 4KB sectors at 128MB capacity. These performance improvements result from significantly better cache space utilization, leading to 18\%, 21\%, and 22\% average reduction in DRAM cache read misses, respectively, for 1KB, 2KB, and 4KB sectors at 128MB capacity. We evaluate our proposal for DRAM cache capacities ranging from 128MB to 1GB.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Georgiou:2017:ETD, author = "Kyriakos Georgiou and Steve Kerrison and Zbigniew Chamski and Kerstin Eder", title = "Energy Transparency for Deeply Embedded Programs", journal = j-TACO, volume = "14", number = "1", pages = "8:1--8:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3046679", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Energy transparency is a concept that makes a program's energy consumption visible, from hardware up to software, through the different system layers. Such transparency can enable energy optimizations at each layer and between layers, as well as help both programmers and operating systems make energy-aware decisions. In this article, we focus on deeply embedded devices, typically used for Internet of Things (IoT) applications, and demonstrate how to enable energy transparency through existing static resource analysis (SRA) techniques and a new target-agnostic profiling technique, without hardware energy measurements. Our novel mapping technique enables software energy consumption estimations at a higher level than the Instruction Set Architecture (ISA), namely the LLVM intermediate representation (IR) level, and therefore introduces energy transparency directly to the LLVM optimizer. We apply our energy estimation techniques to a comprehensive set of benchmarks, including single- and multithreaded embedded programs from two commonly used concurrency patterns: task farms and pipelines. Using SRA, our LLVM IR results demonstrate a high accuracy with a deviation in the range of 1\% from the ISA SRA. Our profiling technique captures the actual energy consumption at the LLVM IR level with an average error of 3\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2017:LLO, author = "Pengcheng Li and Xiaoyu Hu and Dong Chen and Jacob Brock and Hao Luo and Eddy Z. Zhang and Chen Ding", title = "{LD}: Low-Overhead {GPU} Race Detection Without Access Monitoring", journal = j-TACO, volume = "14", number = "1", pages = "9:1--9:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3046678", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Data race detection has become an important problem in GPU programming. Previous designs of CPU race-checking tools are mainly task parallel and incur high overhead on GPUs due to access instrumentation, especially when monitoring many thousands of threads routinely used by GPU programs. This article presents a novel data-parallel solution designed and optimized for the GPU architecture. It includes compiler support and a set of runtime techniques. It uses value-based checking, which detects the races reported in previous work, finds new races, and supports race-free deterministic GPU execution. More important, race checking is massively data parallel and does not introduce divergent branching or atomic synchronization. Its slowdown is less than $ 5 \times $ for over half of the tests and $ 10 \times $ on average, which is orders of magnitude more efficient than the cuda-memcheck tool by Nvidia and the methods that use fine-grained access instrumentation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Palangappa:2017:CCE, author = "Poovaiah M. Palangappa and Kartik Mohanram", title = "{CompEx++}: Compression-Expansion Coding for Energy, Latency, and Lifetime Improvements in {MLC\slash TLC NVMs}", journal = j-TACO, volume = "14", number = "1", pages = "10:1--10:??", month = apr, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3050440", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:58 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multilevel/triple-level cell nonvolatile memories (MLC/TLC NVMs) such as phase-change memory (PCM) and resistive RAM (RRAM) are the subject of active research and development as replacement candidates for DRAM, which is limited by its high refresh power and poor scaling potential. In addition to the benefits of nonvolatility (low refresh power) and improved scalability, MLC/TLC NVMs offer high data density and memory capacity over DRAM. However, the viability of MLC/TLC NVMs is limited primarily due to the high programming energy and latency as well as the low endurance of NVM cells; these are primarily attributed to the iterative program-and-verify procedure necessary for programming the NVM cells. This article proposes compression-expansion (CompEx) coding, a low overhead scheme that synergistically integrates pattern-based compression with expansion coding to realize simultaneous energy, latency, and lifetime improvements in MLC/TLC NVMs. CompEx coding is agnostic to the choice of compression technique; in this work, we evaluate CompEx coding using both frequent pattern compression (FPC) and base-delta-immediate $ (B \Delta I) $ compression. CompEx coding integrates FPC/$ B \Delta I $ with $ (k, m)_q $ ``expansion'' coding; expansion codes are a class of $q$-ary linear block codes that encode data using only the low energy states of a $q$-ary NVM cell. CompEx coding simultaneously reduces energy and latency and improves lifetime for negligible-to-no memory overhead and negligible logic overhead ( \approx 10k gates, which is $ < 0.1 \% $ per NVM module). Furthermore, we also propose CompEx++ coding, which extends CompEx coding by leveraging the variable compressibility of pattern-based compression techniques. CompEx++ coding integrates custom expansion codes to each of the compression patterns to exploit maximum energy/latency benefits of CompEx coding. Our full-system simulations using TLC RRAM show that CompEx/CompEx++ coding reduces total memory energy by 57\%/61\% and write latency by 23.5\%/26\%; these improvements translate to a 5.7\%/10.6\% improvement in IPC, a 11.8\%/19.9\% improvement in main memory bandwidth, and $ 1.8 \times $ improvement in lifetime over classical binary coding using data-comparison write. CompEx/CompEx++ coding thus addresses the programming energy/latency and lifetime challenges of MLC/TLC NVMs that pose a serious technological roadblock to their adoption in high-performance computing systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2017:DBT, author = "Dongwoo Lee and Sangheon Lee and Soojung Ryu and Kiyoung Choi", title = "Dirty-Block Tracking in a Direct-Mapped {DRAM} Cache with Self-Balancing Dispatch", journal = j-TACO, volume = "14", number = "2", pages = "11:1--11:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3068460", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recently, processors have begun integrating 3D stacked DRAMs with the cores on the same package, and there have been several approaches to effectively utilizing the on-package DRAMs as caches. This article presents an approach that combines the previous approaches in a synergistic way by devising a module called the dirty-block tracker to maintain the dirtiness of each block in a dirty region. The approach avoids unnecessary tag checking for a write operation if the corresponding block in the cache is not dirty. Our simulation results show that the proposed technique achieves a 10.3\% performance improvement on average over the state-of-the-art DRAM cache technique.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Parasyris:2017:SAP, author = "Konstantinos Parasyris and Vassilis Vassiliadis and Christos D. Antonopoulos and Spyros Lalis and Nikolaos Bellas", title = "Significance-Aware Program Execution on Unreliable Hardware", journal = j-TACO, volume = "14", number = "2", pages = "12:1--12:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3058980", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article introduces a significance-centric programming model and runtime support that sets the supply voltage in a multicore CPU to sub-nominal values to reduce the energy footprint and provide mechanisms to control output quality. The developers specify the significance of application tasks respecting their contribution to the output quality and provide check and repair functions for handling faults. On a multicore system, we evaluate five benchmarks using an energy model that quantifies the energy reduction. When executing the least-significant tasks unreliably, our approach leads to 20\% CPU energy reduction with respect to a reliable execution and has minimal quality degradation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mendonca:2017:DAA, author = "Gleison Mendon{\c{c}}a and Breno Guimar{\~a}es and P{\'e}ricles Alves and M{\'a}rcio Pereira and Guido Ara{\'u}jo and Fernando Magno Quint{\~a}o Pereira", title = "{DawnCC}: Automatic Annotation for Data Parallelism and Offloading", journal = j-TACO, volume = "14", number = "2", pages = "13:1--13:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3084540", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Directive-based programming models, such as OpenACC and OpenMP, allow developers to convert a sequential program into a parallel one with minimum human intervention. However, inserting pragmas into production code is a difficult and error-prone task, often requiring familiarity with the target program. This difficulty restricts the ability of developers to annotate code that they have not written themselves. This article provides a suite of compiler-related methods to mitigate this problem. Such techniques rely on symbolic range analysis, a well-known static technique, to achieve two purposes: populate source code with data transfer primitives and to disambiguate pointers that could hinder automatic parallelization due to aliasing. We have materialized our ideas into a tool, DawnCC, which can be used stand-alone or through an online interface. To demonstrate its effectiveness, we show how DawnCC can annotate the programs available in PolyBench without any intervention from users. Such annotations lead to speedups of over $ 100 \times $ in an Nvidia architecture and over $ 50 \times $ in an ARM architecture.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Balasubramonian:2017:CNT, author = "Rajeev Balasubramonian and Andrew B. Kahng and Naveen Muralimanohar and Ali Shafiee and Vaishnav Srinivas", title = "{CACTI 7}: New Tools for Interconnect Exploration in Innovative Off-Chip Memories", journal = j-TACO, volume = "14", number = "2", pages = "14:1--14:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3085572", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Historically, server designers have opted for simple memory systems by picking one of a few commoditized DDR memory products. We are already witnessing a major upheaval in the off-chip memory hierarchy, with the introduction of many new memory products-buffer-on-board, LRDIMM, HMC, HBM, and NVMs, to name a few. Given the plethora of choices, it is expected that different vendors will adopt different strategies for their high-capacity memory systems, often deviating from DDR standards and/or integrating new functionality within memory systems. These strategies will likely differ in their choice of interconnect and topology, with a significant fraction of memory energy being dissipated in I/O and data movement. To make the case for memory interconnect specialization, this paper makes three contributions. First, we design a tool that carefully models I/O power in the memory system, explores the design space, and gives the user the ability to define new types of memory interconnects/topologies. The tool is validated against SPICE models, and is integrated into version 7 of the popular CACTI package. Our analysis with the tool shows that several design parameters have a significant impact on I/O power. We then use the tool to help craft novel specialized memory system channels. We introduce a new relay-on-board chip that partitions a DDR channel into multiple cascaded channels. We show that this simple change to the channel topology can improve performance by 22\% for DDR DRAM and lower cost by up to 65\% for DDR DRAM. This new architecture does not require any changes to DIMMs, and it efficiently supports hybrid DRAM/NVM systems. Finally, as an example of a more disruptive architecture, we design a custom DIMM and parallel bus that moves away from the DDR3/DDR4 standards. To reduce energy and improve performance, the baseline data channel is split into three narrow parallel channels and the on-DIMM interconnects are operated at a lower frequency. In addition, this allows us to design a two-tier error protection strategy that reduces data transfers on the interconnect. This architecture yields a performance improvement of 18\% and a memory power reduction of 23\%. The cascaded channel and narrow channel architectures serve as case studies for the new tool and show the potential for benefit from re-organizing basic memory interconnects.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jatala:2017:SSG, author = "Vishwesh Jatala and Jayvant Anantpur and Amey Karkare", title = "Scratchpad Sharing in {GPUs}", journal = j-TACO, volume = "14", number = "2", pages = "15:1--15:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3075619", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "General-Purpose Graphics Processing Unit (GPGPU) applications exploit on-chip scratchpad memory available in the Graphics Processing Units (GPUs) to improve performance. The amount of thread level parallelism (TLP) present in the GPU is limited by the number of resident threads, which in turn depends on the availability of scratchpad memory in its streaming multiprocessor (SM). Since the scratchpad memory is allocated at thread block granularity, part of the memory may remain unutilized. In this article, we propose architectural and compiler optimizations to improve the scratchpad memory utilization. Our approach, called Scratchpad Sharing, addresses scratchpad under-utilization by launching additional thread blocks in each SM. These thread blocks use unutilized scratchpad memory and also share scratchpad memory with other resident blocks. To improve the performance of scratchpad sharing, we propose Owner Warp First (OWF) scheduling that schedules warps from the additional thread blocks effectively. The performance of this approach, however, is limited by the availability of the part of scratchpad memory that is shared among thread blocks. We propose compiler optimizations to improve the availability of shared scratchpad memory. We describe an allocation scheme that helps in allocating scratchpad variables such that shared scratchpad is accessed for short duration. We introduce a new hardware instruction, relssp, that when executed releases the shared scratchpad memory. Finally, we describe an analysis for optimal placement of relssp instructions, such that shared scratchpad memory is released as early as possible, but only after its last use, along every execution path. We implemented the hardware changes required for scratchpad sharing and the relssp instruction using the GPGPU-Sim simulator and implemented the compiler optimizations in Ocelot framework. We evaluated the effectiveness of our approach on 19 kernels from 3 benchmarks suites: CUDA-SDK, GPGPU-Sim, and Rodinia. The kernels that under-utilize scratchpad memory show an average improvement of 19\% and maximum improvement of 92.17\% in terms of the number of instruction executed per cycle when compared to the baseline approach, without affecting the performance of the kernels that are not limited by scratchpad memory.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ham:2017:DDS, author = "Tae Jun Ham and Juan L. Arag{\'o}n and Margaret Martonosi", title = "Decoupling Data Supply from Computation for Latency-Tolerant Communication in Heterogeneous Architectures", journal = j-TACO, volume = "14", number = "2", pages = "16:1--16:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3075620", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In today's computers, heterogeneous processing is used to meet performance targets at manageable power. In adopting increased compute specialization, however, the relative amount of time spent on communication increases. System and software optimizations for communication often come at the costs of increased complexity and reduced portability. The Decoupled Supply-Compute (DeSC) approach offers a way to attack communication latency bottlenecks automatically, while maintaining good portability and low complexity. Our work expands prior Decoupled Access Execute techniques with hardware/software specialization. For a range of workloads, DeSC offers roughly 2 $ \times $ speedup, and additional specialized compression optimizations reduce traffic between decoupled units by 40\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Stanic:2017:IVS, author = "Milan Stanic and Oscar Palomar and Timothy Hayes and Ivan Ratkovic and Adrian Cristal and Osman Unsal and Mateo Valero", title = "An Integrated Vector-Scalar Design on an In-Order {ARM} Core", journal = j-TACO, volume = "14", number = "2", pages = "17:1--17:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3075618", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In the low-end mobile processor market, power, energy, and area budgets are significantly lower than in the server/desktop/laptop/high-end mobile markets. It has been shown that vector processors are a highly energy-efficient way to increase performance; however, adding support for them incurs area and power overheads that would not be acceptable for low-end mobile processors. In this work, we propose an integrated vector-scalar design for the ARM architecture that mostly reuses scalar hardware to support the execution of vector instructions. The key element of the design is our proposed block-based model of execution that groups vector computational instructions together to execute them in a coordinated manner. We implemented a classic vector unit and compare its results against our integrated design. Our integrated design improves the performance (more than $ 6 \times $) and energy consumption (up to $ 5 \times $) of a scalar in-order core with negligible area overhead (only 4.7\% when using a vector register with 32 elements). In contrast, the area overhead of the classic vector unit can be significant (around 44\%) if a dedicated vector floating-point unit is incorporated. Our block-based vector execution outperforms the classic vector unit for all kernels with floating-point data and also consumes less energy. We also complement the integrated design with three energy/performance-efficient techniques that further reduce power and increase performance. The first proposal covers the design and implementation of chaining logic that is optimized to work with the cache hierarchy through vector memory instructions, the second proposal reduces the number of reads/writes from/to the vector register file, and the third idea optimizes complex memory access patterns with the memory shape instruction and unified indexed vector load.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Endo:2017:IBV, author = "Fernando A. Endo and Arthur Perais and Andr{\'e} Seznec", title = "On the Interactions Between Value Prediction and Compiler Optimizations in the Context of {EOLE}", journal = j-TACO, volume = "14", number = "2", pages = "18:1--18:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3090634", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Increasing instruction-level parallelism is regaining attractiveness within the microprocessor industry. The {Early | Out-of-order | Late} Execution (EOLE) microarchitecture and Differential Value TAgged GEometric (D-VTAGE) value predictor were recently introduced to solve practical issues of Value Prediction (VP). In particular, they remove the most significant difficulties that forbade an effective VP hardware. In this study, we present a detailed evaluation of the potential of VP in the context of EOLE/D-VTAGE and different compiler options. Our study shows that if no single general rule always applies-more optimization might sometimes lead to more performance-unoptimized codes often get a large benefit from the prediction of redundant loads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sridharan:2017:BPP, author = "Aswinkumar Sridharan and Biswabandan Panda and Andre Seznec", title = "Band-Pass Prefetching: an Effective Prefetch Management Mechanism Using Prefetch-Fraction Metric in Multi-Core Systems", journal = j-TACO, volume = "14", number = "2", pages = "19:1--19:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3090635", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In multi-core systems, an application's prefetcher can interfere with the memory requests of other applications using the shared resources, such as last level cache and memory bandwidth. In order to minimize prefetcher-caused interference, prior mechanisms have been proposed to dynamically control prefetcher aggressiveness at runtime. These mechanisms use several parameters to capture prefetch usefulness as well as prefetcher-caused interference, performing aggressive control decisions. However, these mechanisms do not capture the actual interference at the shared resources and most often lead to incorrect aggressiveness control decisions. Therefore, prior works leave scope for performance improvement. Toward this end, we propose a solution to manage prefetching in multicore systems. In particular, we make two fundamental observations: First, a positive correlation exists between the accuracy of a prefetcher and the amount of prefetch requests it generates relative to an application's total (demand and prefetch) requests. Second, a strong positive correlation exists between the ratio of total prefetch to demand requests and the ratio of average last level cache miss service times of demand to prefetch requests. In this article, we propose Band-pass prefetching that builds on those two observations, a simple and low-overhead mechanism to effectively manage prefetchers in multicore systems. Our solution consists of local and global prefetcher aggressiveness control components, which altogether, control the flow of prefetch requests between a range of prefetch to demand requests ratios. From our experiments on 16-core multi-programmed workloads, on systems using stream prefetching, we observe that Band-pass prefetching achieves 12.4\% (geometric-mean) improvement on harmonic speedup over the baseline that implements no prefetching, while aggressive prefetching without prefetcher aggressiveness control and state-of-the-art HPAC, P-FST, and CAFFEINE achieve 8.2\%, 8.4\%, 1.4\%, and 9.7\%, respectively. Further evaluation of the proposed Band-pass prefetching mechanism on systems using AMPM prefetcher shows similar performance trends. For a 16-core system, Band-pass prefetching requires only a modest hardware cost of 239 bytes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Goens:2017:SSS, author = "Andr{\'e}s Goens and Sergio Siccha and Jeronimo Castrillon", title = "Symmetry in Software Synthesis", journal = j-TACO, volume = "14", number = "2", pages = "20:1--20:??", month = jul, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3095747", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jul 24 18:00:59 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the surge of multi- and many-core systems, much research has focused on algorithms for mapping and scheduling on these complex platforms. Large classes of these algorithms face scalability problems. This is why diverse methods are commonly used for reducing the search space. While most such approaches leverage the inherent symmetry of architectures and applications, they do it in a problem-specific and intuitive way. However, intuitive approaches become impractical with growing hardware complexity, like Network-on-Chip interconnect or heterogeneous cores. In this article, we present a formal framework that can determine the inherent local and global symmetry of architectures and applications algorithmically and leverage these for problems in software synthesis. Our approach is based on the mathematical theory of groups and a generalization called inverse semigroups. We evaluate our approach in two state-of-the-art mapping frameworks. Even for the platforms with a handful of cores of today and moderate-sized benchmarks, our approach consistently yields reductions of the overall execution time of algorithms. We obtain a speedup of more than $ 10 \times $ for one use-case and saved 10\% of time in another.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vocke:2017:EHI, author = "Sander Vocke and Henk Corporaal and Roel Jordans and Rosilde Corvino and Rick Nas", title = "Extending {Halide} to Improve Software Development for Imaging {DSPs}", journal = j-TACO, volume = "14", number = "3", pages = "21:1--21:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106343", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Specialized Digital Signal Processors (DSPs), which can be found in a wide range of modern devices, play an important role in power-efficient, high-performance image processing. Applications including camera sensor post-processing and computer vision benefit from being (partially) mapped onto such DSPs. However, due to their specialized instruction sets and dependence on low-level code optimization, developing applications for DSPs is more time-consuming and error-prone than for general-purpose processors. Halide is a domain-specific language (DSL) that enables low-effort development of portable, high-performance imaging pipelines-a combination of qualities that is hard, if not impossible, to find among DSP programming models. We propose a set of extensions and modifications to Halide to generate code for DSP C compilers, focusing specifically on diverse SIMD target instruction sets and heterogeneous scratchpad memory hierarchies. We implement said techniques for a commercial DSP found in an Intel Image Processing Unit (IPU), demonstrating that this solution can be used to achieve performance within 20\% of highly tuned, manually written C code, while leading to a reduction in code complexity. By comparing performance of Halide algorithms using our solution to results on CPU and GPU targets, we confirm the value of using DSP targets with Halide.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jensen:2017:ILD, author = "Nicklas Bo Jensen and Sven Karlsson", title = "Improving Loop Dependence Analysis", journal = j-TACO, volume = "14", number = "3", pages = "22:1--22:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3095754", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Programmers can no longer depend on new processors to have significantly improved single-thread performance. Instead, gains have to come from other sources such as the compiler and its optimization passes. Advanced passes make use of information on the dependencies related to loops. We improve the quality of that information by reusing the information given by the programmer for parallelization. We have implemented a prototype based on GCC into which we also add a new optimization pass. Our approach improves the amount of correctly classified dependencies resulting in 46\% average improvement in single-thread performance for kernel benchmarks compared to GCC 6.1.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ganser:2017:ISO, author = "Stefan Ganser and Armin Gr{\"o}sslinger and Norbert Siegmund and Sven Apel and Christian Lengauer", title = "Iterative Schedule Optimization for Parallelization in the Polyhedron Model", journal = j-TACO, volume = "14", number = "3", pages = "23:1--23:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3109482", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The polyhedron model is a powerful model to identify and apply systematically loop transformations that improve data locality (e.g., via tiling) and enable parallelization. In the polyhedron model, a loop transformation is, essentially, represented as an affine function. Well-established algorithms for the discovery of promising transformations are based on performance models. These algorithms have the drawback of not being easily adaptable to the characteristics of a specific program or target hardware. An iterative search for promising loop transformations is more easily adaptable and can help to learn better models. We present an iterative optimization method in the polyhedron model that targets tiling and parallelization. The method enables either a sampling of the search space of legal loop transformations at random or a more directed search via a genetic algorithm. For the latter, we propose a set of novel, tailored reproduction operators. We evaluate our approach against existing iterative and model-driven optimization strategies. We compare the convergence rate of our genetic algorithm to that of random exploration. Our approach of iterative optimization outperforms existing optimization techniques in that it finds loop transformations that yield significantly higher performance. If well configured, then random exploration turns out to be very effective and reduces the need for a genetic algorithm.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wei:2017:HHM, author = "Wei Wei and Dejun Jiang and Jin Xiong and Mingyu Chen", title = "{HAP}: Hybrid-Memory-Aware Partition in Shared Last-Level Cache", journal = j-TACO, volume = "14", number = "3", pages = "24:1--24:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106340", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Data-center servers benefit from large-capacity memory systems to run multiple processes simultaneously. Hybrid DRAM-NVM memory is attractive for increasing memory capacity by exploiting the scalability of Non-Volatile Memory (NVM). However, current LLC policies are unaware of hybrid memory. Cache misses to NVM introduce high cost due to long NVM latency. Moreover, evicting dirty NVM data suffer from long write latency. We propose hybrid memory aware cache partitioning to dynamically adjust cache spaces and give NVM dirty data more chances to reside in LLC. Experimental results show Hybrid-memory-Aware Partition (HAP) improves performance by 46.7\% and reduces energy consumption by 21.9\% on average against LRU management. Moreover, HAP averagely improves performance by 9.3\% and reduces energy consumption by 6.4\% against a state-of-the-art cache mechanism.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xiong:2017:PPP, author = "Dongliang Xiong and Kai Huang and Xiaowen Jiang and Xiaolang Yan", title = "Providing Predictable Performance via a Slowdown Estimation Model", journal = j-TACO, volume = "14", number = "3", pages = "25:1--25:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3124451", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Interapplication interference at shared main memory slows down different applications differently. A few slowdown estimation models have been proposed to provide predictable performance by quantifying memory interference, but they have relatively low accuracy. Thus, we propose a more accurate slowdown estimation model called SEM at main memory. First, SEM unifies the slowdown estimation model by measuring IPC directly. Second, SEM uses the per-bank structure to monitor memory interference and improves estimation accuracy by considering write interference, row-buffer interference, and data bus interference. The evaluation results show that SEM has significantly lower slowdown estimation error (4.06\%) compared to STFM (30.15\%) and MISE (10.1\%).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pu:2017:PHS, author = "Jing Pu and Steven Bell and Xuan Yang and Jeff Setter and Stephen Richardson and Jonathan Ragan-Kelley and Mark Horowitz", title = "Programming Heterogeneous Systems from an Image Processing {DSL}", journal = j-TACO, volume = "14", number = "3", pages = "26:1--26:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3107953", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Specialized image processing accelerators are necessary to deliver the performance and energy efficiency required by important applications in computer vision, computational photography, and augmented reality. But creating, ``programming,'' and integrating this hardware into a hardware/software system is difficult. We address this problem by extending the image processing language Halide so users can specify which portions of their applications should become hardware accelerators, and then we provide a compiler that uses this code to automatically create the accelerator along with the ``glue'' code needed for the user's application to access this hardware. Starting with Halide not only provides a very high-level functional description of the hardware but also allows our compiler to generate a complete software application, which accesses the hardware for acceleration when appropriate. Our system also provides high-level semantics to explore different mappings of applications to a heterogeneous system, including the flexibility of being able to change the throughput rate of the generated hardware. We demonstrate our approach by mapping applications to a commercial Xilinx Zynq system. Using its FPGA with two low-power ARM cores, our design achieves up to 6$ \times $ higher performance and 38$ \times $ lower energy compared to the quad-core ARM CPU on an NVIDIA Tegra K1, and 3.5$ \times $ higher performance with 12$ \times $ lower energy compared to the K1's 192-core GPU.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hroub:2017:EGC, author = "Ayman Hroub and M. E. S. Elrabaa and M. F. Mudawar and A. Khayyat", title = "Efficient Generation of Compact Execution Traces for Multicore Architectural Simulations", journal = j-TACO, volume = "14", number = "3", pages = "27:1--27:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106342", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Requiring no functional simulation, trace-driven simulation has the potential of achieving faster simulation speeds than execution-driven simulation of multicore architectures. An efficient, on-the-fly, high-fidelity trace generation method for multithreaded applications is reported. The generated trace is encoded in an instruction-like binary format that can be directly ``interpreted'' by a timing simulator to simulate a general load/store or x8-like architecture. A complete tool suite that has been developed and used for evaluation of the proposed method showed that it produces smaller traces over existing trace compression methods while retaining good fidelity including all threading- and synchronization-related events.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Weber:2017:MAL, author = "Nicolas Weber and Michael Goesele", title = "{MATOG}: Array Layout Auto-Tuning for {CUDA}", journal = j-TACO, volume = "14", number = "3", pages = "28:1--28:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3106341", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Optimal code performance is (besides correctness and accuracy) the most important objective in compute intensive applications. In many of these applications, Graphic Processing Units (GPUs) are used because of their high amount of compute power. However, caused by their massively parallel architecture, the code has to be specifically adjusted to the underlying hardware to achieve optimal performance and therefore has to be reoptimized for each new generation. In reality, this is usually not the case as productive code is normally at least several years old and nobody has the time to continuously adjust existing code to new hardware. In recent years more and more approaches have emerged that automatically tune the performance of applications toward the underlying hardware. In this article, we present the MATOG auto-tuner and its concepts. It abstracts the array memory access in CUDA applications and automatically optimizes the code according to the used GPUs. MATOG only requires few profiling runs to analyze even complex applications, while achieving significant speedups over non-optimized code, independent of the used GPU generation and without the need to manually tune the code.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ashouri:2017:MMC, author = "Amir H. Ashouri and Andrea Bignoli and Gianluca Palermo and Cristina Silvano and Sameer Kulkarni and John Cavazos", title = "{MiCOMP}: Mitigating the Compiler Phase-Ordering Problem Using Optimization Sub-Sequences and Machine Learning", journal = j-TACO, volume = "14", number = "3", pages = "29:1--29:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3124452", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent compilers offer a vast number of multilayered optimizations targeting different code segments of an application. Choosing among these optimizations can significantly impact the performance of the code being optimized. The selection of the right set of compiler optimizations for a particular code segment is a very hard problem, but finding the best ordering of these optimizations adds further complexity. Finding the best ordering represents a long standing problem in compilation research, named the phase-ordering problem. The traditional approach of constructing compiler heuristics to solve this problem simply cannot cope with the enormous complexity of choosing the right ordering of optimizations for every code segment in an application. This article proposes an automatic optimization framework we call MiCOMP, which Mitigates the COMpiler Phase-ordering problem. We perform phase ordering of the optimizations in LLVM's highest optimization level using optimization sub-sequences and machine learning. The idea is to cluster the optimization passes of LLVM's O3 setting into different clusters to predict the speedup of a complete sequence of all the optimization clusters instead of having to deal with the ordering of more than 60 different individual optimizations. The predictive model uses (1) dynamic features, (2) an encoded version of the compiler sequence, and (3) an exploration heuristic to tackle the problem. Experimental results using the LLVM compiler framework and the Cbench suite show the effectiveness of the proposed clustering and encoding techniques to application-based reordering of passes, while using a number of predictive models. We perform statistical analysis on the results and compare against (1) random iterative compilation, (2) standard optimization levels, and (3) two recent prediction approaches. We show that MiCOMP's iterative compilation using its sub-sequences can reach an average performance speedup of 1.31 (up to 1.51). Additionally, we demonstrate that MiCOMP's prediction model outperforms the -O1, -O2, and -O3 optimization levels within using just a few predictions and reduces the prediction error rate down to only 5\%. Overall, it achieves 90\% of the available speedup by exploring less than 0.001\% of the optimization space.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vermij:2017:AIN, author = "Erik Vermij and Leandro Fiorin and Rik Jongerius and Christoph Hagleitner and Jan {Van Lunteren} and Koen Bertels", title = "An Architecture for Integrated Near-Data Processors", journal = j-TACO, volume = "14", number = "3", pages = "30:1--30:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3127069", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To increase the performance of data-intensive applications, we present an extension to a CPU architecture that enables arbitrary near-data processing capabilities close to the main memory. This is realized by introducing a component attached to the CPU system-bus and a component at the memory side. Together they support hardware-managed coherence and virtual memory support to integrate the near-data processors in a shared-memory environment. We present an implementation of the components, as well as a system-simulator, providing detailed performance estimations. With a variety of synthetic workloads we demonstrate the performance of the memory accesses, the mixed fine- and coarse-grained coherence mechanisms, and the near-data processor communication mechanism. Furthermore, we quantify the inevitable start-up penalty regarding coherence and data writeback, and argue that near-data processing workloads should access data several times to offset this penalty. A case study based on the Graph500 benchmark confirms the small overhead for the proposed coherence mechanisms and shows the ability to outperform a real CPU by a factor of two.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Diavastos:2017:SLR, author = "Andreas Diavastos and Pedro Trancoso", title = "{SWITCHES}: a Lightweight Runtime for Dataflow Execution of Tasks on Many-Cores", journal = j-TACO, volume = "14", number = "3", pages = "31:1--31:??", month = sep, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3127068", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Sep 6 17:12:05 MDT 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "SWITCHES is a task-based dataflow runtime that implements a lightweight distributed triggering system for runtime dependence resolution and uses static scheduling and compile-time assignment policies to reduce runtime overheads. Unlike other systems, the granularity of loop-tasks can be increased to favor data-locality, even when having dependences across different loops. SWITCHES introduces explicit task resource allocation mechanisms for efficient allocation of resources and adopts the latest OpenMP Application Programming Interface (API), as to maintain high levels of programming productivity. It provides a source-to-source tool that automatically produces thread-based code. Performance on an Intel Xeon-Phi shows good scalability and surpasses OpenMP by an average of 32\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jain:2017:CMA, author = "Rahul Jain and Preeti Ranjan Panda and Sreenivas Subramoney", title = "Cooperative Multi-Agent Reinforcement Learning-Based Co-optimization of Cores, Caches, and On-chip Network", journal = j-TACO, volume = "14", number = "4", pages = "32:1--32:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3132170", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern multi-core systems provide huge computational capabilities, which can be used to run multiple processes concurrently. To achieve the best possible performance within limited power budgets, the various system resources need to be allocated effectively. Any mismatch between runtime resource requirement and allocation leads to a sub-optimal energy-delay product (EDP). Different optimization techniques exist for addressing the problem of mismatch between the dynamic requirement and runtime allocation of the system resources. Choosing between multiple optimizations at runtime is complex due to the non-additive effects, making the scenario suitable for the application of machine learning techniques. We present a novel method, Machine Learned Machines (MLM), by using online reinforcement learning (RL) to perform dynamic partitioning of the last level cache (LLC), along with dynamic voltage and frequency scaling (DVFS) of the core and uncore (interconnection network and LLC). We have proposed and evaluated three different MLM co-optimization techniques based on independent and cooperative multi-agent learners. We show that the co-optimization results in a much lower system EDP than any of the techniques applied individually. We explore various RL models targeted toward optimization of different system metrics and study their effects on a system EDP, system throughput (STP), and Fairness. The various proposed techniques have been extensively evaluated with a mix of 20 workloads on a 4-core system using Spec2006 benchmarks. We have further evaluated our cooperative MLM techniques on a 16-core system. The results show an average of 20.5\% and 19.1\% system EDP improvement on a 4-core and 16-core system, respectively, with limited degradation of STP and Fairness.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{DeSensi:2017:BPP, author = "Daniele {De Sensi} and Tiziano {De Matteis} and Massimo Torquati and Gabriele Mencagli and Marco Danelutto", title = "Bringing Parallel Patterns Out of the Corner: The {P$^3$ARSEC} Benchmark Suite", journal = j-TACO, volume = "14", number = "4", pages = "33:1--33:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3132710", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High-level parallel programming is an active research topic aimed at promoting parallel programming methodologies that provide the programmer with high-level abstractions to develop complex parallel software with reduced time to solution. Pattern-based parallel programming is based on a set of composable and customizable parallel patterns used as basic building blocks in parallel applications. In recent years, a considerable effort has been made in empowering this programming model with features able to overcome shortcomings of early approaches concerning flexibility and performance. In this article, we demonstrate that the approach is flexible and efficient enough by applying it on 12 out of 13 PARSEC applications. Our analysis, conducted on three different multicore architectures, demonstrates that pattern-based parallel programming has reached a good level of maturity, providing comparable results in terms of performance with respect to both other parallel programming methodologies based on pragma-based annotations (i.e., Open mp and OmpSs) and native implementations (i.e., Pthreads). Regarding the programming effort, we also demonstrate a considerable reduction in lines of code and code churn compared to Pthreads and comparable results with respect to other existing implementations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ye:2017:CES, author = "Chencheng Ye and Chen Ding and Hao Luo and Jacob Brock and Dong Chen and Hai Jin", title = "Cache Exclusivity and Sharing: Theory and Optimization", journal = j-TACO, volume = "14", number = "4", pages = "34:1--34:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3134437", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A problem on multicore systems is cache sharing, where the cache occupancy of a program depends on the cache usage of peer programs. Exclusive cache hierarchy as used on AMD processors is an effective solution to allow processor cores to have a large private cache while still benefitting from shared cache. The shared cache stores the ``victims'' (i.e., data evicted from private caches). The performance depends on how victims of co-run programs interact in shared cache. This article presents a new metric called the victim footprint (VFP). It is measured once per program in its solo execution and can then be combined to compute the performance of any exclusive cache hierarchy, replacing parallel testing with theoretical analysis. The work evaluates the VFP by using it to analyze cache sharing by parallel mixes of sequential programs, comparing the accuracy of the theory to hardware counter results, and measuring the benefit of exclusivity-aware analysis and optimization.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shrivastava:2017:EEC, author = "Rahul Shrivastava and V. Krishna Nandivada", title = "Energy-Efficient Compilation of Irregular Task-Parallel Loops", journal = j-TACO, volume = "14", number = "4", pages = "35:1--35:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3136063", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Energy-efficient compilation is an important problem for multi-core systems. In this context, irregular programs with task-parallel loops present interesting challenges: the threads with lesser work-loads ( non-critical -threads) wait at the join-points for the thread with maximum work-load ( critical -thread); this leads to significant energy wastage. This problem becomes more interesting in the context of multi-socket-multi-core (MSMC) systems, where different sockets may run at different frequencies, but all the cores connected to a socket run at a single frequency. In such a configuration, even though the load-imbalance among the cores may be significant, an MSMC-oblivious technique may miss the opportunities to reduce energy consumption, if the load-imbalance across the sockets is minimal. This problem becomes further challenging in the presence of mutual-exclusion, where scaling the frequencies of a socket executing the non-critical-threads can impact the execution time of the critical-threads. In this article, we propose a scheme (X10Ergy) to obtain energy gains with minimal impact on the execution time, for task-parallel languages, such as X10, HJ, and so on. X10Ergy takes as input a loop-chunked program (parallel-loop iterations divided into chunks and each chunk is executed by a unique thread). X10Ergy follows a mixed compile-time + runtime approach that (i) uses static analysis to efficiently compute the work-load of each chunk at runtime, (ii) computes the ``remaining'' work-load of the chunks running on the cores of each socket at regular intervals and tunes the frequency of the sockets accordingly, (iii) groups the threads into different sockets (based on the remaining work-load of their respective chunks), and (iv) in the presence of atomic-blocks, models the effect of frequency-scaling on the critical-thread. We implemented X10Ergy for X10 and have obtained encouraging results for the IMSuite kernels.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Proy:2017:CAL, author = "Julien Proy and Karine Heydemann and Alexandre Berzati and Albert Cohen", title = "Compiler-Assisted Loop Hardening Against Fault Attacks", journal = j-TACO, volume = "14", number = "4", pages = "36:1--36:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3141234", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Secure elements widely used in smartphones, digital consumer electronics, and payment systems are subject to fault attacks. To thwart such attacks, software protections are manually inserted requiring experts and time. The explosion of the Internet of Things (IoT) in home, business, and public spaces motivates the hardening of a wider class of applications and the need to offer security solutions to non-experts. This article addresses the automated protection of loops at compilation time, covering the widest range of control- and data-flow patterns, in both shape and complexity. The security property we consider is that a sensitive loop must always perform the expected number of iterations; otherwise, an attack must be reported. We propose a generic compile-time loop hardening scheme based on the duplication of termination conditions and of the computations involved in the evaluation of such conditions. We also investigate how to preserve the security property along the compilation flow while enabling aggressive optimizations. We implemented this algorithm in LLVM 4.0 at the Intermediate Representation (IR) level in the backend. On average, the compiler automatically hardens 95\% of the sensitive loops of typical security benchmarks, and 98\% of these loops are shown to be robust to simulated faults. Performance and code size overhead remain quite affordable, at 12.5\% and 14\%, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Peterson:2017:TCT, author = "Christina Peterson and Damian Dechev", title = "A Transactional Correctness Tool for Abstract Data Types", journal = j-TACO, volume = "14", number = "4", pages = "37:1--37:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3148964", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Transactional memory simplifies multiprocessor programming by providing the guarantee that a sequential block of code in the form of a transaction will exhibit atomicity and isolation. Transactional data structures offer the same guarantee to concurrent data structures by enabling the atomic execution of a composition of operations. The concurrency control of transactional memory systems preserves atomicity and isolation by detecting read/write conflicts among multiple concurrent transactions. State-of-the-art transactional data structures improve on this concurrency control protocol by providing explicit transaction-level synchronization for only non-commutative operations. Since read/write conflicts are handled by thread-level concurrency control, the correctness of transactional data structures cannot be evaluated according to the read/write histories. This presents a challenge for existing correctness verification techniques for transactional memory, because correctness is determined according to the transitions taken by the transactions in the presence of read/write conflicts. In this article, we present Transactional Correctness tool for Abstract Data Types (TxC-ADT), the first tool that can check the correctness of transactional data structures. TxC-ADT elevates the standard definitions of transactional correctness to be in terms of an abstract data type, an essential aspect for checking correctness of transactions that synchronize only for high-level semantic conflicts. To accommodate a diverse assortment of transactional correctness conditions, we present a technique for defining correctness as a happens-before relation. Defining a correctness condition in this manner enables an automated approach in which correctness is evaluated by generating and analyzing a transactional happens-before graph during model checking. A transactional happens-before graph is maintained on a per-thread basis, making our approach applicable to transactional correctness conditions that do not enforce a total order on a transactional execution. We demonstrate the practical applications of TxC-ADT by checking Lock Free Transactional Transformation and Transactional Data Structure Libraries for serializability, strict serializability, opacity, and causal consistency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ferroni:2017:PCM, author = "Matteo Ferroni and Andrea Corna and Andrea Damiani and Rolando Brondolin and Juan A. Colmenares and Steven Hofmeyr and John D. Kubiatowicz and Marco D. Santambrogio", title = "Power Consumption Models for Multi-Tenant Server Infrastructures", journal = j-TACO, volume = "14", number = "4", pages = "38:1--38:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3148965", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multi-tenant virtualized infrastructures allow cloud providers to minimize costs through workload consolidation. One of the largest costs is power consumption, which is challenging to understand in heterogeneous environments. We propose a power modeling methodology that tackles this complexity using a divide-and-conquer approach. Our results outperform previous research work, achieving a relative error of 2\% on average and under 4\% in almost all cases. Models are portable across similar architectures, enabling predictions of power consumption before migrating a tenant to a different hardware platform. Moreover, we show the models allow us to evaluate colocations of tenants to reduce overall consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mohammadi:2017:COE, author = "Milad Mohammadi and Tor M. Aamodt and William J. Dally", title = "{CG-OoO}: Energy-Efficient Coarse-Grain Out-of-Order Execution Near In-Order Energy with Near Out-of-Order Performance", journal = j-TACO, volume = "14", number = "4", pages = "39:1--39:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3151034", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We introduce the Coarse-Grain Out-of-Order (CG-OoO) general-purpose processor designed to achieve close to In-Order (InO) processor energy while maintaining Out-of-Order (OoO) performance. CG-OoO is an energy-performance-proportional architecture. Block-level code processing is at the heart of this architecture; CG-OoO speculates, fetches, schedules, and commits code at block-level granularity. It eliminates unnecessary accesses to energy-consuming tables and turns large tables into smaller, distributed tables that are cheaper to access. CG-OoO leverages compiler-level code optimizations to deliver efficient static code and exploits dynamic block-level and instruction-level parallelism. CG-OoO introduces Skipahead, a complexity effective, limited out-of-order instruction scheduling model. Through the energy efficiency techniques applied to the compiler and processor pipeline stages, CG-OoO closes 62\% of the average energy gap between the InO and OoO baseline processors at the same area and nearly the same performance as the OoO. This makes CG-OoO 1.8$ \times $ more efficient than the OoO on the energy-delay product inverse metric. CG-OoO meets the OoO nominal performance while trading off the peak scheduling performance for superior energy efficiency.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Swami:2017:EEC, author = "Shivam Swami and Poovaiah M. Palangappa and Kartik Mohanram", title = "{ECS}: Error-Correcting Strings for Lifetime Improvements in Nonvolatile Memories", journal = j-TACO, volume = "14", number = "4", pages = "40:1--40:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3151083", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Emerging nonvolatile memories (NVMs) suffer from low write endurance, resulting in early cell failures (hard errors), which reduce memory lifetime. It was recognized early on that conventional error-correcting codes (ECCs), which are designed for soft errors, are a poor choice for addressing hard errors in NVMs. This led to the evolution of hard error correction schemes like dynamically replicated memory (DRM), error-correcting pointers (ECPs), SAFER, FREE-p, PAYG, and Zombie memory to improve NVM lifetime. Whereas these approaches made significant inroads in addressing hard errors and low memory lifetime in NVMs, overcoming the challenges of underutilization of error-correcting resources and/or implementation overhead (e.g., codec latency, hardware support) remain areas of active research and development. This article proposes error-correcting strings (ECSs) as a high-utilization, low-latency solution for hard error correction in single-/multi-/triple-level cell (SLC/MLC/TLC) NVMs. At its core, ECS adopts a base-offset approach to store pointers to the failed memory cells; in this work, base is the address of the first failed cell in a memory block and offsets are the distances between successive failed cells in that memory block. Unlike ECP, which uses fixed-length pointers, ECS uses variable-length offsets to point to the failed cells, thereby realizing more pointers to tolerate more hard errors per memory block. Further, this article proposes eXtended-ECS (XECS), a page-level error correction architecture, which employs dynamic on-demand ECS allocation and opportunistic pattern-based data compression to improve NVM lifetime by 2$ \times $ over ECP-6 for comparable overhead and negligible impact to system performance. Finally, this article demonstrates that ECS is a drop-in replacement for ECP to extend the lifetime of state-of-the-art ECP-based techniques like PAYG and Zombie memory; ECS is also compatible with MLC/TLC NVMs, where it complements drift-induced soft error reduction techniques like ECC and incomplete data mapping to simultaneously extend NVM lifetime.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Azhar:2017:SQS, author = "M. Waqar Azhar and Per Stenstr{\"o}m and Vassilis Papaefstathiou", title = "{SLOOP}: {QoS}-Supervised Loop Execution to Reduce Energy on Heterogeneous Architectures", journal = j-TACO, volume = "14", number = "4", pages = "41:1--41:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3148053", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Most systems allocate computational resources to each executing task without any actual knowledge of the application's Quality-of-Service (QoS) requirements. Such best-effort policies lead to overprovisioning of the resources and increase energy loss. This work assumes applications with soft QoS requirements and exploits the inherent timing slack to minimize the allocated computational resources to reduce energy consumption. We propose a lightweight progress-tracking methodology based on the outer loops of application kernels. It builds on online history and uses it to estimate the total execution time. The prediction of the execution time and the QoS requirements are then used to schedule the application on a heterogeneous architecture with big out-of-order cores and small (LITTLE) in-order cores and select the minimum operating frequency, using DVFS, that meets the deadline. Our scheme is effective in exploiting the timing slack of each application. We show that it can reduce the energy consumption by more than 20\% without missing any computational deadlines.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kanakagiri:2017:MMD, author = "Raghavendra Kanakagiri and Biswabandan Panda and Madhu Mutyam", title = "{MBZip}: Multiblock Data Compression", journal = j-TACO, volume = "14", number = "4", pages = "42:1--42:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3151033", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compression techniques at the last-level cache and the DRAM play an important role in improving system performance by increasing their effective capacities. A compressed block in DRAM also reduces the transfer time over the memory bus to the caches, reducing the latency of a LLC cache miss. Usually, compression is achieved by exploiting data patterns present within a block. But applications can exhibit data locality that spread across multiple consecutive data blocks. We observe that there is significant opportunity available for compressing multiple consecutive data blocks into one single block, both at the LLC and DRAM. Our studies using 21 SPEC CPU applications show that, at the LLC, around 25\% (on average) of the cache blocks can be compressed into one single cache block when grouped together in groups of 2 to 8 blocks. In DRAM, more than 30\% of the columns residing in a single DRAM page can be compressed into one DRAM column, when grouped together in groups of 2 to 6. Motivated by these observations, we propose a mechanism, namely, MBZip, that compresses multiple data blocks into one single block (called a zipped block), both at the LLC and DRAM. At the cache, MBZip includes a simple tag structure to index into these zipped cache blocks and the indexing does not incur any redirectional delay. At the DRAM, MBZip does not need any changes to the address computation logic and works seamlessly with the conventional/existing logic. MBZip is a synergistic mechanism that coordinates these zipped blocks at the LLC and DRAM. Further, we also explore silent writes at the DRAM and show that certain writes need not access the memory when blocks are zipped. MBZip improves the system performance by 21.9\%, with a maximum of 90.3\% on a 4-core system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Neill:2017:FAM, author = "Richard Neill and Andi Drebes and Antoniu Pop", title = "Fuse: Accurate Multiplexing of Hardware Performance Counters Across Executions", journal = j-TACO, volume = "14", number = "4", pages = "43:1--43:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3148054", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Collecting hardware event counts is essential to understanding program execution behavior. Contemporary systems offer few Performance Monitoring Counters (PMCs), thus only a small fraction of hardware events can be monitored simultaneously. We present new techniques to acquire counts for all available hardware events with high accuracy by multiplexing PMCs across multiple executions of the same program, then carefully reconciling and merging the multiple profiles into a single, coherent profile. We present a new metric for assessing the similarity of statistical distributions of event counts and show that our execution profiling approach performs significantly better than Hardware Event Multiplexing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sardashti:2017:CCG, author = "Somayeh Sardashti and David A. Wood", title = "Could Compression Be of General Use? {Evaluating} Memory Compression across Domains", journal = j-TACO, volume = "14", number = "4", pages = "44:1--44:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3138805", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent proposals present compression as a cost-effective technique to increase cache and memory capacity and bandwidth. While these proposals show potentials of compression, there are several open questions to adopt these proposals in real systems including the following: (1) Do these techniques work for real-world workloads running for long time? (2) Which application domains would potentially benefit the most from compression? (3) At which level of memory hierarchy should we apply compression: caches, main memory, or both? In this article, our goal is to shed light on some main questions on applicability of compression. We evaluate compression in the memory hierarchy for selected examples from different application classes. We analyze real applications with real data and complete runs of several benchmarks. While simulators provide a pretty accurate framework to study potential performance/energy impacts of ideas, they mostly limit us to a small range of workloads with short runtimes. To enable studying real workloads, we introduce a fast and simple methodology to get samples of memory and cache contents of a real machine (a desktop or a server). Compared to a cycle-accurate simulator, our methodology allows us to study real workloads as well as benchmarks. Our toolset is not a replacement for simulators but mostly complements them. While we can use a simulator to measure performance/energy impact of a particular compression proposal, here with our methodology we can study the potentials with long running workloads in early stages of the design. Using our toolset, we evaluate a collection of workloads from different domains, such as a web server of CS department of UW-Madison for 24h, Google Chrome (watching a 1h-long movie on YouTube), and Linux games (playing for about an hour). We also use several benchmarks from different domains, including SPEC, mobile, and big data. We run these benchmarks to completion. Using these workloads and our toolset, we analyze different compression properties for both real applications and benchmarks. We focus on eight main hypotheses on compression, derived from previous work on compression. These properties (Table 2) act as foundation of several proposals on compression, so performance of those proposals depends very much on these basic properties. Overall, our results suggest that compression could be of general use both in main memory and caches. On average, the compression ratio is {$>$}=2 for 64\% and 54\% of workloads, respectively, for memory and cache data. Our evaluation indicates significant potential for both cache and memory compression, with higher compressibility in memory due to abundance of zero blocks. Among application domains we studied, servers show on average the highest compressibility, while our mobile benchmarks show the lowest compressibility. For comparing benchmarks with real workloads, we show that (1) it is critical to run benchmarks to completion or considerably long runtimes to avoid biased conclusions, and (2) SPEC benchmarks are good representative of real Desktop applications in terms of compressibility of their datasets. However, this does not hold for all compression properties. For example, SPEC benchmarks have much better compression locality (i.e., neighboring blocks have similar compressibility) than real workloads. Thus, it is critical for designers to consider wider range of workloads, including real applications, to evaluate their compression techniques.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huang:2017:IEG, author = "Libo Huang and Yashuai L{\"u} and Li Shen and Zhiying Wang", title = "Improving the Efficiency of {GPGPU} Work-Queue Through Data Awareness", journal = j-TACO, volume = "14", number = "4", pages = "45:1--45:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3151035", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The architecture and programming model of current GPGPUs are best suited for applications that are dominated by structured control and data flows across large regular datasets. Parallel workloads with irregular control and data structures cannot easily harness the processing power of the GPGPU. One approach for mapping these irregular-parallel workloads to GPGPUs is using work-queues. The work-queue approach improves the utilization of SIMD units by only processing useful works that are dynamically generated during execution. As current GPGPUs lack necessary supports for work-queues, a software-based work-queue implementation often suffers from memory contention and load balancing issues. In this article, we present a novel hardware work-queue design named DaQueue, which incorporates three data-aware features to improve the efficiency of work-queues on GPGPUs. We evaluate our proposal on the irregular-parallel workloads and carry out a case study on a path tracing pipeline with a cycle-level simulator. Experimental results show that for the tested workloads, DaQueue improves performance by 1.53$ \times $ on average and up to 1.91$ \times $. Compared to a hardware worklist approach that is the state-of-the-art prior work, DaQueue can achieve an average of 33.92\% extra speedup with less hardware area cost.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Angerd:2017:FAC, author = "Alexandra Angerd and Erik Sintorn and Per Stenstr{\"o}m", title = "A Framework for Automated and Controlled Floating-Point Accuracy Reduction in Graphics Applications on {GPUs}", journal = j-TACO, volume = "14", number = "4", pages = "46:1--46:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3151032", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Reducing the precision of floating-point values can improve performance and/or reduce energy expenditure in computer graphics, among other, applications. However, reducing the precision level of floating-point values in a controlled fashion needs support both at the compiler and at the microarchitecture level. At the compiler level, a method is needed to automate the reduction of precision of each floating-point value. At the microarchitecture level, a lower precision of each floating-point register can allow more floating-point values to be packed into a register file. This, however, calls for new register file organizations. This article proposes an automated precision-selection method and a novel GPU register file organization that can store floating-point register values at arbitrary precisions densely. The automated precision-selection method uses a data-driven approach for setting the precision level of floating-point values, given a quality threshold and a representative set of input data. By allowing a small, but acceptable, degradation in output quality, our method can remove a significant amount of the bits needed to represent floating-point values in the investigated kernels (between 28\% and 60\%). Our proposed register file organization exploits these lower-precision floating-point values by packing several of them into the same physical register. This reduces the register pressure per thread by up to 48\%, and by 27\% on average, for a negligible output-quality degradation. This can enable GPUs to keep up to twice as many threads in flight simultaneously.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Arteaga:2017:GFG, author = "Jaime Arteaga and St{\'e}phane Zuckerman and Guang R. Gao", title = "Generating Fine-Grain Multithreaded Applications Using a Multigrain Approach", journal = j-TACO, volume = "14", number = "4", pages = "47:1--47:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3155288", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The recent evolution in hardware landscape, aimed at producing high-performance computing systems capable of reaching extreme-scale performance, has reignited the interest in fine-grain multithreading, particularly at the intranode level. Indeed, popular parallel programming environments, such as OpenMP, which features a simple interface for the parallelization of programs, are now incorporating fine-grain constructs. However, since coarse-grain directives are still heavily used, the OpenMP runtime is forced to support both coarse- and fine-grain models of execution, potentially reducing the advantages obtained when executing an application in a fully fine-grain environment. To evaluate the type of applications that benefit from executing in a unified fine-grain program execution model, this article presents a multigrain parallel programming environment for the generation of fine-grain multithreaded applications from programs featuring OpenMP's API, allowing OpenMP programs to be run on top of a fine-grain event-driven program execution model. Experimental results with five scientific benchmarks show that fine-grain applications, generated by and run on our environment with two runtimes implementing a fine-grain event-driven program execution model, are competitive and can outperform their OpenMP counterparts, especially for data-intensive workloads with irregular and dynamic parallelism, reaching speedups as high as 2.6$ \times $ for Graph500 and 51$ \times $ for NAS Data Cube.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hadidi:2017:CCA, author = "Ramyad Hadidi and Lifeng Nai and Hyojong Kim and Hyesoon Kim", title = "{CAIRO}: a Compiler-Assisted Technique for Enabling Instruction-Level Offloading of Processing-In-Memory", journal = j-TACO, volume = "14", number = "4", pages = "48:1--48:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3155287", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Three-dimensional (3D)-stacking technology and the memory-wall problem have popularized processing-in-memory (PIM) concepts again, which offers the benefits of bandwidth and energy savings by offloading computations to functional units inside the memory. Several memory vendors have also started to integrate computation logics into the memory, such as Hybrid Memory Cube (HMC), the latest version of which supports up to 18 in-memory atomic instructions. Although industry prototypes have motivated studies for investigating efficient methods and architectures for PIM, researchers have not proposed a systematic way for identifying the benefits of instruction-level PIM offloading. As a result, compiler support for recognizing offloading candidates and utilizing instruction-level PIM offloading is unavailable. In this article, we analyze the advantages of instruction-level PIM offloading in the context of HMC-atomic instructions for graph-computing applications and propose CAIRO, a compiler-assisted technique and decision model for enabling instruction-level offloading of PIM without any burden on programmers. To develop CAIRO, we analyzed how instruction offloading enables performance gain in both CPU and GPU workloads. Our studies show that performance gain from bandwidth savings, the ratio of number of cache misses to total cache accesses, and the overhead of host atomic instructions are the key factors in selecting an offloading candidate. Based on our analytical models, we characterize the properties of beneficial and nonbeneficial candidates for offloading. We evaluate CAIRO with 27 multithreaded CPU and 36 GPU benchmarks. In our evaluation, CAIRO not only doubles the speedup for a set of PIM-beneficial workloads by exploiting HMC-atomic instructions but also prevents slowdown caused by incorrect offloading decisions for other workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lim:2017:TEP, author = "Hongyeol Lim and Giho Park", title = "{Triple Engine Processor (TEP)}: a Heterogeneous Near-Memory Processor for Diverse Kernel Operations", journal = j-TACO, volume = "14", number = "4", pages = "49:1--49:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3155920", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The advent of 3D memory stacking technology, which integrates a logic layer and stacked memories, is expected to be one of the most promising memory technologies to mitigate the memory wall problem by leveraging the concept of near-memory processing (NMP). With the ability to process data locally within the logic layer of stacked memory, a variety of emerging big data applications can achieve significant performance and energy-efficiency benefits. Various approaches to the NMP logic layer architecture have been studied to utilize the advantage of stacked memory. While significant acceleration of specific kernel operations has been derived from previous NMP studies, an NMP-based system using an NMP logic architecture capable of handling some specific kernel operations can suffer from performance and energy efficiency degradation caused by a significant communication overhead between the host processor and NMP stack. In this article, we first analyze the kernel operations that can greatly improve the performance of NMP-based systems in diverse emerging applications, and then we analyze the architecture to efficiently process the extracted kernel operations. This analysis confirms that three categories of processing engines for NMP logic are required for efficient processing of a variety of emerging applications, and thus we propose a Triple Engine Processor (TEP), a heterogeneous near-memory processor with three types of computing engines. These three types of engines are an in-order core, a coerce-grain reconfigurable processor (CGRA), and dedicated hardware. The proposed TEP provides about 3.4 times higher performance and 33\% greater energy savings than the baseline 3D memory system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Patsilaras:2017:RRD, author = "George Patsilaras and James Tuck", title = "{ReDirect}: Reconfigurable Directories for Multicore Architectures", journal = j-TACO, volume = "14", number = "4", pages = "50:1--50:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3162015", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As we enter the dark silicon era, architects should not envision designs in which every transistor remains turned on permanently but rather ones in which portions of the chip are judiciously turned on/off depending on the characteristics of a workload. At the same time, due to the increasing cost per transistor, architects should also consider new ways to re-purpose transistors to increase their architectural value. In this work, we consider the design of directory-based cache coherence in light of the dark silicon era and the need to re-purpose transistors. We point out that directories are not needed all of the time, and we argue that directories (and coherence) should be off unless it is actually needed for correctness. In our design, directories will be disabled and powered off for workloads with no sharing. Then only when parallel workloads need cache coherence will directories be enabled in proportion to the sharing that is present. At the same time, we exploit the structural similarities of directories and cache. If a directory is idle, then we reconfigure it to be used as extra capacity in the last-level cache. Since our novel approach can keep most directories off, we are free to select sparse overprovisioned directory designs that are reconfigurable to large amounts of cache that can significantly boost performance when the directory is idle. We call these combined features Reconfigured Dark Directories, since directories are usually dark (off) and can be reconfigured. Our results for Reconfigurable Dark Directories running SPEC 2006 applications show a performance benefit, on average, of 17\% for an 8$ \times $ overprovisioned fully mapped directory on a 64-tile system under low system concurrency (10\% under heavy concurrency), or a 29\% average speedup for a 2$ \times $ overprovisioned directory on 256-tile system (10\% under heavy concurrency) to systems with a conventional sparse directory design using the same overprovisioning factor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Patil:2017:HHA, author = "Adarsh Patil and Ramaswamy Govindarajan", title = "{HAShCache}: Heterogeneity-Aware Shared {DRAMCache} for Integrated Heterogeneous Systems", journal = j-TACO, volume = "14", number = "4", pages = "51:1--51:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3158641", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Integrated Heterogeneous System (IHS) processors pack throughput-oriented General-Purpose Graphics Processing Units (GPGPUs) alongside latency-oriented Central Processing Units (CPUs) on the same die sharing certain resources, e.g., shared last-level cache, Network-on-Chip (NoC), and the main memory. The demands for memory accesses and other shared resources from GPU cores can exceed that of CPU cores by two to three orders of magnitude. This disparity poses significant problems in exploiting the full potential of these architectures. In this article, we propose adding a large-capacity stacked DRAM, used as a shared last-level cache, for the IHS processors. However, adding the DRAMCache naively, leaves significant performance on the table due to the disparate demands from CPU and GPU cores for DRAMCache and memory accesses. In particular, the imbalance can significantly reduce the performance benefits that the CPU cores would have otherwise enjoyed with the introduction of the DRAMCache, necessitating a heterogeneity-aware management of this shared resource for improved performance. In this article, we propose three simple techniques to enhance the performance of CPU application while ensuring very little to no performance impact to the GPU. Specifically, we propose (i) PrIS, a prioritization scheme for scheduling CPU requests at the DRAMCache controller; (ii) ByE, a selective and temporal bypassing scheme for CPU requests at the DRAMCache; and (iii) Chaining, an occupancy controlling mechanism for GPU lines in the DRAMCache through pseudo-associativity. The resulting cache, Heterogeneity-Aware Shared DRAMCache (HAShCache), is heterogeneity-aware and can adapt dynamically to address the inherent disparity of demands in an IHS architecture. Experimental evaluation of the proposed HAShCache results in an average system performance improvement of 41\% over a naive DRAMCache and over 200\% improvement over a baseline system with no stacked DRAMCache.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Alias:2017:OAC, author = "Christophe Alias and Alexandru Plesco", title = "Optimizing Affine Control With Semantic Factorizations", journal = j-TACO, volume = "14", number = "4", pages = "52:1--52:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3162017", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hardware accelerators generated by polyhedral synthesis techniques make extensive use of affine expressions (affine functions and convex polyhedra) in control and steering logic. Since the control is pipelined, these affine objects must be evaluated at the same time for different values, which forbids aggressive reuse of operators. In this article, we propose a method to factorize a collection of affine expressions without preventing pipelining. Our key contributions are (i) to use semantic factorizations exploiting arithmetic properties of addition and multiplication and (ii) to rely on a cost function whose minimization ensures correct usage of FPGA resources. Our algorithm is totally parameterized by the cost function, which can be customized to fit a target FPGA. Experimental results on a large pool of linear algebra kernels show a significant improvement compared to traditional low-level RTL optimizations. In particular, we show how our method reduces resource consumption by revealing hidden strength reductions.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Matheou:2017:DDC, author = "George Matheou and Paraskevas Evripidou", title = "Data-Driven Concurrency for High Performance Computing", journal = j-TACO, volume = "14", number = "4", pages = "53:1--53:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3162014", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this work, we utilize dynamic dataflow/data-driven techniques to improve the performance of high performance computing (HPC) systems. The proposed techniques are implemented and evaluated through an efficient, portable, and robust programming framework that enables data-driven concurrency on HPC systems. The proposed framework is based on data-driven multithreading (DDM), a hybrid control-flow/dataflow model that schedules threads based on data availability on sequential processors. The proposed framework was evaluated using several benchmarks, with different characteristics, on two different systems: a 4-node AMD system with a total of 128 cores and a 64-node Intel HPC system with a total of 768 cores. The performance evaluation shows that the proposed framework scales well and tolerates scheduling overheads and memory latencies effectively. We also compare our framework to MPI, DDM-VM, and OmpSs@Cluster. The comparison results show that the proposed framework obtains comparable or better performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Georgakoudis:2017:SSA, author = "Giorgis Georgakoudis and Hans Vandierendonck and Peter Thoman and Bronis R. {De Supinski} and Thomas Fahringer and Dimitrios S. Nikolopoulos", title = "{SCALO}: Scalability-Aware Parallelism Orchestration for Multi-Threaded Workloads", journal = j-TACO, volume = "14", number = "4", pages = "54:1--54:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3158643", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Shared memory machines continue to increase in scale by adding more parallelism through additional cores and complex memory hierarchies. Often, executing multiple applications concurrently, dividing among them hardware threads, provides greater efficiency rather than executing a single application with large thread counts. However, contention for shared resources can limit the improvement of concurrent application execution: orchestrating the number of threads used by each application and is essential. In this article, we contribute SCALO, a solution to orchestrate concurrent application execution to increase throughput. SCALO monitors co-executing applications at runtime to evaluate their scalability. Its optimizing thread allocator analyzes these scalability estimates to adapt the parallelism of each program. Unlike previous approaches, SCALO differs by including dynamic contention effects on scalability and by controlling the parallelism during the execution of parallel regions. Thus, it improves throughput when other state-of-the-art approaches fail and outperforms them by up to 40\% when they succeed.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Baroudi:2017:OTB, author = "Toufik Baroudi and Rachid Seghir and Vincent Loechner", title = "Optimization of Triangular and Banded Matrix Operations Using $2$ d-Packed Layouts", journal = j-TACO, volume = "14", number = "4", pages = "55:1--55:??", month = dec, year = "2017", CODEN = "????", DOI = "https://doi.org/10.1145/3162016", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Dec 22 18:25:55 MST 2017", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Over the past few years, multicore systems have become increasingly powerful and thereby very useful in high-performance computing. However, many applications, such as some linear algebra algorithms, still cannot take full advantage of these systems. This is mainly due to the shortage of optimization techniques dealing with irregular control structures. In particular, the well-known polyhedral model fails to optimize loop nests whose bounds and/or array references are not affine functions. This is more likely to occur when handling sparse matrices in their packed formats. In this article, we propose using 2d-packed layouts and simple affine transformations to enable optimization of triangular and banded matrix operations. The benefit of our proposal is shown through an experimental study over a set of linear algebra benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2018:IEE, author = "Hochan Lee and Mansureh S. Moghaddam and Dongkwan Suh and Bernhard Egger", title = "Improving Energy Efficiency of Coarse-Grain Reconfigurable Arrays Through Modulo Schedule Compression\slash Decompression", journal = j-TACO, volume = "15", number = "1", pages = "1:1--1:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3162018", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modulo-scheduled course-grain reconfigurable array (CGRA) processors excel at exploiting loop-level parallelism at a high performance per watt ratio. The frequent reconfiguration of the array, however, causes between 25\% and 45\% of the consumed chip energy to be spent on the instruction memory and fetches therefrom. This article presents a hardware/software codesign methodology for such architectures that is able to reduce both the size required to store the modulo-scheduled loops and the energy consumed by the instruction decode logic. The hardware modifications improve the spatial organization of a CGRA's execution plan by reorganizing the configuration memory into separate partitions based on a statistical analysis of code. A compiler technique optimizes the generated code in the temporal dimension by minimizing the number of signal changes. The optimizations achieve, on average, a reduction in code size of more than 63\% and in energy consumed by the instruction decode logic by 70\% for a wide variety of application domains. Decompression of the compressed loops can be performed in hardware with no additional latency, rendering the presented method ideal for low-power CGRAs running at high frequencies. The presented technique is orthogonal to dictionary-based compression schemes and can be combined to achieve a further reduction in code size.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sangaiah:2018:SSA, author = "Karthik Sangaiah and Michael Lui and Radhika Jagtap and Stephan Diestelhorst and Siddharth Nilakantan and Ankit More and Baris Taskin and Mark Hempstead", title = "{SynchroTrace}: Synchronization-Aware Architecture-Agnostic Traces for Lightweight Multicore Simulation of {CMP} and {HPC} Workloads", journal = j-TACO, volume = "15", number = "1", pages = "2:1--2:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3158642", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Trace-driven simulation of chip multiprocessor (CMP) systems offers many advantages over execution-driven simulation, such as reducing simulation time and complexity, allowing portability, and scalability. However, trace-based simulation approaches have difficulty capturing and accurately replaying multithreaded traces due to the inherent nondeterminism in the execution of multithreaded programs. In this work, we present SynchroTrace, a scalable, flexible, and accurate trace-based multithreaded simulation methodology. By recording synchronization events relevant to modern threading libraries (e.g., Pthreads and OpenMP) and dependencies in the traces, independent of the host architecture, the methodology is able to accurately model the nondeterminism of multithreaded programs for different hardware platforms and threading paradigms. Through capturing high-level instruction categories, the SynchroTrace average CPI trace Replay timing model offers fast and accurate simulation of many-core in-order CMPs. We perform two case studies to validate the SynchroTrace simulation flow against the gem5 full-system simulator: (1) a constraint-based design space exploration with traditional CMP benchmarks and (2) a thread-scalability study with HPC-representative applications. The results from these case studies show that (1) our trace-based approach with trace filtering has a peak speedup of up to 18.7$ \times $ over simulation in gem5 full-system with an average of 9.6$ \times $ speedup, (2) SynchroTrace maintains the thread-scaling accuracy of gem5 and can efficiently scale up to 64 threads, and (3) SynchroTrace can trace in one platform and model any platform in early stages of design.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zheng:2018:ESG, author = "Long Zheng and Xiaofei Liao and Hai Jin", title = "Efficient and Scalable Graph Parallel Processing With Symbolic Execution", journal = j-TACO, volume = "15", number = "1", pages = "3:1--3:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3170434", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pagerank.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Existing graph processing essentially relies on the underlying iterative execution with synchronous (Sync) and/or asynchronous (Async) engine. Nevertheless, they both suffer from a wide class of inherent serialization arising from data interdependencies within a graph. In this article, we present SymGraph, a judicious graph engine with symbolic iteration that enables the parallelism of dependent computation on vertices. SymGraph allows using abstract symbolic value (instead of the concrete value) for the computation if the desired data is unavailable. To maximize the potential of symbolic iteration, we propose a chain of tailored sophisticated techniques, enabling SymGraph to scale out with a new milestone of efficiency for large-scale graph processing. We evaluate SymGraph in comparison to Sync, Async, and a hybrid of Sync and Async engines. Our results on 12 nodes show that SymGraph outperforms all three graph engines by 1.93x (vs. Sync), 1.98x (vs. Async), and 1.57x (vs. Hybrid) on average. In particular, the performance for PageRank on 32 nodes can be dramatically improved by 16.5x (vs. Sync), 23.3x (vs. Async), and 12.1x (vs. Hybrid), respectively. The efficiency of SymGraph is also validated with at least one order of magnitude improvement in contrast to three specialized graph systems (Naiad, GraphX, and PGX.D).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jo:2018:DSD, author = "Jae-Eon Jo and Gyu-Hyeon Lee and Hanhwi Jang and Jaewon Lee and Mohammadamin Ajdari and Jangwoo Kim", title = "{DiagSim}: Systematically Diagnosing Simulators for Healthy Simulations", journal = j-TACO, volume = "15", number = "1", pages = "4:1--4:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177959", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Simulators are the most popular and useful tool to study computer architecture and examine new ideas. However, modern simulators have become prohibitively complex (e.g., 200K+ lines of code) to fully understand and utilize. Users therefore end up analyzing and modifying only the modules of interest (e.g., branch predictor, register file) when performing simulations. Unfortunately, hidden details and inter-module interactions of simulators create discrepancies between the expected and actual module behaviors. Consequently, the effect of modifying the target module may be amplified or masked and the users get inaccurate insights from expensive simulations. In this article, we propose DiagSim, an efficient and systematic method to diagnose simulators. It ensures the target modules behave as expected to perform simulation in a healthy (i.e., accurate and correct) way. DiagSim is efficient in that it quickly pinpoints the modules showing discrepancies and guides the users to inspect the behavior without investigating the whole simulator. DiagSim is systematic in that it hierarchically tests the modules to guarantee the integrity of individual diagnosis and always provide reliable results. We construct DiagSim based on generic category-based diagnosis ideas to encourage easy expansion of the diagnosis. We diagnose three popular open source simulators and discover hidden details including implicitly reserved resources, un-documented latency factors, and hard-coded module parameter values. We observe that these factors have large performance impacts (up to 156\%) and illustrate that our diagnosis can correctly detect and eliminate them.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kondguli:2018:CME, author = "Sushant Kondguli and Michael Huang", title = "A Case for a More Effective, Power-Efficient Turbo Boosting", journal = j-TACO, volume = "15", number = "1", pages = "5:1--5:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3170433", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Single-thread performance and throughput often pose different design constraints and require compromises. Mainstream CPUs today incorporate a non-trivial number of cores, even for mobile devices. For power and thermal considerations, by default, a single core does not operate at the maximum performance level. When operating conditions allow, however, commercial products often rely on turbo boosting, which temporarily increases the clock frequency to increase single-thread performance. However, increasing clock speed may result in a poor performance return for invested energy. In this article, we make a case for a more effective boosting strategy, which invests energy in activities with the best estimated return. In addition to running faster clocks, we can also use a look-ahead thread to overlap the penalties of cache misses and branch mispredicts. Overall, for similar power consumptions, the proposed adaptive turbo boosting strategy can achieve about twice the performance benefits while halving the energy overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2018:ESE, author = "Kuan-Chung Chen and Chung-Ho Chen", title = "Enabling {SIMT} Execution Model on Homogeneous Multi-Core System", journal = j-TACO, volume = "15", number = "1", pages = "6:1--6:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177960", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Single-instruction multiple-thread (SIMT) machine emerges as a primary computing device in high-performance computing, since the SIMT execution paradigm can exploit data-level parallelism effectively. This article explores the SIMT execution potential on homogeneous multi-core processors, which generally run in multiple-instruction multiple-data (MIMD) mode when utilizing the multi-core resources. We address three architecture issues in enabling SIMT execution model on multi-core processor, including multithreading execution model, kernel thread context placement, and thread divergence. For the SIMT execution model, we propose a fine-grained multithreading mechanism on an ARM-based multi-core system. Each of the processor cores stores the kernel thread contexts in its L1 data cache for per-cycle thread-switching requirement. For divergence-intensive kernels, an Inner Conditional Statement First (ICS-First) mechanism helps early re-convergence to occur and significantly improves the performance. The experiment results show that effectiveness in data-parallel processing reduces on average 36\% dynamic instructions, and boosts the SIMT executions to achieve on average 1.52$ \times $ and up to 5$ \times $ speedups over the MIMD counterpart for OpenCL benchmarks for single issue in-order processor cores. By using the explicit vectorization optimization on the kernels, the SIMT model gains further benefits from the SIMD extension and achieves 1.71$ \times $ speedup over the MIMD approach. The SIMT model using in-order superscalar processor cores outperforms the MIMD model that uses superscalar out-of-order processor cores by 40\%. The results show that, to exploit data-level parallelism, enabling the SIMT model on homogeneous multi-core processors is important.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2018:SSM, author = "Mingzhe Zhang and King Tin Lam and Xin Yao and Cho-Li Wang", title = "{SIMPO}: a Scalable In-Memory Persistent Object Framework Using {NVRAM} for Reliable Big Data Computing", journal = j-TACO, volume = "15", number = "1", pages = "7:1--7:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3167972", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "While CPU architectures are incorporating many more cores to meet ever-bigger workloads, advance in fault-tolerance support is indispensable for sustaining system performance under reliability constraints. Emerging non-volatile memory technologies are yielding fast, dense, and energy-efficient NVRAM that can dethrone SSD drives for persisting data. Research on using NVRAM to enable fast in-memory data persistence is ongoing. In this work, we design and implement a persistent object framework, dubbed scalable in-memory persistent object (SIMPO), which exploits NVRAM, alongside DRAM, to support efficient object persistence in highly threaded big data applications. Based on operation logging, we propose a new programming model that classifies functions into instant and deferrable groups. SIMPO features a streamlined execution model, which allows lazy evaluation of deferrable functions and is well suited to big data computing workloads that would see improved data locality and concurrency. Our log recording and checkpointing scheme is effectively optimized towards NVRAM, mitigating its long write latency through write-combining and consolidated flushing techniques. Efficient persistent object management with features including safe references and memory leak prevention is also implemented and tailored to NVRAM. We evaluate a wide range of SIMPO-enabled applications with machine learning, high-performance computing, and database workloads on an emulated hybrid memory architecture and a real hybrid memory machine with NVDIMM. Compared with native applications without persistence, experimental results show that SIMPO incurs less than 5\% runtime overhead on both platforms and even gains up to 2.5$ \times $ speedup and 84\% increase in throughput in highly threaded situations on the two platforms, respectively, thanks to the streamlined execution model.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Deng:2018:EML, author = "Bobin Deng and Sriseshan Srikanth and Eric R. Hein and Thomas M. Conte and Erik Debenedictis and Jeanine Cook and Michael P. Frank", title = "Extending {Moore's Law} via Computationally Error-Tolerant Computing", journal = j-TACO, volume = "15", number = "1", pages = "8:1--8:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177837", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dennard scaling has ended. Lowering the voltage supply (V$_{dd}$) to sub-volt levels causes intermittent losses in signal integrity, rendering further scaling (down) no longer acceptable as a means to lower the power required by a processor core. However, it is possible to correct the occasional errors caused due to lower V$_{dd}$ in an efficient manner and effectively lower power. By deploying the right amount and kind of redundancy, we can strike a balance between overhead incurred in achieving reliability and energy savings realized by permitting lower V$_{dd}$. One promising approach is the Redundant Residue Number System (RRNS) representation. Unlike other error correcting codes, RRNS has the important property of being closed under addition, subtraction and multiplication, thus enabling computational error correction at a fraction of an overhead compared to conventional approaches. We use the RRNS scheme to design a Computationally-Redundant, Energy-Efficient core, including the microarchitecture, Instruction Set Architecture (ISA) and RRNS centered algorithms. From the simulation results, this RRNS system can reduce the energy-delay-product by about 3$ \times $ for multiplication intensive workloads and by about 2$ \times $ in general, when compared to a non-error-correcting binary core.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dice:2018:IPH, author = "Dave Dice and Maurice Herlihy and Alex Kogan", title = "Improving Parallelism in Hardware Transactional Memory", journal = j-TACO, volume = "15", number = "1", pages = "9:1--9:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177962", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Today's hardware transactional memory (HTM) systems rely on existing coherence protocols, which implement a requester-wins strategy. This, in turn, leads to poor performance when transactions frequently conflict, causing them to resort to a non-speculative fallback path. Often, such a path severely limits parallelism. In this article, we propose very simple architectural changes to the existing requester-wins HTM implementations that enhance conflict resolution between hardware transactions and thus improve their parallelism. Our idea is compatible with existing HTM systems, requires no changes to target applications that employ traditional lock synchronization, and is shown to provide robust performance benefits.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kim:2018:BEE, author = "Namhyung Kim and Junwhan Ahn and Kiyoung Choi and Daniel Sanchez and Donghoon Yoo and Soojung Ryu", title = "{Benzene}: an Energy-Efficient Distributed Hybrid Cache Architecture for Manycore Systems", journal = j-TACO, volume = "15", number = "1", pages = "10:1--10:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177963", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article proposes Benzene, an energy-efficient distributed SRAM/STT-RAM hybrid cache for manycore systems running multiple applications. It is based on the observation that a na{\"\i}ve application of hybrid cache techniques to distributed caches in a manycore architecture suffers from limited energy reduction due to uneven utilization of scarce SRAM. We propose two-level optimization techniques: intra-bank and inter-bank. Intra-bank optimization leverages highly associative cache design, achieving more uniform distribution of writes within a bank. Inter-bank optimization evenly balances the amount of write-intensive data across the banks. Our evaluation results show that Benzene significantly reduces energy consumption of distributed hybrid caches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ao:2018:POH, author = "Yulong Ao and Chao Yang and Fangfang Liu and Wanwang Yin and Lijuan Jiang and Qiao Sun", title = "Performance Optimization of the {HPCG} Benchmark on the {Sunway TaihuLight Supercomputer}", journal = j-TACO, volume = "15", number = "1", pages = "11:1--11:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3182177", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/super.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we present some key techniques for optimizing HPCG on Sunway TaihuLight and demonstrate how to achieve high performance in memory-bound applications by exploiting specific characteristics of the hardware architecture. In particular, we utilize a block multicoloring approach for parallelization and propose methods such as requirement-based data mapping and customized gather collective to enhance the effective memory bandwidth. Experiments indicate that the optimized HPCG code can sustain 77\% of the theoretical memory bandwidth and scale to the full system of more than 10 million cores, with an aggregated performance of 480.8 Tflop/s and a weak scaling efficiency of 87.3\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rashidi:2018:IMP, author = "Saeed Rashidi and Majid Jalili and Hamid Sarbazi-Azad", title = "Improving {MLC PCM} Performance through Relaxed Write and Read for Intermediate Resistance Levels", journal = j-TACO, volume = "15", number = "1", pages = "12:1--12:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177965", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Phase Change Memory (PCM) is one of the most promising candidates to be used at the main memory level of the memory hierarchy due to poor scalability, considerable leakage power, and high cost/bit of DRAM. PCM is a new resistive memory that is capable of storing data based on resistance values. The wide resistance range of PCM allows for storing multiple bits per cell (MLC) rather than a single bit per cell (SLC). Unfortunately, higher density of MLC PCM comes at the expense of longer read/write latency, higher soft error rate, higher energy consumption, and earlier wearout compared to the SLC PCM. Some studies suggest removing the most error-prone level to mitigate soft error and write latency of MLC PCM, hence introducing a less dense memory called Tri-Level memory. Another scheme, called M-Metric, proposes a new read metric to address the soft error problem in MLC PCM. In order to deal with the limited lifetime of PCM, some extra storage per memory line is required to correct permanent hard errors (stuck-at faults). Since the extra storage is used only when permanent faults occur, it has a low utilization for a long time before hard errors start to occur. In this article, we utilize the extra storage to improve the read/write latency in a 2-bit MLC PCM using a relaxation scheme for reading and writing the cells for intermediate resistance levels. More specifically, we combine the most time-consuming levels (intermediate resistance levels) to reduce the number of resistance levels (making a Tri-Level PCM) and therefore improve write latency. We then store some error correction metadata in the extra storage section to successfully retrieve the exact data values in the read operation. We also modify the Tri-Level PCM cell to increase its read latency when the M-Metric scheme is used. Evaluation results show that the proposed scheme improves read latency by 57.2\%, write latency by 56.1\%, and overall system performance (IPC) by 26.9\% over the baseline. It is noteworthy that combining the proposed scheme and FPC compression method improves read latency by 75.2\%, write latency by 67\%, and overall system performance (IPC) by 37.4\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2018:OCN, author = "Wenlai Zhao and Haohuan Fu and Jiarui Fang and Weijie Zheng and Lin Gan and Guangwen Yang", title = "Optimizing Convolutional Neural Networks on the {Sunway TaihuLight Supercomputer}", journal = j-TACO, volume = "15", number = "1", pages = "13:1--13:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177885", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/super.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Sunway TaihuLight supercomputer is powered by SW26010, a new 260-core processor designed with on-chip fusion of heterogeneous cores. In this article, we present our work on optimizing the training process of convolutional neural networks (CNNs) on the Sunway TaihuLight supercomputer. Specifically, a highly efficient library (swDNN) and a customized Caffe framework (swCaffe) are proposed. Architecture-oriented optimization methods targeting the many-core architecture of SW26010 are introduced and are able to achieve 48$ \times $ speedup for the convolution routine in swDNN and 4$ \times $ speedup for the complete training process of the VGG-16 network using swCaffe, compared to the unoptimized algorithm and framework. Compared to the cuDNN library and the Caffe framework based on the NVIDIA K40m GPU, the proposed swDNN library and swCaffe framework on SW26010 have nearly half the performance of K40m in single-precision and have 3.6$ \times $ and 1.8$ \times $ speedup over K40m in double precision, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mbakoyiannis:2018:EPC, author = "Dimitrios Mbakoyiannis and Othon Tomoutzoglou and George Kornaros", title = "Energy-Performance Considerations for Data Offloading to {FPGA}-Based Accelerators Over {PCIe}", journal = j-TACO, volume = "15", number = "1", pages = "14:1--14:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3180263", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern data centers increasingly employ FPGA-based heterogeneous acceleration platforms as a result of their great potential for continued performance and energy efficiency. Today, FPGAs provide more hardware parallelism than is possible with GPUs or CPUs, whereas C-like programming environments facilitate shorter development time, even close to software cycles. In this work, we address limitations and overheads in access and transfer of data to accelerators over common CPU-accelerator interconnects such as PCIe. We present three different FPGA accelerator dispatching methods for streaming applications (e.g., multimedia, vision computing). The first uses zero-copy data transfers and on-chip scratchpad memory (SPM) for energy efficiency, and the second uses also zero-copy but shared copy engines among different accelerator instances and local external memory. The third uses the processor's memory management unit to acquire the physical address of user pages and uses scatter-gather data transfers with SPM. Even though all techniques exhibit advantages in terms of scalability and relieve the processor from control overheads through using integrated schedulers, the first method presents the best energy-efficient acceleration in streaming applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lin:2018:GPV, author = "Zhen Lin and Michael Mantor and Huiyang Zhou", title = "{GPU} Performance vs. Thread-Level Parallelism: Scalability Analysis and a Novel Way to Improve {TLP}", journal = j-TACO, volume = "15", number = "1", pages = "15:1--15:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177964", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graphics Processing Units (GPUs) leverage massive thread-level parallelism (TLP) to achieve high computation throughput and hide long memory latency. However, recent studies have shown that the GPU performance does not scale with the GPU occupancy or the degrees of TLP that a GPU supports, especially for memory-intensive workloads. The current understanding points to L1 D-cache contention or off-chip memory bandwidth. In this article, we perform a novel scalability analysis from the perspective of throughput utilization of various GPU components, including off-chip DRAM, multiple levels of caches, and the interconnect between L1 D-caches and L2 partitions. We show that the interconnect bandwidth is a critical bound for GPU performance scalability. For the applications that do not have saturated throughput utilization on a particular resource, their performance scales well with increased TLP. To improve TLP for such applications efficiently, we propose a fast context switching approach. When a warp/thread block (TB) is stalled by a long latency operation, the context of the warp/TB is spilled to spare on-chip resource so that a new warp/TB can be launched. The switched-out warp/TB is switched back when another warp/TB is completed or switched out. With this fine-grain fast context switching, higher TLP can be supported without increasing the sizes of critical resources like the register file. Our experiment shows that the performance can be improved by up to 47\% and a geometric mean of 22\% for a set of applications with unsaturated throughput utilization. Compared to the state-of-the-art TLP improvement scheme, our proposed scheme achieves 12\% higher performance on average and 16\% for unsaturated benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zinenko:2018:VPM, author = "Oleksandr Zinenko and St{\'e}phane Huot and C{\'e}dric Bastoul", title = "Visual Program Manipulation in the Polyhedral Model", journal = j-TACO, volume = "15", number = "1", pages = "16:1--16:??", month = apr, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3177961", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Parallelism is one of the key performance sources in modern computer systems. When heuristics-based automatic parallelization fails to improve performance, a cumbersome and error-prone manual transformation is often required. As a solution, we propose an interactive visual approach building on the polyhedral model that visualizes exact dependencies and parallelism; decomposes and replays a complex automatically computed transformation step by step; and allows for directly manipulating the visual representation as a means of transforming the program with immediate feedback. User studies suggest that our visualization is understood by experts and nonexperts alike, and that it may favor an exploratory approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shihab:2018:RFD, author = "Mustafa M. Shihab and Jie Zhang and Myoungsoo Jung and Mahmut Kandemir", title = "{ReveNAND}: a Fast-Drift-Aware Resilient {$3$D} {NAND} Flash Design", journal = j-TACO, volume = "15", number = "2", pages = "17:1--17:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3184744", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The paradigm shift from planar (two dimensional (2D)) to vertical (three-dimensional (3D)) models has placed the NAND flash technology on the verge of a design evolution that can handle the demands of next-generation storage applications. However, it also introduces challenges that may obstruct the realization of such 3D NAND flash. Specifically, we observed that the fast threshold drift (fast-drift) in a charge-trap flash-based 3D NAND cell can make it lose a critical fraction of the stored charge relatively soon after programming and generate errors. In this work, we first present an elastic read reference (V$_{Ref}$) scheme (ERR) for reducing such errors in ReveNAND-our fast-drift aware 3D NAND design. To address the inherent limitation of the adaptive V$_{Ref}$, we introduce a new intra-block page organization (hitch-hike) that can enable stronger error correction for the error-prone pages. In addition, we propose a novel reinforcement-learning-based smart data refill scheme (iRefill) to counter the impact of fast-drift with minimum performance and hardware overhead. Finally, we present the first analytic model to characterize fast-drift and evaluate its system-level impact. Our results show that, compared to conventional 3D NAND design, our ReveNAND can reduce fast-drift errors by 87\%, on average, and can lower the ECC latency and energy overheads by 13$ \times $ and 10$ \times $, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zahedi:2018:MHD, author = "Seyed Majid Zahedi and Songchun Fan and Benjamin C. Lee", title = "Managing Heterogeneous Datacenters with Tokens", journal = j-TACO, volume = "15", number = "2", pages = "18:1--18:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3191821", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Ensuring fairness in a system with scarce, preferred resources requires time sharing. We consider a heterogeneous system with a few ``big'' and many ``small'' processors. We allocate heterogeneous processors using a novel token mechanism, which frames the allocation problem as a repeated game. At each round, users request big processors and spend a token if their request is granted. We analyze the game and optimize users' strategies to produce an equilibrium. In equilibrium, allocations balance performance and fairness. Our mechanism outperforms classical, fair mechanisms by 1.7$ \times $, on average, in performance gains, and is competitive with a performance maximizing mechanism.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pericas:2018:EPA, author = "Miquel Peric{\`a}s", title = "{Elastic Places}: an Adaptive Resource Manager for Scalable and Portable Performance", journal = j-TACO, volume = "15", number = "2", pages = "19:1--19:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3185458", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The diversity and complexity of modern computing platforms makes the development of high-performance software challenging. Designing scalable software requires tuning for a large set of resources, including cores (parallelism), memory bandwidths, and various levels of private and shared caches, as well as developing strategies for optimizing locality. But highly optimized implementations are often inefficient when executed on a different platform. This is the performance portability problem. One approach to scalability and portability is to tune the amount of work per task based on runtime overheads and concurrency. This results in a better balance between parallelism and scheduling overheads, but it can neither tune data reuse nor avoid inter-task interference. We propose a complementary approach that consists in tuning the amount of resources allocated to tasks and combine it with software-defined task topologies to provide portable locality. These ideas are combined into a low-overhead resource management scheme called Elastic Places. Elastic Places is implemented in the XiTAO software framework but the core ideas are equally applicable to other languages and runtimes. Experimental results on an AMD-based NUMA machine and an Intel Knights Landing system show that elastic places provides both high scalability and performance portability, with speed-ups of up to 2.3$ \times $ on both platforms compared to state-of-the-art runtimes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Olson:2018:CLM, author = "Matthew Benjamin Olson and Joseph T. Teague and Divyani Rao and Michael R. JANTZ and Kshitij A. Doshi and Prasad A. Kulkarni", title = "Cross-Layer Memory Management to Improve {DRAM} Energy Efficiency", journal = j-TACO, volume = "15", number = "2", pages = "20:1--20:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3196886", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Controlling the distribution and usage of memory power is often difficult, because these effects typically depend on activity across multiple layers of the vertical execution stack. To address this challenge, we construct a novel and collaborative framework that employs object placement, cross-layer communication, and page-level management to effectively distribute application objects in the DRAM hardware to achieve desired power/performance goals. This work describes the design and implementation of our framework, which is the first to integrate automatic object profiling and analysis at the application layer with fine-grained management of memory hardware resources in the operating system. We demonstrate the utility of this framework by employing it to control memory power consumption more effectively. First, we design a custom memory-intensive workload to show the potential of this approach to reduce DRAM energy consumption. Next, we develop sampling and profiling-based analyses and modify the code generator in the HotSpot VM to understand object usage patterns and automatically control the placement of hot and cold objects in a partitioned VM heap. This information is communicated to the operating system, which uses it to map the logical application pages to the appropriate DRAM modules according to user-defined provisioning goals. The evaluation shows that our Java VM-based framework achieves our goal of significant DRAM energy savings across a variety of workloads, without any source code modifications or recompilations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zoni:2018:DEP, author = "Davide Zoni and Luca Colombo and William Fornaciari", title = "{DarkCache}: Energy-Performance Optimization of Tiled Multi-Cores by Adaptively Power-Gating {LLC} Banks", journal = j-TACO, volume = "15", number = "2", pages = "21:1--21:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3186895", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Last Level Cache (LLC) is a key element to improve application performance in multi-cores. To handle the worst case, the main design trend employs tiled architectures with a large LLC organized in banks, which goes underutilized in several realistic scenarios. Our proposal, named DarkCache, aims at properly powering off such unused banks to optimize the Energy-Delay Product (EDP) through an adaptive cache reconfiguration, thus aggressively reducing the leakage energy. The implemented solution is general and it can recognize and skip the activation of the DarkCache policy for the few strong memory intensive applications that actually require the use of the entire LLC. The validation has been carried out on 16- and 64-core architectures also accounting for two state-of-the-art methodologies. Compared to the baseline solution, DarkCache exhibits a performance overhead within 2\% and an average EDP improvement of 32.58\% and 36.41\% considering 16 and 64 cores, respectively. Moreover, DarkCache shows an average EDP gain between 16.15\% (16 cores) and 21.05\% (64 cores) compared to the best state-of-the-art we evaluated, and it confirms a good scalability since the gain improves with the size of the architecture.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2018:CNC, author = "Yang Zhang and Dan Feng and Wei Tong and Yu Hua and Jingning Liu and Zhipeng Tan and Chengning Wang and Bing Wu and Zheng Li and Gaoxiang Xu", title = "{CACF}: a Novel Circuit Architecture Co-optimization Framework for Improving Performance, Reliability and Energy of {ReRAM}-based Main Memory System", journal = j-TACO, volume = "15", number = "2", pages = "22:1--22:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3195799", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Emerging Resistive Random Access Memory (ReRAM) is a promising candidate as the replacement for DRAM due to its low standby power, high density, high scalability, and nonvolatility. By employing the unique crossbar structure, ReRAM can be constructed with extremely high density. However, the crossbar ReRAM faces some serious challenges in terms of performance, reliability, and energy consumption. First, ReRAM's crossbar structure causes an IR drop problem due to wire resistance and sneak currents, which results in nonuniform access latency in ReRAM banks and reduces its reliability. Second, without access transistors in the crossbar structure, write disturbance results in serious data reliability problem. Third, the access latency, reliability, and energy use of ReRAM arrays are significantly influenced by the data patterns involved in a write operation. To overcome the challenges of the crossbar ReRAM, we propose a novel circuit architecture co-optimization framework for improving the performance, reliability, and energy use of ReRAM-based main memory system, called CACF. The proposed CACF consists of three levels, including the circuit level, circuit architecture level, and architecture level. At the circuit level, to reduce the IR drops along bitlines, we propose a double-sided write driver design by applying write drivers along both sides of bitlines and selectively activating the write drivers. At the circuit architecture level, to address the write disturbance with low overheads, we propose a RESET disturbance detection scheme by adding disturbance reference cells and conditionally performing refresh operations. At the architecture level, a region partition with address remapping method is proposed to leverage the nonuniform access latency in ReRAM banks, and two flip schemes are proposed in different regions to optimize the data patterns involved in a write operation. The experimental results show that CACF improves system performance by 26.1\%, decreases memory access latency by 22.4\%, shortens running time by 20.1\%, and reduces energy consumption by 21.6\% on average over an aggressive baseline. Meanwhile, CACF significantly improves the reliability of ReRAM-based memory systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Stawinoga:2018:PTC, author = "Nicolai Stawinoga and Tony Field", title = "Predictable Thread Coarsening", journal = j-TACO, volume = "15", number = "2", pages = "23:1--23:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3194242", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Thread coarsening on GPUs combines the work of several threads into one. We show how thread coarsening can be implemented as a fully automated compile-time optimisation that estimates the optimal coarsening factor based on a low-cost, approximate static analysis of cache line re-use and an occupancy prediction model. We evaluate two coarsening strategies on three different NVidia GPU architectures. For NVidia reduction kernels we achieve a maximum speedup of 5.08x, and for the Rodinia benchmarks we achieve a mean speedup of 1.30x over 8 of 19 kernels that were determined safe to coarsen.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Roy:2018:NCN, author = "Probir Roy and Shuaiwen Leon Song and Sriram Krishnamoorthy and Abhinav Vishnu and Dipanjan Sengupta and Xu Liu", title = "{NUMA-Caffe}: {NUMA}-Aware Deep Learning Neural Networks", journal = j-TACO, volume = "15", number = "2", pages = "24:1--24:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3199605", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Convolution Neural Networks (CNNs), a special subcategory of Deep Learning Neural Networks (DNNs), have become increasingly popular in industry and academia for their powerful capability in pattern classification, image processing, and speech recognition. Recently, they have been widely adopted in High Performance Computing (HPC) environments for solving complex problems related to modeling, runtime prediction, and big data analysis. Current state-of-the-art designs for DNNs on modern multi- and many-core CPU architectures, such as variants of Caffe, have reported promising performance in speedup and scalability, comparable with the GPU implementations. However, modern CPU architectures employ Non-Uniform Memory Access (NUMA) technique to integrate multiple sockets, which incurs unique challenges for designing highly efficient CNN frameworks. Without a careful design, DNN frameworks can easily suffer from long memory latency due to a large number of memory accesses to remote NUMA domains, resulting in poor scalability. To address this challenge, we propose NUMA-aware multi-solver-based CNN design, named NUMA-Caffe, for accelerating deep learning neural networks on multi- and many-core CPU architectures. NUMA-Caffe is independent of DNN topology, does not impact network convergence rates, and provides superior scalability to the existing Caffe variants. Through a thorough empirical study on four contemporary NUMA-based multi- and many-core architectures, our experimental results demonstrate that NUMA-Caffe significantly outperforms the state-of-the-art Caffe designs in terms of both throughput and scalability.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ejaz:2018:DDD, author = "Ahsen Ejaz and Vassilios Papaefstathiou and Ioannis Sourdis", title = "{DDRNoC}: Dual Data-Rate Network-on-Chip", journal = j-TACO, volume = "15", number = "2", pages = "25:1--25:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3200201", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article introduces DDRNoC, an on-chip interconnection network capable of routing packets at Dual Data Rate. The cycle time of current 2D-mesh Network-on-Chip routers is limited by their control as opposed to the datapath (switch and link traversal), which exhibits significant slack. DDRNoC capitalizes on this observation, allowing two flits per cycle to share the same datapath. Thereby, DDRNoC achieves higher throughput than a Single Data Rate (SDR) network. Alternatively, using lower voltage circuits, the above slack can be exploited to reduce power consumption while matching the SDR network throughput. In addition, DDRNoC exhibits reduced clock distribution power, improving energy efficiency, as it needs a slower clock than a SDR network that routes packets at the same rate. Post place and route results in 28nm technology show that, compared to an iso-voltage (1.1V) SDR network, DDRNoC improves throughput proportionally to the SDR datapath slack. Moreover, a low-voltage (0.95V) DDRNoC implementation converts that slack to power reduction offering the 1.1V SDR throughput at a substantially lower energy cost.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cai:2018:ESH, author = "Ying Cai and Yulong Ao and Chao Yang and Wenjing Ma and Haitao Zhao", title = "Extreme-Scale High-Order {WENO} Simulations of {$3$-D} Detonation Wave with 10 Million Cores", journal = j-TACO, volume = "15", number = "2", pages = "26:1--26:??", month = jun, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3209208", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High-order stencil computations, frequently found in many applications, pose severe challenges to emerging many-core platforms due to the complexities of hardware architectures as well as the sophisticated computing and data movement patterns. In this article, we tackle the challenges of high-order WENO computations in extreme-scale simulations of 3D gaseous waves on Sunway TaihuLight. We design efficient parallelization algorithms and present effective optimization techniques to fully exploit various parallelisms with reduced memory footprints, enhanced data reuse, and balanced computation load. Test results show the optimized code can scale to 9.98 million cores, solving 12.74 trillion unknowns with 23.12 Pflops double-precision performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sfakianakis:2018:QPB, author = "Yannis Sfakianakis and Christos Kozanitis and Christos Kozyrakis and Angelos Bilas", title = "{QuMan}: Profile-based Improvement of Cluster Utilization", journal = j-TACO, volume = "15", number = "3", pages = "27:1--27:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3210560", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern data centers consolidate workloads to increase server utilization and reduce total cost of ownership, and cope with scaling limitations. However, server resource sharing introduces performance interference across applications and, consequently, increases performance volatility, which negatively affects user experience. Thus, a challenging problem is to increase server utilization while maintaining application QoS. In this article, we present QuMan, a server resource manager that uses application isolation and profiling to increase server utilization while controlling degradation of application QoS. Previous solutions, either estimate interference across applications and then restrict colocation to ``compatible'' applications, or assume that application requirements are known. Instead, QuMan estimates the required resources of applications. It uses an isolation mechanism to create properly-sized resource slices for applications, and arbitrarily colocates applications. QuMan 's mechanisms can be used with a variety of admission control policies, and we explore the potential of two such policies: (1) A policy that allows users to specify a minimum performance threshold and (2) an automated policy, which operates without user input and is based on a new combined QoS-utilization metric. We implement QuMan on top of Linux servers, and we evaluate its effectiveness using containers and real applications. Our single-node results show that QuMan balances highly effectively the tradeoff between server utilization and application performance, as it achieves 80\% server utilization while the performance of each application does not drop below 80\% the respective standalone performance. We also deploy QuMan on a cluster of 100 AWS instances that are managed by a modified version of the Sparrow scheduler [37] and, we observe a 48\% increase in application performance on a highly utilized cluster, compared to the performance of the same cluster under the same load when it is managed by native Sparrow or Apache Mesos.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kayraklioglu:2018:LLA, author = "Engin Kayraklioglu and Michael P. Ferguson and Tarek El-Ghazawi", title = "{LAPPS}: Locality-Aware Productive Prefetching Support for {PGAS}", journal = j-TACO, volume = "15", number = "3", pages = "28:1--28:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3233299", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Prefetching is a well-known technique to mitigate scalability challenges in the Partitioned Global Address Space (PGAS) model. It has been studied as either an automated compiler optimization or a manual programmer optimization. Using the PGAS locality awareness, we define a hybrid tradeoff. Specifically, we introduce locality-aware productive prefetching support for PGAS. Our novel, user-driven approach strikes a balance between the ease-of-use of compiler-based automated prefetching and the high performance of the laborious manual prefetching. Our prototype implementation in Chapel shows that significant scalability and performance improvements can be achieved with minimal effort in common applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Benatia:2018:BSM, author = "Akrem Benatia and Weixing Ji and Yizhuo Wang and Feng Shi", title = "{BestSF}: a Sparse Meta-Format for Optimizing {SpMV} on {GPU}", journal = j-TACO, volume = "15", number = "3", pages = "29:1--29:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3226228", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Sparse Matrix-Vector Multiplication (SpMV) kernel dominates the computing cost in numerous scientific applications. Many implementations based on different sparse formats were proposed to improve this kernel on the recent GPU architectures. However, it has been widely observed that there is no ``best-for-all'' sparse format for the SpMV kernel on GPU. Indeed, serious performance degradation of an order of magnitude can be observed without a careful selection of the sparse format to use. To address this problem, we propose in this article BestSF (Best Sparse Format), a new learning-based sparse meta-format that automatically selects the most appropriate sparse format for a given input matrix. To do so, BestSF relies on a cost-sensitive classification system trained using Weighted Support Vector Machines (WSVMs) to predict the best sparse format for each input sparse matrix. Our experimental results on two different NVIDIA GPU architectures using a large number of real-world sparse matrices show that BestSF achieved a noticeable overall performance improvement over using a single sparse format. While BestSF is trained to select the best sparse format in terms of performance (GFLOPS), our further experimental investigations revealed that using BestSF also led, in most of the test cases, to the best energy efficiency (MFLOPS/W). To prove its practical effectiveness, we also evaluate the performance and energy efficiency improvement achieved when using BestSF as a building block in a GPU-based Preconditioned Conjugate Gradient (PCG) iterative solver.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Michaud:2018:ATL, author = "Pierre Michaud", title = "An Alternative {TAGE}-like Conditional Branch Predictor", journal = j-TACO, volume = "15", number = "3", pages = "30:1--30:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3226098", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "TAGE is one of the most accurate conditional branch predictors known today. However, TAGE does not exploit its input information perfectly, as it is possible to obtain significant prediction accuracy improvements by complementing TAGE with a statistical corrector using the same input information. This article proposes an alternative TAGE-like predictor making statistical correction practically superfluous.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Garland:2018:LCM, author = "James Garland and David Gregg", title = "Low Complexity Multiply-Accumulate Units for Convolutional Neural Networks with Weight-Sharing", journal = j-TACO, volume = "15", number = "3", pages = "31:1--31:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3233300", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Convolutional neural networks (CNNs) are one of the most successful machine-learning techniques for image, voice, and video processing. CNNs require large amounts of processing capacity and memory bandwidth. Hardware accelerators have been proposed for CNNs that typically contain large numbers of multiply-accumulate (MAC) units, the multipliers of which are large in integrated circuit (IC) gate count and power consumption. ``Weight-sharing'' accelerators have been proposed where the full range of weight values in a trained CNN are compressed and put into bins, and the bin index is used to access the weight-shared value. We reduce power and area of the CNN by implementing parallel accumulate shared MAC (PASM) in a weight-shared CNN. PASM re-architects the MAC to instead count the frequency of each weight and place it in a bin. The accumulated value is computed in a subsequent multiply phase, significantly reducing gate count and power consumption of the CNN. In this article, we implement PASM in a weight-shared CNN convolution hardware accelerator and analyze its effectiveness. Experiments show that for a clock speed 1GHz implemented on a 45nm ASIC process our approach results in fewer gates, smaller logic, and reduced power with only a slight increase in latency. We also show that the same weight-shared-with-PASM CNN accelerator can be implemented in resource-constrained FPGAs, where the FPGA has limited numbers of digital signal processor (DSP) units to accelerate the MAC operations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kim:2018:CEC, author = "Hyojong Kim and Ramyad Hadidi and Lifeng Nai and Hyesoon Kim and Nuwan Jayasena and Yasuko Eckert and Onur Kayiran and Gabriel Loh", title = "{CODA}: Enabling Co-location of Computation and Data for Multiple {GPU} Systems", journal = j-TACO, volume = "15", number = "3", pages = "32:1--32:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3232521", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To exploit parallelism and scalability of multiple GPUs in a system, it is critical to place compute and data together. However, two key techniques that have been used to hide memory latency and improve thread-level parallelism (TLP), memory interleaving, and thread block scheduling, in traditional GPU systems are at odds with efficient use of multiple GPUs. Distributing data across multiple GPUs to improve overall memory bandwidth utilization incurs high remote traffic when the data and compute are misaligned. Nondeterministic thread block scheduling to improve compute resource utilization impedes co-placement of compute and data. Our goal in this work is to enable co-placement of compute and data in the presence of fine-grained interleaved memory with a low-cost approach. To this end, we propose a mechanism that identifies exclusively accessed data and place the data along with the thread block that accesses it in the same GPU. The key ideas are (1) the amount of data exclusively used by a thread block can be estimated, and that exclusive data (of any size) can be localized to one GPU with coarse-grained interleaved pages; (2) using the affinity-based thread block scheduling policy, we can co-place compute and data together; and (3) by using dual address mode with lightweight changes to virtual to physical page mappings, we can selectively choose different interleaved memory pages for each data structure. Our evaluations across a wide range of workloads show that the proposed mechanism improves performance by 31\% and reduces 38\% remote traffic over a baseline system.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Manivannan:2018:GDB, author = "Madhavan Manivannan and Miquel Peric{\'a}s and Vassilis Papaefstathiou and Per Stenstr{\"o}m", title = "Global Dead-Block Management for Task-Parallel Programs", journal = j-TACO, volume = "15", number = "3", pages = "33:1--33:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3234337", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Task-parallel programs inefficiently utilize the cache hierarchy due to the presence of dead blocks in caches. Dead blocks may occupy cache space in multiple cache levels for a long time without providing any utility until they are finally evicted. Existing dead-block prediction schemes take decisions locally for each cache level and do not efficiently manage the entire cache hierarchy. This article introduces runtime-orchestrated global dead-block management, in which static and dynamic information about tasks available to the runtime system is used to effectively detect and manage dead blocks across the cache hierarchy. In the proposed global management schemes, static information (e.g., when tasks start/finish, and what data regions tasks produce/consume) is combined with dynamic information to detect when/where blocks become dead. When memory regions are deemed dead at some cache level(s), all the associated cache blocks are evicted from the corresponding level(s). We extend the cache controllers at both private and shared cache levels to use the aforementioned information to evict dead blocks. The article does an extensive evaluation of both inclusive and non-inclusive cache hierarchies and shows that the proposed global schemes outperform existing local dead-block management schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gareev:2018:HPG, author = "Roman Gareev and Tobias Grosser and Michael Kruse", title = "High-Performance Generalized Tensor Operations: a Compiler-Oriented Approach", journal = j-TACO, volume = "15", number = "3", pages = "34:1--34:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3235029", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The efficiency of tensor contraction is of great importance. Compilers cannot optimize it well enough to come close to the performance of expert-tuned implementations. All existing approaches that provide competitive performance require optimized external code. We introduce a compiler optimization that reaches the performance of optimized BLAS libraries without the need for an external implementation or automatic tuning. Our approach provides competitive performance across hardware architectures and can be generalized to deliver the same benefits for algebraic path problems. By making fast linear algebra kernels available to everyone, we expect productivity increases when optimized libraries are not available.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yviquel:2018:CPU, author = "Herv{\'e} Yviquel and Lauro Cruz and Guido Araujo", title = "Cluster Programming using the {OpenMP} Accelerator Model", journal = j-TACO, volume = "15", number = "3", pages = "35:1--35:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3226112", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Computation offloading is a programming model in which program fragments (e.g., hot loops) are annotated so that their execution is performed in dedicated hardware or accelerator devices. Although offloading has been extensively used to move computation to GPUs, through directive-based annotation standards like OpenMP, offloading computation to very large computer clusters can become a complex and cumbersome task. It typically requires mixing programming models (e.g., OpenMP and MPI) and languages (e.g., C/C++ and Scala), dealing with various access control mechanisms from different cloud providers (e.g., AWS and Azure), and integrating all this into a single application. This article introduces computer cluster nodes as simple OpenMP offloading devices that can be used either from a local computer or from the cluster head-node. It proposes a methodology that transforms OpenMP directives to Spark runtime calls with fully integrated communication management, in a way that a cluster appears to the programmer as yet another accelerator device. Experiments using LLVM 3.8, OpenMP 4.5 on well known cloud infrastructures (Microsoft Azure and Amazon EC2) show the viability of the proposed approach, enable a thorough analysis of its performance, and make a comparison with an MPI implementation. The results show that although data transfers can impose overheads, cloud offloading from a local machine can still achieve promising speedups for larger granularity: up to 115$ \times $ in 256 cores for the 2MM benchmark using 1GB sparse matrices. In addition, the parallel implementation of a complex and relevant scientific application reveals a 80$ \times $ speedup on a 320 core machine when executed directly from the headnode of the cluster.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tavana:2018:BCA, author = "Mohammad Khavari Tavana and Amir Kavyan Ziabari and David Kaeli", title = "Block Cooperation: Advancing Lifetime of Resistive Memories by Increasing Utilization of Error Correcting Codes", journal = j-TACO, volume = "15", number = "3", pages = "36:1--36:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3243906", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Block-level cooperation is an endurance management technique that operates on top of error correction mechanisms to extend memory lifetimes. Once an error recovery scheme fails to recover from faults in a data block, the entire physical page associated with that block is disabled and becomes unavailable to the physical address space. To reduce the page waste caused by early block failures, other blocks can be used to support the failed block, working cooperatively to keep it alive and extend the faulty page's lifetime. We combine the proposed technique with existing error recovery schemes, such as Error Correction Pointers (ECP) and Aegis, to increase memory lifetimes. Block cooperation is realized through metadata sharing in ECP, where one data block shares its unused metadata with another data block. When combined with Aegis, block cooperation is realized through reorganizing data layout, where blocks possessing few faults come to the aid of failed blocks, bringing them back from the dead. Our evaluation using Monte Carlo simulation shows that block cooperation at a single level (or multiple levels) on top of ECP and Aegis, boosts memory lifetimes by 28\% (37\%) and 8\% (14\%) on average, respectively. Furthermore, using trace-driven benchmark evaluation shows that lifetime boost can reach to 68\% (30\%) exploiting metadata sharing (or data layout reorganization).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jin:2018:LCM, author = "Hai Jin and Bo Liu and Wenbin Jiang and Yang Ma and Xuanhua Shi and Bingsheng He and Shaofeng Zhao", title = "Layer-Centric Memory Reuse and Data Migration for Extreme-Scale Deep Learning on Many-Core Architectures", journal = j-TACO, volume = "15", number = "3", pages = "37:1--37:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3243904", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Due to the popularity of Deep Neural Network (DNN) models, we have witnessed extreme-scale DNN models with the continued increase of the scale in terms of depth and width. However, the extremely high memory requirements for them make it difficult to run the training processes on single many-core architectures such as a Graphic Processing Unit (GPU), which compels researchers to use model parallelism over multiple GPUs to make it work. However, model parallelism always brings very heavy additional overhead. Therefore, running an extreme-scale model in a single GPU is urgently required. There still exist several challenges to reduce the memory footprint for extreme-scale deep learning. To address this tough problem, we first identify the memory usage characteristics for deep and wide convolutional networks, and demonstrate the opportunities for memory reuse at both the intra-layer and inter-layer levels. We then present Layrub, a runtime data placement strategy that orchestrates the execution of the training process. It achieves layer-centric reuse to reduce memory consumption for extreme-scale deep learning that could not previously be run on a single GPU. Experiments show that, compared to the original Caffe, Layrub can cut down the memory usage rate by an average of 58.2\% and by up to 98.9\%, at the moderate cost of 24.1\% higher training execution time on average. Results also show that Layrub outperforms some popular deep learning systems such as GeePS, vDNN, MXNet, and Tensorflow. More importantly, Layrub can tackle extreme-scale deep learning tasks. For example, it makes an extra-deep ResNet with 1,517 layers that can be trained successfully in one GPU with 12GB memory, while other existing deep learning systems cannot.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Voitsechov:2018:SDT, author = "Dani Voitsechov and Arslan Zulfiqar and Mark Stephenson and Mark Gebhart and Stephen W. Keckler", title = "Software-Directed Techniques for Improved {GPU} Register File Utilization", journal = j-TACO, volume = "15", number = "3", pages = "38:1--38:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3243905", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Throughput architectures such as GPUs require substantial hardware resources to hold the state of a massive number of simultaneously executing threads. While GPU register files are already enormous, reaching capacities of 256KB per streaming multiprocessor (SM), we find that nearly half of real-world applications we examined are register-bound and would benefit from a larger register file to enable more concurrent threads. This article seeks to increase the thread occupancy and improve performance of these register-bound applications by making more efficient use of the existing register file capacity. Our first technique eagerly deallocates register resources during execution. We show that releasing register resources based on value liveness as proposed in prior states of the art leads to unreliable performance and undue design complexity. To address these deficiencies, our article presents a novel compiler-driven approach that identifies and exploits last use of a register name (instead of the value contained within) to eagerly release register resources. Furthermore, while previous works have leveraged ``scalar'' and ``narrow'' operand properties of a program for various optimizations, their impact on thread occupancy has been relatively unexplored. Our article evaluates the effectiveness of these techniques in improving thread occupancy and demonstrates that while any one approach may fail to free very many registers, together they synergistically free enough registers to launch additional parallel work. An in-depth evaluation on a large suite of applications shows that just our early register technique outperforms previous work on dynamic register allocation, and together these approaches, on average, provide 12\% performance speedup (23\% higher thread occupancy) on register bound applications not already saturating other GPU resources.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lin:2018:GTD, author = "Huanxin Lin and Cho-Li Wang and Hongyuan Liu", title = "{On-GPU} Thread-Data Remapping for Branch Divergence Reduction", journal = j-TACO, volume = "15", number = "3", pages = "39:1--39:??", month = oct, year = "2018", CODEN = "????", DOI = "https://doi.org/10.1145/3242089", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:19:59 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "General Purpose GPU computing (GPGPU) plays an increasingly vital role in high performance computing and other areas like deep learning. However, arising from the SIMD execution model, the branch divergence issue lowers efficiency of conditional branching on GPUs, and hinders the development of GPGPU. To achieve runtime on-the-spot branch divergence reduction, we propose the first on-GPU thread-data remapping scheme. Before kernel launching, our solution inserts codes into GPU kernels immediately before each target branch so as to acquire actual runtime divergence information. GPU software threads can be remapped to datasets multiple times during single kernel execution. We propose two thread-data remapping algorithms that are tailored to the GPU architecture. Effective on two generations of GPUs from both NVIDIA and AMD, our solution achieves speedups up to 2.718 with third-party benchmarks. We also implement three GPGPU frontier benchmarks from areas including computer vision, algorithmic trading and data analytics. They are hindered by more complex divergence coupled with different memory access patterns, and our solution works better than the traditional thread-data remapping scheme in all cases. As a compiler-assisted runtime solution, it can better reduce divergence for divergent applications that gain little acceleration on GPUs for the time being.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kronawitter:2019:PSS, author = "Stefan Kronawitter and Christian Lengauer", title = "Polyhedral Search Space Exploration in the {ExaStencils} Code Generator", journal = j-TACO, volume = "15", number = "4", pages = "40:1--40:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3274653", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Performance optimization of stencil codes requires data locality improvements. The polyhedron model for loop transformation is well suited for such optimizations with established techniques, such as the PLuTo algorithm and diamond tiling. However, in the domain of our project ExaStencils, stencil codes, it fails to yield optimal results. As an alternative, we propose a new, optimized, multi-dimensional polyhedral search space exploration and demonstrate its effectiveness: we obtain better results than existing approaches in several cases. We also propose how to specialize the search for the domain of stencil codes, which dramatically reduces the exploration effort without significantly impairing performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2019:PTA, author = "Jingheng Xu and Haohuan Fu and Wen Shi and Lin Gan and Yuxuan Li and Wayne Luk and Guangwen Yang", title = "Performance Tuning and Analysis for Stencil-Based Applications on {POWER8} Processor", journal = j-TACO, volume = "15", number = "4", pages = "41:1--41:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3264422", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article demonstrates an approach for combining general tuning techniques with the POWER8 hardware architecture through optimizing three representative stencil benchmarks. Two typical real-world applications, with kernels similar to those of the winning programs of the Gordon Bell Prize 2016 and 2017, are employed to illustrate algorithm modifications and a combination of hardware-oriented tuning strategies with the application algorithms. This work fills the gap between hardware capability and software performance of the POWER8 processor, and provides useful guidance for optimizing stencil-based scientific applications on POWER systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2019:SSS, author = "Jiajun Wang and Reena Panda and Lizy K. John", title = "{SelSMaP}: a Selective Stride Masking Prefetching Scheme", journal = j-TACO, volume = "15", number = "4", pages = "42:1--42:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3274650", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Data prefetching, which intelligently loads data closer to the processor before demands, is a popular cache performance optimization technique to address the increasing processor-memory performance gap. Although prefetching concepts have been proposed for decades, sophisticated system architecture and emerging applications introduce new challenges. Large instruction windows coupled with out-of-order execution makes the program data access sequence distorted from a cache perspective. Furthermore, big data applications stress memory subsystems heavily with their large working set sizes and complex data access patterns. To address such challenges, this work proposes a high-performance hardware prefetching scheme, SelSMaP. SelSMaP is able to detect both regular and nonuniform stride patterns by taking the minimum observed address offset (called a reference stride) as a heuristic. A stride masking is generated according to the reference stride and is to filter out history accesses whose pattern can be rephrased as uniform stride accesses. Prefetching decision and prefetch degree are determined based on the masking outcome. As SelSMaP prediction logic does not rely on the chronological order of data accesses or program counter information, it is able to unveil the effect of out-of-order execution and compiler optimization. We evaluated SelSMaP with CloudSuite workloads and SPEC CPU2006 benchmarks. SelSMaP achieves an average CloudSuite performance improvement of 30\% over nonprefetching systems. With one to two orders of magnitude less storage and much less functional logic, SelSMaP outperforms the highest-performing prefetcher by 8.6\% in CloudSuite workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Su:2019:SSC, author = "Xing Su and Xiangke Liao and Hao Jiang and Canqun Yang and Jingling Xue", title = "{SCP}: Shared Cache Partitioning for High-Performance {GEMM}", journal = j-TACO, volume = "15", number = "4", pages = "43:1--43:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3274654", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GEneral Matrix Multiply (GEMM) is the most fundamental computational kernel routine in the BLAS library. To achieve high performance, in-memory data must be prefetched into fast on-chip caches before they are used. Two techniques, software prefetching and data packing, have been used to effectively exploit the capability of on-chip least recent used (LRU) caches, which are popular in traditional high-performance processors used in high-end servers and supercomputers. However, the market has recently witnessed a new diversity in processor design, resulting in high-performance processors equipped with shared caches with non-LRU replacement policies. This poses a challenge to the development of high-performance GEMM in a multithreaded context. As several threads try to load data into a shared cache simultaneously, interthread cache conflicts will increase significantly. We present a Shared Cache Partitioning (SCP) method to eliminate interthread cache conflicts in the GEMM routines, by partitioning a shared cache into physically disjoint sets and assigning different sets to different threads. We have implemented SCP in the OpenBLAS library and evaluated it on Phytium 2000+, a 64-core AArch64 processor with private LRU L1 caches and shared pseudo-random L2 caches (per four-core cluster). Our evaluation shows that SCP has effectively reduced the conflict misses in both L1 and L2 caches in a highly optimized GEMM implementation, resulting in an improvement of its performance by 2.75\% to 6.91\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pereira:2019:SPS, author = "Fernando Magno Quint{\~a}o Pereira and Guilherme Vieira Leobas and Abdoulaye Gamati{\'e}", title = "Static Prediction of Silent Stores", journal = j-TACO, volume = "15", number = "4", pages = "44:1--44:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3280848", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A store operation is called ``silent'' if it writes in memory a value that is already there. The ability to detect silent stores is important, because they might indicate performance bugs, might enable code optimizations, and might reveal opportunities of automatic parallelization, for instance. Silent stores are traditionally detected via profiling tools. In this article, we depart from this methodology and instead explore the following question: is it possible to predict silentness by analyzing the syntax of programs? The process of building an answer to this question is interesting in itself, given the stochastic nature of silent stores, which depend on data and coding style. To build such an answer, we have developed a methodology to classify store operations in terms of syntactic features of programs. Based on such features, we develop different kinds of predictors, some of which go much beyond what any trivial approach could achieve. To illustrate how static prediction can be employed in practice, we use it to optimize programs running on nonvolatile memory systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Crago:2019:EMA, author = "Neal C. Crago and Mark Stephenson and Stephen W. Keckler", title = "Exposing Memory Access Patterns to Improve Instruction and Memory Efficiency in {GPUs}", journal = j-TACO, volume = "15", number = "4", pages = "45:1--45:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3280851", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern computing workloads often have high memory intensity, requiring high bandwidth access to memory. The memory request patterns of these workloads vary and include regular strided accesses and indirect (pointer-based) accesses. Such applications require a large number of address generation instructions and a high degree of memory-level parallelism. This article proposes new memory instructions that exploit strided and indirect memory request patterns and improve efficiency in GPU architectures. The new instructions reduce address calculation instructions by offloading addressing to dedicated hardware, and reduce destructive memory request interference by grouping related requests together. Our results show that we can eliminate 33\% of dynamic instructions across 16 GPU benchmarks. These improvements result in an overall runtime improvement of 26\%, an energy reduction of 18\%, and a reduction in energy-delay product of 32\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2019:PPB, author = "Feng Zhang and Jingling Xue", title = "{Poker}: Permutation-Based {SIMD} Execution of Intensive Tree Search by Path Encoding", journal = j-TACO, volume = "15", number = "4", pages = "46:1--46:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3280850", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We introduce Poker, a permutation-based approach for vectorizing multiple queries over B$^+$-trees. Our key insight is to combine vector loads and path-encoding-based permutations to alleviate memory latency while keeping the number of key comparisons needed for a query to a minimum. Implemented as a C++ template library, Poker represents a general-purpose solution for vectorizing the queries over indexing trees on multi-core processors equipped with SIMD units. For a set of five representative benchmarks evaluated with 24 configurations each, Poker outperforms the state of the art by 2.11x with one single thread and 2.28x with eight threads on an Intel Broadwell processor that supports 256-bit AVX2, on average. In addition, strip-mining queries will further improve Poker's performance by 1.21x (with one single thread) and 1.31x (with eight threads), on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Belleville:2019:ASP, author = "Nicolas Belleville and Damien Courouss{\'e} and Karine Heydemann and Henri-Pierre Charles", title = "Automated Software Protection for the Masses Against Side-Channel Attacks", journal = j-TACO, volume = "15", number = "4", pages = "47:1--47:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3281662", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present an approach and a tool to answer the need for effective, generic, and easily applicable protections against side-channel attacks. The protection mechanism is based on code polymorphism, so that the observable behaviour of the protected component is variable and unpredictable to the attacker. Our approach combines lightweight specialized runtime code generation with the optimization capabilities of static compilation. It is extensively configurable. Experimental results show that programs secured by our approach present strong security levels and meet the performance requirements of constrained systems.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yu:2019:ITL, author = "Chao Yu and Yuebin Bai and Qingxiao Sun and Hailong Yang", title = "Improving Thread-level Parallelism in {GPUs} Through Expanding Register File to Scratchpad Memory", journal = j-TACO, volume = "15", number = "4", pages = "48:1--48:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3280849", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern Graphic Processing Units (GPUs) have become pervasive computing devices in datacenters due to their high performance with massive thread level parallelism (TLP). GPUs are equipped with large register files (RF) to support fast context switch between massive threads and scratchpad memory (SPM) to support inter-thread communication within the cooperative thread array (CTA). However, the TLP of GPUs is usually limited by the inefficient resource management of register file and scratchpad memory. This inefficiency also leads to register file and scratchpad memory underutilization. To overcome the above inefficiency, we propose a new resource management approach EXPARS for GPUs. EXPARS provides a larger register file logically by expanding the register file to scratchpad memory. When the available register file becomes limited, our approach leverages the underutilized scratchpad memory to support additional register allocation. Therefore, more CTAs can be dispatched to SMs, which improves the GPU utilization. Our experiments on representative benchmark suites show that the number of CTAs dispatched to each SM increases by 1.28$ \times $ on average. In addition, our approach improves the GPU resource utilization significantly, with the register file utilization improved by 11.64\% and the scratchpad memory utilization improved by 48.20\% on average. With better TLP, our approach achieves 20.01\% performance improvement on average with negligible energy overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Orosa:2019:AAF, author = "Lois Orosa and Rodolfo Azevedo and Onur Mutlu", title = "{AVPP}: Address-first Value-next Predictor with Value Prefetching for Improving the Efficiency of Load Value Prediction", journal = j-TACO, volume = "15", number = "4", pages = "49:1--49:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3239567", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Value prediction improves instruction level parallelism in superscalar processors by breaking true data dependencies. Although this technique can significantly improve overall performance, most of the state-of-the-art value prediction approaches require high hardware cost, which is the main obstacle for its wide adoption in current processors. To tackle this issue, we revisit load value prediction as an efficient alternative to the classical approaches that predict all instructions. By speculating only on loads, the pressure over shared resources (e.g., the Physical Register File) and the predictor size can be substantially reduced (e.g., more than 90\% reduction compared to recent works). We observe that existing value predictors cannot achieve very high performance when speculating only on load instructions. To solve this problem, we propose a new, accurate and low-cost mechanism for predicting the values of load instructions: the Address-first Value-next Predictor with Value Prefetching (AVPP). The key idea of our predictor is to predict the load address first (which, we find, is much more predictable than the value) and to use a small non-speculative Value Table (VT)-indexed by the predicted address-to predict the value next. To increase the coverage of AVPP, we aim to increase the hit rate of the VT by predicting also the load address of a future instance of the same load instruction and prefetching its value in the VT. We show that AVPP is relatively easy to implement, requiring only 2.5\% of the area of a 32KB L1 data cache. We compare our mechanism with five state-of-the-art value prediction techniques, evaluated within the context of load value prediction, in a relatively narrow out-of-order processor. On average, our AVPP predictor achieves 11.2\% speedup and 3.7\% of energy savings over the baseline processor, outperforming all the state-of-the-art predictors in 16 of the 23 benchmarks we evaluate. We evaluate AVPP implemented together with different prefetching techniques, showing additive performance gains (20\% average speedup). In addition, we propose a new taxonomy to classify different value predictor policies regarding predictor update, predictor availability, and in-flight pending updates. We evaluate these policies in detail.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2019:REU, author = "Jun Zhang and Rui Hou and Wei Song and Sally A. Mckee and Zhen Jia and Chen Zheng and Mingyu Chen and Lixin Zhang and Dan Meng", title = "{RAGuard}: an Efficient and User-Transparent Hardware Mechanism against {ROP} Attacks", journal = j-TACO, volume = "15", number = "4", pages = "50:1--50:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3280852", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/prng.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Control-flow integrity (CFI) is a general method for preventing code-reuse attacks, which utilize benign code sequences to achieve arbitrary code execution. CFI ensures that the execution of a program follows the edges of its predefined static Control-Flow Graph: any deviation that constitutes a CFI violation terminates the application. Despite decades of research effort, there are still several implementation challenges in efficiently protecting the control flow of function returns (Return-Oriented Programming attacks). The set of valid return addresses of frequently called functions can be large and thus an attacker could bend the backward-edge CFI by modifying an indirect branch target to another within the valid return set. This article proposes RAGuard, an efficient and user-transparent hardware-based approach to prevent Return-Oriented Programming attacks. RAGuard binds a message authentication code (MAC) to each return address to protect its integrity. To guarantee the security of the MAC and reduce runtime overhead: RAGuard (1) computes the MAC by encrypting the signature of a return address with AES-128, (2) develops a key management module based on a Physical Unclonable Function (PUF) and a True Random Number Generator (TRNG), and (3) uses a dedicated register to reduce MACs' load and store operations of leaf functions. We have evaluated our mechanism based on the open-source LEON3 processor and the results show that RAGuard incurs acceptable performance overhead and occupies reasonable area.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2019:GGC, author = "Ping Wang and Luke Mchale and Paul V. Gratz and Alex Sprintson", title = "{GenMatcher}: a Generic Clustering-Based Arbitrary Matching Framework", journal = j-TACO, volume = "15", number = "4", pages = "51:1--51:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3281663", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Packet classification methods rely upon packet content/header matching against rules. Thus, throughput of matching operations is critical in many networking applications. Further, with the advent of Software Defined Networking (SDN), efficient implementation of software approaches to matching are critical for the overall system performance. This article presents$^1$ GenMatcher, a generic, software-only, arbitrary matching framework for fast, efficient searches. The key idea of our approach is to represent arbitrary rules with efficient prefix-based tries. To support arbitrary wildcards, we rearrange bits within the rules such that wildcards accumulate to one side of the bitstring. Since many non-contiguous wildcards often remain, we use multiple prefix-based tries. The main challenge in this context is to generate efficient trie groupings and expansions to support all arbitrary rules. Finding an optimal mix of grouping and expansion is an NP-complete problem. Our contribution includes a novel, clustering-based grouping algorithm to group rules based upon their bit-level similarities. Our algorithm generates near-optimal trie groupings with low configuration times and provides significantly higher match throughput compared to prior techniques. Experiments with synthetic traffic show that our method can achieve a 58.9X speedup compared to the baseline on a single core processor under a given memory constraint.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hong:2019:PTG, author = "Ding-Yong Hong and Jan-Jan Wu and Yu-Ping Liu and Sheng-Yu Fu and Wei-Chung Hsu", title = "Processor-Tracing Guided Region Formation in Dynamic Binary Translation", journal = j-TACO, volume = "15", number = "4", pages = "52:1--52:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3281664", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Region formation is an important step in dynamic binary translation to select hot code regions for translation and optimization. The quality of the formed regions determines the extent of optimizations and thus determines the final execution performance. Moreover, the overall performance is very sensitive to the formation overhead, because region formation can have a non-trivial cost. For addressing the dual issues of region quality and region formation overhead, this article presents a lightweight region formation method guided by processor tracing, e.g., Intel PT. We leverage the branch history information stored in the processor to reconstruct the program execution profile and effectively form high-quality regions with low cost. Furthermore, we present the designs of lightweight hardware performance monitoring sampling and the branch instruction decode cache to minimize region formation overhead. Using ARM64 to x86-64 translations, the experiment results show that our method achieves a performance speedup of up to 1.53$ \times $ (1.16$ \times $ on average) for SPEC CPU2006 benchmarks with reference inputs, compared to the well-known software-based trace formation method, Next Executing Tail (NET). The performance results of x86-64 to ARM64 translations also show a speedup of up to 1.25$ \times $ over NET for CINT2006 benchmarks with reference inputs. The comparison with a relaxed NETPlus region formation method further demonstrates that our method achieves the best performance and lowest compilation overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2019:PNW, author = "Yu Wang and Victor Lee and Gu-Yeon Wei and David Brooks", title = "Predicting New Workload or {CPU} Performance by Analyzing Public Datasets", journal = j-TACO, volume = "15", number = "4", pages = "53:1--53:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3284127", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The marketplace for general-purpose microprocessors offers hundreds of functionally similar models, differing by traits like frequency, core count, cache size, memory bandwidth, and power consumption. Their performance depends not only on microarchitecture, but also on the nature of the workloads being executed. Given a set of intended workloads, the consumer needs both performance and price information to make rational buying decisions. Many benchmark suites have been developed to measure processor performance, and their results for large collections of CPUs are often publicly available. However, repositories of benchmark results are not always helpful when consumers need performance data for new processors or new workloads. Moreover, the aggregate scores for benchmark suites designed to cover a broad spectrum of workload types can be misleading. To address these problems, we have developed a deep neural network (DNN) model, and we have used it to learn the relationship between the specifications of Intel CPUs and their performance on the SPEC CPU2006 and Geekbench 3 benchmark suites. We show that we can generate useful predictions for new processors and new workloads. We also cross-predict the two benchmark suites and compare their performance scores. The results quantify the self-similarity of these suites for the first time in the literature. This work should discourage consumers from basing purchasing decisions exclusively on Geekbench 3, and it should encourage academics to evaluate research using more diverse workloads than the SPEC CPU suites alone.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Park:2019:ROC, author = "Hyukwoo Park and Sungkook Kim and Jung-Geun Park and Soo-Mook Moon", title = "Reusing the Optimized Code for {JavaScript} Ahead-of-Time Compilation", journal = j-TACO, volume = "15", number = "4", pages = "54:1--54:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291056", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As web pages and web apps increasingly include heavy JavaScript code, JavaScript performance has been a critical issue. Modern JavaScript engines achieve a remarkable performance by employing tiered-execution architecture based on interpreter, baseline just-in-time compiler (JITC), and optimizing JITC. Unfortunately, they suffer from a substantial compilation overhead, which can take more than 50\% of the whole running time. A simple idea to reduce the compilation overhead is ahead-of-time compilation (AOTC), which reuses the code generated in the previous run. In fact, existing studies that reuse the bytecode generated by the interpreter or the machine code generated by the baseline JITC have shown tangible performance benefits [12, 31, 41]. However, there has been no study to reuse the machine code generated by the optimizing JITC, which heavily uses profile-based optimizations, thus not easily reusable. We propose a novel AOTC that can reuse the optimized machine code for high-performance JavaScript engines. Unlike previous AOTCs, we need to resolve a few challenging issues related to reusing profile-based optimized code and relocating dynamic addresses. Our AOTC improves the performance of a commercial JavaScript engine by 6.36 times (max) and 1.99 times (average) for Octane benchmarks, by reducing the compilation overhead and by running the optimized code from the first invocation of functions. It also improves the loading time of six web apps by 1.28 times, on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2019:BLA, author = "Han Zhao and Quan Chen and Yuxian Qiu and Ming Wu and Yao Shen and Jingwen Leng and Chao Li and Minyi Guo", title = "Bandwidth and Locality Aware Task-stealing for Manycore Architectures with Bandwidth-Asymmetric Memory", journal = j-TACO, volume = "15", number = "4", pages = "55:1--55:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291058", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Parallel computers now start to adopt Bandwidth-Asymmetric Memory architecture that consists of traditional DRAM memory and new High Bandwidth Memory (HBM) for high memory bandwidth. However, existing task schedulers suffer from low bandwidth usage and poor data locality problems in bandwidth-asymmetric memory architectures. To solve the two problems, we propose a Bandwidth and Locality Aware Task-stealing (BATS) system, which consists of an HBM-aware data allocator, a bandwidth-aware traffic balancer, and a hierarchical task-stealing scheduler. Leveraging compile-time code transformation and run-time data distribution, the data allocator enables HBM usage automatically without user interference. According to data access hotness, the traffic balancer migrates data to balance memory traffic across memory nodes proportional to their bandwidth. The hierarchical scheduler improves data locality at runtime without a priori program knowledge. Experiments on an Intel Knights Landing server that adopts bandwidth-asymmetric memory show that BATS reduces the execution time of memory-bound programs up to 83.5\% compared with traditional task-stealing schedulers.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ganser:2019:SIP, author = "Stefan Ganser and Armin Gr{\"o}{\ss}linger and Norbert Siegmund and Sven Apel and Christian Lengauer", title = "Speeding up Iterative Polyhedral Schedule Optimization with Surrogate Performance Models", journal = j-TACO, volume = "15", number = "4", pages = "56:1--56:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291773", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Iterative program optimization is known to be able to adapt more easily to particular programs and target hardware than model-based approaches. An approach is to generate random program transformations and evaluate their profitability by applying them and benchmarking the transformed program on the target hardware. This procedure's large computational effort impairs its practicality tremendously, though. To address this limitation, we pursue the guidance of a genetic algorithm for program optimization via feedback from surrogate performance models. We train the models on program transformations that were evaluated during previous iterative optimizations. Our representation of programs and program transformations refers to the polyhedron model. The representation is particularly meaningful for an optimization of loop programs that profit a from coarse-grained parallelization for execution on modern multicore-CPUs. Our evaluation reveals that surrogate performance models can be used to speed up the optimization of loop programs. We demonstrate that we can reduce the benchmarking effort required for an iterative optimization and degrade the resulting speedups by an average of 15\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2019:DPC, author = "Song Wu and Fang Zhou and Xiang Gao and Hai Jin and Jinglei Ren", title = "Dual-Page Checkpointing: an Architectural Approach to Efficient Data Persistence for In-Memory Applications", journal = j-TACO, volume = "15", number = "4", pages = "57:1--57:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291057", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Data persistence is necessary for many in-memory applications. However, the disk-based data persistence largely slows down in-memory applications. Emerging non-volatile memory (NVM) offers an opportunity to achieve in-memory data persistence at the DRAM-level performance. Nevertheless, NVM typically requires a software library to operate NVM data, which brings significant overhead. This article demonstrates that a hardware-based high-frequency checkpointing mechanism can be used to achieve efficient in-memory data persistence on NVM. To maintain checkpoint consistency, traditional logging and copy-on-write techniques incur excessive NVM writes that impair both performance and endurance of NVM; recent work attempts to solve the issue but requires a large amount of metadata in the memory controller. Hence, we design a new dual-page checkpointing system, which achieves low metadata cost and eliminates most excessive NVM writes at the same time. It breaks the traditional trade-off between metadata space cost and extra data writes. Our solution outperforms the state-of-the-art NVM software libraries by 13.6$ \times $ in throughput, and leads to 34\% less NVM wear-out and 1.28$ \times $ higher throughput than state-of-the-art hardware checkpointing solutions, according to our evaluation with OLTP, graph computing, and machine-learning workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kiani:2019:ECP, author = "Mohsen Kiani and Amir Rajabzadeh", title = "Efficient Cache Performance Modeling in {GPUs} Using Reuse Distance Analysis", journal = j-TACO, volume = "15", number = "4", pages = "58:1--58:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291051", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Reuse distance analysis (RDA) is a popular method for calculating locality profiles and modeling cache performance. The present article proposes a framework to apply the RDA algorithm to obtain reuse distance profiles in graphics processing unit (GPU) kernels. To study the implications of hardware-related parameters in RDA, two RDA algorithms were employed, including a high-level cache-independent RDA algorithm, called HLRDA, and a detailed RDA algorithm, called DRDA. DRDA models the effects of reservation fails in cache blocks and miss status holding registers to provide accurate cache-related performance metrics. In this case, the reuse profiles are cache-specific. In a selection of GPU kernels, DRDA obtained the L1 miss-rate breakdowns with an average error of 3.86\% and outperformed the state-of-the-art RDA in terms of accuracy. In terms of performance, DRDA is 246,000$ \times $ slower than the real GPU executions and 11$ \times $ faster than GPGPU-Sim. HLRDA ignores the cache-related parameters and its obtained reuse profiles are general, which can be used to calculate miss rates in all cache sizes. Moreover, the average error incurred by HLRDA was 16.9\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Debrunner:2019:AAK, author = "Thomas Debrunner and Sajad Saeedi and Paul H. J. Kelly", title = "{AUKE}: Automatic Kernel Code Generation for an Analogue {SIMD} Focal-Plane Sensor-Processor Array", journal = j-TACO, volume = "15", number = "4", pages = "59:1--59:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291055", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Focal-plane Sensor-Processor Arrays (FPSPs) are new imaging devices with parallel Single Instruction Multiple Data (SIMD) computational capabilities built into every pixel. Compared to traditional imaging devices, FPSPs allow for massive pixel-parallel execution of image processing algorithms. This enables the application of certain algorithms at extreme frame rates ({$>$10},000 frames per second). By performing some early-stage processing in-situ, systems incorporating FPSPs can consume less power compared to conventional approaches using standard digital cameras. In this article, we explore code generation for an FPSP whose 256 $ \times $ 256 processors operate on analogue signal data, leading to further opportunities for power reduction-and additional code synthesis challenges. While rudimentary image processing algorithms have been demonstrated on FPSPs before, progress with higher-level computer vision algorithms has been sparse due to the unique architecture and limits of the devices. This article presents a code generator for convolution filters for the SCAMP-5 FPSP, with applications in many high-level tasks such as convolutional neural networks, pose estimation, and so on. The SCAMP-5 FPSP has no effective multiply operator. Convolutions have to be implemented through sequences of more primitive operations such as additions, subtractions, and multiplications/divisions by two. We present a code generation algorithm to optimise convolutions by identifying common factors in the different weights and by determining an optimised pattern of pixel-to-pixel data movements to exploit them. We present evaluation in terms of both speed and energy consumption for a suite of well-known convolution filters. Furthermore, an application of the method is shown by the implementation of a Viola-Jones face detection algorithm.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2019:SNS, author = "You Zhou and Fei Wu and Zhonghai Lu and Xubin He and Ping Huang and Changsheng Xie", title = "{SCORE}: a Novel Scheme to Efficiently Cache Overlong {ECCs} in {NAND} Flash Memory", journal = j-TACO, volume = "15", number = "4", pages = "60:1--60:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291052", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Technology scaling and program/erase cycling result in an increasing bit error rate in NAND flash storage. Some solid state drives (SSDs) adopt overlong error correction codes (ECCs), whose redundancy size exceeds the spare area limit of flash pages, to protect user data for improved reliability and lifetime. However, the read performance is significantly degraded, because a logical data page and its ECC redundancy are stored in two flash pages. In this article, we find that caching ECCs has a large potential to reduce flash reads by achieving higher hit rates, compared to caching data. Then, we propose a novel {$<$ underline$>$ s$<$}/{underline$>$ cheme} to efficiently {$<$ underline$>$ c$<$}/{underline$>$ ache} {$<$ underline$>$ o$<$}/{underline$>$ ve$<$ underline$>$ r$<$}/{underline$>$ long} {$<$ underline$>$E$<$}/{underline$>$CCs}, called SCORE, to improve the SSD performance. Exceeding ECC redundancy (called ECC residues ) of logically consecutive data pages are grouped into ECC pages. SCORE partitions RAM to cache both data pages and ECC pages in a workload-adaptive manner. Finally, we verify SCORE using extensive trace-driven simulations. The results show that SCORE obtains high ECC hit rates without sacrificing data hit rates, thus improving the read performance by an average of 22\% under various workloads, compared to the state-of-the-art schemes.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Andujar:2019:PPA, author = "Franciso J. And{\'u}jar and Salvador Coll and Marina Alonso and Pedro L{\'o}pez and Juan-Miguel Mart{\'\i}nez", title = "{POWAR}: Power-Aware Routing in {HPC} Networks with On\slash Off Links", journal = j-TACO, volume = "15", number = "4", pages = "61:1--61:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3293445", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In order to save energy in HPC interconnection networks, one usual proposal is to switch idle links into a low-power mode after a certain time without any transmission, as IEEE Energy Efficient Ethernet standard proposes. Extending the low-power mode mechanism, we propose POWer-Aware Routing (POWAR), a simple power-aware routing and selection function for fat-tree and torus networks. POWAR adapts the amount of network links that can be used, taking into account the network load, and obtaining great energy savings in the network (55\%--65\%) and the entire system (9\%--10\%) with negligible performance overhead.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mammadli:2019:AGD, author = "Rahim Mammadli and Felix Wolf and Ali Jannesari", title = "The Art of Getting Deep Neural Networks in Shape", journal = j-TACO, volume = "15", number = "4", pages = "62:1--62:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291053", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Training a deep neural network (DNN) involves selecting a set of hyperparameters that define the network topology and influence the accuracy of the resulting network. Often, the goal is to maximize prediction accuracy on a given dataset. However, non-functional requirements of the trained network --- such as inference speed, size, and energy consumption --- can be very important as well. In this article, we aim to automate the process of selecting an appropriate DNN topology that fulfills both functional and non-functional requirements of the application. Specifically, we focus on tuning two important hyperparameters, depth and width, which together define the shape of the resulting network and directly affect its accuracy, speed, size, and energy consumption. To reduce the time needed to search the design space, we train a fraction of DNNs and build a model to predict the performances of the remaining ones. We are able to produce tuned ResNets, which are up to 4.22 times faster than original depth-scaled ResNets on a batch of 128 images while matching their accuracy.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "62", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tzilis:2019:EER, author = "Stavros Tzilis and Pedro Trancoso and Ioannis Sourdis", title = "Energy-Efficient Runtime Management of Heterogeneous Multicores using Online Projection", journal = j-TACO, volume = "15", number = "4", pages = "63:1--63:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3293446", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Heterogeneous multicores offer flexibility in the form of different core types and Dynamic Voltage and Frequency Scaling (DVFS), defining a vast configuration space. The optimal configuration choice is not always straightforward, even for single applications, and becomes a very difficult problem for dynamically changing scenarios of concurrent applications with unpredictable spawn and termination times and individual performance requirements. This article proposes an integrated approach for runtime decision making for energy efficiency on such systems. The approach consists of a model that predicts performance and power for any possible decision and low-complexity heuristics that use this model to evaluate a subset of possible decisions to choose the best. The model predicts performance by projecting standalone application profiling data to the current status of the system and power by using a set of platform-specific parameters that are determined only once for a given system and are independent of the application mix. Our approach is evaluated with a plethora of dynamic, multi-application scenarios. When considering best effort performance to be adequate, our runtime achieves on average 3\% higher energy efficiency compared to the powersave governor and 2$ \times $ better compared to the other Linux governors. Moreover, when also considering individual applications' performance requirements, our runtime is able to satisfy them, giving away 18\% of the system's energy efficiency compared to the powersave, which, however, misses the performance targets by 23\%; at the same time, our runtime maintains an efficiency advantage of about 55\% compared to the other governors, which also satisfy the performance constraints.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "63", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2019:SLS, author = "Matthew Kay Fei Lee and Yingnan Cui and Thannirmalai Somu and Tao Luo and Jun Zhou and Wai Teng Tang and Weng-Fai Wong and Rick Siow Mong Goh", title = "A System-Level Simulator for {RRAM}-Based Neuromorphic Computing Chips", journal = j-TACO, volume = "15", number = "4", pages = "64:1--64:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291054", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Advances in non-volatile resistive switching random access memory (RRAM) have made it a promising memory technology with potential applications in low-power and embedded in-memory computing devices owing to a number of advantages such as low-energy consumption, low area cost and good scaling. There have been proposals to employ RRAM in architecting chips for neuromorphic computing and artificial neural networks where matrix-vector multiplication can be computed in the analog domain in a single timestep. However, it is challenging to employ RRAM devices in neuromorphic chips owing to the non-ideal behavior of RRAM. In this article, we propose a cycle-accurate and scalable system-level simulator that can be used to study the effects of using RRAM devices in neuromorphic computing chips. The simulator models a spatial neuromorphic chip architecture containing many neural cores with RRAM crossbars connected via a Network-on-Chip (NoC). We focus on system-level simulation and demonstrate the effectiveness of our simulator in understanding how non-linear RRAM effects such as stuck-at-faults (SAFs), write variability, and random telegraph noise (RTN) can impact an application's behavior. By using our simulator, we show that RTN and write variability can have adverse effects on an application. Nevertheless, we show that these effects can be mitigated through proper design choices and the implementation of a write-verify scheme.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "64", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vasilakis:2019:DFC, author = "Evangelos Vasilakis and Vassilis Papaefstathiou and Pedro Trancoso and Ioannis Sourdis", title = "Decoupled Fused Cache: Fusing a Decoupled {LLC} with a {DRAM} Cache", journal = j-TACO, volume = "15", number = "4", pages = "65:1--65:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3293447", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "DRAM caches have shown excellent potential in capturing the spatial and temporal data locality of applications capitalizing on advances of 3D-stacking technology; however, they are still far from their ideal performance. Besides the unavoidable DRAM access to fetch the requested data, tag access is in the critical path, adding significant latency and energy costs. Existing approaches are not able to remove these overheads and in some cases limit DRAM cache design options. For instance, caching DRAM cache tags adds constant latency to every access; accessing the DRAM cache using the TLB calls for OS support and DRAM cachelines as large as a page; reusing the last-level cache (LLC) tags to access the DRAM cache limits LLC performance as it requires indexing the LLC using higher-order address bits. In this article, we introduce Decoupled Fused Cache, a DRAM cache design that alleviates the cost of tag accesses by fusing DRAM cache tags with the tags of the on-chip LLC without affecting LLC performance. In essence, the Decoupled Fused Cache relies in most cases on the LLC tag access to retrieve the required information for accessing the DRAM cache while avoiding additional overheads. Compared to current DRAM cache designs of the same cacheline size, Decoupled Fused Cache improves system performance by 6\% on average and by 16\% to 18\% for large cacheline sizes. Finally, Decoupled Fused Cache reduces DRAM cache traffic by 18\% and DRAM cache energy consumption by 7\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "65", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pirkelbauer:2019:BTF, author = "Peter Pirkelbauer and Amalee Wilson and Christina Peterson and Damian Dechev", title = "{Blaze-Tasks}: a Framework for Computing Parallel Reductions over Tasks", journal = j-TACO, volume = "15", number = "4", pages = "66:1--66:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3293448", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compared to threads, tasks are a more fine-grained alternative. The task parallel programming model offers benefits in terms of better performance portability and better load-balancing for problems that exhibit nonuniform workloads. A common scenario of task parallel programming is that a task is recursively decomposed into smaller sub-tasks. Depending on the problem domain, the number of created sub-tasks may be nonuniform, thereby creating potential for significant load imbalances in the system. Dynamic load-balancing mechanisms will distribute the tasks across available threads. The final result of a computation may be modeled as a reduction over the results of all sub-tasks. This article describes a simple, yet effective prototype framework, Blaze-Tasks, for task scheduling and task reductions on shared memory architectures. The framework has been designed with lock-free techniques and generic programming principles in mind. Blaze-Tasks is implemented entirely in C++17 and is thus portable. To load-balance the computation, Blaze-Tasks uses task stealing. To manage contention on a task pool, the number of lock-free attempts to steal a task depends on the distance between thief and pool owner and the estimated number of tasks in a victim's pool. This article evaluates the Blaze framework on Intel and IBM dual-socket systems using nine benchmarks and compares its performance with other task parallel frameworks. While Cilk outperforms Blaze on Intel on most benchmarks, the evaluation shows that Blaze is competitive with OpenMP and other library-based implementations. On IBM, the experiments show that Blaze outperforms other approaches on most benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "66", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sato:2019:AFS, author = "Yukinori Sato and Tomoya Yuki and Toshio Endo", title = "An Autotuning Framework for Scalable Execution of Tiled Code via Iterative Polyhedral Compilation", journal = j-TACO, volume = "15", number = "4", pages = "67:1--67:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3293449", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "On modern many-core CPUs, performance tuning against complex memory subsystems and scalability for parallelism is mandatory to achieve their potential. In this article, we focus on loop tiling, which plays an important role in performance tuning, and develop a novel framework that analytically models the load balance and empirically autotunes unpredictable cache behaviors through iterative polyhedral compilation using LLVM/Polly. From an evaluation on many-core CPUs, we demonstrate that our autotuner achieves a performance superior to those that use conventional static approaches and well-known autotuning heuristics. Moreover, our autotuner achieves almost the same performance as a brute-force search-based approach.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "67", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shekofteh:2019:MSG, author = "S.-Kazem Shekofteh and Hamid Noori and Mahmoud Naghibzadeh and Hadi Sadoghi Yazdi and Holger Fr{\"o}ning", title = "Metric Selection for {GPU} Kernel Classification", journal = j-TACO, volume = "15", number = "4", pages = "68:1--68:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3295690", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graphics Processing Units (GPUs) are vastly used for running massively parallel programs. GPU kernels exhibit different behavior at runtime and can usually be classified in a simple form as either ``compute-bound'' or ``memory-bound.'' Recent GPUs are capable of concurrently running multiple kernels, which raises the question of how to most appropriately schedule kernels to achieve higher performance. In particular, co-scheduling of compute-bound and memory-bound kernels seems promising. However, its benefits as well as drawbacks must be determined along with which kernels should be selected for a concurrent execution. Classifying kernels can be performed online by instrumentation based on performance counters. This work conducts a thorough analysis of the metrics collected from various benchmarks from Rodinia and CUDA SDK. The goal is to find the minimum number of effective metrics that enables online classification of kernels with a low overhead. This study employs a wrapper-based feature selection method based on the Fisher feature selection criterion. The results of experiments show that to classify kernels with a high accuracy, only three and five metrics are sufficient on a Kepler and a Pascal GPU, respectively. The proposed method is then utilized for a runtime scheduler. The results show an average speedup of 1.18$ \times $ and 1.1$ \times $ compared with a serial and a random scheduler, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "68", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bilas:2019:LDR, author = "Angelos Bilas", title = "List of 2018 Distinguished Reviewers {ACM TACO}", journal = j-TACO, volume = "15", number = "4", pages = "69:1--69:??", month = jan, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3293444", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jan 8 17:20:00 MST 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "69", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shobaki:2019:EAC, author = "Ghassan Shobaki and Austin Kerbow and Christopher Pulido and William Dobson", title = "Exploring an Alternative Cost Function for Combinatorial Register-Pressure-Aware Instruction Scheduling", journal = j-TACO, volume = "16", number = "1", pages = "1:1--1:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3301489", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 11 19:00:20 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multiple combinatorial algorithms have been proposed for doing pre-allocation instruction scheduling with the objective of minimizing register pressure or balancing register pressure and instruction-level parallelism. The cost function that is minimized in most of these algorithms is the peak register pressure (or the peak excess register pressure). In this work, we explore an alternative register-pressure cost function, which is the Sum of Live Interval Lengths (SLIL). Unlike the peak cost function, which captures register pressure only at the highest pressure point in the schedule, the proposed SLIL cost function captures register pressure at all points in the schedule. Minimizing register pressure at all points is desirable in larger scheduling regions with multiple high-pressure points. This article describes a Branch-and-Bound (B8B) algorithm for minimizing the SLIL cost function. The algorithm is based on two SLIL-specific dynamic lower bounds as well as the history utilization technique proposed in our previous work. The proposed algorithm is implemented into the LLVM Compiler and evaluated experimentally relative to our previously proposed B8B algorithm for minimizing the peak excess register pressure. The experimental results show that the proposed algorithm for minimizing the SLIL cost function produces substantially less spilling than the previous algorithm that minimizes the peak cost function. Execution-time results on various processors show that the proposed B8B algorithm significantly improves the performance of many CPU2006 benchmarks by up to 49\% relative to LLVM's default scheduler. The geometric-mean improvement for FP2006 on Intel Core i7 is 4.22\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2019:ESA, author = "Yu-Ping Liu and Ding-Yong Hong and Jan-Jan Wu and Sheng-Yu Fu and Wei-Chung Hsu", title = "Exploiting {SIMD} Asymmetry in {ARM}-to-x86 Dynamic Binary Translation", journal = j-TACO, volume = "16", number = "1", pages = "2:1--2:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3301488", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 11 19:00:20 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Single instruction multiple data (SIMD) has been adopted for decades because of its superior performance and power efficiency. The SIMD capability (i.e., width, number of registers, and advanced instructions) has diverged rapidly on different SIMD instruction-set architectures (ISAs). Therefore, migrating existing applications to another host ISA that has fewer but longer SIMD registers and more advanced instructions raises the issues of asymmetric SIMD capability. To date, this issue has been overlooked and the host SIMD capability is underutilized, resulting in suboptimal performance. In this article, we present a novel binary translation technique called spill-aware superword level parallelism (saSLP), which combines short ARMv8 instructions and registers in the guest binaries to exploit the x86 AVX2 host's parallelism, register capacity, and gather instructions. Our experiment results show that saSLP improves the performance by 1.6$ \times $ (2.3$ \times $) across a number of benchmarks and reduces spilling by 97\% (99\%) for ARMv8 to x86 AVX2 (AVX-512) translation. Furthermore, with AVX2 (AVX-512) gather instructions, saSLP speeds up several data-irregular applications that cannot be vectorized on ARMv8 NEON by up to 3.9$ \times $ (4.2$ \times $).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sadrosadati:2019:IIT, author = "Mohammad Sadrosadati and Seyed Borna Ehsani and Hajar Falahati and Rachata Ausavarungnirun and Arash Tavakkol and Mojtaba Abaee and Lois Orosa and Yaohua Wang and Hamid Sarbazi-Azad and Onur Mutlu", title = "{ITAP}: Idle-Time-Aware Power Management for {GPU} Execution Units", journal = j-TACO, volume = "16", number = "1", pages = "3:1--3:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291606", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 11 19:00:20 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graphics Processing Units (GPUs) are widely used as the accelerator of choice for applications with massively data-parallel tasks. However, recent studies show that GPUs suffer heavily from resource underutilization, which, combined with their large static power consumption, imposes a significant power overhead. One of the most power-hungry components of a GPU-the execution units-frequently experience idleness when (1) an underutilized warp is issued to the execution units, leading to partial lane idleness, and (2) there is no active warp to be issued for the execution due to warp stalls (e.g., waiting for memory access and synchronization). Although large in total, the idle time of execution units actually comes from short but frequent stalls, leaving little potential for common power saving techniques, such as power-gating. In this article, we propose ITAP, a novel idle-time-aware power management technique, which aims to effectively reduce the static energy consumption of GPU execution units. By taking advantage of different power management techniques (i.e., power-gating and different levels of voltage scaling), ITAP employs three static power reduction modes with different overheads and capabilities of static power reduction. ITAP estimates the idle period length of execution units using prediction and peek-ahead techniques in a synergistic way and then applies the most appropriate static power reduction mode based on the estimated idle period length. We design ITAP to be power-aggressive or performance-aggressive, not both at the same time. Our experimental results on several workloads show that the power-aggressive design of ITAP outperforms the state-of-the-art solution by an average of 27.6\% in terms of static energy savings, with less than 2.1\% performance overhead. However, the performance-aggressive design of ITAP improves the static energy savings by an average of 16.9\%, while keeping the GPU performance almost unaffected (i.e., up to 0.4\% performance overhead) compared to the state-of-the-art static energy savings mechanism.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dogan:2019:ASU, author = "Halit Dogan and Masab Ahmad and Brian Kahne and Omer Khan", title = "Accelerating Synchronization Using Moving Compute to Data Model at 1,000-core Multicore Scale", journal = j-TACO, volume = "16", number = "1", pages = "4:1--4:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3300208", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 11 19:00:20 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Thread synchronization using shared memory hardware cache coherence paradigm is prevalent in multicore processors. However, as the number of cores increase on a chip, cache line ping-pong prevents performance scaling for algorithms that deploy fine-grain synchronization. This article proposes an in-hardware moving computation to data model (MC) that pins shared data at dedicated cores. The critical code sections are serialized and executed at these cores in a spatial setting to enable data locality optimizations. In-hardware messages enable non-blocking and blocking communication between cores, without involving the cache coherence protocol. The in-hardware MC model is implemented on Tilera Tile-Gx72 multicore platform to evaluate 8- to 64-core count scale. A simulated RISC-V multicore environment is built to further evaluate the performance scaling advantages of the MC model at 1,024-cores scale. The evaluation using graph and machine-learning benchmarks illustrates that atomic instructions based synchronization scales up to 512 cores, and the MC model at the same core count outperforms by 27\% in completion time and 39\% in dynamic energy consumption.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Azriel:2019:MSP, author = "Leonid Azriel and Lukas Humbel and Reto Achermann and Alex Richardson and Moritz Hoffmann and Avi Mendelson and Timothy Roscoe and Robert N. M. Watson and Paolo Faraboschi and Dejan Milojicic", title = "Memory-Side Protection With a Capability Enforcement Co-Processor", journal = j-TACO, volume = "16", number = "1", pages = "5:1--5:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3302257", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 11 19:00:20 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Byte-addressable nonvolatile memory (NVM) blends the concepts of storage and memory and can radically improve data-centric applications, from in-memory databases to graph processing. By enabling large-capacity devices to be shared across multiple computing elements, fabric-attached NVM changes the nature of rack-scale systems and enables short-latency direct memory access while retaining data persistence properties and simplifying the software stack. An adequate protection scheme is paramount when addressing shared and persistent memory, but mechanisms that rely on virtual memory paging suffer from the tension between performance (pushing toward large pages) and protection granularity (pushing toward small pages). To address this tension, capabilities are worth revisiting as a more powerful protection mechanism, but the long time needed to introduce new CPU features hampers the adoption of schemes that rely on instruction-set architecture support. This article proposes the Capability Enforcement Co-Processor (CEP), a programmable memory controller that implements fine-grain protection through the capability model without requiring instruction-set support in the application CPU. CEP decouples capabilities from the application CPU instruction-set architecture, shortens time to adoption, and can rapidly evolve to embrace new persistent memory technologies, from NVDIMMs to native NVM devices, either locally connected or fabric attached in rack-scale configurations. CEP exposes an application interface based on memory handles that get internally converted to extended-pointer capabilities. This article presents a proof of concept implementation of a distributed object store (Redis) with CEP. It also demonstrates a capability-enhanced file system (FUSE) implementation using CEP. Our proof of concept shows that CEP provides fine-grain protection while enabling direct memory access from application clients to the NVM, and that by doing so opens up important performance optimization opportunities (up to 4$ \times $ reduction in latency in comparison to software-based security enforcement) without compromising security. Finally, we also sketch how a future hybrid model could improve the initial implementation by delegating some CEP functionality to a CHERI-enabled processor.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jaleel:2019:DHP, author = "Aamer Jaleel and Eiman Ebrahimi and Sam Duncan", title = "{DUCATI}: High-performance Address Translation by Extending {TLB} Reach of {GPU}-accelerated Systems", journal = j-TACO, volume = "16", number = "1", pages = "6:1--6:??", month = mar, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3309710", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Mar 11 19:00:20 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Conventional on-chip TLB hierarchies are unable to fully cover the growing application working-set sizes. To make things worse, Last-Level TLB (LLT) misses require multiple accesses to the page table even with the use of page walk caches. Consequently, LLT misses incur long address translation latency and hurt performance. This article proposes two low-overhead hardware mechanisms for reducing the frequency and penalty of on-die LLT misses. The first, Unified CAche and TLB (UCAT), enables the conventional on-die Last-Level Cache to store cache lines and TLB entries in a single unified structure and increases on-die TLB capacity significantly. The second, DRAM-TLB, memoizes virtual to physical address translations in DRAM and reduces LLT miss penalty when UCAT is unable to fully cover total application working-set. DRAM-TLB serves as the next larger level in the TLB hierarchy that significantly increases TLB coverage relative to on-chip TLBs. The combination of these two mechanisms, DUCATI, is an address translation architecture that improves GPU performance by 81\%; (up to 4.5$ \times $) while requiring minimal changes to the existing system design. We show that DUCATI is within 20\%, 5\%, and 2\% the performance of a perfect LLT system when using 4KB, 64KB, and 2MB pages, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2019:SSD, author = "Yemao Xu and Dezun Dong and Weixia Xu and Xiangke Liao", title = "{SketchDLC}: a Sketch on Distributed Deep Learning Communication via Trace Capturing", journal = j-TACO, volume = "16", number = "2", pages = "7:1--7:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3312570", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the fast development of deep learning (DL), the communication is increasingly a bottleneck for distributed workloads, and a series of optimization works have been done to scale out successfully. Nevertheless, the network behavior has not been investigated much yet. We intend to analyze the network behavior and then carry out some research through network simulation. Under this circumstance, an accurate communication measurement is necessary, as it is an effective way to study the network behavior and the basis for accurate simulation. Therefore, we propose to capture the deep learning communication (DLC) trace to achieve the measurement. To the best of our knowledge, we make the first attempt to capture the communication trace for DL training. In this article, we first provide detailed analyses about the communication mechanism of MXNet, which is a representative framework for distributed DL. Secondly, we define the DLC trace format to describe and record the communication behaviors. Third, we present the implementation of method for trace capturing. Finally, we make some statistics and analyses about the distributed DL training, including communication pattern, overlap ratio between computation and communication, computation overhead, synchronization overhead, update overhead, and so forth. Both the statistics and analyses are based on the trace files captured in a cluster with six machines. On the one hand, our trace files provide a sketch on the DLC, which contributes to understanding the communication details. On the other hand, the captured trace files can be used for figuring out various overheads, as they record the communication behaviors of each node.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mastoras:2019:ESE, author = "Aristeidis Mastoras and Thomas R. Gross", title = "Efficient and Scalable Execution of Fine-Grained Dynamic Linear Pipelines", journal = j-TACO, volume = "16", number = "2", pages = "8:1--8:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3307411", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present Pipelite, a dynamic scheduler that exploits the properties of dynamic linear pipelines to achieve high performance for fine-grained workloads. The flexibility of Pipelite allows the stages and their data dependences to be determined at runtime. Pipelite unifies communication, scheduling, and synchronization algorithms with suitable data structures. This unified design introduces the local suspension mechanism and a wait-free enqueue operation, which allow efficient dynamic scheduling. The evaluation on a 44-core machine, using programs from three widely used benchmark suites, shows that Pipelite implies low overhead and significantly outperforms the state of the art in terms of speedup, scalability, and memory usage.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ham:2019:EDS, author = "Tae Jun Ham and Juan L. Arag{\'o}n and Margaret Martonosi", title = "Efficient Data Supply for Parallel Heterogeneous Architectures", journal = j-TACO, volume = "16", number = "2", pages = "9:1--9:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3310332", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Decoupling techniques have been proposed to reduce the amount of memory latency exposed to high-performance accelerators as they fetch data. Although decoupled access-execute (DAE) and more recent decoupled data supply approaches offer promising single-threaded performance improvements, little work has considered how to extend them into parallel scenarios. This article explores the opportunities and challenges of designing parallel, high-performance, resource-efficient decoupled data supply systems. We propose Mercury, a parallel decoupled data supply system that utilizes thread-level parallelism for high-throughput data supply with good portability attributes. Additionally, we introduce some microarchitectural improvements for data supply units to efficiently handle long-latency indirect loads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sioutas:2019:SSH, author = "Savvas Sioutas and Sander Stuijk and Luc Waeijen and Twan Basten and Henk Corporaal and Lou Somers", title = "Schedule Synthesis for {Halide} Pipelines through Reuse Analysis", journal = j-TACO, volume = "16", number = "2", pages = "10:1--10:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3310248", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Efficient code generation for image processing applications continues to pose a challenge in a domain where high performance is often necessary to meet real-time constraints. The inherently complex structure found in most image-processing pipelines, the plethora of transformations that can be applied to optimize the performance of an implementation, as well as the interaction of these optimizations with locality, redundant computation and parallelism, can be identified as the key reasons behind this issue. Recent domain-specific languages (DSL) such as the Halide DSL and compiler attempt to encourage high-level design-space exploration to facilitate the optimization process. We propose a novel optimization strategy that aims to maximize producer-consumer locality by exploiting reuse in image-processing pipelines. We implement our analysis as a tool that can be used alongside the Halide DSL to automatically generate schedules for pipelines implemented in Halide and test it on a variety of benchmarks. Experimental results on three different multi-core architectures show an average performance improvement of 40\% over the Halide Auto-Scheduler and 75\% over a state-of-the art approach that targets the PolyMage DSL.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2019:SSL, author = "Xiaoyuan Wang and Haikun Liu and Xiaofei Liao and Ji Chen and Hai Jin and Yu Zhang and Long Zheng and Bingsheng He and Song Jiang", title = "Supporting Superpages and Lightweight Page Migration in Hybrid Memory Systems", journal = j-TACO, volume = "16", number = "2", pages = "11:1--11:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3310133", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Superpages have long been used to mitigate address translation overhead in large-memory systems. However, superpages often preclude lightweight page migration, which is crucial for performance and energy efficiency in hybrid memory systems composed of DRAM and non-volatile memory (NVM). In this article, we propose a novel memory management mechanism called Rainbow to bridge this fundamental conflict between superpages and lightweight page migration. Rainbow manages NVM at the superpage granularity, and uses DRAM to cache frequently accessed (hot) small pages within each superpage. Correspondingly, Rainbow utilizes split TLBs to support different page sizes. By introducing an efficient hot page identification mechanism and a novel NVM-to-DRAM address remapping mechanism, Rainbow supports lightweight page migration without splintering superpages. Experiment results show that Rainbow can significantly reduce applications' TLB misses by 99.9\%, and improve application performance (in terms of IPC) by up to $ 2.9 \times $ (45.3\% on average) when compared to a state-of-the-art memory migration policy without a superpage support.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sargaran:2019:SSA, author = "Sahar Sargaran and Naser Mohammadzadeh", title = "{SAQIP}: a Scalable Architecture for Quantum Information Processors", journal = j-TACO, volume = "16", number = "2", pages = "12:1--12:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3311879", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Proposing an architecture that efficiently compensates for the inefficiencies of physical hardware with extra resources is one of the key issues in quantum computer design. Although the demonstration of quantum systems has been limited to some dozen qubits, scaling the current small-sized lab quantum systems to large-scale quantum systems that are capable of solving meaningful practical problems can be the main goal of much research. Focusing on this issue, in this article a scalable architecture for quantum information processors, called SAQIP, is proposed. Moreover, a flow is presented to map and schedule a quantum circuit on this architecture. Experimental results show that the proposed architecture and design flow decrease the average latency and the average area of quantum circuits by about 81\% and 11\%, respectively, for the attempted benchmarks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Budhkar:2019:AMD, author = "Prerna Budhkar and Ildar Absalyamov and Vasileios Zois and Skyler Windh and Walid A. Najjar and Vassilis J. Tsotras", title = "Accelerating In-Memory Database Selections Using Latency Masking Hardware Threads", journal = j-TACO, volume = "16", number = "2", pages = "13:1--13:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3310229", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Inexpensive DRAMs have created new opportunities for in-memory data analytics. However, the major bottleneck in such systems is high memory access latency. Traditionally, this problem is solved with large cache hierarchies that only benefit regular applications. Alternatively, many data-intensive applications exhibit irregular behavior. Hardware multithreading can better cope with high latency seen in such applications. This article implements a multithreaded prototype (MTP) on FPGAs for the relational selection operator that exhibits control flow irregularity. On a standard TPC-H query evaluation, MTP achieves a bandwidth utilization of 83\%, while the CPU and the GPU implementations achieve 61\% and 64\%, respectively. Besides being bandwidth efficient, MTP is also $ 14.2 \times $ and $ 4.2 \times $ more power efficient than CPU and GPU, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Riebler:2019:TAH, author = "Heinrich Riebler and Gavin Vaz and Tobias Kenter and Christian Plessl", title = "Transparent Acceleration for Heterogeneous Platforms With Compilation to {OpenCL}", journal = j-TACO, volume = "16", number = "2", pages = "14:1--14:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3319423", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multi-accelerator platforms combine CPUs and different accelerator architectures within a single compute node. Such systems are capable of processing parallel workloads very efficiently while being more energy efficient than regular systems consisting of CPUs only. However, the architectures of such systems are diverse, forcing developers to port applications to each accelerator using different programming languages, models, tools, and compilers. Developers not only require domain-specific knowledge but also need to understand the low-level accelerator details, leading to an increase in the design effort and costs. To tackle this challenge, we propose a compilation approach and a practical realization called HTrOP that is completely transparent to the user. HTrOP is able to automatically analyze a sequential CPU application, detect computational hotspots, and generate parallel OpenCL host and kernel code. The potential of HTrOP is demonstrated by offloading hotspots to different OpenCL-enabled resources (currently the CPU, the general-purpose GPU, and the manycore Intel Xeon Phi) for a broad set of benchmark applications. We present an in-depth evaluation of our approach in terms of performance gains and energy savings, taking into account all static and dynamic overheads. We are able to achieve speedups and energy savings of up to two orders of magnitude, if an application has sufficient computational intensity, when compared to a natively compiled application.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gong:2019:HAG, author = "Xun Gong and Xiang Gong and Leiming Yu and David Kaeli", title = "{HAWS}: Accelerating {GPU} Wavefront Execution through Selective Out-of-order Execution", journal = j-TACO, volume = "16", number = "2", pages = "15:1--15:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3291050", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graphics Processing Units (GPUs) have become an attractive platform for accelerating challenging applications on a range of platforms, from High Performance Computing (HPC) to full-featured smartphones. They can overcome computational barriers in a wide range of data-parallel kernels. GPUs hide pipeline stalls and memory latency by utilizing efficient thread preemption. But given the demands on the memory hierarchy due to the growth in the number of computing cores on-chip, it has become increasingly difficult to hide all of these stalls. In this article, we propose a novel Hint-Assisted Wavefront Scheduler (HAWS) to bypass long-latency stalls. HAWS starts by enhancing a compiler infrastructure to identify potential opportunities that can bypass memory stalls. HAWS includes a wavefront scheduler that can continue to execute instructions in the shadow of a memory stall, executing instructions speculatively, guided by compiler-generated hints. HAWS increases utilization of GPU resources by aggressively fetching/executing speculatively. Based on our simulation results on the AMD Southern Islands GPU architecture, at an estimated cost of 0.4\% total chip area, HAWS can improve application performance by 14.6\% on average for memory intensive applications.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Song:2019:SAR, author = "Yang Song and Olivier Alavoine and Bill Lin", title = "A Self-aware Resource Management Framework for Heterogeneous Multicore {SoCs} with Diverse {QoS} Targets", journal = j-TACO, volume = "16", number = "2", pages = "16:1--16:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3319804", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In modern heterogeneous MPSoCs, the management of shared memory resources is crucial in delivering end-to-end QoS. Previous frameworks have either focused on singular QoS targets or the allocation of partitionable resources among CPU applications at relatively slow timescales. However, heterogeneous MPSoCs typically require instant response from the memory system where most resources cannot be partitioned. Moreover, the health of different cores in a heterogeneous MPSoC is often measured by diverse performance objectives. In this work, we propose the Self-Aware Resource Allocation framework for heterogeneous MPSoCs. Priority-based adaptation allows cores to use different target performance and self-monitor their own intrinsic health. In response, the system allocates non-partitionable resources based on priorities. The proposed framework meets a diverse range of QoS demands from heterogeneous cores. Moreover, we present a runtime scheme to configure priority-based adaptation so that distinct sensitivities of heterogeneous QoS targets with respect to memory allocation can be accommodated. In addition, the priority of best-effort cores can also be regulated.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yebenes:2019:CSA, author = "Pedro Yebenes and Jose Rocher-Gonzalez and Jesus Escudero-Sahuquillo and Pedro Javier Garcia and Francisco J. Alfaro and Francisco J. Quiles and Crisp{\'\i}n G{\'o}mez and Jose Duato", title = "Combining Source-adaptive and Oblivious Routing with Congestion Control in High-performance Interconnects using Hybrid and Direct Topologies", journal = j-TACO, volume = "16", number = "2", pages = "17:1--17:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3319805", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hybrid and direct topologies are cost-efficient and scalable options to interconnect thousands of end nodes in high-performance computing (HPC) systems. They offer a rich path diversity, high bisection bandwidth, and a reduced diameter guaranteeing low latency. In these topologies, efficient deterministic routing algorithms can be used to balance smartly the traffic flows among the available routes. Unfortunately, congestion leads these networks to saturation, where the HoL blocking effect degrades their performance dramatically. Among the proposed solutions to deal with HoL blocking, the routing algorithms selecting alternative routes, such as adaptive and oblivious, can mitigate the congestion effects. Other techniques use queues to separate congested flows from non-congested ones, thus reducing the HoL blocking. In this article, we propose a new approach that reduces HoL blocking in hybrid and direct topologies using source-adaptive and oblivious routing. This approach also guarantees deadlock-freedom as it uses virtual networks to break potential cycles generated by the routing policy in the topology. Specifically, we propose two techniques, called Source-Adaptive Solution for Head-of-Line Blocking Avoidance (SASHA) and Oblivious Solution for Head-of-Line Blocking Avoidance (OSHA). Experiment results, carried out through simulations under different traffic scenarios, show that SASHA and OSHA can significantly reduce the HoL blocking.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Alshboul:2019:ECR, author = "Mohammad Alshboul and Hussein Elnawawy and Reem Elkhouly and Keiji Kimura and James Tuck and Yan Solihin", title = "Efficient Checkpointing with Recompute Scheme for Non-volatile Main Memory", journal = j-TACO, volume = "16", number = "2", pages = "18:1--18:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3323091", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Future main memory will likely include Non-Volatile Memory. Non-Volatile Main Memory (NVMM) provides an opportunity to rethink checkpointing strategies for providing failure safety to applications. While there are many checkpointing and logging schemes in the literature, their use must be revisited as they incur high execution time overheads as well as a large number of additional writes to NVMM, which may significantly impact write endurance. In this article, we propose a novel recompute-based failure safety approach and demonstrate its applicability to loop-based code. Rather than keeping a fully consistent logging state, we only log enough state to enable recomputation. Upon a failure, our approach recovers to a consistent state by determining which parts of the computation were not completed and recomputing them. Effectively, our approach removes the need to keep checkpoints or logs, thus reducing execution time overheads and improving NVMM write endurance at the expense of more complex recovery. We compare our new approach against logging and checkpointing on five scientific workloads, including tiled matrix multiplication, on a computer system model that was built on gem5 and supports Intel PMEM instruction extensions. For tiled matrix multiplication, our recompute approach incurs an execution time overhead of only 5\%, in contrast to 8\% overhead with logging and 207\% overhead with checkpointing. Furthermore, recompute only adds 7\% additional NVMM writes, compared to 111\% with logging and 330\% with checkpointing. We also conduct experiments on real hardware, allowing us to run our workloads to completion while varying the number of threads used for computation. These experiments substantiate our simulation-based observations and provide a sensitivity study and performance comparison between the Recompute Scheme and Naive Checkpointing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hadjilambrou:2019:CCO, author = "Zacharias Hadjilambrou and Marios Kleanthous and Georgia Antoniou and Antoni Portero and Yiannakis Sazeides", title = "Comprehensive Characterization of an Open Source Document Search Engine", journal = j-TACO, volume = "16", number = "2", pages = "19:1--19:??", month = may, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3320346", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This work performs a thorough characterization and analysis of the open source Lucene search library. The article describes in detail the architecture, functionality, and micro-architectural behavior of the search engine, and investigates prominent online document search research issues. In particular, we study how intra-server index partitioning affects the response time and throughput, explore the potential use of low power servers for document search, and examine the sources of performance degradation ands the causes of tail latencies. Some of our main conclusions are the following: (a) intra-server index partitioning can reduce tail latencies but with diminishing benefits as incoming query traffic increases, (b) low power servers given enough partitioning can provide same average and tail response times as conventional high performance servers, (c) index search is a CPU-intensive cache-friendly application, and (d) C-states are the main culprits for performance degradation in document search.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2019:EGC, author = "Bingchao Li and Jizeng Wei and Jizhou Sun and Murali Annavaram and Nam Sung Kim", title = "An Efficient {GPU} Cache Architecture for Applications with Irregular Memory Access Patterns", journal = j-TACO, volume = "16", number = "3", pages = "20:1--20:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3322127", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "GPUs provide high-bandwidth/low-latency on-chip shared memory and L1 cache to efficiently service a large number of concurrent memory requests. Specifically, concurrent memory requests accessing contiguous memory space are coalesced into warp-wide accesses. To support such large accesses to L1 cache with low latency, the size of L1 cache line is no smaller than that of warp-wide accesses. However, such L1 cache architecture cannot always be efficiently utilized when applications generate many memory requests with irregular access patterns especially due to branch and memory divergences that make requests uncoalesced and small. Furthermore, unlike L1 cache, the shared memory of GPUs is not often used in many applications, which essentially depends on programmers. In this article, we propose Elastic-Cache, which can efficiently support both fine- and coarse-grained L1 cache line management for applications with both regular and irregular memory access patterns to improve the L1 cache efficiency. Specifically, it can store 32- or 64-byte words in non-contiguous memory space to a single 128-byte cache line. Furthermore, it neither requires an extra memory structure nor reduces the capacity of L1 cache for tag storage, since it stores auxiliary tags for fine-grained L1 cache line managements in the shared memory space that is not fully used in many applications. To improve the bandwidth utilization of L1 cache with Elastic-Cache for fine-grained accesses, we further propose Elastic-Plus to issue 32-byte memory requests in parallel, which can reduce the processing latency of memory instructions and improve the throughput of GPUs. Our experiment result shows that Elastic-Cache improves the geometric-mean performance of applications with irregular memory access patterns by 104\% without degrading the performance of applications with regular memory access patterns. Elastic-Plus outperforms Elastic-Cache and improves the performance of applications with irregular memory access patterns by 131\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Roberts:2019:POS, author = "Stephen I. Roberts and Steven A. Wright and Suhaib A. Fahmy and Stephen A. Jarvis", title = "The Power-optimised Software Envelope", journal = j-TACO, volume = "16", number = "3", pages = "21:1--21:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3321551", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Advances in processor design have delivered performance improvements for decades. As physical limits are reached, refinements to the same basic technologies are beginning to yield diminishing returns. Unsustainable increases in energy consumption are forcing hardware manufacturers to prioritise energy efficiency in their designs. Research suggests that software modifications may be needed to exploit the resulting improvements in current and future hardware. New tools are required to capitalise on this new class of optimisation. In this article, we present the Power Optimised Software Envelope (POSE) model, which allows developers to assess the potential benefits of power optimisation for their applications. The POSE model is metric agnostic and in this article, we provide derivations using the established Energy-Delay Product metric and the novel Energy-Delay Sum and Energy-Delay Distance metrics that we believe are more appropriate for energy-aware optimisation efforts. We demonstrate POSE on three platforms by studying the optimisation characteristics of applications from the Mantevo benchmark suite. Our results show that the Pathfinder application has very little scope for power optimisation while TeaLeaf has the most, with all other applications in the benchmark suite falling between the two. Finally, we extend our POSE model with a formulation known as System Summary POSE-a meta-heuristic that allows developers to assess the scope a system has for energy-aware software optimisation independent of the code being run.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kannan:2019:CIE, author = "Ram Srivatsa Kannan and Michael Laurenzano and Jeongseob Ahn and Jason Mars and Lingjia Tang", title = "{Caliper}: Interference Estimator for Multi-tenant Environments Sharing Architectural Resources", journal = j-TACO, volume = "16", number = "3", pages = "22:1--22:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3323090", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We introduce Caliper, a technique for accurately estimating performance interference occurring in shared servers. Caliper overcomes the limitations of prior approaches by leveraging a micro-experiment-based technique. In contrast to state-of-the-art approaches that focus on periodically pausing co-running applications to estimate slowdown, Caliper utilizes a strategic phase-triggered technique to capture interference due to co-location. This enables Caliper to orchestrate an accurate and low-overhead interference estimation technique that can be readily deployed in existing production systems. We evaluate Caliper for a broad spectrum of workload scenarios, demonstrating its ability to seamlessly support up to 16 applications running simultaneously and outperform the state-of-the-art approaches.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lin:2019:CCC, author = "Zhen Lin and Hongwen Dai and Michael Mantor and Huiyang Zhou", title = "Coordinated {CTA} Combination and Bandwidth Partitioning for {GPU} Concurrent Kernel Execution", journal = j-TACO, volume = "16", number = "3", pages = "23:1--23:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3326124", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Contemporary GPUs support multiple kernels to run concurrently on the same streaming multiprocessors (SMs). Recent studies have demonstrated that such concurrent kernel execution (CKE) improves both resource utilization and computational throughput. Most of the prior works focus on partitioning the GPU resources at the cooperative thread array (CTA) level or the warp scheduler level to improve CKE. However, significant performance slowdown and unfairness are observed when latency-sensitive kernels co-run with bandwidth-intensive ones. The reason is that bandwidth over-subscription from bandwidth-intensive kernels leads to much aggravated memory access latency, which is highly detrimental to latency-sensitive kernels. Even among bandwidth-intensive kernels, more intensive kernels may unfairly consume much higher bandwidth than less-intensive ones. In this article, we first make a case that such problems cannot be sufficiently solved by managing CTA combinations alone and reveal the fundamental reasons. Then, we propose a coordinated approach for CTA combination and bandwidth partitioning. Our approach dynamically detects co-running kernels as latency sensitive or bandwidth intensive. As both the DRAM bandwidth and L2-to-L1 Network-on-Chip (NoC) bandwidth can be the critical resource, our approach partitions both bandwidth resources coordinately along with selecting proper CTA combinations. The key objective is to allocate more CTA resources for latency-sensitive kernels and more NoC/DRAM bandwidth resources to NoC-/DRAM-intensive kernels. We achieve it using a variation of dominant resource fairness (DRF). Compared with two state-of-the-art CKE optimization schemes, SMK [52] and WS [55], our approach improves the average harmonic speedup by 78\% and 39\%, respectively. Even compared to the best possible CTA combinations, which are obtained from an exhaustive search among all possible CTA combinations, our approach improves the harmonic speedup by up to 51\% and 11\% on average.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Didier:2019:CCP, author = "Keryan Didier and Dumitru Potop-Butucaru and Guillaume Iooss and Albert Cohen and Jean Souyris and Philippe Baufreton and Amaury Graillat", title = "Correct-by-Construction Parallelization of Hard Real-Time Avionics Applications on Off-the-Shelf Predictable Hardware", journal = j-TACO, volume = "16", number = "3", pages = "24:1--24:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328799", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present the first end-to-end modeling and compilation flow to parallelize hard real-time control applications while fully guaranteeing the respect of real-time requirements on off-the-shelf hardware. It scales to thousands of dataflow nodes and has been validated on two production avionics applications. Unlike classical optimizing compilation, it takes as input non-functional requirements (real time, resource limits). To enforce these requirements, the compiler follows a static resource allocation strategy, from coarse-grain tasks communicating over an interconnection network all the way to individual variables and memory accesses. It controls timing interferences resulting from mapping decisions in a precise, safe, and scalable way.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zardoshti:2019:STM, author = "Pantea Zardoshti and Tingzhe Zhou and Pavithra Balaji and Michael L. Scott and Michael Spear", title = "Simplifying Transactional Memory Support in {C++}", journal = j-TACO, volume = "16", number = "3", pages = "25:1--25:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328796", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "C++ has supported a provisional version of Transactional Memory (TM) since 2015, via a technical specification. However, TM has not seen widespread adoption, and compiler vendors have been slow to implement the technical specification. We conjecture that the proposed TM support is too difficult for programmers to use, too complex for compiler designers to implement and verify, and not industry-proven enough to justify final standardization in its current form. To address these problems, we present a different design for supporting TM in C++. By forbidding explicit self-abort, and by introducing an executor-based mechanism for running transactions, our approach makes it easier for developers to get code up and running with TM. Our proposal should also be appealing to compiler developers, as it allows a spectrum of levels of support for TM, with varying performance, and varying reliance on hardware TM support in order to provide scalability. \<?tight?\>While our design does not enable some of the optimizations admitted by the current technical specification, we show that it enables the implementation of robust support for TM in a small, orthogonal compiler extension. Our implementation is able to handle a wide range of transactional programs, delivering low instrumentation overhead and scalability and performance on par with the current state of the art. Based on this experience, we believe our approach to be a viable means of reinvigorating the standardization of TM in C++.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Park:2019:MCM, author = "Jungwoo Park and Myoungjun Lee and Soontae Kim and Minho Ju and Jeongkyu Hong", title = "{MH} Cache: a Multi-retention {STT-RAM}-based Low-power Last-level Cache for Mobile Hardware Rendering Systems", journal = j-TACO, volume = "16", number = "3", pages = "26:1--26:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3328520", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Mobile devices have become the most important devices in our life. However, they are limited in battery capacity. Therefore, low-power computing is crucial for their long lifetime. A spin-transfer torque RAM (STT-RAM) has become emerging memory technology because of its low leakage power consumption. We herein propose MH cache, a multi-retention STT-RAM-based cache management scheme for last-level caches (LLC) to reduce their power consumption for mobile hardware rendering systems. We analyzed the memory access patterns of processes and observed how rendering methods affect process behaviors. We propose a cache management scheme that measures write-intensity of each process dynamically and exploits it to manage a power-efficient multi-retention STT-RAM-based cache. Our proposed scheme uses variable threshold for a process' write-intensity to determine cache line placement. We explain how to deal with the following issue to implement our proposed scheme. Our experimental results show that our techniques significantly reduce the LLC power consumption by 32\% and 32.2\% in single- and quad-core systems, respectively, compared to a full STT-RAM LLC.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Leben:2019:PCM, author = "Jakob Leben and George Tzanetakis", title = "Polyhedral Compilation for Multi-dimensional Stream Processing", journal = j-TACO, volume = "16", number = "3", pages = "27:1--27:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3330999", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "We present a method for compilation of multi-dimensional stream processing programs from affine recurrence equations with unbounded domains into imperative code with statically allocated memory. The method involves a novel polyhedral schedule transformation called periodic tiling. It accommodates existing polyhedral optimizations to improve memory access patterns and expose parallelism. This enables efficient execution of programming languages with unbounded recurrence equations, as well as optimization of existing languages from which this form can be derived. The method is experimentally evaluated on 5 DSP algorithms with large problem sizes. Results show potential for improved throughput compared to hand-optimized C++ (speedups on a 6-core Intel Xeon CPU up to $ 10 \times $ with a geometric mean $ 3.3 \times $).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sadeghi:2019:TCN, author = "Mohammad Sadegh Sadeghi and Siavash Bayat Sarmadi and Shaahin Hessabi", title = "Toward On-chip Network Security Using Runtime Isolation Mapping", journal = j-TACO, volume = "16", number = "3", pages = "28:1--28:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3337770", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many-cores execute a large number of diverse applications concurrently. Inter-application interference can lead to a security threat as timing channel attack in the on-chip network. A non-interference communication in the shared on-chip network is a dominant necessity for secure many-core platforms to leverage the concepts of the cloud and embedded system-on-chip. The current non-interference techniques are limited to static scheduling and need router modification at micro-architecture level. Mapping of applications can effectively determine the interference among applications in on-chip network. In this work, we explore non-interference approaches through run-time mapping at software and application level. We map the same group of applications in isolated domain(s) to meet non-interference flows. Through run-time mapping, we can maximize utilization of the system without leaking information. The proposed run-time mapping policy requires no router modification in contrast to the best known competing schemes, and the performance degradation is, on average, 16\% compared to the state-of-the-art baselines.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Louise:2019:FST, author = "Stephane Louise", title = "A First Step Toward Using Quantum Computing for Low-level {WCETs} Estimations", journal = j-TACO, volume = "16", number = "3", pages = "29:1--29:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3335549", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Low-Level analysis of Worst Case Execution Time (WCET) is an important field for real-time system validation. It stands between computer architecture and mathematics, as it relies strongly on variants of abstract interpretation. One of the features that causes the largest uncertainty regarding WCET evaluation for low-level analysis of sequential execution on a single processor is taking Cache Memory-related Delays (CMRD) and Cache-related Preemption Delays (CRPD) correctly into account. Research work from the 1990s provides a good basic framework for this problem as long as a task runs without preemption. But when preemption of tasks is allowed, although several formalisms exist, their predictive power is lower and the usual approach relies on analyses of NP-hard problems. In this article, we want to show some potential advantages of using a formalism inspired by Quantum Computing (QC) to evaluate CMRDs with preemptions while avoiding the NP-hard problem underneath. The experimental results, with a classic (non-quantum) numerical approach, on a selection of Malardalen benchmark programs display very good accuracy, while the complexity of the evaluation is a low-order polynomial of the number of memory accesses. While it is not yet a fully parallel quantum algorithm, we provide a first roadmap on how to reach such an objective.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chikin:2019:MAA, author = "Artem Chikin and Taylor Lloyd and Jos{\'e} Nelson Amaral and Ettore Tiotto and Muhammad Usman", title = "Memory-access-aware Safety and Profitability Analysis for Transformation of Accelerator-bound {OpenMP} Loops", journal = j-TACO, volume = "16", number = "3", pages = "30:1--30:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3333060", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Iteration Point Difference Analysis is a new static analysis framework that can be used to determine the memory coalescing characteristics of parallel loops that target GPU offloading and to ascertain safety and profitability of loop transformations with the goal of improving their memory access characteristics. This analysis can propagate definitions through control flow, works for non-affine expressions, and is capable of analyzing expressions that reference conditionally defined values. This analysis framework enables safe and profitable loop transformations. Experimental results demonstrate potential for dramatic performance improvements. GPU kernel execution time across the Polybench suite is improved by up to $ 25.5 \times $ on an Nvidia P100 with benchmark overall improvement of up to $ 3.2 \times $. An opportunity detected in a SPEC ACCEL benchmark yields kernel speedup of $ 86.5 \times $ with a benchmark improvement of $ 3.3 \times $. This work also demonstrates how architecture-aware compilers improve code portability and reduce programmer effort.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cha:2019:MDC, author = "Sanghoon Cha and Bokyeong Kim and Chang Hyun Park and Jaehyuk Huh", title = "Morphable {DRAM} Cache Design for Hybrid Memory Systems", journal = j-TACO, volume = "16", number = "3", pages = "31:1--31:??", month = jul, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3338505", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Jul 26 14:25:54 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "DRAM caches have emerged as an efficient new layer in the memory hierarchy to address the increasing diversity of memory components. When a small amount of fast memory is combined with slow but large memory, the cache-based organization of the fast memory can provide a SW-transparent solution for the hybrid memory systems. In such DRAM cache designs, their effectiveness is affected by the bandwidth and latency of both fast and slow memory. To quantitatively assess the effect of memory configurations and application patterns on the DRAM cache designs, this article first investigates how three prior approaches perform with six hybrid memory scenarios. From the investigation, we observe no single DRAM cache organization always outperforms the other organizations across the diverse hybrid memory configurations and memory access patterns. Based on this observation, this article proposes a reconfigurable DRAM cache design that can adapt to different hybrid memory combinations and workload patterns. Unlike the fixed tag and data arrays of conventional on-chip SRAM caches, this study advocates to exploit the flexibility of DRAM caches, which can store tags and data to DRAM in any arbitrary way. Using a sample-based mechanism, the proposed DRAM cache controller dynamically finds the best organization from three candidates and applies the best one by reconfiguring the tags and data layout in the DRAM cache. Our evaluation shows that the proposed morphable DRAM cache can outperform the fixed DRAM configurations across six hybrid memory configurations.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luo:2019:SCT, author = "Chao Luo and Yunsi Fei and David Kaeli", title = "Side-channel Timing Attack of {RSA} on a {GPU}", journal = j-TACO, volume = "16", number = "3", pages = "32:1--32:??", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3341729", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:27:40 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3341729", abstract = "To increase computation throughput, general purpose Graphics Processing Units (GPUs) have been leveraged to accelerate computationally intensive workloads. GPUs have been used as cryptographic engines, improving encryption/decryption throughput and leveraging the GPU's Single Instruction Multiple Thread (SIMT) model. RSA is a widely used public-key cipher and has been ported onto GPUs for signing and decrypting large files. Although performance has been significantly improved, the security of RSA on GPUs is vulnerable to side-channel timing attacks and is an exposure overlooked in previous studies. GPUs tend to be naturally resilient to side-channel attacks, given that they execute a large number of concurrent threads, performing many RSA operations on different data in parallel. Given the degree of parallel execution on a GPU, there will be a significant amount of noise introduced into the timing channel given the thousands of concurrent threads executing concurrently. In this work, we build a timing model to capture the parallel characteristics of an RSA public-key cipher implemented on a GPU. We consider optimizations that include using Montgomery multiplication and sliding-window exponentiation to implement cryptographic operations. Our timing model considers the challenges of parallel execution, complications that do not occur in single-threaded computing platforms. Based on our timing model, we launch successful timing attacks on RSA running on a GPU, extracting the private key of RSA. We also present an effective error detection and correction mechanism. Our results demonstrate that GPU acceleration of RSA is vulnerable to side-channel timing attacks. We propose several countermeasures to defend against this class of attacks.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yuan:2019:RTL, author = "Liang Yuan and Chen Ding and Wesley Smith and Peter Denning and Yunquan Zhang", title = "A Relational Theory of Locality", journal = j-TACO, volume = "16", number = "3", pages = "33:1--33:??", month = aug, year = "2019", CODEN = "????", DOI = "https://doi.org/10.1145/3341109", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:27:40 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/ft_gateway.cfm?id=3341109", abstract = "In many areas of program and system analysis and optimization, locality is a common concept and has been defined and measured in many ways. This article aims to formally establish relations between these previously disparate types of locality. It categorizes locality definitions in three groups and shows whether and how they can be interconverted. For the footprint, a recent metric, it gives a new measurement algorithm that is asymptotically more time/space efficient than previous approaches. Using the conversion relations, the new algorithm derives with the same efficiency different locality metrics developed and used in program analysis, memory management, and cache design.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Thangamani:2020:ORC, author = "Arun Thangamani and V. Krishna Nandivada", title = "Optimizing Remote Communication in {X10}", journal = j-TACO, volume = "16", number = "4", pages = "34:1--34:26", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3345558", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:31:26 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "X10 is a partitioned global address space programming language that supports the notion of places; a place consists of some data and some lightweight tasks called activities. Each activity runs at a place and may invoke a place-change operation (using the at-construct) to synchronously perform some computation at another place. These place-change operations can be very expensive, as they need to copy all the required data from the current place to the remote place. However, identifying the necessary number of place-change operations and the required data during each place-change operation are non-trivial tasks, especially in the context of irregular applications (like graph applications) that contain complex code with large amounts of cross-referencing objects-not all of those objects may be actually required, at the remote place. In this article, we present AT-Com, a scheme to optimize X10 code with place-change operations. AT-Com consists of two inter-related new optimizations: (i) AT-Opt, which minimizes the amount of data serialized and communicated during place-change operations, and (ii) AT-Pruning, which identifies/elides redundant place-change operations and does parallel execution of place-change operations. AT-Opt uses a novel abstraction, called abstract-place-tree, to capture place-change operations in the program. For each place-change operation, AT-Opt uses a novel inter-procedural analysis to precisely identify the data required at the remote place in terms of the variables in the current scope. AT-Opt then emits the appropriate code to copy the identified data-items to the remote place. AT-Pruning introduces a set of program transformation techniques to emit optimized code such that it avoids the redundant place-change operations. We have implemented AT-Com in the x10v2.6.0 compiler and tested it over the IMSuite benchmark kernels. Compared to the current X10 compiler, the AT-Com optimized code achieved a geometric mean speedup of 18.72$ \times $ and 17.83$ \times $ on a four-node (32 cores per node) Intel and two-node (16 cores per node) AMD system, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Srikanth:2020:MAS, author = "Sriseshan Srikanth and Anirudh Jain and Joseph M. Lennon and Thomas M. Conte and Erik Debenedictis and Jeanine Cook", title = "{MetaStrider}: Architectures for Scalable Memory-centric Reduction of Sparse Data Streams", journal = j-TACO, volume = "16", number = "4", pages = "35:1--35:26", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3355396", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:31:26 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Reduction is an operation performed on the values of two or more key-value pairs that share the same key. Reduction of sparse data streams finds application in a wide variety of domains such as data and graph analytics, cybersecurity, machine learning, and HPC applications. However, these applications exhibit low locality of reference, rendering traditional architectures and data representations inefficient. This article presents MetaStrider, a significant algorithmic and architectural enhancement to the state-of-the-art, SuperStrider. Furthermore, these enhancements enable a variety of parallel, memory-centric architectures that we propose, resulting in demonstrated performance that scales near-linearly with available memory-level parallelism.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Koraei:2020:DSS, author = "Mostafa Koraei and Omid Fatemi and Magnus Jahre", title = "{DCMI}: a Scalable Strategy for Accelerating Iterative Stencil Loops on {FPGAs}", journal = j-TACO, volume = "16", number = "4", pages = "36:1--36:24", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3352813", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:31:26 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Iterative Stencil Loops (ISLs) are the key kernel within a range of compute-intensive applications. To accelerate ISLs with Field Programmable Gate Arrays, it is critical to exploit parallelism (1) among elements within the same iteration and (2) across loop iterations. We propose a novel ISL acceleration scheme called Direct Computation of Multiple Iterations (DCMI) that improves upon prior work by pre-computing the effective stencil coefficients after a number of iterations at design time-resulting in accelerators that use minimal on-chip memory and avoid redundant computation. This enables DCMI to improve throughput by up to 7.7$ \times $ compared to the state-of-the-art cone-based architecture.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Peled:2020:NNP, author = "Leeor Peled and Uri Weiser and Yoav Etsion", title = "A Neural Network Prefetcher for Arbitrary Memory Access Patterns", journal = j-TACO, volume = "16", number = "4", pages = "37:1--37:27", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3345000", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3345000", abstract = "Memory prefetchers are designed to identify and prefetch specific access patterns, including spatiotemporal locality (e.g., strides, streams), recurring patterns (e.g., varying strides, temporal correlation), and specific irregular patterns (e.g., \ldots{})", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vasilache:2020:NAL, author = "Nicolas Vasilache and Oleksandr Zinenko and Theodoros Theodoridis and Priya Goyal and Zachary Devito and William S. Moses and Sven Verdoolaege and Andrew Adams and Albert Cohen", title = "The Next 700 Accelerated Layers: From Mathematical Expressions of Network Computation Graphs to Accelerated {GPU} Kernels, Automatically", journal = j-TACO, volume = "16", number = "4", pages = "38:1--38:26", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3355606", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:31:26 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Deep learning frameworks automate the deployment, distribution, synchronization, memory allocation, and hardware acceleration of models represented as graphs of computational operators. These operators wrap high-performance libraries such as cuDNN or NNPACK. When the computation does not match any predefined library call, custom operators must be implemented, often at high engineering cost and performance penalty, limiting the pace of innovation. To address this productivity gap, we propose and evaluate: (1) a domain-specific language with a tensor notation close to the mathematics of deep learning; (2) a Just-In-Time optimizing compiler based on the polyhedral framework; (3) carefully coordinated linear optimization and evolutionary algorithms to synthesize high-performance CUDA kernels; (4) the transparent integration of our flow into PyTorch and Caffe2, providing the fully automatic synthesis of high-performance GPU kernels from simple tensor algebra. The performance is comparable to, and often exceeds the performance of, highly tuned libraries.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2020:LLA, author = "Wenbin Jiang and Yang Ma and Bo Liu and Haikun Liu and Bing Bing Zhou and Jian Zhu and Song Wu and Hai Jin", title = "{Layup}: Layer-adaptive and Multi-type Intermediate-oriented Memory Optimization for {GPU}-based {CNNs}", journal = j-TACO, volume = "16", number = "4", pages = "39:1--39:23", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3357238", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:31:26 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Although GPUs have emerged as the mainstream for the acceleration of convolutional neural network (CNN) training processes, they usually have limited physical memory, meaning that it is hard to train large-scale CNN models. Many methods for memory optimization have been proposed to decrease the memory consumption of CNNs and to mitigate the increasing scale of these networks; however, this optimization comes at the cost of an obvious drop in time performance. We propose a new memory optimization strategy named Layup that realizes both better memory efficiency and better time performance. First, a fast layer-type-specific method for memory optimization is presented, based on the new finding that a single memory optimization often shows dramatic differences in time performance for different types of layers. Second, a new memory reuse method is presented in which greater attention is paid to multi-type intermediate data such as convolutional workspaces and cuDNN handle data. Experiments show that Layup can significantly increase the scale of extra-deep network models on a single GPU with lower performance loss. It even can train ResNet with 2,504 layers using 12GB memory, outperforming the state-of-the-art work of SuperNeurons with 1,920 layers (batch size = 16).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Siso:2020:EAV, author = "Sergi Siso and Wes Armour and Jeyarajan Thiyagalingam", title = "Evaluating Auto-Vectorizing Compilers through Objective Withdrawal of Useful Information", journal = j-TACO, volume = "16", number = "4", pages = "40:1--40:23", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3356842", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:31:26 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The need for compilers to generate highly vectorized code is at an all-time high with the increasing vectorization capabilities of modern processors. To this end, the information that compilers have at their disposal, either through code analysis or via user annotations, is instrumental for auto-vectorization, and hence for the overall performance. However, the information that is available to compilers at compile time and its accuracy varies greatly, as does the resulting performance of vectorizing compilers. Benchmarks like the Test Suite for Vectorizing Compilers (TSVC) have been developed to evaluate the vectorization capability of such compilers. The overarching approach of TSVC and similar benchmarks is to evaluate the compilers under the best possible scenario (i.e., assuming that compilers have access to all useful contextual information at compile time). Although this idealistic view is useful to observe the capability of compilers for auto-vectorization, it is not a true reflection of the conditions found in real-world applications. In this article, we propose a novel method for evaluating the auto-vectorization capability of compilers. Instead of assuming that compilers have access to a wealth of information at compile time, we formulate a method to objectively supply or withdraw information that would otherwise aid the compiler in the auto-vectorization process. This method is orthogonal to the approach adopted by TSVC, and as such, it provides the means of assessing the capabilities of modern vectorizing compilers in a more detailed way. Using this new method, we exhaustively evaluated five industry-grade compilers (GNU, Intel, Clang, PGI, and IBM) on four representative vector platforms (AVX-2, AVX-512 (Skylake), AVX-512 (KNL), and AltiVec) using the modified version of TSVC and application-level proxy kernels. The results show the impact that withdrawing information has on the vectorization capabilities of each compiler and also prove the validity of the presented technique.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Resch:2020:PBN, author = "Salonik Resch and S. Karen Khatamifard and Zamshed Iqbal Chowdhury and Masoud Zabihi and Zhengyang Zhao and Jian-Ping Wang and Sachin S. Sapatnekar and Ulya R. Karpuzcu", title = "{PIMBALL}: Binary Neural Networks in Spintronic Memory", journal = j-TACO, volume = "16", number = "4", pages = "41:1--41:26", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3357250", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 12 15:31:26 MDT 2019", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Neural networks span a wide range of applications of industrial and commercial significance. Binary neural networks (BNN) are particularly effective in trading accuracy for performance, energy efficiency, or hardware/software complexity. Here, we introduce a spintronic, re-configurable in-memory BNN accelerator, PIMBALL: Processing In Memory BNN AccL(L)erator, which allows for massively parallel and energy efficient computation. PIMBALL is capable of being used as a standard spintronic memory (STT-MRAM) array and a computational substrate simultaneously. We evaluate PIMBALL using multiple image classifiers and a genomics kernel. Our simulation results show that PIMBALL is more energy efficient than alternative CPU-, GPU-, and FPGA-based implementations while delivering higher throughput.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2020:EBC, author = "Zhen Hang Jiang and Yunsi Fei and David Kaeli", title = "Exploiting Bank Conflict-based Side-channel Timing Leakage of {GPUs}", journal = j-TACO, volume = "16", number = "4", pages = "42:1--42:24", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3361870", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3361870", abstract = "To prevent information leakage during program execution, modern software cryptographic implementations target constant-time function, where the number of instructions executed remains the same when program inputs change. However, the underlying \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Daruwalla:2020:BVC, author = "Kyle Daruwalla and Heng Zhuo and Rohit Shukla and Mikko Lipasti", title = "{BitSAD v2}: Compiler Optimization and Analysis for Bitstream Computing", journal = j-TACO, volume = "16", number = "4", pages = "43:1--43:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3364999", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3364999", abstract = "Computer vision and machine learning algorithms operating under a strict power budget require an alternate computing paradigm. While bitstream computing (BC) satisfies these constraints, creating BC systems is difficult. To address the design challenges, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mastoras:2020:CDL, author = "Aristeidis Mastoras and Thomas R. Gross", title = "Chunking for Dynamic Linear Pipelines", journal = j-TACO, volume = "16", number = "4", pages = "44:1--44:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3363815", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3363815", abstract = "Dynamic scheduling and dynamic creation of the pipeline structure are crucial for efficient execution of pipelined programs. Nevertheless, dynamic systems imply higher overhead than static systems. Therefore, chunking is the key to decrease the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Selva:2020:BPR, author = "Manuel Selva and Fabian Gruber and Diogo Sampaio and Christophe Guillon and Louis-No{\"e}l Pouchet and Fabrice Rastello", title = "Building a Polyhedral Representation from an Instrumented Execution: Making Dynamic Analyses of Nonaffine Programs Scalable", journal = j-TACO, volume = "16", number = "4", pages = "45:1--45:26", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3363785", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3363785", abstract = "The polyhedral model has been successfully used in production compilers. Nevertheless, only a very restricted class of applications can benefit from it. Recent proposals investigated how runtime information could be used to apply polyhedral optimization \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yasin:2020:MGM, author = "Ahmad Yasin and Jawad Haj-Yahya and Yosi Ben-Asher and Avi Mendelson", title = "A Metric-Guided Method for Discovering Impactful Features and Architectural Insights for {Skylake}-Based Processors", journal = j-TACO, volume = "16", number = "4", pages = "46:1--46:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3369383", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3369383", abstract = "The slowdown in technology scaling puts architectural features at the forefront of the innovation in modern processors. This article presents a Metric-Guided Method (MGM) that extends Top-Down analysis with carefully selected, dynamically adapted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2020:FTF, author = "Jie Zhao and Albert Cohen", title = "Flextended Tiles: a Flexible Extension of Overlapped Tiles for Polyhedral Compilation", journal = j-TACO, volume = "16", number = "4", pages = "47:1--47:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3369382", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3369382", abstract = "Loop tiling to exploit data locality and parallelism plays an essential role in a variety of general-purpose and domain-specific compilers. Affine transformations in polyhedral frameworks implement classical forms of rectangular and parallelogram tiling, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gerzhoy:2020:NMS, author = "Daniel Gerzhoy and Xiaowu Sun and Michael Zuzak and Donald Yeung", title = "Nested {MIMD--SIMD} Parallelization for Heterogeneous Microprocessors", journal = j-TACO, volume = "16", number = "4", pages = "48:1--48:27", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3368304", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3368304", abstract = "Heterogeneous microprocessors integrate a CPU and GPU on the same chip, providing fast CPU-GPU communication and enabling cores to compute on data {``in place.''} This permits exploiting a finer granularity of parallelism on the integrated GPUs, and enables \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xia:2020:DAB, author = "Chunwei Xia and Jiacheng Zhao and Huimin Cui and Xiaobing Feng and Jingling Xue", title = "{DNNTune}: Automatic Benchmarking {DNN} Models for Mobile-cloud Computing", journal = j-TACO, volume = "16", number = "4", pages = "49:1--49:26", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3368305", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/super.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3368305", abstract = "Deep Neural Networks (DNNs) are now increasingly adopted in a variety of Artificial Intelligence (AI) applications. Meantime, more and more DNNs are moving from cloud to the mobile devices, as emerging AI chips are integrated into mobiles. Therefore, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Briggs:2020:FRT, author = "Ian Briggs and Arnab Das and Mark Baranowski and Vishal Sharma and Sriram Krishnamoorthy and Zvonimir Rakamari{\'c} and Ganesh Gopalakrishnan", title = "{FailAmp}: Relativization Transformation for Soft Error Detection in Structured Address Generation", journal = j-TACO, volume = "16", number = "4", pages = "50:1--50:21", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3369381", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3369381", abstract = "We present FailAmp, a novel LLVM program transformation algorithm that makes programs employing structured index calculations more robust against soft errors. Without FailAmp, an offset error can go undetected; with FailAmp, all subsequent offsets are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ahmad:2020:DDM, author = "Khalid Ahmad and Hari Sundar and Mary Hall", title = "Data-driven Mixed Precision Sparse Matrix Vector Multiplication for {GPUs}", journal = j-TACO, volume = "16", number = "4", pages = "51:1--51:24", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3371275", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3371275", abstract = "We optimize Sparse Matrix Vector multiplication (SpMV) using a mixed precision strategy (MpSpMV) for Nvidia V100 GPUs. The approach has three benefits: (1) It reduces computation time, (2) it reduces the size of the input matrix and therefore reduces data movement, and (3) it provides an opportunity for increased parallelism. MpSpMV's decision to lower to single precision is data driven, based on individual nonzero values of the sparse matrix. On all real-valued matrices from the Sparse Matrix Collection, we obtain a maximum speedup of $ 2.61 \times $ and average speedup of $ 1.06 \times $ over double precision, while maintaining higher accuracy compared to single precision.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Stoltzfus:2020:TOS, author = "Larisa Stoltzfus and Bastian Hagedorn and Michel Steuwer and Sergei Gorlatch and Christophe Dubach", title = "Tiling Optimizations for Stencil Computations Using Rewrite Rules in Lift", journal = j-TACO, volume = "16", number = "4", pages = "52:1--52:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3368858", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3368858", abstract = "Stencil computations are a widely used type of algorithm, found in applications from physical simulations to machine learning. Stencils are embarrassingly parallel, therefore fit on modern hardware such as Graphic Processing Units perfectly. Although \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{vanderVlag:2020:ECB, author = "Michiel A. van der Vlag and Georgios Smaragdos and Zaid Al-Ars and Christos Strydis", title = "Exploring Complex Brain-Simulation Workloads on Multi-{GPU} Deployments", journal = j-TACO, volume = "16", number = "4", pages = "53:1--53:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3371235", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3371235", abstract = "In-silico brain simulations are the de-facto tools computational neuroscientists use to understand large-scale and complex brain-function dynamics. Current brain simulators do not scale efficiently enough to large-scale problem sizes (e.g., $ > 100, 000 $ \ldots{})", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Elkhouly:2020:CSC, author = "Reem Elkhouly and Mohammad Alshboul and Akihiro Hayashi and Yan Solihin and Keiji Kimura", title = "Compiler-support for Critical Data Persistence in {NVM}", journal = j-TACO, volume = "16", number = "4", pages = "54:1--54:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3371236", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3371236", abstract = "Non-volatile Main Memories (NVMs) offer a promising way to preserve data persistence and enable computation recovery in case of failure. While the use of NVMs can significantly reduce the overhead of failure recovery, which is the case with High-\ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chelini:2020:DLT, author = "Lorenzo Chelini and Oleksandr Zinenko and Tobias Grosser and Henk Corporaal", title = "Declarative Loop Tactics for Domain-specific Optimization", journal = j-TACO, volume = "16", number = "4", pages = "55:1--55:25", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3372266", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3372266", abstract = "Increasingly complex hardware makes the design of effective compilers difficult. To reduce this problem, we introduce Declarative Loop Tactics, which is a novel framework of composable program transformations based on an internal tree-like program \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Khan:2020:SMS, author = "Asif Ali Khan and Fazal Hameed and Robin Bl{\"a}sing and Stuart S. P. Parkin and Jeronimo Castrillon", title = "{ShiftsReduce}: Minimizing Shifts in {Racetrack Memory 4.0}", journal = j-TACO, volume = "16", number = "4", pages = "56:1--56:23", month = jan, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3372489", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 11 07:11:45 MST 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3372489", abstract = "Racetrack memories (RMs) have significantly evolved since their conception in 2008, making them a serious contender in the field of emerging memory technologies. Despite key technological advancements, the access latency and energy consumption of an RM-\ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2020:DCP, author = "Yuhao Li and Dan Sun and Benjamin C. Lee", title = "Dynamic Colocation Policies with Reinforcement Learning", journal = j-TACO, volume = "17", number = "1", pages = "1:1--1:25", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3375714", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3375714", abstract = "We draw on reinforcement learning frameworks to design and implement an adaptive controller for managing resource contention. During runtime, the controller observes the dynamic system conditions and optimizes control policies that satisfy latency \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tampouratzis:2020:NHI, author = "Nikolaos Tampouratzis and Ioannis Papaefstathiou and Antonios Nikitakis and Andreas Brokalakis and Stamatis Andrianakis and Apostolos Dollas and Marco Marcon and Emanuele Plebani", title = "A Novel, Highly Integrated Simulator for Parallel and Distributed Systems", journal = j-TACO, volume = "17", number = "1", pages = "2:1--2:28", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3378934", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3378934", abstract = "In an era of complex networked parallel heterogeneous systems, simulating independently only parts, components, or attributes of a system-under-design is a cumbersome, inaccurate, and inefficient approach. Moreover, by considering each part of a system \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2020:EHE, author = "Lijuan Jiang and Chao Yang and Wenjing Ma", title = "Enabling Highly Efficient Batched Matrix Multiplications on {SW26010} Many-core Processor", journal = j-TACO, volume = "17", number = "1", pages = "3:1--3:23", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3378176", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3378176", abstract = "We present a systematic methodology for optimizing batched matrix multiplications on SW26010 many-core processor of the Sunway TaihuLight supercomputer. Five surrogate algorithms and a machine learning-based algorithm selector are proposed to fully \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cavus:2020:IPI, author = "Mustafa Cavus and Resit Sendag and Joshua J. Yi", title = "Informed Prefetching for Indirect Memory Accesses", journal = j-TACO, volume = "17", number = "1", pages = "4:1--4:29", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3374216", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3374216", abstract = "Indirect memory accesses have irregular access patterns that limit the performance of conventional software and hardware-based prefetchers. To address this problem, we propose the Array Tracking Prefetcher (ATP), which tracks array-based indirect memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Uguen:2020:ASA, author = "Yohann Uguen and Florent {De Dinechin} and Victor Lezaud and Steven Derrien", title = "Application-Specific Arithmetic in High-Level Synthesis Tools", journal = j-TACO, volume = "17", number = "1", pages = "5:1--5:23", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377403", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377403", abstract = "This work studies hardware-specific optimization opportunities currently unexploited by high-level synthesis compilers. Some of these optimizations are specializations of floating-point operations that respect the usual semantics of the input program without changing the numerical result. Some other optimizations, locally triggered by the programmer thanks to a pragma, assume a different semantics, where floating-point code is interpreted as the specification of computation with real numbers. The compiler is then in charge to ensure an application-level accuracy constraint expressed in the pragma and has the freedom to use non-standard arithmetic hardware when more efficient. These two classes of optimizations are prototyped in the GeCoS source-to-source compiler and evaluated on the Polybench and EEMBC benchmark suites. Latency is reduced by up to 93\%, and resource usage is reduced by up to 58\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Song:2020:IME, author = "Yang Song and Bill Lin", title = "Improving Memory Efficiency in Heterogeneous {MPSoCs} through Row-Buffer Locality-aware Forwarding", journal = j-TACO, volume = "17", number = "1", pages = "6:1--6:26", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377149", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377149", abstract = "In heterogeneous multicore systems, the memory subsystem plays a critical role, since most core-to-core communications are conducted through the main memory. Memory efficiency has a substantial impact on system performance. Although memory traffic from \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2020:MBS, author = "Hao Wu and Weizhi Liu and Huanxin Lin and Cho-Li Wang", title = "A Model-Based Software Solution for Simultaneous Multiple Kernels on {GPUs}", journal = j-TACO, volume = "17", number = "1", pages = "7:1--7:26", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377138", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377138", abstract = "As a critical computing resource in multiuser systems such as supercomputers, data centers, and cloud services, a GPU contains multiple compute units (CUs). GPU Multitasking is an intuitive solution to underutilization in GPGPU computing. Recently \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shi:2020:OSB, author = "Xuanhua Shi and Wei Liu and Ligang He and Hai Jin and Ming Li and Yong Chen", title = "Optimizing the {SSD} Burst Buffer by Traffic Detection", journal = j-TACO, volume = "17", number = "1", pages = "8:1--8:26", month = mar, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377705", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Mar 10 08:30:23 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377705", abstract = "Currently, HPC storage systems still use hard disk drive (HDD) as their dominant storage device. Solid state drive (SSD) is widely deployed as the buffer to HDDs. Burst buffer has also been proposed to manage the SSD buffering of bursty write requests.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kalra:2020:ACB, author = "Charu Kalra and Fritz Previlon and Norm Rubin and David Kaeli", title = "{ArmorAll}: Compiler-based Resilience Targeting {GPU} Applications", journal = j-TACO, volume = "17", number = "2", pages = "9:1--9:24", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3382132", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3382132", abstract = "The vulnerability of GPUs to soft errors has become a first-class design concern as they are increasingly being used in accuracy-sensitive and safety-critical domains. Existing solutions used to enhance the reliability of GPUs come with significant overhead in terms of area, power, and/or performance. In this article, we propose ArmorAll, a light-weight, adaptive, selective, and portable software solution to protect GPUs against soft errors. ArmorAll consists of a set of purely compiler-based redundancy schemes designed to optimize instruction duplication on GPUs, thereby enabling much more reliable execution. The choice of the scheme determines the subset of instructions that must be duplicated in an application, allowing adaptable fault coverage for different applications. ArmorAll can intelligently select a redundancy scheme that provides the best coverage to an application with an accuracy of 91.7\%. The high coverage provided by ArmorAll comes at an average improvement of 64.5\% in runtime", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cherubin:2020:DPA, author = "Stefano Cherubin and Daniele Cattaneo and Michele Chiari and Giovanni Agosta", title = "Dynamic Precision Autotuning with {TAFFO}", journal = j-TACO, volume = "17", number = "2", pages = "10:1--10:26", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3388785", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3388785", abstract = "Many classes of applications, both in the embedded and high performance domains, can trade off the accuracy of the computed results for computation performance. One way to achieve such a trade-off is precision tuning-that is, to modify the data types used for the computation by reducing the bit width, or by changing the representation from floating point to fixed point. We present a methodology for high-accuracy dynamic precision tuning based on the identification of input classes (i.e., classes of input datasets that benefit from similar optimizations). When a new input region is detected, the application kernels are re-compiled on the fly with the appropriate selection of parameters. In this way, we obtain a continuous optimization approach that enables the exploitation of the reduced precision computation while progressively exploring the solution space, thus reducing the time required by compilation overheads. We provide tools to support the automation of the runtime part of the solution, leaving to the user only the task of identifying the input classes. Our approach provides a significant performance boost (up to 320\%) on the typical approximate computing benchmarks, without meaningfully affecting the accuracy of the result, since the error remains always below 3\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Erdem:2020:RDS, author = "Ahmet Erdem and Cristina Silvano and Thomas Boesch and Andrea Carlo Ornstein and Surinder-Pal Singh and Giuseppe Desoli", title = "Runtime Design Space Exploration and Mapping of {DCNNs} for the Ultra-Low-Power {Orlando SoC}", journal = j-TACO, volume = "17", number = "2", pages = "11:1--11:25", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3379933", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3379933", abstract = "Recent trends in deep convolutional neural networks (DCNNs) impose hardware accelerators as a viable solution for computer vision and speech recognition. The Orlando SoC architecture from STMicroelectronics targets exactly this class of problems by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sabet:2020:RAU, author = "Amir Hossein Nodehi Sabet and Junqiao Qiu and Zhijia Zhao and Sriram Krishnamoorthy", title = "Reliability Analysis for Unreliable {FSM} Computations", journal = j-TACO, volume = "17", number = "2", pages = "12:1--12:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3377456", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3377456", abstract = "Finite State Machines (FSMs) are fundamental in both hardware design and software development. However, the reliability of FSM computations remains poorly understood. Existing reliability analyses are mainly designed for generic computations and are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xue:2020:NIA, author = "Jiachen Xue and T. N. Vijaykumar and Mithuna Thottethodi", title = "Network Interface Architecture for Remote Indirect Memory Access {(RIMA)} in Datacenters", journal = j-TACO, volume = "17", number = "2", pages = "13:1--13:22", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3374215", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3374215", abstract = "Remote Direct Memory Access (RDMA) fabrics such as InfiniBand and Converged Ethernet report latency shorter by a factor of 50 than TCP. As such, RDMA is a potential replacement for TCP in datacenters (DCs) running low-latency applications, such as Web \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2020:CFS, author = "Qinggang Wang and Long Zheng and Jieshan Zhao and Xiaofei Liao and Hai Jin and Jingling Xue", title = "A Conflict-free Scheduler for High-performance Graph Processing on Multi-pipeline {FPGAs}", journal = j-TACO, volume = "17", number = "2", pages = "14:1--14:26", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3390523", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3390523", abstract = "FPGA-based graph processing accelerators are nowadays equipped with multiple pipelines for hardware acceleration of graph computations. However, their multi-pipeline efficiency can suffer greatly from the considerable overheads caused by the read/write \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tino:2020:SXE, author = "Anita Tino and Caroline Collange and Andr{\'e} Seznec", title = "{SIMT-X}: Extending Single-Instruction Multi-Threading to Out-of-Order Cores", journal = j-TACO, volume = "17", number = "2", pages = "15:1--15:23", month = jun, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3392032", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 27 12:06:50 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/abs/10.1145/3392032", abstract = "This work introduces Single Instruction Multi-Thread Express (SIMT-X), a general-purpose Central Processing Unit (CPU) microarchitecture that enables Graphics Processing Units (GPUs)-style SIMT execution across multiple threads of the same program for high throughput, while retaining the latency benefits of out-of-order execution, and the programming convenience of homogeneous multi-thread processors. SIMT-X leverages the existing Single Instruction Multiple Data (SIMD) back-end to provide CPU/GPU-like processing on a single core with minimal overhead. We demonstrate that although SIMT-X invokes a restricted form of Out-of-Order (OoO), the microarchitecture successfully captures a majority of the benefits of aggressive OoO execution using at most two concurrent register mappings per architectural register, while addressing issues of partial dependencies and supporting a general-purpose Instruction Set Architecture (ISA).", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kaeli:2020:EME, author = "Dave Kaeli", title = "Editorial: a Message from the {Editor-in-Chief}", journal = j-TACO, volume = "17", number = "3", pages = "16:1--16:2", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3409369", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3409369", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rangan:2020:ZEZ, author = "Ram Rangan and Mark W. Stephenson and Aditya Ukarande and Shyam Murthy and Virat Agarwal and Marc Blackstein", title = "{Zeroploit}: Exploiting Zero Valued Operands in Interactive Gaming Applications", journal = j-TACO, volume = "17", number = "3", pages = "17:1--17:26", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3394284", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3394284", abstract = "In this article, we first characterize register operand value locality in shader programs of modern gaming applications and observe that there is a high likelihood of one of the register operands of several multiply, logical-and, and similar operations \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Adamek:2020:GFC, author = "Karel Ad{\'a}mek and Sofia Dimoudi and Mike Giles and Wesley Armour", title = "{GPU} Fast Convolution via the Overlap-and-Save Method in Shared Memory", journal = j-TACO, volume = "17", number = "3", pages = "18:1--18:20", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3394116", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3394116", abstract = "We present an implementation of the overlap-and-save method, a method for the convolution of very long signals with short response functions, which is tailored to GPUs. We have implemented several FFT algorithms (using the CUDA programming language), \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Das:2020:FER, author = "Arnab Das and Sriram Krishnamoorthy and Ian Briggs and Ganesh Gopalakrishnan and Ramakrishna Tipireddy", title = "{FPDetect}: Efficient Reasoning About Stencil Programs Using Selective Direct Evaluation", journal = j-TACO, volume = "17", number = "3", pages = "19:1--19:27", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3402451", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3402451", abstract = "We present FPDetect, a low-overhead approach for detecting logical errors and soft errors affecting stencil computations without generating false positives. We develop an offline analysis that tightly estimates the number of floating-point bits \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Abdelrahman:2020:CSH, author = "Tarek S. Abdelrahman", title = "Cooperative Software-hardware Acceleration of {$K$}-means on a Tightly Coupled {CPU--FPGA} System", journal = j-TACO, volume = "17", number = "3", pages = "20:1--20:24", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3406114", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3406114", abstract = "We consider software-hardware acceleration of K-means clustering on the Intel Xeon+FPGA platform. We design a pipelined accelerator for K-means and combine it with CPU threads to assess performance benefits of (1) acceleration when data are only \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2020:SBP, author = "Jaekyu Lee and Yasuo Ishii and Dam Sunwoo", title = "Securing Branch Predictors with Two-Level Encryption", journal = j-TACO, volume = "17", number = "3", pages = "21:1--21:25", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3404189", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3404189", abstract = "Modern processors rely on various speculative mechanisms to meet performance demand. Branch predictors are one of the most important micro-architecture components to deliver performance. However, they have been under heavy scrutiny because of recent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cerina:2020:EDO, author = "L. Cerina and M. D. Santambrogio and G. Franco and C. Gallicchio and A. Micheli", title = "{EchoBay}: Design and Optimization of Echo State Networks under Memory and Time Constraints", journal = j-TACO, volume = "17", number = "3", pages = "22:1--22:24", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3404993", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3404993", abstract = "The increase in computational power of embedded devices and the latency demands of novel applications brought a paradigm shift on how and where the computation is performed. Although AI inference is slowly moving from the cloud to end-devices with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sioutas:2020:SSH, author = "Savvas Sioutas and Sander Stuijk and Twan Basten and Henk Corporaal and Lou Somers", title = "Schedule Synthesis for {Halide} Pipelines on {GPUs}", journal = j-TACO, volume = "17", number = "3", pages = "23:1--23:25", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3406117", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3406117", abstract = "The Halide DSL and compiler have enabled high-performance code generation for image processing pipelines targeting heterogeneous architectures through the separation of algorithmic description and optimization schedule. However, automatic schedule \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huzaifa:2020:IKR, author = "Muhammad Huzaifa and Johnathan Alsop and Abdulrahman Mahmoud and Giordano Salvador and Matthew D. Sinclair and Sarita V. Adve", title = "Inter-kernel Reuse-aware Thread Block Scheduling", journal = j-TACO, volume = "17", number = "3", pages = "24:1--24:27", month = aug, year = "2020", CODEN = "????", DOI = "https://doi.org/10.1145/3406538", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Aug 28 12:02:00 MDT 2020", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3406538", abstract = "As GPUs have become more programmable, their performance and energy benefits have made them increasingly popular. However, while GPU compute units continue to improve in performance, on-chip memories lag behind and data accesses are becoming \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jafri:2021:RTC, author = "Syed M. A. H. Jafri and Hasan Hassan and Ahmed Hemani and Onur Mutlu", title = "Refresh Triggered Computation: Improving the Energy Efficiency of Convolutional Neural Network Accelerators", journal = j-TACO, volume = "18", number = "1", pages = "2:1--2:29", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3417708", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3417708", abstract = "To employ a Convolutional Neural Network (CNN) in an energy-constrained embedded system, it is critical for the CNN implementation to be highly energy efficient. Many recent studies propose CNN accelerator architectures with custom computation units \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Abera:2021:PET, author = "Solomon Abera and M. Balakrishnan and Anshul Kumar", title = "Performance-Energy Trade-off in Modern {CMPs}", journal = j-TACO, volume = "18", number = "1", pages = "3:1--3:26", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3427092", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3427092", abstract = "Chip multiprocessors (CMPs) are ubiquitous in all computing systems ranging from high-end servers to mobile devices. In these systems, energy consumption is a critical design constraint as it constitutes the most significant operating cost for computing \ldots{}.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mehrabi:2021:BOE, author = "Atefeh Mehrabi and Aninda Manocha and Benjamin C. Lee and Daniel J. Sorin", title = "{Bayesian} Optimization for Efficient Accelerator Synthesis", journal = j-TACO, volume = "18", number = "1", pages = "4:1--4:25", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3427377", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3427377", abstract = "Accelerator design is expensive due to the effort required to understand an algorithm and optimize the design. Architects have embraced two technologies to reduce costs. High-level synthesis automatically generates hardware from code. Reconfigurable \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kim:2021:IRA, author = "Minsu Kim and Jeong-Keun Park and Soo-Mook Moon", title = "Irregular Register Allocation for Translation of Test-pattern Programs", journal = j-TACO, volume = "18", number = "1", pages = "5:1--5:23", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3427378", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3427378", abstract = "Test-pattern programs are for testing DRAM memory chips. They run on a special embedded system called automated test equipment (ATE). Each ATE manufacturer provides its own programming language, which is mostly low level, thus accessing the registers in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nematollahi:2021:ENN, author = "Negin Nematollahi and Mohammad Sadrosadati and Hajar Falahati and Marzieh Barkhordar and Mario Paulo Drumond and Hamid Sarbazi-Azad and Babak Falsafi", title = "Efficient Nearest-Neighbor Data Sharing in {GPUs}", journal = j-TACO, volume = "18", number = "1", pages = "6:1--6:26", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3429981", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3429981", abstract = "Stencil codes (a.k.a. nearest-neighbor computations) are widely used in image processing, machine learning, and scientific applications. Stencil codes incur nearest-neighbor data exchange because the value of each point in the structured grid is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Braun:2021:SMP, author = "Lorenz Braun and Sotirios Nikas and Chen Song and Vincent Heuveline and Holger Fr{\"o}ning", title = "A Simple Model for Portable and Fast Prediction of Execution Time and Power Consumption of {GPU} Kernels", journal = j-TACO, volume = "18", number = "1", pages = "7:1--7:25", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3431731", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3431731", abstract = "Characterizing compute kernel execution behavior on GPUs for efficient task scheduling is a non-trivial task. We address this with a simple model enabling portable and fast predictions among different GPUs using only hardware-independent features. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mettler:2021:DHM, author = "Marcel Mettler and Daniel Mueller-Gritschneder and Ulf Schlichtmann", title = "A Distributed Hardware Monitoring System for Runtime Verification on Multi-Tile {MPSoCs}", journal = j-TACO, volume = "18", number = "1", pages = "8:1--8:25", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3430699", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3430699", abstract = "Exhaustive verification techniques do not scale with the complexity of today's multi-tile Multi-processor Systems-on-chip (MPSoCs). Hence, runtime verification (RV) has emerged as a complementary method, which verifies the correct behavior of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2021:EPO, author = "Yu Emma Wang and Carole-Jean Wu and Xiaodong Wang and Kim Hazelwood and David Brooks", title = "Exploiting Parallelism Opportunities with Deep Learning Frameworks", journal = j-TACO, volume = "18", number = "1", pages = "9:1--9:23", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3431388", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3431388", abstract = "State-of-the-art machine learning frameworks support a wide variety of design features to enable a flexible machine learning programming interface and to ease the programmability burden on machine learning developers. Identifying and using a performance-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tavarageri:2021:PPO, author = "Sanket Tavarageri and Alexander Heinecke and Sasikanth Avancha and Bharat Kaul and Gagandeep Goyal and Ramakrishna Upadrasta", title = "{PolyDL}: Polyhedral Optimizations for Creation of High-performance {DL} Primitives", journal = j-TACO, volume = "18", number = "1", pages = "11:1--11:27", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3433103", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3433103", abstract = "Deep Neural Networks (DNNs) have revolutionized many aspects of our lives. The use of DNNs is becoming ubiquitous, including in software for image recognition, speech recognition, speech synthesis, language translation, to name a few. The training of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yadalam:2021:SXS, author = "Sujay Yadalam and Vinod Ganapathy and Arkaprava Basu", title = "{SG XL}: Security and Performance for Enclaves Using Large Pages", journal = j-TACO, volume = "18", number = "1", pages = "12:1--12:25", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3433983", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3433983", abstract = "Intel's SGX architecture offers clients of public cloud computing platforms the ability to create hardware-protected enclaves whose contents are protected from privileged system software. However, SGX relies on system software for enclave memory \ldots{}.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kalaitzidis:2021:LVE, author = "Kleovoulos Kalaitzidis and Andr{\'e} Seznec", title = "Leveraging Value Equality Prediction for Value Speculation", journal = j-TACO, volume = "18", number = "1", pages = "13:1--13:20", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3436821", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3436821", abstract = "Value Prediction (VP) has recently been gaining interest in the research community, since prior work has established practical solutions for its implementation that provide meaningful performance gains. A constant challenge of contemporary context-based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Singh:2021:SSM, author = "Abhishek Singh and Shail Dave and Pantea Zardoshti and Robert Brotzman and Chao Zhang and Xiaochen Guo and Aviral Shrivastava and Gang Tan and Michael Spear", title = "{SPX64}: a Scratchpad Memory for General-purpose Microprocessors", journal = j-TACO, volume = "18", number = "1", pages = "14:1--14:26", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3436730", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3436730", abstract = "General-purpose computing systems employ memory hierarchies to provide the appearance of a single large, fast, coherent memory. In special-purpose CPUs, programmers manually manage distinct, non-coherent scratchpad memories. In this article, we combine \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Labini:2021:APM, author = "Paolo Sylos Labini and Marco Cianfriglia and Damiano Perri and Osvaldo Gervasi and Grigori Fursin and Anton Lokhmotov and Cedric Nugteren and Bruno Carpentieri and Fabiana Zollo and Flavio Vella", title = "On the Anatomy of Predictive Models for Accelerating {GPU} Convolution Kernels and Beyond", journal = j-TACO, volume = "18", number = "1", pages = "16:1--16:24", month = jan, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3434402", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 16 06:46:44 MST 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3434402", abstract = "Efficient HPC libraries often expose multiple tunable parameters, algorithmic implementations, or a combination of them, to provide optimized routines. The optimal parameters and algorithmic choices may depend on input properties such as the shapes of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Voss:2021:PRS, author = "Nils Voss and Bastiaan Kwaadgras and Oskar Mencer and Wayne Luk and Georgi Gaydadjiev", title = "On Predictable Reconfigurable System Design", journal = j-TACO, volume = "18", number = "2", pages = "17:1--17:28", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3436995", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3436995", abstract = "We propose a design methodology to facilitate rigorous development of complex applications targeting reconfigurable hardware. Our methodology relies on analytical estimation of system performance and area utilisation for a given specific application and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kaushik:2021:GHP, author = "Anirudh Mohan Kaushik and Gennady Pekhimenko and Hiren Patel", title = "{Gretch}: a Hardware Prefetcher for Graph Analytics", journal = j-TACO, volume = "18", number = "2", pages = "18:1--18:25", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3439803", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3439803", abstract = "Data-dependent memory accesses (DDAs) pose an important challenge for high-performance graph analytics (GA). This is because such memory accesses do not exhibit enough temporal and spatial locality resulting in low cache performance. Prior efforts that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ho:2021:GFD, author = "Nhut-Minh Ho and Himeshi {De Silva} and Weng-Fai Wong", title = "{GRAM}: a Framework for Dynamically Mixing Precisions in {GPU} Applications", journal = j-TACO, volume = "18", number = "2", pages = "19:1--19:24", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3441830", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3441830", abstract = "This article presents GRAM (GPU-based Runtime Adaption for Mixed-precision) a framework for the effective use of mixed precision arithmetic for CUDA programs. Our method provides a fine-grain tradeoff between output error and performance. It can create many variants that satisfy different accuracy requirements by assigning different groups of threads to different precision levels adaptively at runtime. To widen the range of applications that can benefit from its approximation, GRAM comes with an optional half-precision approximate math library. Using GRAM, we can trade off precision for any performance improvement of up to 540\%, depending on the application and accuracy requirement.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Biswas:2021:CSI, author = "Arnab Kumar Biswas", title = "Cryptographic Software {IP} Protection without Compromising Performance or Timing Side-channel Leakage", journal = j-TACO, volume = "18", number = "2", pages = "20:1--20:20", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3443707", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3443707", abstract = "Program obfuscation is a widely used cryptographic software intellectual property (IP) protection technique against reverse engineering attacks in embedded systems. However, very few works have studied the impact of combining various obfuscation techniques on the obscurity (difficulty of reverse engineering) and performance (execution time) of obfuscated programs. In this article, we propose a Genetic Algorithm (GA)-based framework that not only optimizes obscurity and performance of obfuscated cryptographic programs, but it also ensures very low timing side-channel leakage. Our proposed Timing Side Channel Sensitive Program Obfuscation Optimization Framework (TSC-SPOOF) determines the combination of obfuscation transformation functions that produce optimized obfuscated programs with preferred optimization parameters. In particular, TSC-SPOOF employs normalized compression distance (NCD) and channel capacity to measure obscurity and timing side-channel leakage, respectively. We also use RISC-V rocket core running on a Xilinx Zynq FPGA device as part of our framework to obtain realistic results. The experimental results clearly show that our proposed solution leads to cryptographic programs with lower execution time, higher obscurity, and lower timing side-channel leakage than unguided obfuscation.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{France-Pillois:2021:NIT, author = "Maxime France-Pillois and J{\'e}r{\^o}me Martin and Fr{\'e}d{\'e}ric Rousseau", title = "A Non-Intrusive Tool Chain to Optimize {MPSoC} End-to-End Systems", journal = j-TACO, volume = "18", number = "2", pages = "21:1--21:22", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3445030", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3445030", abstract = "Multi-core systems are now found in many electronic devices. But does current software design fully leverage their capabilities? The complexity of the hardware and software stacks in these platforms requires software optimization with end-to-end \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2021:GTU, author = "Pengyu Wang and Jing Wang and Chao Li and Jianzong Wang and Haojin Zhu and Minyi Guo", title = "{Grus}: Toward Unified-memory-efficient High-performance Graph Processing on {GPU}", journal = j-TACO, volume = "18", number = "2", pages = "22:1--22:25", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3444844", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3444844", abstract = "Today's GPU graph processing frameworks face scalability and efficiency issues as the graph size exceeds GPU-dedicated memory limit. Although recent GPUs can over-subscribe memory with Unified Memory (UM), they incur significant overhead when handling \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Izadpanah:2021:PPT, author = "Ramin Izadpanah and Christina Peterson and Yan Solihin and Damian Dechev", title = "{PETRA}: Persistent Transactional Non-blocking Linked Data Structures", journal = j-TACO, volume = "18", number = "2", pages = "23:1--23:26", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446391", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3446391", abstract = "Emerging byte-addressable Non-Volatile Memories (NVMs) enable persistent memory where process state can be recovered after crashes. To enable applications to rely on persistent data, durable data structures with failure-atomic operations have been \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hassan:2021:RCM, author = "Muhammad Hassan and Chang Hyun Park and David Black-Schaffer", title = "A Reusable Characterization of the Memory System Behavior of {SPEC2017} and {SPEC2006}", journal = j-TACO, volume = "18", number = "2", pages = "24:1--24:20", month = mar, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446200", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 20 17:25:10 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3446200", abstract = "The SPEC CPU Benchmarks are used extensively for evaluating and comparing improvements to computer systems. This ubiquity makes characterization critical for researchers to understand the bottlenecks the benchmarks do and do not expose and where new designs should and should not be expected to show impact. However, in characterization there is a tradeoff between accuracy and reusability: The more precisely we characterize a benchmark's performance on a given system, the less usable it is across different micro-architectures and varying memory configurations. For SPEC, most existing characterizations include system-specific effects (e.g., via performance counters) and/or only look at aggregate behavior (e.g., averages over the full application execution). While such approaches simplify characterization, they make it difficult to separate the applications intrinsic behavior from the system-specific effects and/or lose the diverse phase-based behaviors.\par In this work we focus on characterizing the applications intrinsic memory behaviour by isolating them from micro-architectural configuration specifics. We do this by providing a simplified generic system model that evaluates the applications memory behavior across multiple cache sizes, with and without prefetching, and over time. The resulting characterization can be reused across a range of systems to understand application behavior and allow us to see how frequently different behaviors occur. We use this approach to compare the SPEC 2006 and 2017 suites, providing insight into their memory system behaviour beyond previous system-specific and/or aggregate results. We demonstrate the ability to use this characterization in different contexts by showing a portion of the SPEC 2017 benchmark suite that could benefit from giga-scale caches, despite aggregate results indicating otherwise.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tiwari:2021:PCP, author = "Sugandha Tiwari and Neel Gala and Chester Rebeiro and V. Kamakoti", title = "{PERI}: a Configurable Posit Enabled {RISC-V} Core", journal = j-TACO, volume = "18", number = "3", pages = "25:1--25:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3446210", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3446210", abstract = "Owing to the failure of Dennard's scaling, the past decade has seen a steep growth of prominent new paradigms leveraging opportunities in computer architecture. Two technologies of interest are Posit and RISC-V. Posit was introduced in mid-2017 as a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Charitopoulos:2021:MDC, author = "George Charitopoulos and Dionisios N. Pnevmatikatos and Georgi Gaydadjiev", title = "{MC-DeF}: Creating Customized {CGRAs} for Dataflow Applications", journal = j-TACO, volume = "18", number = "3", pages = "26:1--26:25", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447970", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3447970", abstract = "Executing complex scientific applications on Coarse-Grain Reconfigurable Arrays (CGRAs) promises improvements in execution time and/or energy consumption compared to optimized software implementations or even fully customized hardware solutions. Typical \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Borbon:2021:APB, author = "Jose M. Rodriguez Borbon and Junjie Huang and Bryan M. Wong and Walid Najjar", title = "Acceleration of Parallel-Blocked {$ Q R $} Decomposition of Tall-and-Skinny Matrices on {FPGAs}", journal = j-TACO, volume = "18", number = "3", pages = "27:1--27:25", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3447775", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3447775", abstract = "$ Q R $ decomposition is one of the most useful factorization kernels in modern numerical linear algebra algorithms. In particular, the decomposition of tall-and-skinny matrices (TSMs) has major applications in areas including scientific computing, machine learning, image processing, wireless networks, and numerical methods. Traditionally, CPUs and GPUs have achieved better throughput on these applications by using large cache hierarchies and compute cores running at a high frequency, leading to high power consumption. With the advent of heterogeneous platforms, however, FPGAs are emerging as a promising viable alternative. In this work, we propose a high-throughput FPGA-based engine that has a very high computational efficiency (ratio of achieved to peak throughput) compared to similar $ Q R $ solvers running on FPGAs. Although comparable $ Q R $ solvers achieve an efficiency of 36\%, our design exhibits an efficiency of 54\%. For TSMs, our experimental results show that our design can outperform highly optimized $ Q R $ solvers running on CPUs and GPUs. For TSMs with more than 50K rows, our design outperforms the Intel MKL solver running on an Intel quad-core processor by a factor of $ 1.5 \times $. For TSMs containing 256 columns or less, our design outperforms the NVIDIA CUBLAS solver running on a K40 GPU by a factor of $ 3.0 \times $. In addition to being fast, our design is energy efficient competing platforms execute up to 0.6 GFLOPS/Joule, whereas our design executes more than 1.0 GFLOPS/Joule.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Stokes:2021:DMR, author = "Michael Stokes and David Whalley and Soner Onder", title = "Decreasing the Miss Rate and Eliminating the Performance Penalty of a Data Filter Cache", journal = j-TACO, volume = "18", number = "3", pages = "28:1--28:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3449043", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3449043", abstract = "While data filter caches (DFCs) have been shown to be effective at reducing data access energy, they have not been adopted in processors due to the associated performance penalty caused by high DFC miss rates. In this article, we present a design that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Akram:2021:PEI, author = "Shoaib Akram", title = "Performance Evaluation of {Intel Optane} Memory for Managed Workloads", journal = j-TACO, volume = "18", number = "3", pages = "29:1--29:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3451342", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2020.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3451342", abstract = "Intel Optane memory offers non-volatility, byte addressability, and high capacity. It suits managed workloads that prefer large main memory heaps. We investigate Optane as the main memory for managed (Java) workloads, focusing on performance scalability. As the workload (core count) increases, we note Optane's performance relative to DRAM. A few workloads incur a slight slowdown on Optane memory, which helps conserve limited DRAM capacity. Unfortunately, other workloads scale poorly beyond a few core counts.\par This article investigates scaling bottlenecks for Java workloads on Optane memory, analyzing the application, runtime, and microarchitectural interactions. Poorly scaling workloads allocate objects rapidly and access objects in Optane memory frequently. These characteristics slow down the mutator and substantially slow down garbage collection (GC). At the microarchitecture level, load, store, and instruction miss penalties rise. To regain performance, we partition heaps across DRAM and Optane memory, a hybrid that scales considerably better than Optane alone. We exploit state-of-the-art GC approaches to partition heaps. Unfortunately, existing GC approaches needlessly waste DRAM capacity because they ignore runtime behavior.\par This article also introduces performance impact-guided memory allocation (PIMA) for hybrid memories. PIMA maximizes Optane utilization, allocating in DRAM only if it improves performance. It estimates the performance impact of allocating heaps in either memory type by sampling. We target PIMA at graph analytics workloads, offering a novel performance estimation method and detailed evaluation. PIMA identifies workload phases that benefit from DRAM with high (94.33\%) accuracy, incurring only a 2\% sampling overhead. PIMA operates stand-alone or combines with prior approaches to offer new performance versus DRAM capacity trade-offs. This work opens up Optane memory to a real-life role as the main memory for Java workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2021:GAG, author = "Yashuai L{\"u} and Hui Guo and Libo Huang and Qi Yu and Li Shen and Nong Xiao and Zhiying Wang", title = "{GraphPEG}: Accelerating Graph Processing on {GPUs}", journal = j-TACO, volume = "18", number = "3", pages = "30:1--30:24", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3450440", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3450440", abstract = "Due to massive thread-level parallelism, GPUs have become an attractive platform for accelerating large-scale data parallel computations, such as graph processing. However, achieving high performance for graph processing with GPUs is non-trivial. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Omar:2021:PSH, author = "Hamza Omar and Omer Khan", title = "{PRISM}: Strong Hardware Isolation-based Soft-Error Resilient Multicore Architecture with High Performance and Availability at Low Hardware Overheads", journal = j-TACO, volume = "18", number = "3", pages = "31:1--31:25", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3450523", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3450523", abstract = "Multicores increasingly deploy safety-critical parallel applications that demand resiliency against soft-errors to satisfy the safety standards. However, protection against these errors is challenging due to complex communication and data access \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tripathy:2021:PLG, author = "Devashree Tripathy and Amirali Abdolrashidi and Laxmi Narayan Bhuyan and Liang Zhou and Daniel Wong", title = "{PAVER}: Locality Graph-Based Thread Block Scheduling for {GPUs}", journal = j-TACO, volume = "18", number = "3", pages = "32:1--32:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3451164", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3451164", abstract = "The massive parallelism present in GPUs comes at the cost of reduced L1 and L2 cache sizes per thread, leading to serious cache contention problems such as thrashing. Hence, the data access locality of an application should be considered during thread \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Heirman:2021:ASE, author = "Wim Heirman and Stijn Eyerman and Kristof {Du Bois} and Ibrahim Hur", title = "Automatic Sublining for Efficient Sparse Memory Accesses", journal = j-TACO, volume = "18", number = "3", pages = "33:1--33:23", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3452141", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3452141", abstract = "Sparse memory accesses, which are scattered accesses to single elements of a large data structure, are a challenge for current processor architectures. Their lack of spatial and temporal locality and their irregularity makes caches and traditional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cavus:2021:FKV, author = "Mustafa Cavus and Mohammed Shatnawi and Resit Sendag and Augustus K. Uht", title = "Fast Key-Value Lookups with Node Tracker", journal = j-TACO, volume = "18", number = "3", pages = "34:1--34:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3452099", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3452099", abstract = "Lookup operations for in-memory databases are heavily memory bound, because they often rely on pointer-chasing linked data structure traversals. They also have many branches that are hard-to-predict due to random key lookups. In this study, we show that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Song:2021:CRE, author = "Weijia Song and Christina Delimitrou and Zhiming Shen and Robbert {Van Renesse} and Hakim Weatherspoon and Lotfi Benmohamed and Frederic {De Vaulx} and Charif Mahmoudi", title = "{CacheInspector}: Reverse Engineering Cache Resources in Public Clouds", journal = j-TACO, volume = "18", number = "3", pages = "35:1--35:25", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3457373", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib; https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib", URL = "https://dl.acm.org/doi/10.1145/3457373", abstract = "Infrastructure-as-a-Service cloud providers sell virtual machines that are only specified in terms of number of CPU cores, amount of memory, and I/O throughput. Performance-critical aspects such as cache sizes and memory latency are missing or reported in ways that make them hard to compare across cloud providers. It is difficult for users to adapt their application's behavior to the available resources. In this work, we aim to increase the visibility that cloud users have into shared resources on public clouds. Specifically, we present CacheInspector, a lightweight runtime that determines the performance and allocated capacity of shared caches on multi-tenant public clouds. We validate CacheInspector's accuracy in a controlled environment, and use it to study the characteristics and variability of cache resources in the cloud, across time, instances, availability regions, and cloud providers. We show that CacheInspector's output allows cloud users to tailor their application's behavior, including their output quality, to avoid suboptimal performance when resources are scarce.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{RodriguesCarvalho:2021:UCC, author = "Daniel {Rodrigues Carvalho} and Andr{\'e} Seznec", title = "Understanding Cache Compression", journal = j-TACO, volume = "18", number = "3", pages = "36:1--36:27", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3457207", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3457207", abstract = "Hardware cache compression derives from software-compression research; yet, its implementation is not a straightforward translation, since it must abide by multiple restrictions to comply with area, power, and latency constraints. This study sheds light on the challenges of adopting compression in cache design ---from the shrinking of the data until its physical placement. The goal of this article is not to summarize proposals but to put in evidence the solutions they employ to handle those challenges. An in-depth description of the main characteristics of multiple methods is provided, as well as criteria that can be used as a basis for the assessment of such schemes. It is expected that this article will ease the understanding of decisions to be taken for the design of compressed systems and provide directions for future work.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Thuerck:2021:FRA, author = "Daniel Thuerck and Nicolas Weber and Roberto Bifulco", title = "{Flynn}'s Reconciliation: Automating the Register Cache Idiom for Cross-accelerator Programming", journal = j-TACO, volume = "18", number = "3", pages = "37:1--37:26", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458357", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3458357", abstract = "A large portion of the recent performance increase in the High Performance Computing (HPC) and Machine Learning (ML) domains is fueled by accelerator cards. Many popular ML frameworks support accelerators by organizing computations as a computational \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Carvalho:2021:KRN, author = "Jo{\~a}o P. L. {De Carvalho} and Braedy Kuzma and Ivan Korostelev and Jos{\'e} Nelson Amaral and Christopher Barton and Jos{\'e} Moreira and Guido Araujo", title = "{KernelFaRer}: Replacing Native-Code Idioms with High-Performance Library Calls", journal = j-TACO, volume = "18", number = "3", pages = "38:1--38:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3459010", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3459010", abstract = "Well-crafted libraries deliver much higher performance than code generated by sophisticated application programmers using advanced optimizing compilers. When a code pattern for which a well-tuned library implementation exists is found in the source code \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Alves:2021:EAP, author = "Ricardo Alves and Stefanos Kaxiras and David Black-Schaffer", title = "Early Address Prediction: Efficient Pipeline Prefetch and Reuse", journal = j-TACO, volume = "18", number = "3", pages = "39:1--39:22", month = jun, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458883", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Jun 29 08:21:11 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3458883", abstract = "Achieving low load-to-use latency with low energy and storage overheads is critical for performance. Existing techniques either prefetch into the pipeline (via address prediction and validation) or provide data reuse in the pipeline (via register sharing or L0 caches). These techniques provide a range of tradeoffs between latency, reuse, and overhead.\par In this work, we present a pipeline prefetching technique that achieves state-of-the-art performance and data reuse without additional data storage, data movement, or validation overheads by adding address tags to the register file. Our addition of register file tags allows us to forward (reuse) load data from the register file with no additional data movement, keep the data alive in the register file beyond the instruction s lifetime to increase temporal reuse, and coalesce prefetch requests to achieve spatial reuse. Further, we show that we can use the existing memory order violation detection hardware to validate prefetches and data forwards without additional overhead.\par Our design achieves the performance of existing pipeline prefetching while also forwarding 32\% of the loads from the register file (compared to 15\% in state-of-the-art register sharing), delivering a 16\% reduction in L1 dynamic energy (1.6\% total processor energy), with an area overhead of less than 0.5\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Goswami:2021:TES, author = "Kaustav Goswami and Dip Sankar Banerjee and Shirshendu Das", title = "Towards Enhanced System Efficiency while Mitigating Row Hammer", journal = j-TACO, volume = "18", number = "4", pages = "40:1--40:26", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3458749", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3458749", abstract = "In recent years, DRAM-based main memories have become susceptible to the Row Hammer (RH) problem, which causes bits to flip in a row without accessing them directly. Frequent activation of a row, called an aggressor row, causes its adjacent rows' (victim) bits to flip. The state-of-the-art solution is to refresh the victim rows explicitly to prevent bit flipping. There have been several proposals made to detect RH attacks. These include both probabilistic as well as deterministic counter-based methods. The technique of handling RH attacks, however, remains the same. In this work, we propose an efficient technique for handling the RH problem. We show that the mechanism is agnostic of the detection mechanism. Our RH handling technique omits the necessity of refreshing the victim rows. Instead, we use a small non-volatile Spin-Transfer Torque Magnetic Random Access Memory (STTRAM) that ensures no unnecessary refreshes of the victim rows on the DRAM device and thus allowing more time for normal applications in the same DRAM device. Our model relies on the migration of the aggressor rows. This accounts for removing blocking of the DRAM operations due to the refreshing of victim rows incurred in the previous solution. After extensive evaluation, we found that, compared to the conventional RH mitigation techniques, our model minimizes the blocking time of the memory that is imposed due to explicit refreshing by an average of 80.72\% in the worst-case scenario and provides energy savings of about 15.82\% on average, across different types of RH-based workloads. A lookup table is necessary to pinpoint the location of a particular row, which, when combined with the STTMRAM, limits the storage overhead to 0.39\% of a 2 GB DRAM. Our proposed model prevents repeated refreshing of the same victim rows in different refreshing windows on the DRAM device and leads to an efficient RH handling technique.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Proficz:2021:AGA, author = "Jerzy Proficz", title = "All-gather Algorithms Resilient to Imbalanced Process Arrival Patterns", journal = j-TACO, volume = "18", number = "4", pages = "41:1--41:22", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460122", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3460122", abstract = "Two novel algorithms for the all-gather operation resilient to imbalanced process arrival patterns (PATs) are presented. The first one, Background Disseminated Ring (BDR), is based on the regular parallel ring algorithm often supplied in MPI implementations and exploits an auxiliary background thread for early data exchange from faster processes to accelerate the performed all-gather operation. The other algorithm, Background Sorted Linear synchronized tree with Broadcast (BSLB), is built upon the already existing PAP-aware gather algorithm, that is, Background Sorted Linear Synchronized tree (BSLS), followed by a regular broadcast distributing gathered data to all participating processes. The background of the imbalanced PAP subject is described, along with the PAP monitoring and evaluation topics. An experimental evaluation of the algorithms based on a proposed mini-benchmark is presented. The mini-benchmark was performed over 2,000 times in a typical HPC cluster architecture with homogeneous compute nodes. The obtained results are analyzed according to different PATs, data sizes, and process numbers, showing that the proposed optimization works well for various configurations, is scalable, and can significantly reduce the all-gather elapsed times, in our case, up to factor 1.9 or 47\% in comparison with the best state-of-the-art solution.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2021:CMD, author = "Rui Xu and Sheng Ma and Yaohua Wang and Xinhai Chen and Yang Guo", title = "Configurable Multi-directional Systolic Array Architecture for Convolutional Neural Networks", journal = j-TACO, volume = "18", number = "4", pages = "42:1--42:24", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460776", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3460776", abstract = "The systolic array architecture is one of the most popular choices for convolutional neural network hardware accelerators. The biggest advantage of the systolic array architecture is its simple and efficient design principle. Without complicated control \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Seo:2021:SAI, author = "Wonik Seo and Sanghoon Cha and Yeonjae Kim and Jaehyuk Huh and Jongse Park", title = "{SLO}-Aware Inference Scheduler for Heterogeneous Processors in Edge Platforms", journal = j-TACO, volume = "18", number = "4", pages = "43:1--43:26", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460352", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3460352", abstract = "With the proliferation of applications with machine learning (ML), the importance of edge platforms has been growing to process streaming sensor, data locally without resorting to remote servers. Such edge platforms are commonly equipped with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Qureshi:2021:GXM, author = "Yasir Mahmood Qureshi and William Andrew Simon and Marina Zapater and Katzalin Olcoz and David Atienza", title = "{Gem5-X}: a Many-core Heterogeneous Simulation Platform for Architectural Exploration and Optimization", journal = j-TACO, volume = "18", number = "4", pages = "44:1--44:27", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3461662", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3461662", abstract = "The increasing adoption of smart systems in our daily life has led to the development of new applications with varying performance and energy constraints, and suitable computing architectures need to be developed for these new applications. In this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jung:2021:PPB, author = "Tina Jung and Fabian Ritter and Sebastian Hack", title = "{PICO}: a {Presburger} In-bounds Check Optimization for Compiler-based Memory Safety Instrumentations", journal = j-TACO, volume = "18", number = "4", pages = "45:1--45:27", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460434", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3460434", abstract = "Memory safety violations such as buffer overflows are a threat to security to this day. A common solution to ensure memory safety for C is code instrumentation. However, this often causes high execution-time overhead and is therefore rarely used in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sha:2021:LIA, author = "Zhibing Sha and Jun Li and Lihao Song and Jiewen Tang and Min Huang and Zhigang Cai and Lianju Qian and Jianwei Liao and Zhiming Liu", title = "Low {I/O} Intensity-aware Partial {GC} Scheduling to Reduce Long-tail Latency in {SSDs}", journal = j-TACO, volume = "18", number = "4", pages = "46:1--46:25", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3460433", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3460433", abstract = "This article proposes a low I/O intensity-aware scheduling scheme on garbage collection (GC) in SSDs for minimizing the I/O long-tail latency to ensure I/O responsiveness. The basic idea is to assemble partial GC operations by referring to several \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Alam:2021:LPL, author = "Syed Asad Alam and James Garland and David Gregg", title = "Low-precision Logarithmic Number Systems: Beyond Base-2", journal = j-TACO, volume = "18", number = "4", pages = "47:1--47:25", month = dec, year = "2021", CODEN = "????", DOI = "", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3461699", abstract = "Logarithmic number systems (LNS) are used to represent real numbers in many applications using a constant base raised to a fixed-point exponent making its distribution exponential. This greatly simplifies hardware multiply, divide, and square root. LNS with base-2 is most common, but in this article, we show that for low-precision LNS the choice of base has a significant impact.\par We make four main contributions. First, LNS is not closed under addition and subtraction, so the result is approximate. We show that choosing a suitable base can manipulate the distribution to reduce the average error. Second, we show that low-precision LNS addition and subtraction can be implemented efficiently in logic rather than commonly used ROM lookup tables, the complexity of which can be reduced by an appropriate choice of base. A similar effect is shown where the result of arithmetic has greater precision than the input.Third, where input data from external sources is not expected to be in LNS, we can reduce the conversion error by selecting a LNS base to match the expected distribution of the input. Thus, there is no one base that gives the global optimum, and base selection is a trade-off between different factors. Fourth, we show that circuits realized in LNS require lower area and power consumption for short word lengths", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Walden:2021:MIN, author = "Candace Walden and Devesh Singh and Meenatchi Jagasivamani and Shang Li and Luyi Kang and Mehdi Asnaashari and Sylvain Dubois and Bruce Jacob and Donald Yeung", title = "Monolithically Integrating Non-Volatile Main Memory over the Last-Level Cache", journal = j-TACO, volume = "18", number = "4", pages = "48:1--48:26", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3462632", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3462632", abstract = "Many emerging non-volatile memories are compatible with CMOS logic, potentially enabling their integration into a CPU's die. This article investigates such monolithically integrated CPU-main memory chips. We exploit non-volatile memories employing 3D \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tomei:2021:BSC, author = "Matthew Tomei and Shomit Das and Mohammad Seyedzadeh and Philip Bedoukian and Bradford Beckmann and Rakesh Kumar and David Wood", title = "Byte-Select Compression", journal = j-TACO, volume = "18", number = "4", pages = "49:1--49:27", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3462209", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3462209", abstract = "Cache-block compression is a highly effective technique for both reducing accesses to lower levels in the memory hierarchy (cache compression) and minimizing data transfers (link compression). While many effective cache-block compression algorithms have been proposed, the design of these algorithms is largely ad hoc and manual and relies on human recognition of patterns. In this article, we take an entirely different approach. We introduce a class of ``byte-select'' compression algorithms, as well as an automated methodology for generating compression algorithms in this class. We argue that, based on upper bounds within the class, the study of this class of byte-select algorithms has potential to yield algorithms with better performance than existing cache-block compression algorithms. The upper bound we establish on the compression ratio is 2X that of any existing algorithm. We then offer a generalized representation of a subset of byte-select compression algorithms and search through the resulting space guided by a set of training data traces. Using this automated process, we find efficient and effective algorithms for various hardware applications. We find that the resulting algorithms exploit novel patterns that can inform future algorithm designs. The generated byte-select algorithms are evaluated against a separate set of traces and evaluations show that Byte-Select has a 23\% higher compression ratio on average. While no previous algorithm performs best for all our data sets which include CPU and GPU applications, our generated algorithms do. Using an automated hardware generator for these algorithms, we show that their decompression and compression latency is one and two cycles respectively, much lower than any existing algorithm with a competitive compression ratio.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2021:CHC, author = "Cunlu Li and Dezun Dong and Shazhou Yang and Xiangke Liao and Guangyu Sun and Yongheng Liu", title = "{CIB-HIER}: Centralized Input Buffer Design in Hierarchical High-radix Routers", journal = j-TACO, volume = "18", number = "4", pages = "50:1--50:21", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468062", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3468062", abstract = "Hierarchical organization is widely used in high-radix routers to enable efficient scaling to higher switch port count. A general-purpose hierarchical router must be symmetrically designed with the same input buffer depth, resulting in a large amount of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gysi:2021:DSM, author = "Tobias Gysi and Christoph M{\"u}ller and Oleksandr Zinenko and Stephan Herhut and Eddie Davis and Tobias Wicky and Oliver Fuhrer and Torsten Hoefler and Tobias Grosser", title = "Domain-Specific Multi-Level {IR} Rewriting for {GPU}: The {Open Earth} Compiler for {GPU}-accelerated Climate Simulation", journal = j-TACO, volume = "18", number = "4", pages = "51:1--51:23", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3469030", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3469030", abstract = "Most compilers have a single core intermediate representation (IR) (e.g., LLVM) sometimes complemented with vaguely defined IR-like data structures. This IR is commonly low-level and close to machine instructions. As a result, optimizations relying on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zou:2021:SLE, author = "An Zou and Huifeng Zhu and Jingwen Leng and Xin He and Vijay Janapa Reddi and Christopher D. Gill and Xuan Zhang", title = "System-level Early-stage Modeling and Evaluation of {IVR}-assisted Processor Power Delivery System", journal = j-TACO, volume = "18", number = "4", pages = "52:1--52:27", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3468145", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3468145", abstract = "Despite being employed in numerous efforts to improve power delivery efficiency, the integrated voltage regulator (IVR) approach has yet to be evaluated rigorously and quantitatively in a full power delivery system (PDS) setting. To fulfill this need, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Manocha:2021:GOD, author = "Aninda Manocha and Tyler Sorensen and Esin Tureci and Opeoluwa Matthews and Juan L. Arag{\'o}n and Margaret Martonosi", title = "{GraphAttack}: Optimizing Data Supply for Graph Applications on In-Order Multicore Architectures", journal = j-TACO, volume = "18", number = "4", pages = "53:1--53:26", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3469846", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3469846", abstract = "Graph structures are a natural representation of important and pervasive data. While graph applications have significant parallelism, their characteristic pointer indirect loads to neighbor data hinder scalability to large datasets on multicore systems. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Benz:2021:SAP, author = "Joscha Benz and Oliver Bringmann", title = "Scenario-Aware Program Specialization for Timing Predictability", journal = j-TACO, volume = "18", number = "4", pages = "54:1--54:26", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3473333", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3473333", abstract = "The successful application of static program analysis strongly depends on flow facts of a program such as loop bounds, control-flow constraints, and operating modes. This problem heavily affects the design of real-time systems, since static program \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chakraborty:2021:WGC, author = "Shounak Chakraborty and Magnus Sj{\"a}lander", title = "{WaFFLe}: Gated Cache-Ways with Per-Core Fine-Grained {DVFS} for Reduced On-Chip Temperature and Leakage Consumption", journal = j-TACO, volume = "18", number = "4", pages = "55:1--55:25", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3471908", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3471908", abstract = "Managing thermal imbalance in contemporary chip multi-processors (CMPs) is crucial in assuring functional correctness of modern mobile as well as server systems. Localized regions with high activity, e.g., register files, ALUs, FPUs, and so on, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Srikanth:2021:SIC, author = "Sriseshan Srikanth and Anirudh Jain and Thomas M. Conte and Erik P. Debenedictis and Jeanine Cook", title = "{SortCache}: Intelligent Cache Management for Accelerating Sparse Data Workloads", journal = j-TACO, volume = "18", number = "4", pages = "56:1--56:24", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3473332", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3473332", abstract = "Sparse data applications have irregular access patterns that stymie modern memory architectures. Although hyper-sparse workloads have received considerable attention in the past, moderately-sparse workloads prevalent in machine learning applications, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Metzger:2021:DHT, author = "Paul Metzger and Volker Seeker and Christian Fensch and Murray Cole", title = "Device Hopping: Transparent Mid-Kernel Runtime Switching for Heterogeneous Systems", journal = j-TACO, volume = "18", number = "4", pages = "57:1--57:25", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3471909", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3471909", abstract = "Existing OS techniques for homogeneous many-core systems make it simple for single and multithreaded applications to migrate between cores. Heterogeneous systems do not benefit so fully from this flexibility, and applications that cannot migrate in mid-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2021:LED, author = "Yu Zhang and Da Peng and Xiaofei Liao and Hai Jin and Haikun Liu and Lin Gu and Bingsheng He", title = "{LargeGraph}: an Efficient Dependency-Aware {GPU}-Accelerated Large-Scale Graph Processing", journal = j-TACO, volume = "18", number = "4", pages = "58:1--58:24", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3477603", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3477603", abstract = "Many out-of-GPU-memory systems are recently designed to support iterative processing of large-scale graphs. However, these systems still suffer from long time to converge because of inefficient propagation of active vertices' new states along graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cilasun:2021:SNN, author = "H{\"u}srev Cilasun and Salonik Resch and Zamshed I. Chowdhury and Erin Olson and Masoud Zabihi and Zhengyang Zhao and Thomas Peterson and Keshab K. Parhi and Jian-Ping Wang and Sachin S. Sapatnekar and Ulya R. Karpuzcu", title = "Spiking Neural Networks in Spintronic Computational {RAM}", journal = j-TACO, volume = "18", number = "4", pages = "59:1--59:21", month = dec, year = "2021", CODEN = "????", DOI = "https://doi.org/10.1145/3475963", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Oct 4 07:14:07 MDT 2021", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3475963", abstract = "Spiking Neural Networks (SNNs) represent a biologically inspired computation model capable of emulating neural computation in human brain and brain-like structures. The main promise is very low energy consumption. Classic Von Neumann architecture based \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ukarande:2022:LAC, author = "Aditya Ukarande and Suryakant Patidar and Ram Rangan", title = "Locality-Aware {CTA} Scheduling for Gaming Applications", journal = j-TACO, volume = "19", number = "1", pages = "1:1--1:26", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3477497", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3477497", abstract = "The compute work rasterizer or the GigaThread Engine of a modern NVIDIA GPU focuses on maximizing compute work occupancy across all streaming multiprocessors in a GPU while retaining design simplicity. In this article, we identify the operational aspects \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2022:ICO, author = "Hongzhi Liu and Jie Luo and Ying Li and Zhonghai Wu", title = "Iterative Compilation Optimization Based on Metric Learning and Collaborative Filtering", journal = j-TACO, volume = "19", number = "1", pages = "2:1--2:25", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3480250", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3480250", abstract = "Pass selection and phase ordering are two critical compiler auto-tuning problems. Traditional heuristic methods cannot effectively address these NP-hard problems especially given the increasing number of compiler passes and diverse hardware architectures. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sasongko:2022:RFY, author = "Muhammad Aditya Sasongko and Milind Chabbi and Mandana Bagheri Marzijarani and Didem Unat", title = "{ReuseTracker}: Fast Yet Accurate Multicore Reuse Distance Analyzer", journal = j-TACO, volume = "19", number = "1", pages = "3:1--3:25", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3484199", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3484199", abstract = "One widely used metric that measures data locality is reuse distance -the number of unique memory locations that are accessed between two consecutive accesses to a particular memory location. State-of-the-art techniques that measure reuse distance in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fu:2022:GDS, author = "Yaosheng Fu and Evgeny Bolotin and Niladrish Chatterjee and David Nellans and Stephen W. Keckler", title = "{GPU} Domain Specialization via Composable On-Package Architecture", journal = j-TACO, volume = "19", number = "1", pages = "4:1--4:23", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3484505", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3484505", abstract = "As GPUs scale their low-precision matrix math throughput to boost deep learning (DL) performance, they upset the balance between math throughput and memory system capabilities. We demonstrate that a converged GPU design trying to address diverging \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2022:SBC, author = "Daeyeal Lee and Bill Lin and Chung-Kuan Cheng", title = "{SMT}-Based Contention-Free Task Mapping and Scheduling on {$2$D\slash $3$D SMART NoC} with Mixed Dimension-Order Routing", journal = j-TACO, volume = "19", number = "1", pages = "5:1--5:21", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3487018", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3487018", abstract = "SMART NoCs achieve ultra-low latency by enabling single-cycle multiple-hop transmission via bypass channels. However, contention along bypass channels can seriously degrade the performance of SMART NoCs by breaking the bypass paths. Therefore, contention-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chatarasi:2022:MDC, author = "Prasanth Chatarasi and Hyoukjun Kwon and Angshuman Parashar and Michael Pellauer and Tushar Krishna and Vivek Sarkar", title = "{Marvel}: a Data-Centric Approach for Mapping Deep Learning Operators on Spatial Accelerators", journal = j-TACO, volume = "19", number = "1", pages = "6:1--6:26", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3485137", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3485137", abstract = "A spatial accelerator's efficiency depends heavily on both its mapper and cost models to generate optimized mappings for various operators of DNN models. However, existing cost models lack a formal boundary over their input programs (operators) for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rieber:2022:JPL, author = "Dennis Rieber and Axel Acosta and Holger Fr{\"o}ning", title = "Joint Program and Layout Transformations to Enable Convolutional Operators on Specialized Hardware Based on Constraint Programming", journal = j-TACO, volume = "19", number = "1", pages = "7:1--7:26", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3487922", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3487922", abstract = "The success of Deep Artificial Neural Networks (DNNs) in many domains created a rich body of research concerned with hardware accelerators for compute-intensive DNN operators. However, implementing such operators efficiently with complex hardware \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lei:2022:SEW, author = "Mengya Lei and Fan Li and Fang Wang and Dan Feng and Xiaomin Zou and Renzhi Xiao", title = "{SecNVM}: an Efficient and Write-Friendly Metadata Crash Consistency Scheme for Secure {NVM}", journal = j-TACO, volume = "19", number = "1", pages = "8:1--8:26", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3488724", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3488724", abstract = "Data security is an indispensable part of non-volatile memory (NVM) systems. However, implementing data security efficiently on NVM is challenging, since we have to guarantee the consistency of user data and the related security metadata. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Di:2022:TPM, author = "Bang Di and Daokun Hu and Zhen Xie and Jianhua Sun and Hao Chen and Jinkui Ren and Dong Li", title = "{TLB}-pilot: Mitigating {TLB} Contention Attack on {GPUs} with Microarchitecture-Aware Scheduling", journal = j-TACO, volume = "19", number = "1", pages = "9:1--9:23", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3491218", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3491218", abstract = "Co-running GPU kernels on a single GPU can provide high system throughput and improve hardware utilization, but this raises concerns on application security. We reveal that translation lookaside buffer (TLB) attack, one of the common attacks on CPU, can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Saileshwar:2022:HLC, author = "Gururaj Saileshwar and Rick Boivie and Tong Chen and Benjamin Segal and Alper Buyuktosunoglu", title = "{HeapCheck}: Low-cost Hardware Support for Memory Safety", journal = j-TACO, volume = "19", number = "1", pages = "10:1--10:24", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3495152", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3495152", abstract = "Programs written in C/C++ are vulnerable to memory-safety errors like buffer-overflows and use-after-free. While several mechanisms to detect such errors have been previously proposed, they suffer from a variety of drawbacks, including poor performance, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Azhar:2022:TRR, author = "M. Waqar Azhar and Miquel Peric{\`a}s and Per Stenstr{\"o}m", title = "{Task-RM}: a Resource Manager for Energy Reduction in Task-Parallel Applications under Quality of Service Constraints", journal = j-TACO, volume = "19", number = "1", pages = "11:1--11:26", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494537", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3494537", abstract = "Improving energy efficiency is an important goal of computer system design. This article focuses on a general model of task-parallel applications under quality-of-service requirements on the completion time. Our technique, called Task-RM, exploits the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gomes:2022:CCA, author = "Cesar Gomes and Maziar Amiraski and Mark Hempstead", title = "{CASHT}: Contention Analysis in Shared Hierarchies with Thefts", journal = j-TACO, volume = "19", number = "1", pages = "12:1--12:27", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3494538", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3494538", abstract = "Cache management policies should consider workloads' contention behavior when managing a shared cache. Prior art makes estimates about shared cache behavior by adding extra logic or time to isolate per workload cache statistics. These approaches provide \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2022:OSS, author = "Yufei Wang and Xiaoshe Dong and Longxiang Wang and Weiduo Chen and Xingjun Zhang", title = "Optimizing Small-Sample Disk Fault Detection Based on {LSTM-GAN} Model", journal = j-TACO, volume = "19", number = "1", pages = "13:1--13:24", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3500917", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3500917", abstract = "In recent years, researches on disk fault detection based on SMART data combined with different machine learning algorithms have been proven to be effective. However, these methods require a large amount of data. In the early stages of the establishment \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Silfa:2022:BEE, author = "Franyell Silfa and Jose Maria Arnau and Antonio Gonz{\'a}lez", title = "{E-BATCH}: Energy-Efficient and High-Throughput {RNN} Batching", journal = j-TACO, volume = "19", number = "1", pages = "14:1--14:23", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3499757", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3499757", abstract = "Recurrent Neural Network (RNN) inference exhibits low hardware utilization due to the strict data dependencies across time-steps. Batching multiple requests can increase throughput. However, RNN batching requires a large amount of padding since the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ding:2022:CCA, author = "Chen Ding and Dong Chen and Fangzhou Liu and Benjamin Reber and Wesley Smith", title = "{CARL}: Compiler Assigned Reference Leasing", journal = j-TACO, volume = "19", number = "1", pages = "15:1--15:28", month = mar, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3498730", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 18 06:51:06 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3498730", abstract = "Data movement is a common performance bottleneck, and its chief remedy is caching. Traditional cache management is transparent to the workload: data that should be kept in cache are determined by the recency information only, while the program information,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Schlaak:2022:MAF, author = "Christof Schlaak and Tzung-Han Juang and Christophe Dubach", title = "Memory-Aware Functional {IR} for Higher-Level Synthesis of Accelerators", journal = j-TACO, volume = "19", number = "2", pages = "16:1--16:26", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501768", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3501768", abstract = "Specialized accelerators deliver orders of a magnitude of higher performance than general-purpose processors. The ever-changing nature of modern workloads is pushing the adoption of Field Programmable Gate Arrays (FPGAs) as the substrate of choice. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lakshminarasimhan:2022:FSC, author = "Kartik Lakshminarasimhan and Ajeya Naithani and Josu{\'e} Feliu and Lieven Eeckhout", title = "The Forward Slice Core: a High-Performance, Yet Low-Complexity Microarchitecture", journal = j-TACO, volume = "19", number = "2", pages = "17:1--17:25", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3499424", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3499424", abstract = "Superscalar out-of-order cores deliver high performance at the cost of increased complexity and power budget. In-order cores, in contrast, are less complex and have a smaller power budget, but offer low performance. A processor architecture should ideally \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Srikanthan:2022:MMA, author = "Sharanyan Srikanthan and Sayak Chakraborti and Princeton Ferro and Sandhya Dwarkadas", title = "{MAPPER}: Managing Application Performance via Parallel Efficiency Regulation *", journal = j-TACO, volume = "19", number = "2", pages = "18:1--18:26", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3501767", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3501767", abstract = "State-of-the-art systems, whether in servers or desktops, provide ample computational and storage resources to allow multiple simultaneously executing potentially parallel applications. However, performance tends to be unpredictable, being a function of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Athanasios:2022:LPN, author = "Tziouvaras Athanasios and Dimitriou Georgios and Stamoulis Georgios", title = "Low-power Near-data Instruction Execution Leveraging Opcode-based Timing Analysis", journal = j-TACO, volume = "19", number = "2", pages = "19:1--19:26", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3504005", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3504005", abstract = "Traditional processor architectures utilize an external DRAM for data storage, while they also operate under worst-case timing constraints. Such designs are heavily constrained by the delay costs of the data transfer between the core pipeline and the DRAM,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jia:2022:GND, author = "Xingguo Jia and Jin Zhang and Boshi Yu and Xingyue Qian and Zhengwei Qi and Haibing Guan", title = "{GiantVM}: a Novel Distributed Hypervisor for Resource Aggregation with {DSM-aware} Optimizations", journal = j-TACO, volume = "19", number = "2", pages = "20:1--20:27", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3505251", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3505251", abstract = "We present GiantVM, an open-source distributed hypervisor that provides the many-to-one virtualization to aggregate resources from multiple physical machines. We propose techniques to enable distributed CPU and I/O virtualization and distributed shared \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Nejat:2022:CSM, author = "Mehrzad Nejat and Madhavan Manivannan and Miquel Peric{\`a}s and Per Stenstr{\"o}m", title = "Cooperative Slack Management: Saving Energy of Multicore Processors by Trading Performance Slack Between {QoS}-Constrained Applications", journal = j-TACO, volume = "19", number = "2", pages = "21:1--21:27", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3505559", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3505559", abstract = "Processor resources can be adapted at runtime according to the dynamic behavior of applications to reduce the energy consumption of multicore processors without affecting the Quality-of-Service (QoS). To achieve this, an online resource management scheme \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pompougnac:2022:WSR, author = "Hugo Pompougnac and Ulysse Beaugnon and Albert Cohen and Dumitru Potop Butucaru", title = "Weaving Synchronous Reactions into the Fabric of {SSA}-form Compilers", journal = j-TACO, volume = "19", number = "2", pages = "22:1--22:25", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506706", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3506706", abstract = "We investigate the programming of reactive systems combining closed-loop control with performance-intensive components such as Machine Learning (ML). Reactive control systems are often safety-critical and associated with real-time execution requirements, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shobaki:2022:RPA, author = "Ghassan Shobaki and Vahl Scott Gordon and Paul McHugh and Theodore Dubois and Austin Kerbow", title = "Register-Pressure-Aware Instruction Scheduling Using Ant Colony Optimization", journal = j-TACO, volume = "19", number = "2", pages = "23:1--23:23", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3505558", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3505558", abstract = "This paper describes a new approach to register-pressure-aware instruction scheduling, using Ant Colony Optimization (ACO). ACO is a nature-inspired optimization technique that researchers have successfully applied to NP-hard sequencing problems like the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2022:MOG, author = "Qihan Wang and Zhen Peng and Bin Ren and Jie Chen and Robert G. Edwards", title = "{MemHC}: an Optimized {GPU} Memory Management Framework for Accelerating Many-body Correlation", journal = j-TACO, volume = "19", number = "2", pages = "24:1--24:26", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506705", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3506705", abstract = "The many-body correlation function is a fundamental computation kernel in modern physics computing applications, e.g., Hadron Contractions in Lattice quantum chromodynamics (QCD). This kernel is both computation and memory intensive, involving a series of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kumar:2022:DAS, author = "Rakesh Kumar and Mehdi Alipour and David Black-Schaffer", title = "Dependence-aware Slice Execution to Boost {MLP} in Slice-out-of-order Cores", journal = j-TACO, volume = "19", number = "2", pages = "25:1--25:28", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3506704", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3506704", abstract = "Exploiting memory-level parallelism (MLP) is crucial to hide long memory and last-level cache access latencies. While out-of-order (OoO) cores, and techniques building on them, are effective at exploiting MLP, they deliver poor energy efficiency due to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Vijaykumar:2022:MPO, author = "Nandita Vijaykumar and Ataberk Olgun and Konstantinos Kanellopoulos and F. Nisa Bostanci and Hasan Hassan and Mehrshad Lotfi and Phillip B. Gibbons and Onur Mutlu", title = "\pkg{MetaSys}: a Practical Open-source Metadata Management System to Implement and Evaluate Cross-layer Optimizations", journal = j-TACO, volume = "19", number = "2", pages = "26:1--26:29", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3505250", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3505250", abstract = "This article introduces the first open-source FPGA-based infrastructure, MetaSys, with a prototype in a RISC-V system, to enable the rapid implementation and evaluation of a wide range of cross-layer techniques in real hardware. Hardware-software \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2022:EEE, author = "Jing Chen and Madhavan Manivannan and Mustafa Abduljabbar and Miquel Peric{\`a}s", title = "\pkg{ERASE}: Energy Efficient Task Mapping and Resource Management for Work Stealing Runtimes", journal = j-TACO, volume = "19", number = "2", pages = "27:1--27:29", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3510422", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3510422", abstract = "Parallel applications often rely on work stealing schedulers in combination with fine-grained tasking to achieve high performance and scalability. However, reducing the total energy consumption in the context of work stealing runtimes is still challenging,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ye:2022:PAU, author = "Chencheng Ye and Yuanchao Xu and Xipeng Shen and Hai Jin and Xiaofei Liao and Yan Solihin", title = "Preserving Addressability Upon {GC}-Triggered Data Movements on Non-Volatile Memory", journal = j-TACO, volume = "19", number = "2", pages = "28:1--28:26", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3511706", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3511706", abstract = "This article points out an important threat that application-level Garbage Collection (GC) creates to the use of non-volatile memory (NVM). Data movements incurred by GC may invalidate the pointers to objects on NVM and, hence, harm the reusability of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Michelogiannakis:2022:CIR, author = "George Michelogiannakis and Benjamin Klenk and Brandon Cook and Min Yee Teh and Madeleine Glick and Larry Dennison and Keren Bergman and John Shalf", title = "A Case For Intra-rack Resource Disaggregation in {HPC}", journal = j-TACO, volume = "19", number = "2", pages = "29:1--29:26", month = jun, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3514245", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 25 07:03:00 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3514245", abstract = "The expected halt of traditional technology scaling is motivating increased heterogeneity in high-performance computing (HPC) systems with the emergence of numerous specialized accelerators. As heterogeneity increases, so does the risk of underutilizing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2022:SMS, author = "Ping Wang and Fei Wen and Paul V. Gratz and Alex Sprintson", title = "{SIMD-Matcher}: a {SIMD}-based Arbitrary Matching Framework", journal = j-TACO, volume = "19", number = "3", pages = "30:1--30:20", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3514246", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3514246", abstract = "Packet classification methods rely upon matching packet content/header against pre-defined rules, which are generated by network applications and their configurations. With the rapid development of network technology and the fast-growing network \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mettler:2022:FBA, author = "Marcel Mettler and Martin Rapp and Heba Khdr and Daniel Mueller-Gritschneder and J{\"o}rg Henkel and Ulf Schlichtmann", title = "An {FPGA}-based Approach to Evaluate Thermal and Resource Management Strategies of Many-core Processors", journal = j-TACO, volume = "19", number = "3", pages = "31:1--31:24", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3516825", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3516825", abstract = "The continuous technology scaling of integrated circuits results in increasingly higher power densities and operating temperatures. Hence, modern many-core processors require sophisticated thermal and resource management strategies to mitigate these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mpeis:2022:OIC, author = "Paschalis Mpeis and Pavlos Petoumenos and Kim Hazelwood and Hugh Leather", title = "Object Intersection Captures on Interactive Apps to Drive a Crowd-sourced Replay-based Compiler Optimization", journal = j-TACO, volume = "19", number = "3", pages = "32:1--32:25", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3517338", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3517338", abstract = "Traditional offline optimization frameworks rely on representative hardware, software, and inputs to compare different optimizations on. With application-specific optimization for mobile systems though, the idea of a representative testbench is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2022:MRM, author = "Cunlu Li and Dezun Dong and Xiangke Liao", title = "{MUA-Router}: Maximizing the Utility-of-Allocation for On-chip Pipelining Routers", journal = j-TACO, volume = "19", number = "3", pages = "33:1--33:23", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3519027", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3519027", abstract = "As an important pipeline stage in the router of Network-on-Chips, switch allocation assigns output ports to input ports and allows flits to transit through the switch without conflicts. Previous work designed efficient switch allocation strategies by \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Choudhury:2022:FOC, author = "Ziaul Choudhury and Shashwat Shrivastava and Lavanya Ramapantulu and Suresh Purini", title = "An {FPGA} Overlay for {CNN} Inference with Fine-grained Flexible Parallelism", journal = j-TACO, volume = "19", number = "3", pages = "34:1--34:26", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3519598", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3519598", abstract = "Increasingly, pre-trained convolutional neural networks (CNNs) are being deployed for inference in various computer vision applications, both on the server-side in the data centers and at the edge. CNN inference is a very compute-intensive task. It is a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Moolchandani:2022:PPP, author = "Diksha Moolchandani and Anshul Kumar and Smruti R. Sarangi", title = "Performance and Power Prediction for Concurrent Execution on {GPUs}", journal = j-TACO, volume = "19", number = "3", pages = "35:1--35:27", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3522712", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3522712", abstract = "The unprecedented growth of edge computing and 5G has led to an increased offloading of mobile applications to cloud servers or edge cloudlets. The most prominent workloads comprise computer vision applications. Conventional wisdom suggests that computer \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jahanshahi:2022:PQA, author = "Ali Jahanshahi and Nanpeng Yu and Daniel Wong", title = "{PowerMorph}: {QoS}-Aware Server Power Reshaping for Data Center Regulation Service", journal = j-TACO, volume = "19", number = "3", pages = "36:1--36:27", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3524129", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3524129", abstract = "Adoption of renewable energy in power grids introduces stability challenges in regulating the operation frequency of the electricity grid. Thus, electrical grid operators call for provisioning of frequency regulation services from end-user customers, such \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2022:BFE, author = "Peng Xu and Nannan Zhao and Jiguang Wan and Wei Liu and Shuning Chen and Yuanhui Zhou and Hadeel Albahar and Hanyang Liu and Liu Tang and Zhihu Tan", title = "Building a Fast and Efficient {LSM}-tree Store by Integrating Local Storage with Cloud Storage", journal = j-TACO, volume = "19", number = "3", pages = "37:1--37:26", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3527452", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3527452", abstract = "The explosive growth of modern web-scale applications has made cost-effectiveness a primary design goal for their underlying databases. As a backbone of modern databases, LSM-tree based key-value stores (LSM store) face limited storage options. They are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huang:2022:AVC, author = "Horng-Ruey Huang and Ding-Yong Hong and Jan-Jan Wu and Kung-Fu Chen and Pangfeng Liu and Wei-Chung Hsu", title = "Accelerating Video Captioning on Heterogeneous System Architectures", journal = j-TACO, volume = "19", number = "3", pages = "38:1--38:25", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3527609", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3527609", abstract = "Video captioning is a core technology to many important applications, such as AI-assisted medical diagnosis, video question answering, storytelling through videos, and lip-reading. Video captioning employs a hybrid CNN + RNN model. Accelerating such a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Corbalan-Navarro:2022:TDO, author = "David Corbal{\'a}n-Navarro and Juan L. Arag{\'o}n and Mart{\'\i} Anglada and Joan-Manuel Parcerisa and Antonio Gonz{\'a}lez", title = "Triangle Dropping: an Occluded-geometry Predictor for Energy-efficient Mobile {GPUs}", journal = j-TACO, volume = "19", number = "3", pages = "39:1--39:20", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3527861", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3527861", abstract = "This article proposes a novel micro-architecture approach for mobile GPUs aimed at early removing the occluded geometry in a scene by leveraging frame-to-frame coherence, thus reducing the overall energy consumption. Mobile GPUs commonly implement a Tile-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kundan:2022:PAP, author = "Shivam Kundan and Theodoros Marinakis and Iraklis Anagnostopoulos and Dimitri Kagaris", title = "A Pressure-Aware Policy for Contention Minimization on Multicore Systems", journal = j-TACO, volume = "19", number = "3", pages = "40:1--40:26", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3524616", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3524616", abstract = "Modern Chip Multiprocessors (CMPs) are integrating an increasing amount of cores to address the continually growing demand for high-application performance. The cores of a CMP share several components of the memory hierarchy, such as Last-Level Cache (LLC). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Alsop:2022:CFG, author = "Johnathan Alsop and Weon Taek Na and Matthew D. Sinclair and Samuel Grayson and Sarita Adve", title = "A Case for Fine-grain Coherence Specialization in Heterogeneous Systems", journal = j-TACO, volume = "19", number = "3", pages = "41:1--41:26", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3530819", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3530819", abstract = "Hardware specialization is becoming a key enabler of energy-efficient performance. Future systems will be increasingly heterogeneous, integrating multiple specialized and programmable accelerators, each with different memory demands. Traditionally, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Soltaniyeh:2022:ASC, author = "Mohammadreza Soltaniyeh and Richard P. Martin and Santosh Nagarakatte", title = "An Accelerator for Sparse Convolutional Neural Networks Leveraging Systolic General Matrix--matrix Multiplication", journal = j-TACO, volume = "19", number = "3", pages = "42:1--42:26", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3532863", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3532863", abstract = "This article proposes a novel hardware accelerator for the inference task with sparse convolutional neural networks (CNNs) by building a hardware unit to perform Image to Column (Im2Col) transformation of the input feature map coupled with a systolic-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dang:2022:LAP, author = "Dharanidhar Dang and Bill Lin and Debashis Sahoo", title = "{LiteCON}: an All-photonic Neuromorphic Accelerator for Energy-efficient Deep Learning", journal = j-TACO, volume = "19", number = "3", pages = "43:1--43:22", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3531226", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3531226", abstract = "Deep learning is highly pervasive in today's data-intensive era. In particular, convolutional neural networks (CNNs) are being widely adopted in a variety of fields for superior accuracy. However, computing deep CNNs on traditional CPUs and GPUs brings \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Siddhu:2022:CII, author = "Lokesh Siddhu and Rajesh Kedia and Shailja Pandey and Martin Rapp and Anuj Pathania and J{\"o}rg Henkel and Preeti Ranjan Panda", title = "{CoMeT}: an Integrated Interval Thermal Simulation Toolchain for {$2$D}, {2.5D}, and {$3$D} Processor-Memory Systems", journal = j-TACO, volume = "19", number = "3", pages = "44:1--44:25", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3532185", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3532185", abstract = "Processing cores and the accompanying main memory working in tandem enable modern processors. Dissipating heat produced from computation remains a significant problem for processors. Therefore, the thermal management of processors continues to be an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Olson:2022:OAG, author = "M. Ben Olson and Brandon Kammerdiener and Michael R. Jantz and Kshitij A. Doshi and Terry Jones", title = "Online Application Guidance for Heterogeneous Memory Systems", journal = j-TACO, volume = "19", number = "3", pages = "45:1--45:27", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3533855", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3533855", abstract = "As scaling of conventional memory devices has stalled, many high-end computing systems have begun to incorporate alternative memory technologies to meet performance goals. Since these technologies present distinct advantages and tradeoffs compared to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Honorio:2022:UBE, author = "Bruno {Chinelato Honorio} and Jo{\~a}o P. L. {De Carvalho} and Catalina {Munoz Morales} and Alexandro Baldassin and Guido Araujo", title = "Using Barrier Elision to Improve Transactional Code Generation", journal = j-TACO, volume = "19", number = "3", pages = "46:1--46:23", month = sep, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3533318", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Sep 2 10:07:01 MDT 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3533318", abstract = "With chip manufacturers such as Intel, IBM, and ARM offering native support for transactional memory in their instruction set architectures, memory transactions are on the verge of being considered a genuine application tool rather than just an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2022:AOM, author = "Jiansong Li and Xueying Wang and Xiaobing Chen and Guangli Li and Xiao Dong and Peng Zhao and Xianzhi Yu and Yongxin Yang and Wei Cao and Lei Liu and Xiaobing Feng", title = "An Application-oblivious Memory Scheduling System for {DNN} Accelerators", journal = j-TACO, volume = "19", number = "4", pages = "47:1--47:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3535355", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3535355", abstract = "Deep Neural Networks (DNNs) tend to go deeper and wider, which poses a significant challenge to the training of DNNs, due to the limited memory capacity of DNN \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Narayan:2022:AOC, author = "Aditya Narayan and Yvain Thonnart and Pascal Vivet and Ayse Coskun and Ajay Joshi", title = "Architecting Optically Controlled Phase Change Memory", journal = j-TACO, volume = "19", number = "4", pages = "48:1--48:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3533252", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3533252", abstract = "Phase Change Memory (PCM) is an attractive candidate for main memory, as it offers non-volatility and zero leakage power while providing higher cell densities, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2022:AAS, author = "Chao Zhang and Maximilian Bremer and Cy Chan and John Shalf and Xiaochen Guo", title = "{ASA}: Accelerating Sparse Accumulation in Column-wise {SpGEMM}", journal = j-TACO, volume = "19", number = "4", pages = "49:1--49:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3543068", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3543068", abstract = "Sparse linear algebra is an important kernel in many different applications. Among various sparse general matrix-matrix multiplication (SpGEMM) algorithms, Gustavson's column-wise SpGEMM has good locality when reading input matrix and can be easily parallelized by distributing the computation of different columns of an output matrix to different processors. However, the sparse accumulation (SPA) step in column-wise SpGEMM, which merges partial sums from each of the multiplications by the row indices, is still a performance bottleneck. The state-of-the-art software implementation uses a hash table for partial sum search in the SPA, which makes SPA the largest contributor to the execution time of SpGEMM. There are three reasons that cause the SPA to become the bottleneck: (1) hash probing requires data-dependent branches that are difficult for a branch predictor to predict correctly; (2) the accumulation of partial sum is dependent on the results of the hash probing, which makes it difficult to hide the hash probing latency; and (3) hash collision requires time-consuming linear search and optimizations to reduce these collisions require an accurate estimation of the number of non-zeros in each column of the output matrix. This work proposes ASA architecture to accelerate the SPA. ASA overcomes the challenges of SPA by (1) executing the partial sum search and accumulate with a single instruction through ISA extension to eliminate data-dependent branches in hash probing, (2) using a dedicated on-chip cache to perform the search and accumulation in a pipelined fashion, (3) relying on the parallel search capability of a set-associative cache to reduce search latency, and (4) delaying the merging of overflowed entries. As a result, ASA achieves an average of 2.25$ \times $ and 5.05$ \times $ speedup as compared to the state-of-the-art software implementation of a Markov clustering application and its SpGEMM kernel, respectively. As compared to a state-of-the-art hashing accelerator design, ASA achieves an average of 1.95$ \times $ speedup in the SpGEMM kernel.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bik:2022:CSS, author = "Aart Bik and Penporn Koanantakool and Tatiana Shpeisman and Nicolas Vasilache and Bixia Zheng and Fredrik Kjolstad", title = "Compiler Support for Sparse Tensor Computations in {MLIR}", journal = j-TACO, volume = "19", number = "4", pages = "50:1--50:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3544559", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3544559", abstract = "Sparse tensors arise in problems in science, engineering, machine learning, and data analytics. Programs that operate on such tensors can exploit sparsity to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Michaud:2022:HHA, author = "Pierre Michaud and Anis Peysieux", title = "{HAIR}: Halving the Area of the Integer Register File with Odd\slash Even Banking", journal = j-TACO, volume = "19", number = "4", pages = "51:1--51:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3544838", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3544838", abstract = "This article proposes a new microarchitectural scheme for reducing the hardware complexity of the integer register file of a superscalar processor. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yousefzadeh:2022:EEM, author = "Amirreza Yousefzadeh and Jan Stuijt and Martijn Hijdra and Hsiao-Hsuan Liu and Anteneh Gebregiorgis and Abhairaj Singh and Said Hamdioui and Francky Catthoor", title = "Energy-efficient In-Memory Address Calculation", journal = j-TACO, volume = "19", number = "4", pages = "52:1--52:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546071", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3546071", abstract = "Computation-in-Memory (CIM) is an emerging computing paradigm to address memory bottleneck challenges in computer architecture. A CIM unit cannot \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{So:2022:EES, author = "Hwisoo So and Moslem Didehban and Yohan Ko and Aviral Shrivastava and Kyoungwoo Lee", title = "{EXPERTISE}: an Effective Software-level Redundant Multithreading Scheme against Hardware Faults", journal = j-TACO, volume = "19", number = "4", pages = "53:1--53:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546073", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3546073", abstract = "Error resilience is the primary design concern for safety- and mission-critical applications. Redundant MultiThreading (RMT) is one of the most promising soft and hard \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hartley:2022:JTC, author = "Tim Hartley and Foivos S. Zakkak and Andy Nisbet and Christos Kotselidis and Mikel Luj{\'a}n", title = "Just-In-Time Compilation on {ARM} --- a Closer Look at Call-Site Code Consistency", journal = j-TACO, volume = "19", number = "4", pages = "54:1--54:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546568", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3546568", abstract = "The increase in computational capability of low-power Arm architectures has seen them diversify from their more traditional domain of portable battery powered devices into data center servers, personal computers, and even Supercomputers. Thus, managed languages (Java, Javascript, etc.) that require a managed runtime environment (MRE) need to be ported to the Arm architecture, requiring an understanding of different design tradeoffs. This article studies how the lack of strong hardware support for Self Modifying Code (SMC) in low-power architectures (e.g., absence of cache coherence between instruction cache and data caches), affects Just-In-Time (JIT) compilation and runtime behavior in MREs. Specifically, we focus on the implementation and treatment of call-sites, that must maintain code consistency in the face of concurrent execution and modification to redirect control (patching) by the MRE. The lack of coherence, is compounded with the maximum distance (reach of) a call-site can jump to as the reach is more constrained (smaller distance) in Arm when compared with Intel/AMD. We present four different robust implementations for call-sites and discuss their advantages and disadvantages in the absence of strong hardware support for SMC. Finally, we evaluate each approach using a microbenchmark, further evaluating the best three techniques using three JVM benchmark suites and the open source MaxineVM showcasing performance differences up to 12\%. Based on these observations, we propose extending code-cache partitioning strategies for JIT compiled code to encourage more efficient local branching for architectures with limited direct branch ranges.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jellum:2022:SSA, author = "Erling Jellum and Milica Orlandi{\'c} and Edmund Brekke and Tor Johansen and Torleiv Bryne", title = "Solving Sparse Assignment Problems on {FPGAs}", journal = j-TACO, volume = "19", number = "4", pages = "55:1--55:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546072", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3546072", abstract = "The assignment problem is a fundamental optimization problem and a crucial part of many systems. For example, in multiple object tracking, the assignment \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2022:PEP, author = "Yuhao Li and Benjamin C. Lee", title = "{Phronesis}: Efficient Performance Modeling for High-dimensional Configuration Tuning", journal = j-TACO, volume = "19", number = "4", pages = "56:1--56:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3546868", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3546868", abstract = "We present Phronesis, a learning framework for efficiently modeling the performance of data analytic workloads as a function of their high-dimensional software \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tirumalasetty:2022:RMP, author = "Chandrahas Tirumalasetty and Chih Chieh Chou and Narasimha Reddy and Paul Gratz and Ayman Abouelwafa", title = "Reducing Minor Page Fault Overheads through Enhanced Page Walker", journal = j-TACO, volume = "19", number = "4", pages = "57:1--57:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3547142", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3547142", abstract = "Application virtual memory footprints are growing rapidly in all systems from servers down to smartphones. To address this growing demand, system \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gao:2022:ACM, author = "Lan Gao and Jing Wang and Weigong Zhang", title = "Adaptive Contention Management for Fine-Grained Synchronization on Commodity {GPUs}", journal = j-TACO, volume = "19", number = "4", pages = "58:1--58:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3547301", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3547301", abstract = "As more emerging applications are moving to GPUs, fine-grained synchronization has become imperative. However, their performance can be severely \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Han:2022:CEC, author = "Ruobing Han and Jaewon Lee and Jaewoong Sim and Hyesoon Kim", title = "{COX} : Exposing {CUDA} Warp-level Functions to {CPUs}", journal = j-TACO, volume = "19", number = "4", pages = "59:1--59:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3554736", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3554736", abstract = "As CUDA becomes the de facto programming language among data parallel applications such as high-performance computing or machine learning applications, running CUDA on other platforms becomes a compelling option. Although several efforts have attempted to support CUDA on devices other than NVIDIA GPUs, due to extra steps in the translation, the support is always a few years behind CUDA's latest features. In particular, the new CUDA programming model exposes the warp concept in the programming language, which greatly changes the way the CUDA code should be mapped to CPU programs. In this article, hierarchical collapsing that correctly supports CUDA warp-level functions on CPUs is proposed. To verify hierarchical collapsing, we build a framework, COX, that supports executing CUDA source code on the CPU backend. With hierarchical collapsing, 90\% of kernels in CUDA SDK samples can be executed on CPUs, much higher than previous works (68\%). We also evaluate the performance with benchmarks for real applications and show that hierarchical collapsing can generate CPU programs with comparable or even higher performance than previous projects in general.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2022:DAS, author = "Yiding Liu and Xingyao Zhang and Donglin Zhuang and Xin Fu and Shuaiwen Song", title = "{DynamAP}: Architectural Support for Dynamic Graph Traversal on the Automata Processor", journal = j-TACO, volume = "19", number = "4", pages = "60:1--60:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3556976", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3556976", abstract = "Dynamic graph traversals (DGTs) currently are widely used in many important application domains, especially in this big-data era that urgently demands \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zou:2022:PSB, author = "Changwei Zou and Yaoqing Gao and Jingling Xue", title = "Practical Software-Based Shadow Stacks on x86-64", journal = j-TACO, volume = "19", number = "4", pages = "61:1--61:??", month = dec, year = "2022", CODEN = "????", DOI = "https://doi.org/10.1145/3556977", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 8 06:39:05 MST 2022", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3556977", abstract = "Control-Flow Integrity (CFI) techniques focus often on protecting forward edges and assume that backward edges are protected by shadow stacks. However, software-based shadow stacks that can provide performance, security, and compatibility are still hard to obtain, leaving an important security gap on x86-64. In this article, we introduce a simple, efficient, and effective parallel shadow stack design (based on LLVM), FlashStack, for protecting return addresses in single- and multi-threaded programs running under 64-bit Linux on x86-64, with three distinctive features. First, we introduce a novel dual-prologue approach to enable a protected function to thwart the TOCTTOU attacks, which are constructed by Microsoft's red team and lead to the deprecation of Microsoft's RFG. Second, we design a new mapping mechanism, Segment+Rsp-S, to allow the parallel shadow stack to be accessed efficiently while satisfying the constraints of arch\_prctl() and ASLR in 64-bit Linux. Finally, we introduce a lightweight inspection mechanism, SideChannel-K, to harden FlashStack further by detecting entropy-reduction attacks efficiently and protecting the parallel shadow stack effectively with a 10-ms shuffling policy. Our evaluation on SPEC CPU2006, Nginx, and Firefox shows that FlashStack can provide high performance, meaningful security, and reasonable compatibility for server- and client-side programs on x86-64.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luinaud:2023:SAD, author = "Thomas Luinaud and J. M. Pierre Langlois and Yvon Savaria", title = "Symbolic Analysis for Data Plane Programs Specialization", journal = j-TACO, volume = "20", number = "1", pages = "1:1--1:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3557727", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3557727", abstract = "Programmable network data planes have extended the capabilities of packet processing in network devices by allowing custom processing pipelines and agnostic packet processing. While a variety of applications can be implemented on current programmable data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shah:2023:BSA, author = "Nilesh Rajendra Shah and Ashitabh Misra and Antoine Min{\'e} and Rakesh Venkat and Ramakrishna Upadrasta", title = "{BullsEye}: Scalable and Accurate Approximation Framework for Cache Miss Calculation", journal = j-TACO, volume = "20", number = "1", pages = "2:1--2:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3558003", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3558003", abstract = "For Affine Control Programs or Static Control Programs (SCoP), symbolic counting of reuse distances could induce polynomials for each reuse pair. These polynomials along with cache capacity constraints lead to non-affine (semi-algebraic) sets; and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Soni:2023:AC, author = "Mitali Soni and Asmita Pal and Joshua {San Miguel}", title = "As-Is Approximate Computing", journal = j-TACO, volume = "20", number = "1", pages = "3:1--3:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3559761", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3559761", abstract = "Although approximate computing promises better performance for applications allowing marginal errors, dearth of hardware support and lack of run-time accuracy guarantees makes it difficult to adopt. We present As-Is, an Anytime Speculative Interruptible \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shah:2023:TDS, author = "Parth Shah and Ranjal Gautham Shenoy and Vaidyanathan Srinivasan and Pradip Bose and Alper Buyuktosunoglu", title = "{TokenSmart}: Distributed, Scalable Power Management in the Many-core Era", journal = j-TACO, volume = "20", number = "1", pages = "4:1--4:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3559762", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3559762", abstract = "Centralized power management control systems are hitting a scalability limit. In particular, enforcing a power cap in a many-core system in a performance-friendly manner is quite challenging. Today's on-chip controller reduces the clock speed of compute \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2023:LFH, author = "Zhangyu Chen and Yu Hua and Luochangqi Ding and Bo Ding and Pengfei Zuo and Xue Liu", title = "Lock-Free High-performance Hashing for Persistent Memory via {PM}-aware Holistic Optimization", journal = j-TACO, volume = "20", number = "1", pages = "5:1--5:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561651", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3561651", abstract = "Persistent memory (PM) provides large-scale non-volatile memory (NVM) with DRAM-comparable performance. The non-volatility and other unique characteristics of PM architecture bring new opportunities and challenges for the efficient storage system design. For example, some recent crash-consistent and write-friendly hashing schemes are proposed to provide fast queries for PM systems. However, existing PM hashing indexes suffer from the concurrency bottleneck due to the blocking resizing and expensive lock-based concurrency control for queries. Moreover, the lack of PM awareness and systematical design further increases the query latency. To address the concurrency bottleneck of lock contention in PM hashing, we propose clevel hashing, a lock-free concurrent level hashing scheme that provides non-blocking resizing via background threads and lock-free search/insertion/update/deletion using atomic primitives to enable high concurrency for PM hashing. By exploiting the PM characteristics, we present a holistic approach to building clevel hashing for high throughput and low tail latency via the PM-aware index/allocator co-design. The proposed volatile announcement array with a helping mechanism coordinates lock-free insertions and guarantees a strong consistency model. Our experiments using real-world YCSB workloads on Intel Optane DC PMM show that clevel hashing, respectively, achieves up to 5.7x and 1.6x higher throughput than state-of-the-art P-CLHT and Dash while guaranteeing low tail latency, e.g., 1.9x--7.2x speedup for the p99 latency with the insert-only workload.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mastoras:2023:DIN, author = "Aristeidis Mastoras and Sotiris Anagnostidis and Albert-Jan N. Yzelman", title = "Design and Implementation for Nonblocking Execution in {GraphBLAS}: Tradeoffs and Performance", journal = j-TACO, volume = "20", number = "1", pages = "6:1--6:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3561652", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3561652", abstract = "GraphBLAS is a recent standard that allows the expression of graph algorithms in the language of linear algebra and enables automatic code parallelization and optimization. GraphBLAS operations are memory bound and may benefit from data locality \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2023:SSC, author = "Yemao Xu and Dezun Dong and Dongsheng Wang and Shi Xu and Enda Yu and Weixia Xu and Xiangke Liao", title = "{SSD-SGD}: Communication Sparsification for Distributed Deep Learning Training", journal = j-TACO, volume = "20", number = "1", pages = "7:1--7:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563038", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3563038", abstract = "Intensive communication and synchronization cost for gradients and parameters is the well-known bottleneck of distributed deep learning training. Based on the observations that Synchronous SGD (SSGD) obtains good convergence accuracy while asynchronous \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Olgun:2023:PHE, author = "Ataberk Olgun and Juan G{\'o}mez Luna and Konstantinos Kanellopoulos and Behzad Salami and Hasan Hassan and Oguz Ergin and Onur Mutlu", title = "{PiDRAM}: a Holistic End-to-end {FPGA}-based Framework for Processing-in-{DRAM}", journal = j-TACO, volume = "20", number = "1", pages = "8:1--8:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563697", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3563697", abstract = "Commodity DRAM-based processing-using-memory (PuM) techniques that are supported by off-the-shelf DRAM chips present an opportunity for alleviating the data movement bottleneck at low cost. However, system integration of these techniques imposes non-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sakalis:2023:DSS, author = "Christos Sakalis and Stefanos Kaxiras and Magnus Sj{\"a}lander", title = "Delay-on-Squash: Stopping Microarchitectural Replay Attacks in Their Tracks", journal = j-TACO, volume = "20", number = "1", pages = "9:1--9:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563695", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3563695", abstract = "MicroScope and other similar microarchitectural replay attacks take advantage of the characteristics of speculative execution to trap the execution of the victim application in a loop, enabling the attacker to amplify a side-channel attack by executing it \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liang:2023:QRC, author = "Yi Liang and Shaokang Zeng and Lei Wang", title = "Quantifying Resource Contention of Co-located Workloads with the System-level Entropy", journal = j-TACO, volume = "20", number = "1", pages = "10:1--10:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3563696", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3563696", abstract = "The workload co-location, such as deploying offline analysis workloads with online service workloads on the same node, has become common for modern data centers. Workload co-location deployment improves data center resource utilization significantly. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Suyeon:2023:FFF, author = "Hur Suyeon and Seongmin Na and Dongup Kwon and Kim Joonsung and Andrew Boutros and Eriko Nurvitadhi and Jangwoo Kim", title = "A Fast and Flexible {FPGA-based} Accelerator for Natural Language Processing Neural Networks", journal = j-TACO, volume = "20", number = "1", pages = "11:1--11:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3564606", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3564606", abstract = "Deep neural networks (DNNs) have become key solutions in the natural language processing (NLP) domain. However, the existing accelerators customized for their narrow target models cannot support diverse NLP models. Therefore, naively running complex NLP \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gondimalla:2023:OOD, author = "Ashish Gondimalla and Jianqiao Liu and Mithuna Thottethodi and T. N. Vijaykumar", title = "{Occam}: Optimal Data Reuse for Convolutional Neural Networks", journal = j-TACO, volume = "20", number = "1", pages = "12:1--12:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3566052", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3566052", abstract = "Convolutional neural networks (CNNs) are emerging as powerful tools for image processing in important commercial applications. We focus on the important problem of improving the latency of image recognition. While CNNs are highly amenable to prefetching \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Peng:2023:FPS, author = "Bo Peng and Yaozu Dong and Jianguo Yao and Fengguang Wu and Haibing Guan", title = "{FlexHM}: a Practical System for Heterogeneous Memory with Flexible and Efficient Performance Optimizations", journal = j-TACO, volume = "20", number = "1", pages = "13:1--13:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3565885", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3565885", abstract = "With the rapid development of cloud computing, numerous cloud services, containers, and virtual machines have been bringing tremendous demands on high-performance memory resources to modern data centers. Heterogeneous memory, especially the newly released \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2023:RRB, author = "Qiang Zhang and Lei Xu and Baowen Xu", title = "{RegCPython}: a Register-based {Python} Interpreter for Better Performance", journal = j-TACO, volume = "20", number = "1", pages = "14:1--14:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3568973", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3568973", abstract = "Interpreters are widely used in the implementation of many programming languages, such as Python, Perl, and Java. Even though various JIT compilers emerge in an endless stream, interpretation efficiency still plays a critical role in program performance. Does a stack-based interpreter or a register-based interpreter perform better? The pros and cons of the pair of architectures have long been discussed. The stack architecture is attractive for its concise model and compact bytecode, but our study finds that the register-based interpreter can also be implemented easily and that its bytecode size only grows by a small margin. Moreover, the latter turns out to be appreciably faster. Specifically, we implemented an open source Python interpreter named RegCPython based on CPython v3.10.1. The former is register based, while the latter is stack based. Without changes in syntax, Application Programming Interface, and Application Binary Interface, RegCPython is excellently compatible with CPython, as it does not break existing syntax or interfaces. It achieves a speedup of 1.287 on the most favorable benchmark and 0.977 even on the most unfavorable benchmark. For all Python-intensive benchmarks, the average speedup reaches 1.120 on x86 and 1.130 on ARM. Our evaluation work, which also serves as an empirical study, provides a detailed performance survey of both interpreters on modern hardware. It points out that the register-based interpreters are more efficient mainly due to the elimination of machine instructions needed, while changes in branch mispredictions and cache misses have a limited impact on performance. Additionally, it confirms that the register-based implementation is also satisfactory in terms of memory footprint, compilation cost, and implementation complexity.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jin:2023:SBS, author = "Hai Jin and Zhuo He and Weizhong Qiang", title = "{SpecTerminator}: Blocking Speculative Side Channels Based on Instruction Classes on {RISC-V}", journal = j-TACO, volume = "20", number = "1", pages = "15:1--15:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3566053", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3566053", abstract = "In modern processors, speculative execution has significantly improved the performance of processors, but it has also introduced speculative execution vulnerabilities. Recent defenses are based on the delayed execution to block various speculative side channels, but we show that several of the current state-of-the-art defenses fail to block some of the available speculative side channels, and the current most secure defense introduces a performance overhead of up to 24.5\%.\par We propose SpecTerminator, the first defense framework based on instruction classes that can comprehensively and precisely block all existing speculative side channels. In SpecTerminator, a novel speculative side channel classification scheme based on the features of secret transmission is proposed, and the sensitive instructions in the speculative window are classified and identified using optimized hardware taint tracking and instruction masking techniques to accurately determine the scope of leakage. Then, according to the execution characteristics of these instructions, dedicated delayed execution strategies, such as TLB request ignoring, selective issue, and extended delay-on-miss, are designed for each type of sensitive instruction to precisely control that these instructions are delayed only in pipeline stages that are at risk of leakage. In contrast to previous defenses based on the Gem5 simulator, we have innovatively implemented defenses against Spectre attacks based on the open-source instruction set RISC-V on an FPGA-accelerated simulation platform that is more similar to real hardware. To evaluate the security of SpecTerminator, we have replicated various existing x86-based Spectre variants on RISC-V. On SPEC 2006, SpecTerminator defends against Spectre attacks based on memory hierarchy side channels with a performance overhead of 2.6\% and against all existing Spectre attacks with a performance overhead of 6.0\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2023:PSC, author = "Tuowen Zhao and Tobi Popoola and Mary Hall and Catherine Olschanowsky and Michelle Strout", title = "Polyhedral Specification and Code Generation of Sparse Tensor Contraction with Co-iteration", journal = j-TACO, volume = "20", number = "1", pages = "16:1--16:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3566054", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3566054", abstract = "This article presents a code generator for sparse tensor contraction computations. It leverages a mathematical representation of loop nest computations in the sparse polyhedral framework (SPF), which extends the polyhedral model to support non-affine \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Schuler:2023:XOT, author = "Manuela Schuler and Richard Membarth and Philipp Slusallek", title = "{XEngine}: Optimal Tensor Rematerialization for Neural Networks in Heterogeneous Environments", journal = j-TACO, volume = "20", number = "1", pages = "17:1--17:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3568956", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3568956", abstract = "Memory efficiency is crucial in training deep learning networks on resource-restricted devices. During backpropagation, forward tensors are used to calculate gradients. Despite the option of keeping those dependencies in memory until they are reused in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Korostelev:2023:YCL, author = "Ivan Korostelev and Jo{\~a}o P. L. {De Carvalho} and Jos{\'e} Moreira and Jos{\'e} Nelson Amaral", title = "{YaConv}: Convolution with Low Cache Footprint", journal = j-TACO, volume = "20", number = "1", pages = "18:1--18:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570305", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3570305", abstract = "This article introduces YaConv, a new algorithm to compute convolution using GEMM microkernels from a Basic Linear Algebra Subprograms library that is efficient for multiple CPU architectures. Previous approaches either create a copy of each image element \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Eris:2023:PRF, author = "Furkan Eris and Marcia Louis and Kubra Eris and Jos{\'e} Abell{\'a}n and Ajay Joshi", title = "{Puppeteer}: a Random Forest Based Manager for Hardware Prefetchers Across the Memory Hierarchy", journal = j-TACO, volume = "20", number = "1", pages = "19:1--19:??", month = mar, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570304", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 17 06:54:21 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3570304", abstract = "Over the years, processor throughput has steadily increased. However, the memory throughput has not increased at the same rate, which has led to the memory wall problem in turn increasing the gap between effective and theoretical peak processor \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tollenaere:2023:ACE, author = "Nicolas Tollenaere and Guillaume Iooss and St{\'e}phane Pouget and Hugo Brunie and Christophe Guillon and Albert Cohen and P. Sadayappan and Fabrice Rastello", title = "Autotuning Convolutions Is Easier Than You Think", journal = j-TACO, volume = "20", number = "2", pages = "20:1--20:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3570641", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3570641", abstract = "A wide range of scientific and machine learning applications depend on highly optimized implementations of tensor computations. Exploiting the full capacity of a given processor architecture remains a challenging task, due to the complexity of the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Perez:2023:UDO, author = "V{\'\i}ctor P{\'e}rez and Lukas Sommer and Victor Lom{\"u}ller and Kumudha Narasimhan and Mehdi Goli", title = "User-driven Online Kernel Fusion for {SYCL}", journal = j-TACO, volume = "20", number = "2", pages = "21:1--21:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571284", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3571284", abstract = "Heterogeneous programming models are becoming increasingly popular to support the ever-evolving hardware architectures, especially for new and emerging specialized accelerators optimizing specific tasks. While such programs provide performance portability \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Espindola:2023:SMR, author = "Vinicius Espindola and Luciano Zago and Herv{\'e} Yviquel and Guido Araujo", title = "Source Matching and Rewriting for {MLIR} Using String-Based Automata", journal = j-TACO, volume = "20", number = "2", pages = "22:1--22:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571283", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3571283", abstract = "A typical compiler flow relies on a uni-directional sequence of translation/optimization steps that lower the program abstract representation, making it hard to preserve higher-level program information across each transformation step. On the other hand, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ma:2023:OFM, author = "Wenjing Ma and Fangfang Liu and Daokun Chen and Qinglin Lu and Yi Hu and Hongsen Wang and Xinhui Yuan", title = "An Optimized Framework for Matrix Factorization on the New {Sunway} Many-core Platform", journal = j-TACO, volume = "20", number = "2", pages = "23:1--23:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3571856", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3571856", abstract = "Matrix factorization functions are used in many areas and often play an important role in the overall performance of the applications. In the LAPACK library, matrix factorization functions are implemented with blocked factorization algorithm, shifting \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Singh:2023:HHP, author = "Sarabjeet Singh and Neelam Surana and Kailash Prasad and Pranjali Jain and Joycee Mekie and Manu Awasthi", title = "{HyGain}: High-performance, Energy-efficient Hybrid Gain Cell-based Cache Hierarchy", journal = j-TACO, volume = "20", number = "2", pages = "24:1--24:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572839", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3572839", abstract = "In this article, we propose a ``full-stack'' solution to designing high-apacity and low-latency on-chip cache hierarchies by starting at the circuit level of the hardware design stack. We propose a novel half V $_{DD}$ precharge 2T Gain Cell (GC) design for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mummidi:2023:AAC, author = "Chandra Sekhar Mummidi and Sandip Kundu", title = "{ACTION}: Adaptive Cache Block Migration in Distributed Cache Architectures", journal = j-TACO, volume = "20", number = "2", pages = "25:1--25:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572911", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3572911", abstract = "Chip multiprocessors (CMP) with more cores have more traffic to the last-level cache (LLC). Without a corresponding increase in LLC bandwidth, such traffic cannot be sustained, resulting in performance degradation. Previous research focused on data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2023:UBC, author = "Qiaoyi Liu and Jeff Setter and Dillon Huff and Maxwell Strange and Kathleen Feng and Mark Horowitz and Priyanka Raina and Fredrik Kjolstad", title = "Unified Buffer: Compiling Image Processing and Machine Learning Applications to Push-Memory Accelerators", journal = j-TACO, volume = "20", number = "2", pages = "26:1--26:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572908", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3572908", abstract = "Image processing and machine learning applications benefit tremendously from hardware acceleration. Existing compilers target either FPGAs, which sacrifice power and performance for programmability, or ASICs, which become obsolete as applications change. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yuzuguler:2023:SSA, author = "Ahmet Caner Y{\"u}z{\"u}g{\"u}ler and Canberk S{\"o}nmez and Mario Drumond and Yunho Oh and Babak Falsafi and Pascal Frossard", title = "Scale-out Systolic Arrays", journal = j-TACO, volume = "20", number = "2", pages = "27:1--27:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3572917", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3572917", abstract = "Multi-pod systolic arrays are emerging as the architecture of choice in DNN inference accelerators. Despite their potential, designing multi-pod systolic arrays to maximize effective throughput/Watt-i.e., throughput/Watt adjusted when accounting for array \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Minervini:2023:VAE, author = "Francesco Minervini and Oscar Palomar and Osman Unsal and Enrico Reggiani and Josue Quiroga and Joan Marimon and Carlos Rojas and Roger Figueras and Abraham Ruiz and Alberto Gonzalez and Jonnatan Mendoza and Ivan Vargas and C{\'e}sar Hernandez and Joan Cabre and Lina Khoirunisya and Mustapha Bouhali and Julian Pavon and Francesc Moll and Mauro Olivieri and Mario Kovac and Mate Kovac and Leon Dragic and Mateo Valero and Adrian Cristal", title = "{Vitruvius+}: an Area-Efficient {RISC-V} Decoupled Vector Coprocessor for High Performance Computing Applications", journal = j-TACO, volume = "20", number = "2", pages = "28:1--28:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3575861", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3575861", abstract = "The maturity level of RISC-V and the availability of domain-specific instruction set extensions, like vector processing, make RISC-V a good candidate for supporting the integration of specialized hardware in processor cores for the High Performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Benmeziane:2023:MOH, author = "Hadjer Benmeziane and Hamza Ouarnoughi and Kaoutar {El Maghraoui} and Smail Niar", title = "Multi-objective Hardware-aware Neural Architecture Search with {Pareto} Rank-preserving Surrogate Models", journal = j-TACO, volume = "20", number = "2", pages = "29:1--29:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579853", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3579853", abstract = "Deep learning (DL) models such as convolutional neural networks (ConvNets) are being deployed to solve various computer vision and natural language processing tasks at the edge. It is a challenge to find the right DL architecture that simultaneously meets \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2023:FFA, author = "Dongwei Chen and Dong Tong and Chun Yang and Jiangfang Yi and Xu Cheng", title = "{FlexPointer}: Fast Address Translation Based on Range {TLB} and Tagged Pointers", journal = j-TACO, volume = "20", number = "2", pages = "30:1--30:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3579854", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3579854", abstract = "Page-based virtual memory relies on TLBs to accelerate the address translation. Nowadays, the gap between application workloads and the capacity of TLB continues to grow, bringing many costly TLB misses and making the TLB a performance bottleneck. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Du:2023:FOS, author = "Jingwen Du and Fang Wang and Dan Feng and Changchen Gan and Yuchao Cao and Xiaomin Zou and Fan Li", title = "Fast One-Sided {RDMA}-Based State Machine Replication for Disaggregated Memory", journal = j-TACO, volume = "20", number = "2", pages = "31:1--31:??", month = jun, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587096", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jun 10 08:08:06 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3587096", abstract = "Disaggregated memory architecture has risen in popularity for large datacenters with the advantage of improved resource utilization, failure isolation, and elasticity. Replicated state machines (RSMs) have been extensively used for reliability and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sahni:2023:AAS, author = "Abdul Rasheed Sahni and Hamza Omar and Usman Ali and Omer Khan", title = "{ASM}: an Adaptive Secure Multicore for Co-located Mutually Distrusting Processes", journal = j-TACO, volume = "20", number = "3", pages = "32:1--32:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3587480", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3587480", abstract = "With the ever-increasing virtualization of software and hardware, the privacy of user-sensitive data is a fundamental concern in computation outsourcing. Secure processors enable a trusted execution environment to guarantee security properties based on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Puthoor:2023:TBS, author = "Sooraj Puthoor and Mikko H. Lipasti", title = "Turn-based Spatiotemporal Coherence for {GPUs}", journal = j-TACO, volume = "20", number = "3", pages = "33:1--33:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3593054", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3593054", abstract = "This article introduces turn-based spatiotemporal coherence. Spatiotemporal coherence is a novel coherence implementation that assigns write permission to epochs (or turns) as opposed to a processor core. This paradigm shift in the assignment of write \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2023:JOJ, author = "Ruobing Chen and Haosen Shi and Jinping Wu and Yusen Li and Xiaoguang Liu and Gang Wang", title = "Jointly Optimizing Job Assignment and Resource Partitioning for Improving System Throughput in Cloud Datacenters", journal = j-TACO, volume = "20", number = "3", pages = "34:1--34:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3593055", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3593055", abstract = "Colocating multiple jobs on the same server has been widely applied for improving resource utilization in cloud datacenters. However, the colocated jobs would contend for the shared resources, which could lead to significant performance degradation. An \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ravi:2023:TMA, author = "Gokul Subramanian Ravi and Tushar Krishna and Mikko Lipasti", title = "{TNT}: a Modular Approach to Traversing Physically Heterogeneous {NOCs} at Bare-wire Latency", journal = j-TACO, volume = "20", number = "3", pages = "35:1--35:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3597611", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3597611", abstract = "The ideal latency for on-chip network traversal would be the delay incurred from wire traversal alone. Unfortunately, in a realistic modular network, the latency for a packet to traverse the network is significantly higher than this wire delay. The main \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2023:ACN, author = "Weizhi Xu and Yintai Sun and Shengyu Fan and Hui Yu and Xin Fu", title = "Accelerating Convolutional Neural Network by Exploiting Sparsity on {GPUs}", journal = j-TACO, volume = "20", number = "3", pages = "36:1--36:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3600092", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3600092", abstract = "The convolutional neural network (CNN) is an important deep learning method, which is widely used in many fields. However, it is very time consuming to implement the CNN where convolution usually takes most of the time. There are many zero values in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2023:GED, author = "Jin Zhao and Yu Zhang and Ligang He and Qikun Li and Xiang Zhang and Xinyu Jiang and Hui Yu and Xiaofei Liao and Hai Jin and Lin Gu and Haikun Liu and Bingsheng He and Ji Zhang and Xianzheng Song and Lin Wang and Jun Zhou", title = "{GraphTune}: an Efficient Dependency-Aware Substrate to Alleviate Irregularity in Concurrent Graph Processing", journal = j-TACO, volume = "20", number = "3", pages = "37:1--37:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3600091", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3600091", abstract = "With the increasing need for graph analysis, massive Concurrent iterative Graph Processing (CGP) jobs are usually performed on the common large-scale real-world graph. Although several solutions have been proposed, these CGP jobs are not coordinated with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2023:IPS, author = "Yufeng Zhou and Alan L. Cox and Sandhya Dwarkadas and Xiaowan Dong", title = "The Impact of Page Size and Microarchitecture on Instruction Address Translation Overhead", journal = j-TACO, volume = "20", number = "3", pages = "38:1--38:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3600089", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3600089", abstract = "As the volume of data processed by applications has increased, considerable attention has been paid to data address translation overheads, leading to the widespread use of larger page sizes (``superpages'') and multi-level translation lookaside buffers \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Reber:2023:CPS, author = "Benjamin Reber and Matthew Gould and Alexander H. Kneipp and Fangzhou Liu and Ian Prechtl and Chen Ding and Linlin Chen and Dorin Patru", title = "Cache Programming for Scientific Loops Using Leases", journal = j-TACO, volume = "20", number = "3", pages = "39:1--39:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3600090", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3600090", abstract = "Cache management is important in exploiting locality and reducing data movement. This article studies a new type of programmable cache called the lease cache. By assigning leases, software exerts the primary control on when and how long data stays in the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xie:2023:MMC, author = "Xinfeng Xie and Peng Gu and Yufei Ding and Dimin Niu and Hongzhong Zheng and Yuan Xie", title = "{MPU}: Memory-centric {SIMT} Processor via In-{DRAM} Near-bank Computing", journal = j-TACO, volume = "20", number = "3", pages = "40:1--40:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603113", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3603113", abstract = "With the growing number of data-intensive workloads, GPU, which is the state-of-the-art single-instruction-multiple-thread (SIMT) processor, is hindered by the memory bandwidth wall. To alleviate this bottleneck, previously proposed 3D-stacking near-bank \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Krolik:2023:RFQ, author = "Alexander Krolik and Clark Verbrugge and Laurie Hendren", title = "{rNdN}: Fast Query Compilation for {NVIDIA GPUs}", journal = j-TACO, volume = "20", number = "3", pages = "41:1--41:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3603503", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3603503", abstract = "GPU database systems are an effective solution to query optimization, particularly with compilation and data caching. They fall short, however, in end-to-end workloads, as existing compiler toolchains are too expensive for use with short-running queries. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2023:HMP, author = "Jiazhi Jiang and Zijian Huang and Dan Huang and Jiangsu Du and Lin Chen and Ziguan Chen and Yutong Lu", title = "Hierarchical Model Parallelism for Optimizing Inference on Many-core Processor via Decoupled {$3$D-CNN} Structure", journal = j-TACO, volume = "20", number = "3", pages = "42:1--42:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3605149", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3605149", abstract = "The tremendous success of convolutional neural network (CNN) has made it ubiquitous in many fields of human endeavor. Many applications such as biomedical analysis and scientific data analysis involve analyzing volumetric data. This spawns huge demand for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2023:MGA, author = "Yuwen Zhao and Fangfang Liu and Wenjing Ma and Huiyuan Li and Yuanchi Peng and Cui Wang", title = "{MFFT}: a {GPU} Accelerated Highly Efficient Mixed-Precision Large-Scale {FFT} Framework", journal = j-TACO, volume = "20", number = "3", pages = "43:1--43:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3605148", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3605148", abstract = "Fast Fourier transform (FFT) is widely used in computing applications in large-scale parallel programs, and data communication is the main performance bottleneck of FFT and seriously affects its parallel efficiency. To tackle this problem, we propose a new large-scale FFT framework, MFFT, which optimizes parallel FFT with a new mixed-precision optimization technique, adopting the ``high precision computation, low precision communication'' strategy. To enable ``low precision communication'', we propose a shared-exponent floating-point number compression technique, which reduces the volume of data communication, while maintaining higher accuracy. In addition, we apply a two-phase normalization technique to further reduce the round-off error. Based on the mixed-precision MFFT framework, we apply several optimization techniques to improve the performance, such as streaming of GPU kernels, MPI message combination, kernel optimization, and memory optimization. We evaluate MFFT on a system with 4,096 GPUs. The results show that shared-exponent MFFT is $ 1.23 \times $ faster than that of double-precision MFFT on average, and double-precision MFFT achieves performance $ 3.53 \times $ and $ 9.48 \times $ on average higher than open source library 2Decomp\&FFT (CPU-based version) and heFFTe (AMD GPU-based version), respectively. The parallel efficiency of double-precision MFFT increased from 53.2\% to 78.1\% compared with 2Decomp\&FFT, and shared-exponent MFFT further increases the parallel efficiency to 83.8\%.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Azhar:2023:ARR, author = "Muhammad Waqar Azhar and Madhavan Manivannan and Per Stenstr{\"o}m", title = "{Approx-RM}: Reducing Energy on Heterogeneous Multicore Processors under Accuracy and Timing Constraints", journal = j-TACO, volume = "20", number = "3", pages = "44:1--44:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3605214", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3605214", abstract = "Reducing energy consumption while providing performance and quality guarantees is crucial for computing systems ranging from battery-powered embedded systems to data centers. This article considers approximate iterative applications executing on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huang:2023:STE, author = "Dong Huang and Dan Feng and Qiankun Liu and Bo Ding and Wei Zhao and Xueliang Wei and Wei Tong", title = "{SplitZNS}: Towards an Efficient {LSM}-Tree on Zoned Namespace {SSDs}", journal = j-TACO, volume = "20", number = "3", pages = "45:1--45:??", month = sep, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3608476", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 10 07:14:56 MDT 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3608476", abstract = "The Zoned Namespace (ZNS) Solid State Drive (SSD) is a nascent form of storage device that offers novel prospects for the Log Structured Merge Tree (LSM-tree). ZNS exposes erase blocks in SSD as append-only zones, enabling the LSM-tree to gain awareness \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Du:2023:ICM, author = "Jiangsu Du and Jiazhi Jiang and Jiang Zheng and Hongbin Zhang and Dan Huang and Yutong Lu", title = "Improving Computation and Memory Efficiency for Real-world {Transformer} Inference on {GPUs}", journal = j-TACO, volume = "20", number = "4", pages = "46:1--46:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3617689", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3617689", abstract = "Transformer models have emerged as a leading approach in the field of natural language processing (NLP) and are increasingly being deployed in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jin:2023:CTC, author = "Hai Jin and Bo Lei and Haikun Liu and Xiaofei Liao and Zhuohui Duan and Chencheng Ye and Yu Zhang", title = "A Compilation Tool for Computation Offloading in {ReRAM}-based {CIM} Architectures", journal = j-TACO, volume = "20", number = "4", pages = "47:1--47:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3617686", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3617686", abstract = "Computing-in-Memory (CIM) architectures using Non-volatile Memories (NVMs) have emerged as a promising way to address the ``memory wall'' problem in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Menard:2023:HPD, author = "Christian Menard and Marten Lohstroh and Soroush Bateni and Matthew Chorlian and Arthur Deng and Peter Donovan and Cl{\'e}ment Fournier and Shaokai Lin and Felix Suchert and Tassilo Tanneberger and Hokeun Kim and Jeronimo Castrillon and Edward A. Lee", title = "High-performance Deterministic Concurrency Using {Lingua Franca}", journal = j-TACO, volume = "20", number = "4", pages = "48:1--48:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3617687", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3617687", abstract = "Actor frameworks and similar reactive programming techniques are widely used for building concurrent systems. They promise to be efficient and scale well to a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2023:SDM, author = "Donglei Wu and Weihao Yang and Xiangyu Zou and Wen Xia and Shiyi Li and Zhenbo Hu and Weizhe Zhang and Binxing Fang", title = "{Smart-DNN+}: a Memory-efficient Neural Networks Compression Framework for the Model Inference", journal = j-TACO, volume = "20", number = "4", pages = "49:1--49:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3617688", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3617688", abstract = "Deep Neural Networks (DNNs) have achieved remarkable success in various real-world applications. However, running a Deep Neural Network (DNN) typically \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{TACO-294350005, author = "Syed Salauddin Mohammad Tariq and Lance Menard and Pengfei Su and Probir Roy", title = "{MicroProf}: Code-level Attribution of Unnecessary Data Transfer in Microservice Applications", journal = j-TACO, volume = "20", number = "4", pages = "50:1--50:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3622787", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3622787", abstract = "The microservice architecture style has gained popularity due to its ability to fault isolation, ease of scaling applications, and developer's agility. However, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2023:GGM, author = "Shiyi Li and Qiang Cao and Shenggang Wan and Wen Xia and Changsheng Xie", title = "{gPPM}: a Generalized Matrix Operation and Parallel Algorithm to Accelerate the Encoding\slash Decoding Process of Erasure Codes", journal = j-TACO, volume = "20", number = "4", pages = "51:1--51:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3625005", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3625005", abstract = "Erasure codes are widely deployed in modern storage systems, leading to frequent usage of their encoding/decoding operations. The encoding/decoding process for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Anastasiadis:2023:PPA, author = "Petros Anastasiadis and Nikela Papadopoulou and Georgios Goumas and Nectarios Koziris and Dennis Hoppe and Li Zhong", title = "{PARALiA}: a Performance Aware Runtime for Auto-tuning Linear Algebra on Heterogeneous Systems", journal = j-TACO, volume = "20", number = "4", pages = "52:1--52:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3624569", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3624569", abstract = "Dense linear algebra operations appear very frequently in high-performance computing (HPC) applications, rendering their performance crucial to achieve \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yu:2023:RER, author = "Hui Yu and Yu Zhang and Jin Zhao and Yujian Liao and Zhiying Huang and Donghao He and Lin Gu and Hai Jin and Xiaofei Liao and Haikun Liu and Bingsheng He and Jianhui Yue", title = "{RACE}: an Efficient Redundancy-aware Accelerator for Dynamic Graph Neural Network", journal = j-TACO, volume = "20", number = "4", pages = "53:1--53:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3617685", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3617685", abstract = "Dynamic Graph Neural Network (DGNN) has recently attracted a significant amount of research attention from various domains, because most real-world graphs \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ferrari:2023:ADC, author = "Victor Ferrari and Rafael Sousa and Marcio Pereira and Jo{\~a}o P. L. {De Carvalho} and Jos{\'e} Nelson Amaral and Jos{\'e} Moreira and Guido Araujo", title = "Advancing Direct Convolution Using Convolution Slicing Optimization and {ISA} Extensions", journal = j-TACO, volume = "20", number = "4", pages = "54:1--54:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3625004", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3625004", abstract = "Convolution is one of the most computationally intensive operations that must be performed for machine learning model inference. A traditional \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{He:2023:DLS, author = "Bowen He and Xiao Zheng and Yuan Chen and Weinan Li and Yajin Zhou and Xin Long and Pengcheng Zhang and Xiaowei Lu and Linquan Jiang and Qiang Liu and Dennis Cai and Xiantao Zhang", title = "{DxPU}: Large-scale Disaggregated {GPU} Pools in the Datacenter", journal = j-TACO, volume = "20", number = "4", pages = "55:1--55:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3617995", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3617995", abstract = "The rapid adoption of AI and convenience offered by cloud services have resulted in the growing demands for GPUs in the cloud. Generally, GPUs are physically \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2023:CMC, author = "Shiqing Zhang and Mahmood Naderan-Tahan and Magnus Jahre and Lieven Eeckhout", title = "Characterizing Multi-Chip {GPU} Data Sharing", journal = j-TACO, volume = "20", number = "4", pages = "56:1--56:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3629521", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3629521", abstract = "Multi-chip Graphics Processing Unit (GPU) systems are critical to scale performance beyond a single GPU chip for a wide variety of important emerging \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Domke:2023:LPQ, author = "Jens Domke and Emil Vatai and Balazs Gerofi and Yuetsu Kodama and Mohamed Wahib and Artur Podobas and Sparsh Mittal and Miquel Peric{\`a}s and Lingqi Zhang and Peng Chen and Aleksandr Drozd and Satoshi Matsuoka", title = "At the Locus of Performance: Quantifying the Effects of Copious {$3$D}-Stacked Cache on {HPC} Workloads", journal = j-TACO, volume = "20", number = "4", pages = "57:1--57:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3629520", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3629520", abstract = "Over the last three decades, innovations in the memory subsystem were primarily targeted at overcoming the data movement bottleneck. In this paper, we focus \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Badri:2023:MPE, author = "Satya Jaswanth Badri and Mukesh Saini and Neeraj Goel", title = "{Mapi-Pro}: an Energy Efficient Memory Mapping Technique for Intermittent Computing", journal = j-TACO, volume = "20", number = "4", pages = "58:1--58:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3629524", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3629524", abstract = "Battery-less technology evolved to replace battery usage in space, deep mines, and other environments to reduce cost and pollution. Non-volatile memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yu:2023:MFE, author = "Miao Yu and Tingting Xiang and Venkata Pavan Kumar Miriyala and Trevor E. Carlson", title = "{Multiply-and-Fire}: an Event-Driven Sparse Neural Network Accelerator", journal = j-TACO, volume = "20", number = "4", pages = "59:1--59:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3630255", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3630255", abstract = "Deep neural network inference has become a vital workload for many systems from edge-based computing to data centers. To reduce the performance and power \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Choudhury:2023:FAI, author = "Ziaul Choudhury and Anish Gulati and Suresh Purini", title = "{FlowPix}: Accelerating Image Processing Pipelines on an {FPGA} Overlay using a Domain Specific Compiler", journal = j-TACO, volume = "20", number = "4", pages = "60:1--60:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3629523", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3629523", abstract = "The exponential performance growth guaranteed by Moore's law has started to taper in recent years. At the same time, emerging applications like image \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Susskind:2023:UNA, author = "Zachary Susskind and Aman Arora and Igor D. S. Miranda and Alan T. L. Bacellar and Luis A. Q. Villon and Rafael F. Katopodis and Leandro S. de Ara{\'u}jo and Diego L. C. Dutra and Priscila M. V. Lima and Felipe M. G. Fran{\c{c}}a and Mauricio {Breternitz Jr.} and Lizy K. John", title = "{ULEEN}: a Novel Architecture for Ultra-low-energy Edge Neural Networks", journal = j-TACO, volume = "20", number = "4", pages = "61:1--61:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3629522", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3629522", abstract = "``Extreme edge'' devices, such as smart sensors, are a uniquely challenging environment for the deployment of machine learning. The tiny energy budgets \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wei:2023:FOT, author = "Jia Wei and Xingjun Zhang and Longxiang Wang and Zheng Wei", title = "{Fastensor}: Optimise the {Tensor} {I/O} Path from {SSD} to {GPU} for Deep Learning Training", journal = j-TACO, volume = "20", number = "4", pages = "62:1--62:??", month = dec, year = "2023", CODEN = "????", DOI = "https://doi.org/10.1145/3630108", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Dec 21 10:29:36 MST 2023", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3630108", abstract = "In recent years, benefiting from the increase in model size and complexity, deep learning has achieved tremendous success in computer vision (CV) and (NLP). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "62", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luo:2024:CDB, author = "Longfei Luo and Dingcui Yu and Yina Lv and Liang Shi", title = "Critical Data Backup with Hybrid Flash-Based Consumer Devices", journal = j-TACO, volume = "21", number = "1", pages = "1:1--1:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631529", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3631529", abstract = "Hybrid flash-based storage constructed with high-density and low-cost flash memory has become increasingly popular in consumer devices in the last decade due to its low cost. However, its poor reliability is one of the major concerns. To protect critical \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2024:DOO, author = "Peng Chen and Hui Chen and Weichen Liu and Linbo Long and Wanli Chang and Nan Guan", title = "{DAG-Order}: an Order-Based Dynamic {DAG} Scheduling for Real-Time Networks-on-Chip", journal = j-TACO, volume = "21", number = "1", pages = "2:1--2:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631527", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3631527", abstract = "With the high-performance requirement of safety-critical real-time tasks, the platforms of many-core processors with high parallelism are widely utilized, where network-on-chip (NoC) is generally employed for inter-core communication due to its \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2024:JRG, author = "Zhang Jiang and Ying Chen and Xiaoli Gong and Jin Zhang and Wenwen Wang and Pen-Chung Yew", title = "{JiuJITsu}: Removing Gadgets with Safe Register Allocation for {JIT} Code Generation", journal = j-TACO, volume = "21", number = "1", pages = "3:1--3:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631526", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3631526", abstract = "Code-reuse attacks have the capability to craft malicious instructions from small code fragments, commonly referred to as ``gadgets.'' These gadgets are generated by JIT (Just-In-Time) engines as integral components of native instructions, with the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tayeb:2024:AAV, author = "Hayfa Tayeb and Ludovic Paillat and B{\'e}renger Bramas", title = "{Autovesk}: Automatic Vectorized Code Generation from Unstructured Static Kernels Using Graph Transformations", journal = j-TACO, volume = "21", number = "1", pages = "4:1--4:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631709", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3631709", abstract = "Leveraging the SIMD capability of modern CPU architectures is mandatory to take full advantage of their increased performance. To exploit this capability, binary executables must be vectorized, either manually by developers or automatically by a tool. For \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:FCM, author = "Xueying Wang and Guangli Li and Zhen Jia and Xiaobing Feng and Yida Wang", title = "Fast Convolution Meets Low Precision: Exploring Efficient Quantized {Winograd} Convolution on Modern {CPUs}", journal = j-TACO, volume = "21", number = "1", pages = "5:1--5:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632956", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632956", abstract = "Low-precision computation has emerged as one of the most effective techniques for accelerating convolutional neural networks and has garnered widespread support on modern hardware. Despite its effectiveness in accelerating convolutional neural networks, low-precision computation has not been commonly applied to fast convolutions, such as the Winograd algorithm, due to numerical issues. In this article, we propose an effective quantized Winograd convolution, named LoWino, which employs an in-side quantization method in the Winograd domain to reduce the precision loss caused by transformations. Meanwhile, we present an efficient implementation that integrates well-designed optimization techniques, allowing us to fully exploit the capabilities of low-precision computation on modern CPUs. We evaluate LoWino on two Intel Xeon Scalable Processor platforms with representative convolutional layers and neural network models. The experimental results demonstrate that our approach can achieve an average of $ 1.84 \times $ and $ 1.91 \times $ operator speedups over state-of-the-art implementations in the vendor library while preserving accuracy loss at a reasonable level.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fan:2024:QPQ, author = "Hao Fan and Yiliang Ye and Shadi Ibrahim and Zhuo Huang and Xingru Li and Weibin Xue and Song Wu and Chen Yu and Xuanhua Shi and Hai Jin", title = "{QoS-pro}: a {QoS}-enhanced Transaction Processing Framework for Shared {SSDs}", journal = j-TACO, volume = "21", number = "1", pages = "6:1--6:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632955", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632955", abstract = "Solid State Drives (SSDs) are widely used in data-intensive scenarios due to their high performance and decreasing cost. However, in shared environments, concurrent workloads can interfere with each other, leading to a violation of Quality of Service (QoS). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2024:SUE, author = "Yunping Zhao and Sheng Ma and Heng Liu and Libo Huang and Yi Dai", title = "{SAC}: an Ultra-Efficient Spin-based Architecture for Compressed {DNNs}", journal = j-TACO, volume = "21", number = "1", pages = "7:1--7:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632957", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632957", abstract = "Deep Neural Networks (DNNs) have achieved great progress in academia and industry. But they have become computational and memory intensive with the increase of network depth. Previous designs seek breakthroughs in software and hardware levels to mitigate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2024:ECP, author = "Tong-Yu Liu and Jianmei Guo and Bo Huang", title = "Efficient Cross-platform Multiplexing of Hardware Performance Counters via Adaptive Grouping", journal = j-TACO, volume = "21", number = "1", pages = "8:1--8:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3629525", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3629525", abstract = "Collecting sufficient microarchitecture performance data is essential for performance evaluation and workload characterization. There are many events to be monitored in a modern processor while only a few hardware performance monitoring counters (PMCs) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2024:QHQ, author = "Lei Liu and Xinglei Dou", title = "{QuCloud+}: a Holistic Qubit Mapping Scheme for Single\slash Multi-programming on {$2$D\slash $3$D NISQ} Quantum Computers", journal = j-TACO, volume = "21", number = "1", pages = "9:1--9:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631525", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3631525", abstract = "Qubit mapping for NISQ superconducting quantum computers is essential to fidelity and resource utilization. The existing qubit mapping schemes meet challenges, e.g., crosstalk, SWAP overheads, diverse device topologies, etc., leading to qubit resource \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2024:AAM, author = "Lingxi Wu and Minxuan Zhou and Weihong Xu and Ashish Venkat and Tajana Rosing and Kevin Skadron", title = "{Abakus}: Accelerating $k$-mer Counting with Storage Technology", journal = j-TACO, volume = "21", number = "1", pages = "10:1--10:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632952", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632952", abstract = "This work seeks to leverage Processing-with-storage-technology (PWST) to accelerate a key bioinformatics kernel called $k$-mer counting, which involves processing large files of sequence data on the disk to build a histogram of fixed-size genome sequence \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kang:2024:IAG, author = "Seokwon Kang and Jongbin Kim and Gyeongyong Lee and Jeongmyung Lee and Jiwon Seo and Hyungsoo Jung and Yong Ho Song and Yongjun Park", title = "{ISP Agent}: a Generalized In-storage-processing Workload Offloading Framework by Providing Multiple Optimization Opportunities", journal = j-TACO, volume = "21", number = "1", pages = "11:1--11:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632951", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632951", abstract = "As solid-state drives (SSDs) with sufficient computing power have recently become the dominant devices in modern computer systems, in-storage processing (ISP), which processes data within the storage without transferring it to the host memory, is being \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mishra:2024:CHP, author = "Prasoon Mishra and V. Krishna Nandivada", title = "{COWS} for High Performance: Cost Aware Work Stealing for Irregular Parallel Loop", journal = j-TACO, volume = "21", number = "1", pages = "12:1--12:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633331", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3633331", abstract = "Parallel libraries such as OpenMP distribute the iterations of parallel-for-loops among the threads, using a programmer-specified scheduling policy. While the existing scheduling policies perform reasonably well in the context of balanced workloads, in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Park:2024:HHS, author = "Joongun Park and Seunghyo Kang and Sanghyeon Lee and Taehoon Kim and Jongse Park and Youngjin Kwon and Jaehyuk Huh", title = "Hardware-hardened Sandbox Enclaves for Trusted Serverless Computing", journal = j-TACO, volume = "21", number = "1", pages = "13:1--13:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632954", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632954", abstract = "In cloud-based serverless computing, an application consists of multiple functions provided by mutually distrusting parties. For secure serverless computing, the hardware-based trusted execution environment (TEE) can provide strong isolation among \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Allen:2024:FGQ, author = "Tyler Allen and Bennett Cooper and Rong Ge", title = "Fine-grain Quantitative Analysis of Demand Paging in Unified Virtual Memory", journal = j-TACO, volume = "21", number = "1", pages = "14:1--14:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632953", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632953", abstract = "The abstraction of a shared memory space over separate CPU and GPU memory domains has eased the burden of portability for many HPC codebases. However, users pay for ease of use provided by system-managed memory with a moderate-to-high performance \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:RRR, author = "Zhonghua Wang and Yixing Guo and Kai Lu and Jiguang Wan and Daohui Wang and Ting Yao and Huatao Wu", title = "{Rcmp}: Reconstructing {RDMA-Based} Memory Disaggregation via {CXL}", journal = j-TACO, volume = "21", number = "1", pages = "15:1--15:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634916", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3634916", abstract = "Memory disaggregation is a promising architecture for modern datacenters that separates compute and memory resources into independent pools connected by ultra-fast networks, which can improve memory utilization, reduce cost, and enable elastic scaling of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Long:2024:WZW, author = "Linbo Long and Shuiyong He and Jingcheng Shen and Renping Liu and Zhenhua Tan and Congming Gao and Duo Liu and Kan Zhong and Yi Jiang", title = "{WA-Zone}: Wear-Aware Zone Management Optimization for {LSM}-Tree on {ZNS SSDs}", journal = j-TACO, volume = "21", number = "1", pages = "16:1--16:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637488", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3637488", abstract = "ZNS SSDs divide the storage space into sequential-write zones, reducing costs of DRAM utilization, garbage collection, and over-provisioning. The sequential-write feature of zones is well-suited for LSM-based databases, where random writes are organized \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Fan:2024:IUD, author = "Zhihua Fan and Wenming Li and Zhen Wang and Yu Yang and Xiaochun Ye and Dongrui Fan and Ninghui Sun and Xuejun An", title = "Improving Utilization of Dataflow Unit for Multi-Batch Processing", journal = j-TACO, volume = "21", number = "1", pages = "17:1--17:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3637906", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3637906", abstract = "Dataflow architectures can achieve much better performance and higher efficiency than general-purpose core, approaching the performance of a specialized design while retaining programmability. However, advanced application scenarios place higher demands \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2024:EVI, author = "Dunbo Zhang and Qingjie Lang and Ruoxi Wang and Li Shen", title = "Extension {VM}: Interleaved Data Layout in Vector Memory", journal = j-TACO, volume = "21", number = "1", pages = "18:1--18:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3631528", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3631528", abstract = "While vector architecture is widely employed in processors for neural networks, signal processing, and high-performance computing; however, its performance is limited by inefficient column-major memory access. The column-major access limitation originates \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Firtina:2024:AAP, author = "Can Firtina and Kamlesh Pillai and Gurpreet S. Kalsi and Bharathwaj Suresh and Damla Senol Cali and Jeremie S. Kim and Taha Shahroodi and Meryem Banu Cavlak and Jo{\"e}l Lindegger and Mohammed Alser and Juan G{\'o}mez Luna and Sreenivas Subramoney and Onur Mutlu", title = "{ApHMM}: Accelerating Profile Hidden {Markov} Models for Fast and Energy-efficient Genome Analysis", journal = j-TACO, volume = "21", number = "1", pages = "19:1--19:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3632950", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3632950", abstract = "Profile hidden Markov models (pHMMs) are widely employed in various bioinformatics applications to identify similarities between biological sequences, such as DNA or protein sequences. In pHMMs, sequences are represented as graph structures, where states \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ahmad:2024:EDL, author = "Khalid Ahmad and Cris Cecka and Michael Garland and Mary Hall", title = "Exploring Data Layout for Sparse Tensor Times Dense Matrix on {GPUs}", journal = j-TACO, volume = "21", number = "1", pages = "20:1--20:??", month = mar, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633462", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Feb 23 16:28:09 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3633462", abstract = "An important sparse tensor computation is sparse-tensor-dense-matrix multiplication (SpTM), which is used in tensor decomposition and applications. SpTM is a multi-dimensional analog to sparse-matrix-dense-matrix multiplication (SpMM). In this article, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mummidi:2024:HES, author = "Chandra Sekhar Mummidi and Victor C. Ferreira and Sudarshan Srinivasan and Sandip Kundu", title = "Highly Efficient Self-checking Matrix Multiplication on Tiled {AMX} Accelerators", journal = j-TACO, volume = "21", number = "2", pages = "21:1--21:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633332", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3633332", abstract = "General Matrix Multiplication (GEMM) is a computationally expensive operation that is used in many applications such as machine learning. Hardware accelerators are increasingly popular for speeding up GEMM computation, with Tiled Matrix Multiplication (TMUL) in recent Intel processors being an example. Unfortunately, the TMUL hardware is susceptible to errors, necessitating online error detection. The Algorithm-based Error Detection (ABED) technique is a powerful technique to detect errors in matrix multiplications. In this article, we consider implementation of an ABED technique that integrates seamlessly with the TMUL hardware to minimize performance overhead. Unfortunately, rounding errors introduced by floating-point operations do not allow a straightforward implementation of ABED in TMUL. Previously an error bound was considered for addressing rounding errors in ABED. If the error detection threshold is set too low, it will a trigger false alarm, while a loose bound will allow errors to escape detection. In this article, we propose an adaptive error threshold that takes into account the TMUL input values to address the problem of false triggers and error escapes and provide a taxonomy of various error classes. This threshold is obtained from theoretical error analysis but is not easy to implement in hardware. Consequently, we relax the threshold such that it can be easily computed in hardware. While ABED ensures error-free computation, it does not guarantee full coverage of all hardware faults. To address this problem, we propose an algorithmic pattern generation technique to ensure full coverage for all hardware faults. To evaluate the benefits of our proposed solution, we conducted fault injection experiments and show that our approach does not produce any false alarms or detection escapes for observable errors. We conducted additional fault injection experiments on a Deep Neural Network (DNN) model and find that if a fault is not detected, it does not cause any misclassification.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:WWO, author = "Zhonghua Wang and Chen Ding and Fengguang Song and Kai Lu and Jiguang Wan and Zhihu Tan and Changsheng Xie and Guokuan Li", title = "{WIPE}: a Write-Optimized Learned Index for Persistent Memory", journal = j-TACO, volume = "21", number = "2", pages = "22:1--22:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3634915", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3634915", abstract = "Learned Index, which utilizes effective machine learning models to accelerate locating sorted data positions, has gained increasing attention in many big data scenarios. Using efficient learned models, the learned indexes build large nodes and flat \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chacon:2024:CAC, author = "Gino A. Chacon and Charles Williams and Johann Knechtel and Ozgur Sinanoglu and Paul V. Gratz and Vassos Soteriou", title = "Coherence Attacks and Countermeasures in Interposer-based Chiplet Systems", journal = j-TACO, volume = "21", number = "2", pages = "23:1--23:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3633461", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3633461", abstract = "Industry is moving towards large-scale hardware systems that bundle processor cores, memories, accelerators, and so on. via 2.5D integration. These components are fabricated separately as chiplets and then integrated using an interposer as an interconnect \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wei:2024:CCB, author = "Yan Wei and Zhang Xingjun", title = "A Concise Concurrent {B+}-Tree for Persistent Memory", journal = j-TACO, volume = "21", number = "2", pages = "24:1--24:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638717", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3638717", abstract = "Persistent memory (PM) presents a unique opportunity for designing data management systems that offer improved performance, scalability, and instant restart capability. As a widely used data structure for managing data in such systems, B$^+$ -Tree must \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Qararyah:2024:EHD, author = "Fareed Qararyah and Muhammad Waqar Azhar and Pedro Trancoso", title = "An Efficient Hybrid Deep Learning Accelerator for Compact and Heterogeneous {CNNs}", journal = j-TACO, volume = "21", number = "2", pages = "25:1--25:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3639823", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3639823", abstract = "Resource-efficient Convolutional Neural Networks (CNNs) are gaining more attention. These CNNs have relatively low computational and memory requirements. A common denominator among such CNNs is having more heterogeneity than traditional CNNs. This \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Santos:2024:AIC, author = "Fernando {Fernandes Dos Santos} and Luigi Carro and Flavio Vella and Paolo Rech", title = "Assessing the Impact of Compiler Optimizations on {GPUs} Reliability", journal = j-TACO, volume = "21", number = "2", pages = "26:1--26:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3638249", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3638249", abstract = "Graphics Processing Units (GPUs) compilers have evolved in order to support general-purpose programming languages for multiple architectures. NVIDIA CUDA Compiler (NVCC) has many compilation levels before generating the machine code and applies complex \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Isaac-Chassande:2024:DHA, author = "Valentin Isaac-Chassande and Adrian Evans and Yves Durand and Fr{\'e}d{\'e}ric Rousseau", title = "Dedicated Hardware Accelerators for Processing of Sparse Matrices and Vectors: a Survey", journal = j-TACO, volume = "21", number = "2", pages = "27:1--27:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640542", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3640542", abstract = "Performance in scientific and engineering applications such as computational physics, algebraic graph problems or Convolutional Neural Networks (CNN), is dominated by the manipulation of large sparse matrices-matrices with a large number of zero elements. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xie:2024:IIA, author = "Benyi Xie and Yue Yan and Chenghao Yan and Sicheng Tao and Zhuangzhuang Zhang and Xinyu Li and Yanzhi Lan and Xiang Wu and Tianyi Liu and Tingting Zhang and Fuxin Zhang", title = "An Instruction Inflation Analyzing Framework for Dynamic Binary Translators", journal = j-TACO, volume = "21", number = "2", pages = "28:1--28:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640813", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3640813", abstract = "Dynamic binary translators (DBTs) are widely used to migrate applications between different instruction set architectures (ISAs). Despite extensive research to improve DBT performance, noticeable overhead remains, preventing near-native performance, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Rac:2024:CAS, author = "Samuel Rac and Mats Brorsson", title = "Cost-aware Service Placement and Scheduling in the Edge-Cloud Continuum", journal = j-TACO, volume = "21", number = "2", pages = "29:1--29:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3640823", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3640823", abstract = "The edge to data center computing continuum is the aggregation of computing resources located anywhere between the network edge (e.g., close to 5G antennas), and servers in traditional data centers. Kubernetes is the de facto standard for the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xue:2024:TEG, author = "Feng Xue and Chenji Han and Xinyu Li and Junliang Wu and Tingting Zhang and Tianyi Liu and Yifan Hao and Zidong Du and Qi Guo and Fuxin Zhang", title = "{Tyche}: an Efficient and General Prefetcher for Indirect Memory Accesses", journal = j-TACO, volume = "21", number = "2", pages = "30:1--30:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3641853", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3641853", abstract = "Indirect memory accesses (IMAs, i.e., A [ f ( B [ i ])]) are typical memory access patterns in applications such as graph analysis, machine learning, and database. IMAs are composed of producer-consumer pairs, where the consumers' memory addresses are derived \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xie:2024:WLT, author = "Kunpeng Xie and Ye Lu and Xinyu He and Dezhi Yi and Huijuan Dong and Yao Chen", title = "{Winols}: a Large-Tiling Sparse {Winograd} {CNN} Accelerator on {FPGAs}", journal = j-TACO, volume = "21", number = "2", pages = "31:1--31:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3643682", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3643682", abstract = "Convolutional Neural Networks (CNNs) can benefit from the computational reductions provided by the Winograd minimal filtering algorithm and weight pruning. However, harnessing the potential of both methods simultaneously introduces complexity in designing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2024:SSR, author = "Ke Liu and Kan Wu and Hua Wang and Ke Zhou and Peng Wang and Ji Zhang and Cong Li", title = "{SLAP}: Segmented Reuse-Time-Label Based Admission Policy for Content Delivery Network Caching", journal = j-TACO, volume = "21", number = "2", pages = "32:1--32:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3646550", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3646550", abstract = "``Learned'' admission policies have shown promise in improving Content Delivery Network (CDN) cache performance and lowering operational costs. Unfortunately, existing learned policies are optimized with a few fixed cache sizes while in reality, cache \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Miliadis:2024:ASS, author = "Panagiotis Miliadis and Dimitris Theodoropoulos and Dionisios Pnevmatikatos and Nectarios Koziris", title = "Architectural Support for Sharing, Isolating and Virtualizing {FPGA} Resources", journal = j-TACO, volume = "21", number = "2", pages = "33:1--33:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3648475", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3648475", abstract = "FPGAs are increasingly popular in cloud environments for their ability to offer on-demand acceleration and improved compute efficiency. Providers would like to increase utilization, by multiplexing customers on a single device, similar to how processing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Du:2024:FDR, author = "Haitao Du and Yuhan Qin and Song Chen and Yi Kang", title = "{FASA-DRAM}: Reducing {DRAM} Latency with Destructive Activation and Delayed Restoration", journal = j-TACO, volume = "21", number = "2", pages = "34:1--34:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3649455", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3649455", abstract = "DRAM memory is a performance bottleneck for many applications, due to its high access latency. Previous work has mainly focused on data locality, introducing small but fast regions to cache frequently accessed data, thereby reducing the average latency. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Canesche:2024:DSA, author = "Michael Canesche and Vanderson Ros{\'a}rio and Edson Borin and Fernando Quint{\~a}o Pereira", title = "The Droplet Search Algorithm for Kernel Scheduling", journal = j-TACO, volume = "21", number = "2", pages = "35:1--35:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3650109", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3650109", abstract = "Kernel scheduling is the problem of finding the most efficient implementation for a computational kernel. Identifying this implementation involves experimenting with the parameters of compiler optimizations, such as the size of tiling windows and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pal:2024:CUA, author = "Asmita Pal and Keerthana Desai and Rahul Chatterjee and Joshua {San Miguel}", title = "{Camouflage}: Utility-Aware Obfuscation for Accurate Simulation of Sensitive Program Traces", journal = j-TACO, volume = "21", number = "2", pages = "36:1--36:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3650110", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3650110", abstract = "Trace-based simulation is a widely used methodology for system design exploration. It relies on realistic traces that represent a range of behaviors necessary to be evaluated, containing a lot of information about the application, its inputs and the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huan:2024:TNT, author = "Chengying Huan and Yongchao Liu and Heng Zhang and Shuaiwen Song and Santosh Pandey and Shiyang Chen and Xiangfei Fang and Yue Jin and Baptiste Lepers and Yanjun Wu and Hang Liu", title = "{TEA+}: a Novel Temporal Graph Random Walk Engine with Hybrid Storage Architecture", journal = j-TACO, volume = "21", number = "2", pages = "37:1--37:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3652604", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3652604", abstract = "Many real-world networks are characterized by being temporal and dynamic, wherein the temporal information signifies the changes in connections, such as the addition or removal of links between nodes. Employing random walks on these temporal networks is a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hwang:2024:CTM, author = "Soojin Hwang and Daehyeon Baek and Jongse Park and Jaehyuk Huh", title = "{Cerberus}: Triple Mode Acceleration of Sparse Matrix and Vector Multiplication", journal = j-TACO, volume = "21", number = "2", pages = "38:1--38:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3653020", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3653020", abstract = "The multiplication of sparse matrix and vector (SpMV) is one of the most widely used kernels in high-performance computing as well as machine learning acceleration for sparse neural networks. The design space of SpMV accelerators has two axes: algorithm \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Raman:2024:NGD, author = "Siddhartha Raman Sundara Raman and Lizy John and Jaydeep P. Kulkarni", title = "{NEM-GNN}: {DAC\slash ADC}-less, Scalable, Reconfigurable, Graph and Sparsity-Aware Near-Memory Accelerator for Graph Neural Networks", journal = j-TACO, volume = "21", number = "2", pages = "39:1--39:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3652607", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3652607", abstract = "Graph neural networks (GNNs) are of great interest in real-life applications such as citation networks and drug discovery owing to GNN's ability to apply machine learning techniques on graphs. GNNs utilize a two-step approach to classify the nodes in a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2024:XSH, author = "Yan Chen and Qiwen Ke and Huiba Li and Yongwei Wu and Yiming Zhang", title = "{xMeta}: {SSD-HDD}-hybrid Optimization for Metadata Maintenance of Cloud-scale Object Storage", journal = j-TACO, volume = "21", number = "2", pages = "40:1--40:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3652606", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3652606", abstract = "Object storage has been widely used in the cloud. Traditionally, the size of object metadata is much smaller than that of object data, and thus existing object storage systems (such as Ceph and Oasis) can place object data and metadata, respectively, on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Singhal:2024:OHP, author = "Vidush Singhal and Laith Sakka and Kirshanthan Sundararajah and Ryan Newton and Milind Kulkarni", title = "{Orchard}: Heterogeneous Parallelism and Fine-grained Fusion for Complex Tree Traversals", journal = j-TACO, volume = "21", number = "2", pages = "41:1--41:??", month = jun, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3652605", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon May 27 06:59:33 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3652605", abstract = "Many applications are designed to perform traversals on tree-like data structures. Fusing and parallelizing these traversals enhance the performance of applications. Fusing multiple traversals improves the locality of the application. The runtime of an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Falahati:2024:CCD, author = "Hajar Falahati and Mohammad Sadrosadati and Qiumin Xu and Juan G{\'o}mez-Luna and Banafsheh Saber Latibari and Hyeran Jeon and Shaahin Hesaabi and Hamid Sarbazi-Azad and Onur Mutlu and Murali Annavaram and Masoud Pedram", title = "Cross-core Data Sharing for Energy-efficient {GPUs}", journal = j-TACO, volume = "21", number = "3", pages = "42:1--42:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3653019", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3653019", abstract = "Graphics Processing Units (GPUs) are the accelerator of choice in a variety of application domains, because they can accelerate massively parallel workloads and can be easily programmed using general-purpose programming frameworks such as CUDA and OpenCL. Each Streaming Multiprocessor (SM) contains an L1 data cache (L1D) to exploit the locality in data accesses. L1D misses are costly for GPUs for two reasons. First, L1D misses consume a lot of energy as they need to access the L2 cache (L2) via an on-chip network and the off-chip DRAM in case of L2 misses. Second, L1D misses impose performance overhead if the GPU does not have enough active warps to hide the long memory access latency. We observe that threads running on different SMs share 55\% of the data they read from the memory. Unfortunately, as the L1Ds are in the non-coherent memory domain, each SM independently fetches data from the L2 or the off-chip memory into its L1D, even though the data may be currently available in the L1D of another SM. Our goal is to service L1D read misses via other SMs, as much as possible, to cut down costly accesses to the L2 or the off-chip DRAM. To this end, we propose a new data-sharing mechanism, called Cross-Core Data Sharing (CCDS). CCDS employs a predictor to estimate whether the required cache block exists in another SM. If the block is predicted to exist in another SM's L1D, then CCDS fetches the data from the L1D that contain the block. Our experiments on a suite of 26 workloads show that CCDS improves average energy and performance by 1.30$ \times $ and 1.20$ \times $, respectively, compared to the baseline GPU. Compared to the state-of-the-art data-sharing mechanism, CCDS improves average energy and performance by 1.37$ \times $ and 1.11$ \times $, respectively.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2024:RRS, author = "Ching-Jui Lee and Tsung Tai Yeh", title = "{ReSA}: Reconfigurable Systolic Array for Multiple Tiny {DNN} Tensors", journal = j-TACO, volume = "21", number = "3", pages = "43:1--43:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3653363", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3653363", abstract = "Systolic array architecture has significantly accelerated deep neural networks (DNNs). A systolic array comprises multiple processing elements (PEs) that can perform multiply-accumulate (MAC). Traditionally, the systolic array can execute a certain amount \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:EPM, author = "Ziheng Wang and Xiaoshe Dong and Yan Kang and Heng Chen and Qiang Wang", title = "An Example of Parallel {Merkle} Tree Traversal: Post-Quantum {Leighton--Micali} Signature on the {GPU}", journal = j-TACO, volume = "21", number = "3", pages = "44:1--44:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3659209", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3659209", abstract = "The hash-based signature (HBS) is the most conservative and time-consuming among many post-quantum cryptography (PQC) algorithms. Two HBSs, LMS and XMSS, are the only PQC algorithms standardised by the National Institute of Standards and Technology (NIST) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2024:KAM, author = "Jiang Wu and Zhuo Zhang and Deheng Yang and Jianjun Xu and Jiayu He and Xiaoguang Mao", title = "Knowledge-Augmented Mutation-Based Bug Localization for Hardware Design Code", journal = j-TACO, volume = "21", number = "3", pages = "45:1--45:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3660526", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3660526", abstract = "Verification of hardware design code is crucial for the quality assurance of hardware products. Being an indispensable part of verification, localizing bugs in the hardware design code is significant for hardware development but is often regarded as a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ding:2024:DCE, author = "Chen Ding and Jian Zhou and Kai Lu and Sicen Li and Yiqin Xiong and Jiguang Wan and Ling Zhan", title = "{D$^2$Comp}: Efficient Offload of {LSM}-tree Compaction with Data Processing Units on Disaggregated Storage", journal = j-TACO, volume = "21", number = "3", pages = "46:1--46:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3656584", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3656584", abstract = "LSM-based key-value stores suffer from sub-optimal performance due to their slow and heavy background compactions. The compaction brings severe CPU and network overhead on high-speed disaggregated storage. This article further reveals that data-intensive \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:INM, author = "Zhuohao Wang and Lei Liu and Limin Xiao", title = "{iSwap}: a New Memory Page Swap Mechanism for Reducing Ineffective {I/O} Operations in Cloud Environments", journal = j-TACO, volume = "21", number = "3", pages = "47:1--47:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3653302", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3653302", abstract = "This article proposes iSwap, a new memory page swap mechanism that reduces the ineffective I/O swap operations and improves the QoS for applications with a high priority in cloud environments. iSwap works in the OS kernel. iSwap accurately learns the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2024:GDA, author = "Junkaixuan Li and Yi Kang", title = "{GraphSER}: Distance-Aware Stream-Based Edge Repartition for Many-Core Systems", journal = j-TACO, volume = "21", number = "3", pages = "48:1--48:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3661998", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3661998", abstract = "With the explosive growth of graph data, distributed graph processing has become popular, and many graph hardware accelerators use distributed frameworks. Graph partitioning is foundation in distributed graph processing. However, dynamic changes in graph \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2024:CNI, author = "Ke Wu and Dezun Dong and Weixia Xu", title = "{COER}: a Network Interface Offloading Architecture for {RDMA} and Congestion Control Protocol Codesign", journal = j-TACO, volume = "21", number = "3", pages = "49:1--49:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3660525", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3660525", abstract = "RDMA (Remote Direct Memory Access) networks require efficient congestion control to maintain their high throughput and low latency characteristics. However, congestion control protocols deployed at the software layer suffer from slow response times due to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2024:IAS, author = "Qunyou Liu and Darong Huang and Luis Costero and Marina Zapater and David Atienza", title = "Intermediate Address Space: virtual memory optimization of heterogeneous architectures for cache-resident workloads", journal = j-TACO, volume = "21", number = "3", pages = "50:1--50:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3659207", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3659207", abstract = "The increasing demand for computing power and the emergence of heterogeneous computing architectures have driven the exploration of innovative techniques to address current limitations in both the compute and memory subsystems. One such solution is the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Min:2024:CCE, author = "Dongmoon Min and Ilkwon Byun and Gyu-Hyeon Lee and Jangwoo Kim", title = "{CoolDC}: a Cost-Effective Immersion-Cooled Datacenter with Workload-Aware Temperature Scaling", journal = j-TACO, volume = "21", number = "3", pages = "51:1--51:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664925", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3664925", abstract = "For datacenter architects, it is the most important goal to minimize the datacenter's total cost of ownership for the target performance (i.e., TCO/performance). As the major component of a datacenter is a server farm, the most effective way of reducing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2024:SSA, author = "Hai Zhou and Dan Feng", title = "Stripe-schedule Aware Repair in Erasure-coded Clusters with Heterogeneous Star Networks", journal = j-TACO, volume = "21", number = "3", pages = "52:1--52:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664926", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3664926", abstract = "More and more storage systems use erasure code to tolerate faults. It takes pieces of data blocks as input and encodes a small number of parity blocks as output, where these blocks form a stripe. When reconsidering the recovery problem in the multi-stripe \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Deng:2024:FPE, author = "Bobin Deng and Bhargava Nadendla and Kun Suo and Yixin Xie and Dan Chia-Tien Lo", title = "Fixed-point Encoding and Architecture Exploration for Residue Number Systems", journal = j-TACO, volume = "21", number = "3", pages = "53:1--53:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664923", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3664923", abstract = "Residue Number Systems (RNS) demonstrate the fascinating potential to serve integer addition\slash multiplication-intensive applications. The complexity of Artificial Intelligence (AI) models has grown enormously in recent years. From a computer system's perspective, ensuring the training of these large-scale AI models within an adequate time and energy consumption has become a big concern. Matrix multiplication is a dominant subroutine in many prevailing AI models, with an addition\slash multiplication-intensive attribute. However, the data type of matrix multiplication within machine learning training typically requires real numbers, which indicates that RNS benefits for integer applications cannot be directly gained by AI training. The state-of-the-art RNS real-number encodings, including floating-point and fixed-point, have defects and can be further enhanced. To transform default RNS benefits to the efficiency of large-scale AI training, we propose a low-cost and high-accuracy RNS fixed-point representation: Single RNS Logical Partition (S-RNS-Logic-P) representation with Scaling-down Postprocessing Multiplication (SD-Post-Mul). Moreover, we extend the implementation details of the other two RNS fixed-point methods: Double RNS Concatenation and S-RNS-Logic-P representation with Scaling-down Preprocessing Multiplication. We also design the architectures of these three fixed-point multipliers. In empirical experiments, our S-RNS-Logic-P representation with SD-Post-Mul method achieves less latency and energy overhead while maintaining good accuracy. Furthermore, this method can easily extend to the Redundant Residue Number System to raise the efficiency of error-tolerant domains, such as improving the error correction efficiency of quantum computing.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:OSM, author = "Yizhuo Wang and Fangli Chang and Bingxin Wei and Jianhua Gao and Weixing Ji", title = "Optimization of Sparse Matrix Computation for Algebraic Multigrid on {GPUs}", journal = j-TACO, volume = "21", number = "3", pages = "54:1--54:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3664924", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3664924", abstract = "AMG is one of the most efficient and widely used methods for solving sparse linear systems. The computational process of AMG mainly consists of a series of iterative calculations of generalized sparse matrix-matrix multiplication (SpGEMM) and sparse \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:AMA, author = "Luming Wang and Xu Zhang and Songyue Wang and Zhuolun Jiang and Tianyue Lu and Mingyu Chen and Siwei Luo and Keji Huang", title = "Asynchronous Memory Access Unit: Exploiting Massive Parallelism for Far Memory Access", journal = j-TACO, volume = "21", number = "3", pages = "55:1--55:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663479", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3663479", abstract = "The growing memory demands of modern applications have driven the adoption of far memory technologies in data centers to provide cost-effective, high-capacity memory solutions. However, far memory presents new performance challenges because its access \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2024:SOD, author = "Yunping Zhao and Sheng Ma and Hengzhu Liu and Dongsheng Li", title = "{SAL}: Optimizing the Dataflow of Spin-based Architectures for Lightweight Neural Networks", journal = j-TACO, volume = "21", number = "3", pages = "56:1--56:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3673654", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3673654", abstract = "As the Convolutional Neural Network (CNN) goes deeper and more complex, the network becomes memory-intensive and computation-intensive. To address this issue, the lightweight neural network reduces parameters and Multiplication-and-Accumulation (MAC) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2024:SLL, author = "Kai Lu and Siqi Zhao and Haikang Shan and Qiang Wei and Guokuan Li and Jiguang Wan and Ting Yao and Huatao Wu and Daohui Wang", title = "{Scythe}: a Low-latency {RDMA}-enabled Distributed Transaction System for Disaggregated Memory", journal = j-TACO, volume = "21", number = "3", pages = "57:1--57:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3666004", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3666004", abstract = "Disaggregated memory separates compute and memory resources into independent pools connected by RDMA (Remote Direct Memory Access) networks, which can improve memory utilization, reduce cost, and enable elastic scaling of compute and memory resources. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Peng:2024:LER, author = "Wangqi Peng and Yusen Li and Xiaoguang Liu and Gang Wang", title = "{Lavender}: an Efficient Resource Partitioning Framework for Large-Scale Job Colocation", journal = j-TACO, volume = "21", number = "3", pages = "58:1--58:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674736", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3674736", abstract = "Workload consolidation is a widely used approach to enhance resource utilization in modern data centers. However, the concurrent execution of multiple jobs on a shared server introduces contention for essential shared resources such as CPU cores, Last \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2024:ATE, author = "Feng Zhang and Fulin Nan and Binbin Xu and Zhirong Shen and Jiebin Zhai and Dmitrii Kalplun and Jiwu Shu", title = "Achieving Tunable Erasure Coding with Cluster-Aware Redundancy Transitioning", journal = j-TACO, volume = "21", number = "3", pages = "59:1--59:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3672077", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3672077", abstract = "Erasure coding has been demonstrated as a storage-efficient means against failures, yet its tunability remains a challenging issue in data centers, which is prone to induce substantial cross-cluster traffic. In this article, we present ClusterRT, a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Olgun:2024:SDP, author = "Ataberk Olgun and F. Nisa Bostanci and Geraldo Francisco de Oliveira Junior and Yahya Can Tugrul and Rahul Bera and Abdullah Giray Yaglikci and Hasan Hassan and Oguz Ergin and Onur Mutlu", title = "Sectored {DRAM}: a Practical Energy-Efficient and High-Performance Fine-Grained {DRAM} Architecture", journal = j-TACO, volume = "21", number = "3", pages = "60:1--60:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3673653", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3673653", abstract = "Modern computing systems access data in main memory at coarse granularity (e.g., at 512-bit cache block granularity). Coarse-grained access leads to wasted energy because the system does not use all individually accessed small portions (e.g., words, each \ldots{})", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wei:2024:RRI, author = "Xiaohui Wei and Chenyang Wang and Hengshan Yue and Jingweijia Tan and Zeyu Guan and Nan Jiang and Xinyang Zheng and Jianpeng Zhao and Meikang Qiu", title = "{ReIPE}: Recycling Idle {PEs} in {CNN} Accelerator for Vulnerable Filters Soft-Error Detection", journal = j-TACO, volume = "21", number = "3", pages = "61:1--61:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674909", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3674909", abstract = "To satisfy prohibitively massive computational requirements of current deep Convolutional Neural Networks (CNNs), CNN-specific accelerators are widely deployed in large-scale systems. Caused by high-energy neutrons and $ \alpha $-particle strikes, soft error may \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2024:COL, author = "Qiao Li and Yu Chen and Guanyu Wu and Yajuan Du and Min Ye and Xinbiao Gan and Jie Zhang and Zhirong Shen and Jiwu Shu and Chun Xue", title = "Characterizing and Optimizing {LDPC} Performance on {$3$D} {NAND} Flash Memories", journal = j-TACO, volume = "21", number = "3", pages = "62:1--62:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3663478", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3663478", abstract = "With the development of NAND flash memories' bit density and stacking technologies, while storage capacity keeps increasing, the issue of reliability becomes increasingly prominent. Low-density parity check (LDPC) code, as a robust error-correcting code, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "62", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2024:RAR, author = "Jiahong Xu and Haikun Liu and Zhuohui Duan and Xiaofei Liao and Hai Jin and Xiaokang Yang and Huize Li and Cong Liu and Fubing Mao and Yu Zhang", title = "{ReHarvest}: an {ADC} Resource-Harvesting Crossbar Architecture for {ReRAM}-Based {DNN} Accelerators", journal = j-TACO, volume = "21", number = "3", pages = "63:1--63:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3659208", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3659208", abstract = "ReRAM-based Processing-In-Memory (PIM) architectures have been increasingly explored to accelerate various Deep Neural Network (DNN) applications because they can achieve extremely high performance and energy-efficiency for in-situ analog Matrix-Vector \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "63", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2024:TAS, author = "Jiang Wu and Zhuo Zhang and Deheng Yang and Jianjun Xu and Jiayu He and Xiaoguang Mao", title = "Time-Aware Spectrum-Based Bug Localization for Hardware Design Code with Data Purification", journal = j-TACO, volume = "21", number = "3", pages = "64:1--64:??", month = sep, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678009", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Sep 21 06:05:41 MDT 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3678009", abstract = "The verification of hardware design code is a critical aspect in ensuring the quality and reliability of hardware products. Finding bugs in hardware design code is important for hardware development and is frequently considered as a notoriously \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "64", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Song:2024:ECA, author = "Zhuoran Song and Zhongkai Yu and Xinkai Song and Yifan Hao and Li Jiang and Naifeng Jing and Xiaoyao Liang", title = "Environmental Condition Aware Super-Resolution Acceleration Framework in Server--Client Hierarchies", journal = j-TACO, volume = "21", number = "4", pages = "65:1--65:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678008", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3678008", abstract = "In the current landscape, high-resolution (HR) videos have gained immense popularity, promising an elevated viewing experience. Recent research has demonstrated that the video super-resolution (SR) algorithm, empowered by deep neural networks (DNNs), can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "65", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Antoniou:2024:ACS, author = "Georgia Antoniou and Davide Bartolini and Haris Volos and Marios Kleanthous and Zhe Wang and Kleovoulos Kalaitzidis and Tom Rollet and Ziwei Li and Onur Mutlu and Yiannakis Sazeides and Jawad Haj Yahya", title = "Agile {C}-states: a Core {C}-state Architecture for Latency Critical Applications Optimizing both Transition and Cold-Start Latency", journal = j-TACO, volume = "21", number = "4", pages = "66:1--66:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674734", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3674734", abstract = "Latency-critical applications running in modern datacenters exhibit irregular request arrival patterns and are implemented using multiple services with strict latency requirements (30--250$ \mu $ s). These characteristics render existing energy-saving idle CPU \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "66", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gan:2024:MTA, author = "Xinbiao Gan and Tiejun Li and Feng Xiong and Bo Yang and Xinhai Chen and Chunye Gong and Shijie Li and Kai Lu and Qiao Li and Yiming Zhang", title = "{MST}: Topology-Aware Message Aggregation for Exascale Graph Processing of Traversal-Centric Algorithms", journal = j-TACO, volume = "21", number = "4", pages = "67:1--67:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3676846", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3676846", abstract = "This article presents MST, a communication-efficient message library for fast graph traversal on exascale clusters. The key idea is to follow the multi-level network topology to perform topology-aware message aggregation, where small messages are gathered \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "67", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cui:2024:HHE, author = "Yujie Cui and Wei Chen and Xu Cheng and Jiangfang Yi", title = "{Hyperion}: a Highly Effective Page and {PC} Based Delta Prefetcher", journal = j-TACO, volume = "21", number = "4", pages = "68:1--68:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3675398", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3675398", abstract = "Hardware prefetching plays an important role in modern processors for hiding memory access latency. Delta prefetchers show great potential at the L1D cache level, as they can impose small storage overhead by recording deltas. Furthermore, local delta \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "68", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gao:2024:OLS, author = "Jianhua Gao and Weixing Ji and Yizhuo Wang", title = "Optimization of Large-Scale Sparse Matrix--Vector Multiplication on Multi-{GPU} Systems", journal = j-TACO, volume = "21", number = "4", pages = "69:1--69:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3676847", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3676847", abstract = "Sparse matrix--vector multiplication (SpMV) is one of the important kernels of many iterative algorithms for solving sparse linear systems. The limited storage and computational resources of individual GPUs restrict both the scale and speed of SpMV computing in problem-solving. As real-world engineering problems continue to increase in complexity, the imperative for collaborative execution of iterative solving algorithms across multiple GPUs is increasingly apparent. Although the multi-GPU-based SpMV takes less kernel execution time, it also introduces additional data transmission overhead, which diminishes the performance gains derived from parallelization across multi-GPUs. Based on the non-zero elements distribution characteristics of sparse matrices and the tradeoff between redundant computations and data transfer overhead, this article introduces a series of SpMV optimization techniques tailored for multi-GPU environments and effectively enhances the execution efficiency of iterative algorithms on multiple GPUs. First, we propose a two-level non-zero elements-based matrix partitioning method to increase the overlap of kernel execution and data transmission. Then, considering the irregular non-zero elements distribution in sparse matrices, a long-row-aware matrix partitioning method is proposed to hide more data transmissions. Finally, an optimization using redundant and inexpensive short-row execution to exchange costly data transmission is proposed. Our experimental evaluation demonstrates that, compared with the SpMV on a single GPU, the proposed method achieves an average speedup of 2.00$ \times $ and 1.85$ \times $ on platforms equipped with two RTX 3090 and two Tesla V100-SXM2, respectively. The average speedup of 2.65$ \times $ is achieved on a platform equipped with four Tesla V100-SXM2.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "69", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hu:2024:ASA, author = "Zhengding Hu and Jingwei Sun and Zhongyang Li and Guangzhong Sun", title = "{AG-SpTRSV}: an Automatic Framework to Optimize Sparse Triangular Solve on {GPUs}", journal = j-TACO, volume = "21", number = "4", pages = "70:1--70:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674911", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3674911", abstract = "Sparse Triangular Solve (SpTRSV) has long been an essential kernel in the field of scientific computing. Due to its low computational intensity and internal data dependencies, SpTRSV is hard to implement and optimize on graphics processing units (GPUs). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "70", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2024:EEA, author = "Wenbo Zhang and Yiqi Liu and Tianhao Zang and Zhenshan Bao", title = "{EA4RCA}: Efficient {AIE} accelerator design framework for regular Communication-Avoiding Algorithm", journal = j-TACO, volume = "21", number = "4", pages = "71:1--71:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3678010", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3678010", abstract = "With the introduction of the Adaptive Intelligence Engine (AIE), the Versal Adaptive Compute Acceleration Platform (Versal ACAP) has garnered great attention. However, the current focus of Vitis Libraries and limited research has mainly been on how to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "71", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Thangamani:2024:SGP, author = "Arun Thangamani and Vincent Loechner and St{\'e}phane Genaud", title = "A Survey of General-purpose Polyhedral Compilers", journal = j-TACO, volume = "21", number = "4", pages = "72:1--72:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3674735", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3674735", abstract = "Since the 1990s, many implementations of polyhedral compilers have been written and distributed, either as source-to-source translating compilers or integrated into wider-purpose compilers. This article provides a survey on those various available \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "72", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lin:2024:SLC, author = "Junqing Lin and Jingwei Sun and Xiaolong Shi and Honghe Zhang and Xianzhi Yu and Xinzhi Wang and Jun Yao and Guangzhong Sun", title = "{LO-SpMM}: Low-cost Search for High-performance {SpMM} Kernels on {GPUs}", journal = j-TACO, volume = "21", number = "4", pages = "73:1--73:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3685277", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3685277", abstract = "As deep neural networks (DNNs) become increasingly large and complicated, pruning techniques are proposed for lower memory footprint and more efficient inference. The most critical kernel to execute pruned sparse DNNs on GPUs is Sparse-dense Matrix \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "73", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yi:2024:DDB, author = "Chenglong Yi and Jintong Liu and Shenggang Wan and Juntao Fang and Bin Sun and Liqiang Zhang", title = "Data Deduplication Based on Content Locality of Transactions to Enhance Blockchain Scalability", journal = j-TACO, volume = "21", number = "4", pages = "74:1--74:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3680547", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3680547", abstract = "Blockchain is a promising infrastructure for the internet and digital economy, but it has serious scalability problems, that is, long block synchronization time and high storage cost. Conventional coarse-grained data deduplication schemes (block or file level) are proved to be ineffective on improving the scalability of blockchains. Based on comprehensive analysis on typical blockchain workloads, we propose two new locality concepts (economic and argument locality) and a novel fine-grained data deduplication scheme (transaction level) named Alias-Chain. Specifically, Alias-Chain replaces frequently used data, for example, smart contract arguments, with much shorter aliases to reduce the block sizes, which results in both shorter synchronization time and lower storage cost. Furthermore, to solve the potential consistency issue in Alias-Chain, we propose two complementary techniques: one is generating aliases from history blocks with high consistency, and the other is speeding up the generation of aliases via a specific algorithm. Our simulation results show: (1) the average transfer and SC-call transaction (a transaction used to call the smart contracts in the blockchain) sizes can be significantly reduced by up to 11.03\% and 79.44\% in native Ethereum, and up to 39.29\% and 81.84\% in Ethereum optimized by state-of-the-art techniques; and (2) the two complementary techniques well address the inconsistency risk with very limited impact on the benefit of Alias-Chain. Prototyping-based experiments are further conducted on a testbed consisting of up to 3200 miners. The results demonstrate the effectiveness and efficiency of Alias-Chain on reducing block synchronization time and storage cost under typical real-world workloads.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "74", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Booth:2024:NAV, author = "Joshua Dennis Booth and Phillip Lane", title = "A {NUMA}-Aware Version of an Adaptive Self-Scheduling Loop Scheduler", journal = j-TACO, volume = "21", number = "4", pages = "75:1--75:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3680549", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3680549", abstract = "Parallelizing code in a shared-memory environment is commonly done utilizing loop scheduling (LS) in a fork-join manner as in OpenMP. This manner of parallelization is popular due to its ease to code, but the choice of the LS method is important when the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "75", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tang:2024:DME, author = "Yu Tang and Qiao Li and Lujia Yin and Dongsheng Li and Yiming Zhang and Chenyu Wang and Xingcheng Zhang and Linbo Qiao and Zhaoning Zhang and Kai Lu", title = "{DELTA}: Memory-Efficient Training via Dynamic Fine-Grained Recomputation and Swapping", journal = j-TACO, volume = "21", number = "4", pages = "76:1--76:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689338", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689338", abstract = "To accommodate the increasingly large-scale models within limited-capacity GPU memory, various coarse-grained techniques, such as recomputation and swapping, have been proposed to optimize memory usage. However, these methods have encountered limitations, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "76", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tan:2024:OGC, author = "Zhenhua Tan and Linbo Long and Jingcheng Shen and Renping Liu and Congming Gao and Kan Zhong and Yi Jiang", title = "Optimizing Garbage Collection for {ZNS SSDs} via In-storage Data Migration and Address Remapping", journal = j-TACO, volume = "21", number = "4", pages = "77:1--77:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689336", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689336", abstract = "The NVMe Zoned Namespace (ZNS) is a high-performance interface for flash-based solid-state drives (SSDs), which divides the logical address space into fixed-size and sequential-write zones. Meanwhile, ZNS SSDs eliminate in-device garbage collection (GC) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "77", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2024:OGI, author = "Xiang Li and Qiong Chang and Aolong Zha and Shijie Chang and Yun Li and Jun Miyazaki", title = "An Optimized {GPU} Implementation for {GIST} Descriptor", journal = j-TACO, volume = "21", number = "4", pages = "78:1--78:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689339", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689339", abstract = "The GIST descriptor is a classic feature descriptor primarily used for scene categorization and recognition tasks. It drives a bank of Gabor filters, which respond to edges and textures at various scales and orientations to capture the spatial structures \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "78", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2024:MME, author = "Xiaobo Lu and Jianbin Fang and Lin Peng and Chun Huang and Zidong Du and Yongwei Zhao and Zheng Wang", title = "{Mentor}: a Memory-Efficient Sparse-dense Matrix Multiplication Accelerator Based on Column-Wise Product", journal = j-TACO, volume = "21", number = "4", pages = "79:1--79:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3688612", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3688612", abstract = "Sparse-dense matrix multiplication (SpMM) is the performance bottleneck of many high-performance and deep-learning applications, making it attractive to design specialized SpMM hardware accelerators. Unfortunately, existing hardware solutions do not take \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "79", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Feng:2024:PAN, author = "Yu Feng and Weikai Lin and Zihan Liu and Jingwen Leng and Minyi Guo and Han Zhao and Xiaofeng Hou and Jieru Zhao and Yuhao Zhu", title = "{Potamoi}: Accelerating Neural Rendering via a Unified Streaming Architecture", journal = j-TACO, volume = "21", number = "4", pages = "80:1--80:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689340", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689340", abstract = "Neural Radiance Field (NeRF) has emerged as a promising alternative for photorealistic rendering. Despite recent algorithmic advancements, achieving real-time performance on today's resource-constrained devices remains challenging. In this article, we \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "80", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2024:PSS, author = "Changxi Liu and Alen Sabu and Akanksha Chaudhari and Qingxuan Kang and Trevor E. Carlson", title = "{Pac-Sim}: Simulation of Multi-threaded Workloads using Intelligent, Live Sampling", journal = j-TACO, volume = "21", number = "4", pages = "81:1--81:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3680548", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3680548", abstract = "High-performance, multi-core processors are the key to accelerating workloads in several application domains. To continue to scale performance at the limit of Moore's Law and Dennard scaling, software and hardware designers have turned to dynamic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "81", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Raje:2024:CCG, author = "Saurabh Raje and Yufan Xu and Atanas Rountev and Edward F. Valeev and P. Sadayappan", title = "{CoNST}: Code Generator for Sparse Tensor Networks", journal = j-TACO, volume = "21", number = "4", pages = "82:1--82:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689342", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689342", abstract = "Sparse tensor networks represent contractions over multiple sparse tensors. Tensor contractions are higher-order analogs of matrix multiplication. Tensor networks arise commonly in many domains of scientific computing and data science. Such networks are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "82", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jia:2024:DLT, author = "Danlin Jia and Geng Yuan and Yiming Xie and Xue Lin and Ningfang Mi", title = "A Data-Loader Tunable Knob to Shorten {GPU} Idleness for Distributed Deep Learning", journal = j-TACO, volume = "21", number = "4", pages = "83:1--83:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3680546", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3680546", abstract = "Deep Neural Networks (DNNs) have been applied as an effective machine learning algorithm to tackle problems in different domains. However, the endeavor to train sophisticated DNN models can stretch from days into weeks, presenting substantial obstacles in \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "83", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2024:USD, author = "Shaobu Wang and Guangyan Zhang and Junyu Wei and Yang Wang and Jiesheng Wu and Qingchao Luo", title = "Understanding Silent Data Corruption in Processors for Mitigating its Effects", journal = j-TACO, volume = "21", number = "4", pages = "84:1--84:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3690825", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3690825", abstract = "Silent Data Corruption (SDC) in processors can lead to various application-level issues, such as incorrect calculations and even data loss. Since traditional techniques are not effective in detecting these errors, it is very hard to address problems \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "84", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2024:SIT, author = "Yen-Yu Lu and Chin-Hsien Wu and Shih-Jen Li and Cheng-Tze Lee and Cheng-Yen Wu", title = "A Stable Idle Time Detection Platform for Real {I/O} Workloads", journal = j-TACO, volume = "21", number = "4", pages = "85:1--85:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695871", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3695871", abstract = "It is important to utilize the idle time of a workload to improve the system performance. In the article, we will explore multiple idle time detection methods to predict the idle time of the real I/O workloads. The objective is to build a stable idle time \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "85", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sun:2024:TAL, author = "Lingyu Sun and Xiaofeng Hou and Chao Li and Jiacheng Liu and Xinkai Wang and Quan Chen and Minyi Guo", title = "{$ A^2 $}: Towards Accelerator Level Parallelism for Autonomous Micromobility Systems", journal = j-TACO, volume = "21", number = "4", pages = "86:1--86:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3688611", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3688611", abstract = "Autonomous micromobility systems (AMS) such as low-speed minicabs and robots are thriving. In AMS, multiple Deep Neural Networks execute in parallel on heterogeneous AI accelerators. An emerging paradigm called Accelerator Level Parallelism (ALP) suggests \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "86", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sistla:2024:THP, author = "Manojna Sistla and Yiding Liu and Xin Fu", title = "Towards High Performance {QNNs} via Distribution-Based {CNOT} Gate Reduction", journal = j-TACO, volume = "21", number = "4", pages = "87:1--87:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695872", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3695872", abstract = "Quantum Neural Networks (QNNs) are one of the most promising applications that can be implemented on NISQ-era quantum computers. In this study, we observe that QNNs often suffer from gate redundancy, which hugely declines the performance and accuracy of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "87", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mao:2024:PAC, author = "Fubing Mao and Xu Liu and Yu Zhang and Haikun Liu and Xiaofei Liao and Hai Jin and Wei Zhang and Jian Zhou and Yufei Wu and Longyu Nie and Yapu Guo and Zihan Jiang and Jingkang Liu", title = "{PMGraph}: Accelerating Concurrent Graph Queries over Streaming Graphs", journal = j-TACO, volume = "21", number = "4", pages = "88:1--88:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3689337", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689337", abstract = "There are usually a large number of concurrent graph queries (CGQs) requirements in streaming graphs. However, existing graph processing systems mainly optimize a single graph query in streaming graphs or CGQs in static graphs. They have a large number of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "88", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2024:ACG, author = "Wentong Li and Yina Lv and Longfei Luo and Yunpeng Song and Liang Shi", title = "Access Characteristic-Guided Remote Swapping Across Mobile Devices", journal = j-TACO, volume = "21", number = "4", pages = "89:1--89:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695870", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3695870", abstract = "Memory swapping ensures smooth application switching for mobile systems by caching applications in the background. To further play the role of memory swapping, remote swapping across mobile devices has been widely studied, which caches applications to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "89", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2024:SCE, author = "Yinan Zhang and Shun Yang and Huiqi Hu and Chengcheng Yang and Peng Cai and Xuan Zhou", title = "{SuccinctKV}: a {CPU}-efficient {LSM}-tree Based {KV} Store with Scan-based Compaction", journal = j-TACO, volume = "21", number = "4", pages = "90:1--90:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3695873", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3695873", abstract = "The CPU overhead of the LSM-tree becomes increasingly significant when high-speed storage devices are utilized. In this article, we propose SuccinctKV, a key-value store based on LSM-tree that is optimized to improve CPU efficiency in mixed workload \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "90", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ma:2024:PPM, author = "Siyuan Ma and Kaustubh Mhatre and Jian Weng and Bagus Hanindhito and Zhengrong Wang and Tony Nowatzki and Lizy John and Aman Arora", title = "{PIMSAB}: a Processing-In-Memory System with Spatially-Aware Communication and Bit-Serial-Aware Computation", journal = j-TACO, volume = "21", number = "4", pages = "91:1--91:??", month = dec, year = "2024", CODEN = "????", DOI = "https://doi.org/10.1145/3690824", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Nov 22 08:21:31 MST 2024", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3690824", abstract = "Bit-serial Processing-In-Memory (PIM) is an attractive paradigm for accelerator architectures, for parallel workloads such as Deep Learning (DL), because of its capability to achieve massive data parallelism at a low area overhead and provide orders-of-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "91", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gibson:2025:DCM, author = "Perry Gibson and Jose Cano and Elliot Crowley and Amos Storkey and Michael O'boyle", title = "{DLAS}: a Conceptual Model for Across-Stack Deep Learning Acceleration", journal = j-TACO, volume = "22", number = "1", pages = "1:1--1:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3688609", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3688609", abstract = "Deep Neural Networks (DNNs) are very computationally demanding, which presents a significant barrier to their deployment, especially on resource-constrained devices. Significant work from both the machine learning and computing systems communities has \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gan:2025:GTA, author = "Xinbiao Gan", title = "{GraphService}: Topology-aware Constructor for Large-scale Graph Applications", journal = j-TACO, volume = "22", number = "1", pages = "2:1--2:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3689341", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689341", abstract = "Graph-based services are becoming integrated into everyday life through graph applications and graph learning systems. While traditional graph processing approaches boast excellent throughput with millisecond-level processing time, the construction phase \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2025:MOM, author = "Renjun Zhang and Tianming Zhang and Zinuo Cai and Dongmei Li and Ruhui Ma and Buyya Rajkumar", title = "{MemoriaNova}: Optimizing Memory-Aware Model Inference for Edge Computing", journal = j-TACO, volume = "22", number = "1", pages = "3:1--3:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701997", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701997", abstract = "In recent years, deploying deep learning models on edge devices has become pervasive, driven by the increasing demand for intelligent edge computing solutions across various industries. From industrial automation to intelligent surveillance and healthcare,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lepori:2025:IPE, author = "Andrea Lepori and Alexandru Calotoiu and Torsten Hoefler", title = "Iterating Pointers: Enabling Static Analysis for Loop-based Pointers", journal = j-TACO, volume = "22", number = "1", pages = "4:1--4:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701993", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701993", abstract = "Pointers are an integral part of C and other programming languages. They enable substantial flexibility from the programmer's standpoint, allowing the user fine, unmediated control over data access patterns. However, accesses done through pointers are \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Razilov:2025:CMV, author = "Viktor Razilov and Ipek Gecin and Emil Mat{\'u}s and Gerhard Fettweis", title = "Conflict Management in Vector Register Files", journal = j-TACO, volume = "22", number = "1", pages = "5:1--5:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702002", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3702002", abstract = "The instruction set architecture of vector processors operates on vectors stored in the vector register file which needs to handle several concurrent accesses by functional units with multiple ports. When the vector processor is running with high \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2025:LHR, author = "Jingle Xu and Jiayu Fu and Lin Gan and Yaojian Chen and Zhaoqi Sun and Zhenchun Huang and Guangwen Yang", title = "Leveraging the Hardware Resources to Accelerate cryo-{EM} Reconstruction of {RELION} on the New {Sunway} Supercomputer", journal = j-TACO, volume = "22", number = "1", pages = "6:1--6:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701990", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701990", abstract = "The fast development of biomolecular structure determination has enabled the fine-grained study of objects in the micro-world, such as proteins and RNAs. The world is benefited. However, as the computational algorithms are constantly developed, the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Saito:2025:MFM, author = "Yuta Saito and Kazunori Sakamoto and Hironori Washizaki and Yoshiaki Fukazawa", title = "Multiple Function Merging for Code Size Reduction", journal = j-TACO, volume = "22", number = "1", pages = "7:1--7:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702000", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3702000", abstract = "Resource-constrained environments, such as embedded devices, have limited amounts of memory and storage. Practical programming languages such as C++ and Rust tend to output multiple similar functions by monomorphizing polymorphic functions. An \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2025:SLI, author = "Peihua Zhang and Chenggang Wu and Hanzhi Hu and Lichen Jia and Mingfan Peng and Jiali Xu and Mengyao Xie and Yuanming Lai and Yan Kang and Zhe Wang", title = "Shining Light on the Inter-procedural Code Obfuscation: Keep Pace with Progress in Binary Diffing", journal = j-TACO, volume = "22", number = "1", pages = "8:1--8:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701992", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701992", abstract = "Software obfuscation techniques have lost their effectiveness due to the rapid development of binary diffing techniques, which can achieve accurate function matching and identification. In this paper, we propose a new inter-procedural code obfuscation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Han:2025:CUH, author = "Dengke Han and Mingyu Yan and Xiaochun Ye and Dongrui Fan", title = "Characterizing and Understanding {HGNN} Training on {GPUs}", journal = j-TACO, volume = "22", number = "1", pages = "9:1--9:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703356", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3703356", abstract = "Owing to their remarkable representation capabilities for heterogeneous graph data, Heterogeneous Graph Neural Networks (HGNNs) have been widely adopted in many critical real-world domains such as recommendation systems and medical analysis. Prior to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:DRA, author = "Jingyu Wang and Ruilong Ma and Xiang Yang and Qi Qi and Zirui Zhuang and Jing Wang and Jianxin Liao and Song Guo", title = "{DeepZoning}: Re-accelerate {CNN} Inference with Zoning Graph for Heterogeneous Edge Cluster", journal = j-TACO, volume = "22", number = "1", pages = "10:1--10:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701995", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701995", abstract = "Parallelizing CNN inference on heterogeneous edge clusters with data parallelism has gained popularity as a way to meet real-time requirements without sacrificing model accuracy. However, existing algorithms struggle to find optimal parallel granularity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ouyang:2025:CSB, author = "Chenghao Ouyang and Jinhan Xin and Siqi Zeng and Guohui Li and Jianjun Li and Zhibin Yu", title = "Constructing a Supplementary Benchmark Suite to Represent {Android} Applications with User Interactions by using Performance Counters", journal = j-TACO, volume = "22", number = "1", pages = "11:1--11:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701999", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701999", abstract = "We find existing benchmark suites for smartphone CPU micro-architecture design such as Geekbench 5.0 fail to authentically represent the micro-architecture-level performance behavior of widely used real Android applications with interactive operations \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dou:2025:ISA, author = "Xinglei Dou and Lei Liu and Limin Xiao", title = "An Intelligent Scheduling Approach on Mobile {OS} for Optimizing {UI} Smoothness and Power", journal = j-TACO, volume = "22", number = "1", pages = "12:1--12:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3674910", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3674910", abstract = "Mobile devices need to respond quickly to diverse user inputs. The existing approaches often heuristically raise the CPU/GPU frequency according to the empirical rules when facing burst inputs and various changes. Although doing so can be effective \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Choi:2025:SPO, author = "Kwanghoon Choi and Igjae Kim and Sunho Lee and Jaehyuk Huh", title = "{ShieldCXL}: a Practical Obliviousness Support with Sealed {CXL} Memory", journal = j-TACO, volume = "22", number = "1", pages = "13:1--13:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703354", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3703354", abstract = "The CXL (Compute Express Link) technology is an emerging memory interface with high-level commands. Recent studies applied the CXL memory expanding technique to mitigate the capacity limitation of the conventional DDRx memory. Unlike the prior studies to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2025:PCA, author = "Yun Chen and Ali Hajiabadi and Romain Poussier and Yaswanth Tavva and Andreas Diavastos and Shivam Bhasin and Trevor E. Carlson", title = "{PARADISE}: Criticality-Aware Instruction Reordering for Power Attack Resistance", journal = j-TACO, volume = "22", number = "1", pages = "14:1--14:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701991", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701991", abstract = "Power side-channel attacks exploit the correlation of power consumption with the instructions and data being processed to extract secrets from a device (e.g., cryptographic keys). Prior work primarily focused on protecting small embedded micro-controllers \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:HSM, author = "Chunfeng Li and Feng Shi and Fei Yin and Karim Soliman and Jin Wei", title = "A High Scalability Memory {NoC} with Shared-Inside Hierarchical-Groupings for Triplet-Based Many-Core Architecture", journal = j-TACO, volume = "22", number = "1", pages = "15:1--15:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3688610", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3688610", abstract = "Innovative processor architecture designs are shifting towards Many-Core Architectures (MCAs) to meet the future demands of high-performance computing as the limits of Moore's Law have almost been reached. Many-core processors utilize shared memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2025:ERB, author = "Jin Zhao and Yu Zhang and Donghao He and Qikun Li and Weihang Yin and Hui Yu and Hao Qi and Xiaofei Liao and Hai Jin and Haikun Liu and Linchen Yu and Zhang Zhan", title = "An Efficient {ReRAM}-based Accelerator for Asynchronous Iterative Graph Processing", journal = j-TACO, volume = "22", number = "1", pages = "16:1--16:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3689335", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3689335", abstract = "Graph processing has become a central concern for many real-world applications and is well-known for its low compute-to-communication ratios and poor data locality. By integrating computing logic into memory, resistive random access memory (ReRAM) tackles \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:TGE, author = "Xinyu Li and Guangyao Guo and Yanzhi Lan and Feng Xue and Chenji Han and Gen Niu and Fuxin Zhang", title = "{Tiaozhuan}: a General and Efficient Indirect Branch Optimization for Binary Translation", journal = j-TACO, volume = "22", number = "1", pages = "17:1--17:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703355", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3703355", abstract = "Binary translation enables transparent execution, analysis, and modification of the binary program, serving as a core technology that facilitates instruction set emulation, cross-platform compatibility of software, and program instrumentation. Handling \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gao:2025:RRA, author = "Jianhua Gao and Zeming Liu and Yizhuo Wang and Weixing Ji", title = "{RaNAS}: Resource-Aware Neural Architecture Search for Edge Computing", journal = j-TACO, volume = "22", number = "1", pages = "18:1--18:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703353", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3703353", abstract = "Neural architecture search (NAS) for edge devices is often time-consuming because of long-latency deploying and testing on edge devices. The ability to accurately predict the computation cost and memory requirement for convolutional neural networks (CNNs) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hasnat:2025:SSP, author = "Adnan Hasnat and Shoaib Akram", title = "{SPIRIT}: Scalable and Persistent In-Memory Indices for Real-Time Search", journal = j-TACO, volume = "22", number = "1", pages = "19:1--19:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703351", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3703351", abstract = "Today, real-time search over big microblogging data requires low indexing and query latency. Online services, therefore, prefer to host inverted indices in memory. Unfortunately, as datasets grow, indices grow proportionally, and with limited DRAM scaling,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yao:2025:AAL, author = "Dezhong Yao and Sifan Zhao and Tongtong Liu and Gang Wu and Hai Jin", title = "{ApSpGEMM}: Accelerating Large-scale {SpGEMM} with Heterogeneous Collaboration and Adaptive Panel", journal = j-TACO, volume = "22", number = "1", pages = "20:1--20:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3703352", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3703352", abstract = "The Sparse General Matrix-Matrix multiplication (SpGEMM) is a fundamental component for many applications, such as algebraic multigrid methods (AMG), graphic processing, and deep learning. However, the unbearable latency of computing high-dimensional, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2025:AAT, author = "Weiduo Chen and Xiaoshe Dong and Fan Zhang and Bowen Li and Yufei Wang and Qiang Wang", title = "{ATP}: Achieving Throughput Peak for {DNN} Training via Smart {GPU} Memory Management", journal = j-TACO, volume = "22", number = "1", pages = "21:1--21:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701996", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701996", abstract = "Due to the limited GPU memory, the performance of large DNNs training is constrained by the unscalable batch size. Existing studies partially address the issue of GPU memory limit through tensor recomputation and swapping, but overlook the exploration of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Song:2025:GVE, author = "Zhuoran Song and Jiabei Long and Li Jiang and Naifeng Jing and Xiaoyao Liang", title = "{GCNTrain+}: a Versatile and Efficient Accelerator for Graph Convolutional Neural Network Training", journal = j-TACO, volume = "22", number = "1", pages = "22:1--22:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705317", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3705317", abstract = "Recently, graph convolutional networks (GCNs) have gained wide attention due to their ability to capture node relationships in graphs. One problem appears when full-batch GCN is trained on large graph datasets, where the computational and memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Qi:2025:EEZ, author = "Wenjie Qi and Zhipeng Tan and Ziyue Zhang and Ying Yuan and Dan Feng", title = "{exZNS}: Extending Zoned Namespace to Support Byte-loggable Zones", journal = j-TACO, volume = "22", number = "1", pages = "23:1--23:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705318", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3705318", abstract = "Emerging Zoned Namespace (ZNS) provides hosts with fine-grained, performance-predictable storage management. ZNS organizes the address space into zones composed of fixed-size, sequentially written, non-overwritable blocks, making it suitable for log- \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zheng:2025:PPA, author = "Long Zheng and Bing Zhu and Pengcheng Yao and Yuhang Zhou and Chengao Pan and Wenju Zhao and Xiaofei Liao and Hai Jin and Jingling Xue", title = "{PRAGA}: a Priority-Aware Hardware\slash Software Co-design for High-Throughput Graph Processing Acceleration", journal = j-TACO, volume = "22", number = "1", pages = "24:1--24:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701998", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701998", abstract = "Graph processing is pivotal in deriving insights from complex data structures but faces performance limitations due to the irregular nature of graphs. Traditional general-purpose processors often struggle with low instruction-level parallelism and energy \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dong:2025:DAS, author = "Yingshuai Dong and Chencheng Ye and Haikun Liu and Liting Tang and Xiaofei Liao and Hai Jin and Cheng Chen and Yanjiang Li and Yi Wang", title = "{DTAP}: Accelerating Strongly-Typed Programs with Data Type-Aware Hardware Prefetching", journal = j-TACO, volume = "22", number = "1", pages = "25:1--25:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3701994", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3701994", abstract = "Queries on linked data structures, such as trees and graphs, often suffer from frequent cache misses and significant performance loss due to dependent and random pointer-chasing memory accesses. In this article, we propose a software-hardware co-designed \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wei:2025:CAC, author = "Xueliang Wei and Dan Feng and Wei Tong and Bing Wu and Xu Jiang", title = "{COVER}: Alleviating Crash-Consistency Error Amplification in Secure Persistent Memory Systems", journal = j-TACO, volume = "22", number = "1", pages = "26:1--26:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708541", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3708541", abstract = "Data security (including confidentiality, integrity, and availability) and crash consistency guarantees are essential for building trusted persistent memory (PM) systems. Security and consistency metadata are added to enable the guarantees. Recent studies \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2025:MRL, author = "Xinqi Chen and Erci Xu and Dengyao Mo and Ruiming Lu and Haonan Wu and Dian Ding and Guangtao Xue", title = "{MasterPlan}: a Reinforcement Learning Based Scheduler for Archive Storage", journal = j-TACO, volume = "22", number = "1", pages = "27:1--27:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708542", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3708542", abstract = "With the sheer volume of data in today's world, archive storage systems play a significant role in persisting the cold data. Due to stringent cost concerns, one popular design is to organize disks into groups and periodically switch them to be powered on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kammerdiener:2025:FEO, author = "Brandon Kammerdiener and J. Zach Mcmichael and Michael Jantz and Kshitij Doshi and Terry Jones", title = "Flexible and Effective Object Tiering for Heterogeneous Memory Systems", journal = j-TACO, volume = "22", number = "1", pages = "28:1--28:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708540", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3708540", abstract = "Computing platforms that package multiple types of memory, each with their own performance characteristics, are quickly becoming mainstream. To operate efficiently, heterogeneous memory architectures require new data management solutions that are able to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2025:SBI, author = "Zhiqiang Chen and Yongwen Wang and Hongwei Zhou and Jian Zhang", title = "{Steered Bubble}: an Interposer-based Deadlock Recovery Algorithm for Multi-chiplet Systems", journal = j-TACO, volume = "22", number = "1", pages = "29:1--29:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708543", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3708543", abstract = "Dividing a single System-on-Chip (SoC) into multiple chiplets and integrating them via an interposer can achieve an optimal balance between continuous transistor integration and monetary cost. However, potential deadlock may arise between the chiplets and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Karunakar:2025:CBC, author = "Shruthi Karunakar and Rajshekar Kalayappan and Sandeep Chandran", title = "Consequence-based Clustered Architecture", journal = j-TACO, volume = "22", number = "1", pages = "30:1--30:??", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708539", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 29 08:20:28 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", URL = "https://dl.acm.org/doi/10.1145/3708539", abstract = "We recognize that the execution of many dynamic instructions has no consequence on the overall execution of a program. For example, the execution of a correctly predicted conditional branch instruction, as well as all the instructions leading up to it, is \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2025:TTB, author = "Jiahui Yang and Fulin Nan and Zhirong Shen and Zhisheng Chen and Yuhui Cai and Dmitrii Kaplun and Xiaoli Wang and Quanqing Xu and Chuanhui Yang and Jiwu Shu", title = "{TPRepair}: Tree-based Pipelined Repair in Clustered Storage Systems", journal = j-TACO, volume = "22", number = "1", pages = "31:1--31:25", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705895", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Erasure coding is an effective technique for guaranteeing data reliability for storage systems, yet it incurs a high repair penalty with amplified repair traffic. The repair becomes more intricate in clustered storage systems with the bandwidth diversity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yan:2025:RGA, author = "Jianrong Yan and Wenbin Jiang and Dongao He and Suyang Wen and Yang Li and Hai Jin and Zhiyuan Shao", title = "{RT-GNN}: Accelerating Sparse Graph Neural Networks by {Tensor-CUDA} Kernel Fusion", journal = j-TACO, volume = "22", number = "1", pages = "32:1--32:27", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3702001", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graph Neural Networks (GNNs) have achieved remarkable successes in various graph-based learning tasks, thanks to their ability to leverage advanced GPUs. However, GNNs currently face challenges arising from the concurrent use of advanced Tensor Cores (TCs). \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dai:2025:BSF, author = "Yi Dai and Kai Lu and Sheng Ma and Jinshu Su and Dongsheng Li", title = "Bubble-Swap Flow Control", journal = j-TACO, volume = "22", number = "1", pages = "33:1--33:26", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3705316", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Deadlock-free adaptive routing is extensively adopted in both on-chip and off-chip interconnection networks to improve communication bandwidth and reduce latency. Introducing virtual channels (VCs), also known as virtual lanes (VLs). This is the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tang:2025:GFG, author = "Dongjie Tang and Zijun Wu and Yun Wang and Yicheng Gu and Fangxin Liu and Zhengwei Qi", title = "{gCom}: Fine-grained Compressors in Graphics Memory of Mobile {GPU}", journal = j-TACO, volume = "22", number = "1", pages = "34:1--34:25", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711819", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Today, GPUs significantly boost rendering performance. However, the high memory requirements limit their use, especially on low-end mobile platforms. Compression techniques have been widely adopted to reduce memory consumption but face two primary issues \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zong:2025:IEI, author = "Ruixing Zong and Jiapeng Zhang and Zhuo Tang and Kenli Li", title = "{IBing}: an Efficient Interleaved Bidirectional Ring All-Reduce Algorithm for Gradient Synchronization", journal = j-TACO, volume = "22", number = "1", pages = "35:1--35:23", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711818", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Ring all-reduce is currently the most commonly used collective communication technique in the fields of data parallel and distributed computing. It consists of three phases: communication establishment, data transmission, and data processing at each step. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:UEV, author = "Quancheng Wang and Ming Tang and Ke Xu and Han Wang", title = "Unveiling and Evaluating Vulnerabilities in Branch Predictors via a Three-Step Modeling Methodology", journal = j-TACO, volume = "22", number = "1", pages = "36:1--36:26", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711923", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the emergence and proliferation of microarchitectural attacks targeting branch predictors, the once-established security boundary in computer systems and architectures is facing unprecedented challenges. This article introduces an innovative branch \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2025:TFJ, author = "Pengyu Yang and Weihao Cui and Chunyu Xue and Han Zhao and Chen Chen and Quan Chen and Jing Yang and Minyi Guo", title = "Taming Flexible Job Packing in Deep Learning Training Clusters", journal = j-TACO, volume = "22", number = "1", pages = "37:1--37:24", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711927", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Job packing is an effective technique to harvest the idle resources allocated to the deep learning (DL) training jobs but not fully utilized, especially when clusters may experience low utilization, and users may overestimate their resource needs. However,. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2025:GGF, author = "Zhenlin Wu and Haosong Zhao and Hongyuan Liu and Wujie Wen and Jiajia Li", title = "{gHyPart}: {GPU}-friendly End-to-End Hypergraph Partitioner", journal = j-TACO, volume = "22", number = "1", pages = "38:1--38:25", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711925", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hypergraph partitioning finds practical applications in various fields, such as high-performance computing and circuit partitioning in VLSI physical design, where high-performance solutions often demand substantial parallelism beyond what existing CPU-. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Benito:2025:LLI, author = "Mariano Benito and Enrique Vallejo and Ram{\'o}n Beivide", title = "{LIA}: Latency-Improved Adaptive routing for {Dragonfly} networks", journal = j-TACO, volume = "22", number = "1", pages = "39:1--39:26", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711914", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Low-diameter network topologies require non-minimal routing, such as Valiant routing, to avoid network congestion under challenging traffic patterns like the so-called adversarial. However, this mechanism tends to increase the average path length, base \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "39", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gan:2025:KHS, author = "Yiming Gan and Jingwen Leng and Bo Yu and Yuhao Zhu", title = "{KINDRED}: Heterogeneous Split-Lock Architecture for Safe Autonomous Machines", journal = j-TACO, volume = "22", number = "1", pages = "40:1--40:25", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711924", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the increasing practicality of autonomous vehicles and drones, the importance of reliability requirements has escalated substantially. In many instances, traditional system designs tend to overlook reliability issues, emphasizing primarily on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "40", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Juang:2025:MDH, author = "Tzung-Han Juang and Christophe Dubach", title = "Maximizing Data and Hardware Reuse for {HLS} with Early-Stage Symbolic Partitioning", journal = j-TACO, volume = "22", number = "1", pages = "41:1--41:26", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711926", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "While traditional High-Level Synthesis (HLS) converts ``high-level'' C-like programs into hardware automatically, producing high-performance designs still requires hardware expertise. Optimizations such as data partitioning can have a large impact on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "41", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2025:EHT, author = "Cheng Xu and Chao Li and Xiaofeng Hou and Junyi Mei and Jing Wang and Pengyu Wang and Shixuan Sun and Minyi Guo and Baoping Hao", title = "Enhancing High-Throughput {GPU} Random Walks Through Multi-Task Concurrency Orchestration", journal = j-TACO, volume = "22", number = "1", pages = "42:1--42:26", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711820", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Random walk is a powerful tool for large-scale graph learning, but its high computational demand presents a challenge. While GPUs can accelerate random walk tasks, current frameworks fail to fully utilize GPU parallelism due to memory-to-compute bandwidth \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "42", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chang:2025:ANN, author = "Qiong Chang and Weimin Wang and Jun Miyazaki", title = "Accelerating Nearest Neighbor Search in {3D} Point Cloud Registration on {GPUs}", journal = j-TACO, volume = "22", number = "1", pages = "43:1--43:24", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716875", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Iterative Closest Points (ICP) algorithm is the most widely used method for estimating rigid transformation in 3D point cloud registration. However, the ICP relies on repeatedly performing computationally intensive nearest neighbor searches (NNS) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "43", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhan:2025:AAI, author = "Yekang Zhan and Xiangrui Yang and Haichuan Hu and Qiang Cao and Yifan Zhang and Jie Yao", title = "{AIS}: an Active Idleness {I/O} Scheduler to Reduce Buffer-Exhausted Degradation of Solid-State Drives", journal = j-TACO, volume = "22", number = "1", pages = "44:1--44:26", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3708538", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern solid-state drives (SSDs) continue to boost storage density and I/O bandwidth at the cost of flash-access I/O latency, especially for write, hence they prevalently deploy a build-in buffer to absorb incoming writes. However, when the buffer is used \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "44", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Soss:2025:SSK, author = "Coby Soss and Aravind Sukumaran Rajam and Janet Layne and Edoardo Serra and Mahantesh Halappanavar and Assefaw H. Gebremedhin", title = "{ScaWL}: Scaling {$k$-WL} ({Weisfeiler--Lehman}) Algorithms in Memory and Performance on Shared and Distributed-Memory Systems", journal = j-TACO, volume = "22", number = "1", pages = "45:1--45:25", month = mar, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715124", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:12:55 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The k -dimensional Weisfeiler-Lehman ( k -WL) algorithm-developed as an efficient heuristic for testing if two graphs are isomorphic-is a fundamental kernel for node embedding in the emerging field of graph neural networks. Unfortunately, the k -WL algorithm \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "45", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:DPM, author = "Yiming Wang and Weizhe Zhang and Meng Hao and Weizhi Kong and Yuan Wen", title = "Dynamic Power Management Through Multi-agent Deep Reinforcement Learning for Heterogeneous Systems", journal = j-TACO, volume = "22", number = "2", pages = "46:1--46:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716872", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Power management and optimization play a significant role in modern computer systems, from battery-powered devices to servers running in data centers. Existing approaches for power capping fail to meet the requirements presented by dynamic workloads, and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:CEO, author = "Xinyuan Wang and Xingchen Li and Yun Peng and Hejiao Huang", title = "Comprehensive Evaluation and Opportunity Discovery for Deterministic Concurrency Control", journal = j-TACO, volume = "22", number = "2", pages = "47:1--47:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715126", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Deterministic concurrency control (DCC) guarantees that the same input transactions produce the same serializable result. It offers benefits in both distributed databases and blockchain systems. Dozens of DCC algorithms have emerged in the past decade. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "47", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Bastian:2025:CSS, author = "Th{\'e}ophile Bastian and Hugo Pompougnac and Alban Dutilleul and Fabrice Rastello", title = "{CesASMe} and {Staticdeps}: static detection of memory-carried dependencies for code analyzers", journal = j-TACO, volume = "22", number = "2", pages = "48:1--48:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715125", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "A variety of code analyzers, such as IACA, uiCA, llvm-mca, or Ithemal, strive to statically predict the throughput of a computation kernel. Each analyzer is based on its own simplified CPU model reasoning at the scale of a basic block. Facing this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "48", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:CES, author = "Fuyu Wang and Minghua Shen and Yutong Lu and Nong Xiao", title = "{Ceiba}: an Efficient and Scalable {DNN} Scheduler for Spatial Accelerators", journal = j-TACO, volume = "22", number = "2", pages = "49:1--49:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3715123", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Spatial accelerators are domain-specific architectures to elevate performance and energy efficiency for deep neural networks (DNNs). They also bring a large number of schedule parameters to determine computation and data movement patterns of DNNs. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "49", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lei:2025:EDR, author = "Kelun Lei and Shaokang Du and Xin You and Hailong Yang and Zhongzhi Luan and Yi Liu and Depei Qian", title = "Exploiting Dynamic Regular Patterns in Irregular Programs for Efficient Vectorization", journal = j-TACO, volume = "22", number = "2", pages = "50:1--50:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716874", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern optimizing compilers are able to exploit memory access or computation patterns to generate vectorized codes. However, such patterns in irregular programs are unknown until runtime due to the input dependence. Thus, either compiler's static \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "50", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:OAO, author = "Xueying Wang and Shigang Li and Hao Qian and Fan Luo and Zhaoyang Hao and Tong Wu and Ruiyuan Xu and Huimin Cui and Xiaobing Feng and Guangli Li and Jingling Xue", title = "{OptiFX}: Automatic Optimization for Convolutional Neural Networks with Aggressive Operator Fusion on {GPUs}", journal = j-TACO, volume = "22", number = "2", pages = "51:1--51:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716876", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Convolutional Neural Networks (CNNs) are fundamental to advancing computer vision technologies. As CNNs become more complex and larger, optimizing model inference remains a critical challenge in both industry and academia. On modern GPU platforms, CNN \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "51", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{He:2025:AOD, author = "Yifu He and Han Zhao and Weihao Cui and Shulai Zhang and Quan Chen and Minyi Guo", title = "{ARACHNE}: Optimizing Distributed Parallel Applications with Reduced Inter-Process Communication", journal = j-TACO, volume = "22", number = "2", pages = "52:1--52:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716871", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In high-performance computing (HPC), parallelization is essential for improving computational efficiency as data and computation scales exceed single-node capacity. Existing methods, such as the polyhedral model used in Pluto-Distmem, focus on loop and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "52", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2025:VFT, author = "Kailin Yang and Jos{\'e} F. Mart{\'\i}nez", title = "{VersaTile}: Flexible Tiled Architectures via Associative Processors", journal = j-TACO, volume = "22", number = "2", pages = "53:1--53:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3716873", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As modern applications demand more data, processing-in-memory (PIM) architectures have emerged to address the challenges of data movement and parallelism. In this article, we propose VersaTile, a heterogeneous, fully CMOS-based tiled architecture that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "53", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shi:2025:TAC, author = "Changqing Shi and Yufei Sun and Rui Chen and Jiahao Wang and Qiang Guo and Chunye Gong and Yicheng Sui and Yutong Jin and Yuzhi Zhang", title = "{TransCL}: an Automatic {CUDA-to-OpenCL} Programs Transformation Framework", journal = j-TACO, volume = "22", number = "2", pages = "54:1--54:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3718987", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the rising demand for computational power and the increasing variety of computational scenarios, considerable interest has emerged in transforming existing CUDA programs into more general-purpose OpenCL programs, enabling them to run across diverse \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "54", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xuan:2025:SES, author = "Zhibo Xuan and Xin You and Tianyu Feng and Hailong Yang and Zhongzhi Luan and Yi Liu and Depei Qian", title = "{SimTrace}: Exploiting Spatial and Temporal Sampling for Large-Scale Performance Analysis", journal = j-TACO, volume = "22", number = "2", pages = "55:1--55:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3720544", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "MPI tracing tools is essential to collect the communication events and performance metrics of large-scale programs for further performance analysis and optimization. However, toward the exascale era, the performance and storage overhead for tracing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "55", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2025:FCR, author = "Congyong Chen and Shengan Zheng and Yuhang Zhang and Linpeng Huang", title = "{FusionFS}: a Contention-Resilient File System for Persistent {CPU} Caches", journal = j-TACO, volume = "22", number = "2", pages = "56:1--56:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719656", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Byte-addressable storage (BAS), such as persistent memory and CXL-SSDs, does not meet system designers' expectations for data flushing and access granularity. Persistent CPU caches, enabled by recent techniques like Intel's eADR and CXL's Global \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "56", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shen:2025:OAD, author = "Jingcheng Shen and Lang Yang and Linbo Long and Zhenhua Tan and Congming Gao and Kan Zhong and Masao Okita and Fumihiko Ino", title = "Overlapping Aware Data Placement Optimizations for {LSM} Tree-Based Store on {ZNS SSDs}", journal = j-TACO, volume = "22", number = "2", pages = "57:1--57:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721287", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Solid State Drives (SSDs) based on the NVMe Zoned Namespaces (ZNS) interface can notably reduce the costs of address mapping, garbage collection, and over-provisioning by dividing the storage space into multiple zones for sequential writes and random \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "57", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shen:2025:ODA, author = "Minghua Shen and Aoxiang Qin and Nong Xiao", title = "{ODGS}: Dependency-Aware Scheduling for High-Level Synthesis with Graph Neural Network and Reinforcement Learning", journal = j-TACO, volume = "22", number = "2", pages = "58:1--58:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721289", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Scheduling determines the execution order and time of operations in a program. The order is related to operation dependencies, including data and resource dependencies. Data dependency is intrinsic in a program, showing operation data flow. Resource \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "58", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2025:SCS, author = "Gaoyang Zhao and Qiuran Li and Rongzhen Lin and Yaohua Wang", title = "{Shift-CIM}: {In-SRAM} Alignment To Support General-Purpose Bit-level Sparsity Exploration in {SRAM} Multiplication", journal = j-TACO, volume = "22", number = "2", pages = "59:1--59:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719654", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Multiplication plays a critical role in SRAM-based Computing-in-Memory (CIM) architectures. However, current SRAM-based CIMs face three major limitations. First, they do not fully exploit bit-level sparsity, resulting in unnecessary overhead in both \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "59", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cheng:2025:LLT, author = "Xin Cheng and Jinpeng Ye and Haoyu Deng and Tingting Zhang and Tianyi Liu and Jian Wang", title = "{LitTLS}: Lightweight Thread-Level Speculation on Little Cores", journal = j-TACO, volume = "22", number = "2", pages = "60:1--60:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3719655", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Thread-Level Speculation (TLS) utilizes speculative parallelization to accelerate hard-to-parallelize serial codes on multi-cores. As the heterogeneous multi-core architecture is becoming ubiquitous, it presents an opportunity for TLS to reorganize little \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "60", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jia:2025:TCE, author = "Chaoyang Jia and Jingyu Liu and Shi Chen and Kai Lu and Li Shen", title = "{TSN} Cache: Exploiting Data Localities in Graph Computing Applications", journal = j-TACO, volume = "22", number = "2", pages = "61:1--61:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721286", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "This article finds that the reusability of vertices in the same graph in graph processing differs, and the high-reuse and low-reuse vertices are stored together. These phenomena lead to the inability of existing GPU architectures to capture the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "61", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Qin:2025:PAP, author = "Shantian Qin and Zhihua Fan and Wenming Li and Zhen Wang and Xuejun An and Xiaochun Ye and Dongrui Fan", title = "{PANDA}: Adaptive Prefetching and Decentralized Scheduling for Dataflow Architectures", journal = j-TACO, volume = "22", number = "2", pages = "62:1--62:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3721288", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Dataflow architectures are considered promising architecture, offering a commendable balance of performance, efficiency, and flexibility. Abundant prior works have been proposed to improve the performance of dataflow architectures. Nevertheless, these \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "62", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tang:2025:KEP, author = "Yu Tang and Lujia Yin and Qiao Li and Hongyu Zhu and Hengjie Li and Xingcheng Zhang and Linbo Qiao and Dongsheng Li and Jiaxin Li", title = "{Koala}: Efficient Pipeline Training through Automated Schedule Searching on Domain-Specific Language", journal = j-TACO, volume = "22", number = "2", pages = "63:1--63:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722113", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Pipeline parallelism is a crucial technique for large-scale model training, enabling parameter splitting and performance enhancement. However, creating effective pipeline schedules often requires significant manual effort and coding skills, leading to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "63", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:LFR, author = "Yuting Li and Yun Xu and Pengcheng Wang and Yonghui Xu and Weiguang Wang", title = "A Lock-free {RDMA}-friendly Index in {CPU}-parsimonious Environments", journal = j-TACO, volume = "22", number = "2", pages = "64:1--64:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722112", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In CPU-parsimonious environments, such as disaggregated memory systems, the limited CPU power on the memory side constrains the ability to perform more operations. Thus, reducing CPU usage and enhancing concurrency performance are critical for indexing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "64", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wei:2025:SSS, author = "Xueliang Wei and Dan Feng and Wei Tong and Bing Wu and Xu Jiang", title = "{SEED}: Speculative Security Metadata Updates for Low-Latency Secure Memory", journal = j-TACO, volume = "22", number = "2", pages = "65:1--65:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722111", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Securing systems' main memory is important for building trusted data centers. To ensure memory security, encryption and integrity verification techniques update the security metadata (e.g., encryption counters and integrity trees) during memory data \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "65", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2025:GAG, author = "Xiaobo Lu and Jianbin Fang and Lin Peng and Chun Huang and Zixiao Yu and Tiejun Li", title = "{Gator}: Accelerating Graph Attention Networks by Jointly Optimizing Attention and Graph Processing", journal = j-TACO, volume = "22", number = "2", pages = "66:1--66:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722219", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graph attention networks (GATs) have advanced performance in various application domains by introducing the attention mechanism into the graph neural networks (GNNs). The inefficiency of running GATs on CPUs or GPUs necessitates specialized hardware \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "66", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hakimi:2025:SDP, author = "Yacine Hakimi and Riyadh Baghdadi and Yacine Challal", title = "Supporting Dynamic Program Sizes in Deep Learning-Based Cost Models for Code Optimization", journal = j-TACO, volume = "22", number = "2", pages = "67:1--67:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727638", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Automatic code optimization enables developers to write high-level code relying on compilers to optimize it and generate efficient code for target hardware. State-of-the-art methods for automatic code optimization leverage deep learning to build cost \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "67", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:BEC, author = "Yicheng Wang and Lijie Xu and Tian Guo and Wensheng Dou and Hongbin Zeng and Wei Wang and Jun Wei and Tao Huang", title = "{BridgeGC}: an Efficient Cross-Level Garbage Collector for Big Data Frameworks", journal = j-TACO, volume = "22", number = "2", pages = "68:1--68:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722110", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/java2020.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Popular big data frameworks commonly run atop Java Virtual Machine (JVM) and rely on garbage collection (GC) mechanism to automatically allocate/reclaim in-memory objects. Existing garbage collectors are designed based on the hypothesis that most objects are short lived. However, big data frameworks usually generate many long-lived data objects, which can cause heavy GC overhead. Recent approaches have reduced GC overhead in big data frameworks but still suffer from heavy human efforts, additional runtime overhead, or suboptimal GC efficiency.\par This article describes the design of BridgeGC, a big-data-friendly garbage collector that significantly reduces GC overhead introduced by long-lived data objects. BridgeGC follows a cross-level co-design. At the big data framework level, BridgeGC provides two annotations for framework developers to denote the creation and release of data objects. Based on the annotations, BridgeGC tracks the lifecycles of annotated data objects and optimizes their allocation/reclamation at the GC level. At the GC level, we design a label-based allocator that stores data objects separately from other objects and balances their memory usage in the same JVM, leading to fewer GC cycles. We further design an efficient collector to eliminate unnecessary marking and copying of data objects during GC cycles, lowering the GC time. We have integrated BridgeGC into OpenJDK ZGC. The extensive evaluation, using two popular big data frameworks (Flink and Spark) and a key--value database (Cassandra), shows that BridgeGC achieves 31--82\% GC time reduction compared to the baseline ZGC. BridgeGC also outperforms other traditional and academic garbage collectors in end-to-end performance.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "68", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Du:2025:SGC, author = "Zhen Du and Ying Liu and Ninghui Sun and Huimin Cui and Xiaobing Feng and Jiajia Li", title = "{SRSparse}: Generating Codes for High-Performance Sparse Matrix-Vector Semiring Computations", journal = j-TACO, volume = "22", number = "2", pages = "69:1--69:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3722114", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Sparse matrix-vector semiring computation is a key operation in sparse matrix computations, with performance strongly dependent on both program design and the features of the sparse matrices. Given the diversity of sparse matrices, designing a tailored \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "69", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Han:2025:SES, author = "Chenji Han and Zifei Zhang and Feng Xue and Xinyu Li and Yuxuan Wu and Tingting Zhang and Tianyi Liu and Qi Guo and Fuxin Zhang", title = "{SnsBooster}: Enhancing Sampling-based $ \mu $ Arch Evaluation Efficiency through Online Performance Sensitivity Analysis", journal = j-TACO, volume = "22", number = "2", pages = "70:1--70:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727637", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Sampling-based methods, such as SimPoint, are widely used for efficient pre-silicon $ \mu $ Arch evaluations, where the costs are the number of simulation points multiplied by the number of evaluated $ \mu $ Arch designs. However, these costs keep growing with an \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "70", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tiwari:2025:UPE, author = "Amit Tiwari and V. Krishna Nandivada", title = "Unleashing Parallelism with Elastic-Barriers", journal = j-TACO, volume = "22", number = "2", pages = "71:1--71:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3727639", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the rise of multi-core processors, parallel programming has become essential, and managing synchronization overheads has become crucial for efficiency. Barriers, commonly used to synchronize threads, divide the program into different phases. Existing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "71", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Thieu:2025:DAP, author = "Gia Bao Thieu and Sven Gesper and Guillermo Pay{\'a}-Vay{\'a}", title = "{DCMA}: Accelerating Parallel {DMA} Transfers with a Multi-Port Direct Cached Memory Access in a Massive-Parallel Vector Processor", journal = j-TACO, volume = "22", number = "2", pages = "72:1--72:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730582", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "State-of-the-art applications, such as convolutional neural networks, demand specialized hardware accelerators that address performance and efficiency constraints. An efficient memory hierarchy is mandatory for such hardware systems. While the memory \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "72", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Saulquin:2025:MOS, author = "Aur{\'e}lie Saulquin and Mazdak Fatahi and Pierre Boulet and Samy Meftali", title = "{ModNEF} : an Open Source Modular Neuromorphic Emulator for {FPGA} for Low-Power In-Edge Artificial Intelligence", journal = j-TACO, volume = "22", number = "2", pages = "73:1--73:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730581", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Neuromorphic computing is a novel computational paradigm that draws inspiration from the structure and function of the human brain. Spiking Neural Networks (SNNs) are a promising approach for implementing energy-efficient Artificial Neural Networks (ANNs) \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "73", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hu:2025:GHF, author = "Zhengding Hu and Jingwei Sun and Guangzhong Sun", title = "{GNNPilot}: a Holistic Framework for High-Performance Graph Neural Network Computations on {GPUs}", journal = j-TACO, volume = "22", number = "2", pages = "74:1--74:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730586", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graph Neural Networks (GNNs) have emerged as powerful tools for graph-based machine learning tasks, but their performance is often constrained by inefficient sparse operators and limited hardware utilization during multi-operator workflows. This article \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "74", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2025:HHP, author = "Jinghao Zhao and Hongwei Yang and Meng Hao and Weizhe Zhang and Hui He and Desheng Wang", title = "{HEngine}: a High Performance Optimization Framework on a {GPU} for Homomorphic Encryption", journal = j-TACO, volume = "22", number = "2", pages = "75:1--75:??", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3732942", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 3 06:51:03 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Homomorphic encryption (HE) represents an encryption technology that allows for direct computation on encrypted data without requiring decryption. However, the substantial computational complexity and significant latency associated with HE has impeded its \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "75", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cheng:2025:SMB, author = "Wen Cheng and Qianya Cheng and Yi Liu and Lingfang Zeng and Andre Brinkmann and Yang Wang", title = "{9Ring}: a {$3$D}-Stacked Memory-Based Accelerator for Flexible and Efficient Deep {CNN} Applications", journal = j-TACO, volume = "22", number = "2", pages = "76:1--76:26", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3732940", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:07:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The massive computational and memory requirements of deep convolutional neural networks (DCNNs) have led to the development of neural network (NN) accelerators. However, as DCNN models grow in size, the demands on NN accelerators in terms of performance, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "76", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hu:2025:SDL, author = "Cunchen Hu and Heyang Huang and Liangliang Xu and Xusheng Chen and Chenxi Wang and Jiang Xu and Shuang Chen and Hao Feng and Sa Wang and Yungang Bao and Ninghui Sun and Yizhou Shan", title = "{ShuffleInfer}: Disaggregate {LLM} Inference for Mixed Downstream Workloads", journal = j-TACO, volume = "22", number = "2", pages = "77:1--77:24", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3732941", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:07:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Transformer-based large language model (LLM) inference serving is now the backbone of many cloud services. LLM inference consists of a prefill phase and a decode phase. However, existing LLM deployment practices often overlook the distinct characteristics \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "77", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pati:2025:GGO, author = "Suchita Pati and Shaizeen Aga and Nuwan Jayasena and Matthew Sinclair", title = "{GOLDYLOC}: Global Optimizations \& Lightweight Dynamic Logic for Concurrency", journal = j-TACO, volume = "22", number = "2", pages = "78:1--78:28", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730584", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:07:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern accelerators like GPUs increasingly execute independent operations concurrently to improve the device's compute utilization. However, effectively harnessing it on GPUs for important primitives such as general matrix multiplications (GEMMs) remains \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "78", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2025:CAD, author = "Yi Zhang and Xiaomeng Yi and Yu Huang and Jingrui Yuan and Chuangyi Gui and Dan Chen and Long Zheng and Jianhui Yue and Xiaofei Liao and Hai Jin and Jingling Xue", title = "{Cheetah}: Accelerating Dynamic Graph Mining with Grouping Updates", journal = j-TACO, volume = "22", number = "2", pages = "79:1--79:26", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736173", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:07:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Graph pattern mining is essential for deciphering complex networks. In the real world, graphs are dynamic and evolve over time, necessitating updates in mining patterns to reflect these changes. Traditional methods use fine-grained incremental computation \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "79", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Katsaragakis:2025:PEN, author = "Manolis Katsaragakis and Christos Baloukas and Lazaros Papadopoulos and Francky Catthoor and Dimitrios Soudris", title = "Performance, Energy and {NVM} Lifetime-Aware Data Structure Refinement and Placement for Heterogeneous Memory Systems", journal = j-TACO, volume = "22", number = "2", pages = "80:1--80:27", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736174", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:07:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The need for increased memory capacity, which also needs to be affordable and sustainable, leads to the adoption of heterogeneous memory hierarchies, combining DRAM and NVM technologies. This work proposes a memory management methodology that relies on \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "80", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:DLW, author = "Farui Wang and Meng Hao and Siyu Yang and Weizhe Zhang", title = "Deep Learning Workload Mapping Optimization on {Jetson} Platforms", journal = j-TACO, volume = "22", number = "2", pages = "81:1--81:23", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736175", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:07:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To improve the performance and energy efficiency of deep learning (DL) applications, recent edge computing platforms have built-in heterogeneous accelerators, such as general-purpose graphics processing units (GPUs) and neural processing units (NPUs). For \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "81", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mu:2025:AFS, author = "Wenlong Mu and Yue Tang and Bo Huang and Jianmei Guo", title = "{AOBO}: a Fast-Switching Online Binary Optimizer on {AArch64}", journal = j-TACO, volume = "22", number = "2", pages = "82:1--82:27", month = jun, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736170", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:07:13 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As the complexity of real-world server applications continues to grow, performance optimizations for large-scale applications are becoming increasingly challenging. The success of online optimization offered by OCOLOS and Dynimize proves that binary rewriting based on edge profiling data can significantly accelerate these applications. However, no similar online binary optimizer is currently available on the AArch64 platform. In response to the growing adoption of the AArch64 platform, this article introduces AOBO, a fast-switching online binary optimizer specifically designed for AArch64. In addition to providing practical and efficient engineering support for AArch64-specific features, AOBO overcomes the challenge of lacking hardware counters for edge profiling on most commercially available AArch64 servers. In particular, AOBO embraces a novel edge weight estimation scheme to deliver more accurate edge estimation, which in turn allows AOBO's binary rewriter to generate more efficient code. Furthermore, time spent on AOBO's online code replacement stage is optimized to work at a subsecond level, thus enabling a fast switch from running the original binary to running the optimized one. We evaluate AOBO with CINT2017, GCC, MySQL and MongoDB, measuring the accuracy and coverage of the estimated edge weights, the performance improvements of the optimized binaries, and the online optimization cost. To make a fair comparison, we are using the performance data of the binaries generated by the default compilation scripts in the software packages as a baseline. Experimental data shows that AOBO can offer a more accurate edge weight estimation and generate binaries with superior performance. Furthermore, AOBO achieves online optimization with a very small overhead and significantly improves the performance of large-scale applications. Compared with the baselines, AOBO's online optimization can achieve 24.7\% and 31.11\% performance improvement respectively for MySQL and MongoDB. Notably, application pause time is reduced from 1,599.8 milliseconds to 462.1 milliseconds for MySQL, and from 1,765.9 milliseconds to 507.1 milliseconds for MongoDB.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "82", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Moron:2025:BWE, author = "Konrad Moron and Stefan Wallentowitz", title = "Benchmarking {WebAssembly} for Embedded Systems", journal = j-TACO, volume = "22", number = "3", pages = "83:1--83:21", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736169", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "WebAssembly is a modern, low-level virtual machine with designed for improved application performance in web browsers. Recently, WebAssembly gained interest for its use outside the web, for example as a replacement for serverless container runtimes. A \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "83", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xiong:2025:GGB, author = "Qian Xiong and Weiliang Ma and Xuanhua Shi and Yongluan Zhou and Hai Jin and Kaiyi Huang and Haozhou Wang and Zhengru Wang", title = "{gECC}: a {GPU-based} high-throughput framework for Elliptic Curve Cryptography", journal = j-TACO, volume = "22", number = "3", pages = "84:1--84:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736176", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Elliptic Curve Cryptography (ECC) is an encryption method that provides security comparable to traditional techniques like Rivest-Shamir-Adleman (RSA) but with lower computational complexity and smaller key sizes, making it a competitive option for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "84", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:ADE, author = "Haomin Li and Fangxin Liu and Zongwu Wang and Ning Yang and Shiyuan Huang and Xiaoyao Liang and Haibing Guan and Li Jiang", title = "Attack and Defense: Enhancing Robustness of Binary Hyper-Dimensional Computing", journal = j-TACO, volume = "22", number = "3", pages = "85:1--85:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736172", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Hyper-Dimensional Computing (HDC) has emerged as a lightweight computational model, renowned for its robust and efficient learning capabilities, particularly suitable for resource-constrained hardware. As HDC often finds its application in edge devices, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "85", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Kjellqvist:2025:BSO, author = "Chris Kjellqvist and Lisa Wills and Alvin Lebeck", title = "{BigLittleMCA}: a Spatially-Optimal Tiled Hardware Accelerator for {MCMC} Image Processing", journal = j-TACO, volume = "22", number = "3", pages = "86:1--86:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736171", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Markov-Chain Monte-Carlo (MCMC) algorithms offer a general framework for performing interpretable inference but have high overheads due to the computational complexity of the sampling process and the large number of samples required to produce an accurate \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "86", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jia:2025:SPD, author = "Chaoyang Jia and Zhang Dunbo and Qingjie Lang and Ruoxi Wang and Li Shen", title = "{In-SRAM} Parallel Data Shuffle", journal = j-TACO, volume = "22", number = "3", pages = "87:1--87:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3743136", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "While Single Instruction Multiple Data (SIMD) units are widely employed in processors for neural networks, signal processing, and high-performance computing, they suffer from expensive shuffle operations dedicated to data alignment. In fact, shuffle \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "87", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dou:2025:LNC, author = "Xinglei Dou and Lei Liu and Zhuohao Wang and Pengyu Li", title = "{LarQucut}: a New Cutting and Mapping Approach for Large-sized Quantum Circuits in Distributed Quantum Computing {(DQC)} Environments", journal = j-TACO, volume = "22", number = "3", pages = "88:1--88:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730585", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Distributed quantum computing (DQC) is a promising way to achieve large-scale quantum computing. However, mapping large-sized quantum circuits in DQC is a challenging job; for example, it is difficult to find an ideal cutting and mapping solution when \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "88", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ding:2025:TSD, author = "Hao Ding and Peiling Song and Yelin Li and Junyan Qian", title = "A Two-Stage Degradation-Based Topology Reconfiguration Algorithm for Fault-Tolerant Multiprocessor Arrays", journal = j-TACO, volume = "22", number = "3", pages = "89:1--89:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744907", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As the integration density of multiprocessor arrays increases, the likelihood of permanent faults in processing elements (PEs) rises, requiring effective topology reconfiguration for system reliability. However, existing router-based multiprocessor arrays \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "89", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:GEN, author = "Xiang Li and Qiong Chang and Yun Li and Jun Miyazaki", title = "{$3$D} {GNLM}: Efficient {$3$D} Non-Local Means Kernel with Nested Reuse Strategies for Embedded {GPUs}", journal = j-TACO, volume = "22", number = "3", pages = "90:1--90:22", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744909", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The 3D Non-Local Means (NLM) algorithm has become a crucial preprocessing technique for 3D image datasets due to its effectiveness in denoising while preserving fine details. This method has been proven to be highly efficient in high-demand tasks within \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "90", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sun:2025:CEC, author = "Yiming Sun and Jie Zhang and Huawei Cao and Yuan Zhang and Xuejun An and Junying Huang and Xiaochun Ye", title = "{CGCGraph}: Efficient {CPU}-{GPU} Co-execution for Concurrent Dynamic Graph Processing", journal = j-TACO, volume = "22", number = "3", pages = "91:1--91:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744904", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the continuous growth of user scale and application data, the demand for large-scale concurrent graph processing is increasing. Typically, large-scale concurrent graph processing jobs need to process corresponding snapshots of dynamically changing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "91", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Di:2025:APS, author = "Zhanyuan Di and Leping Wang and Zhaojia Ma and En Shao and Jie Zhao and Ziyi Ren and Siyuan Feng and Dingwen Tao and Guangming Tan and Ninghui Sun", title = "Accelerating Parallel Structures in {DNNs} via Parallel Fusion and Operator Co-Optimization", journal = j-TACO, volume = "22", number = "3", pages = "92:1--92:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744906", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Parallel structures have become a key pattern in deep neural networks (DNNs), offering improved efficiency and scalability. However, existing machine learning compilers (MLCs) face challenges in optimizing these structures due to limited parallel fusion \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "92", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:PIP, author = "Ruihao Li and Bagus Hanindhito and Sanjana Yadav and Qinzhe Wu and Krishna Kavi and Gayatri Mehta and Neeraja J. Yadwadkar and Lizy K. John", title = "Performance Implications of Pipelining the Data Transfer in {CPU}-{GPU} Heterogeneous Systems", journal = j-TACO, volume = "22", number = "3", pages = "93:1--93:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3746231", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Driven by the increasing demands of machine learning, heterogeneous systems combining CPUs and GPUs have emerged as the dominant architecture for parallel computing in recent years. To optimize memory management and data transfer between CPUs and GPUs, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "93", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Qiu:2025:DAS, author = "Haozhong Qiu and Chuanfu Xu and Jianbin Fang and Jian Zhang and Liang Deng and Zhe Dai and Yue Ding and Yue Wang and Zhimeng Han and Yonggang Che and Jie Liu", title = "{DCSolver}: Accelerating Sparse Iterative Solvers via Divide-and-Conquer on {GPUs}", journal = j-TACO, volume = "22", number = "3", pages = "94:1--94:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3746233", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Sparse iterative solvers are commonly used in various fields. However, certain essential kernels of these solvers, such as sparse triangular solves (SpTRSV), present significant challenges for efficient parallelization due to data dependencies. Previous \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "94", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2025:ZEH, author = "Yachun Liu and Dan Feng and Jianxi Chen and Jing Hu and Zhouxuan Peng and Jinlei Hu", title = "{ZNSFQ}: an Efficient and High-Performance Fair Queue Scheduling Scheme for {ZNS SSDs}", journal = j-TACO, volume = "22", number = "3", pages = "95:1--95:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3746230", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The Zoned Namespace (ZNS) interface transfers most storage maintenance responsibilities from the underlying Solid-State Drives (SSDs) to the host. This shift creates new opportunities to ensure fairness and high performance in multi-tenant cloud computing \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "95", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{ali:2025:LIA, author = "Omar {Shaaban Ibrahim ali} and Juliette Fournis d'Albiat and Isabel Piedrahita and Vicen{\c{c}} Beltran and Xavier Martorell and Paul Carpenter and Eduard Ayguad{\'e} and Jesus Labarta", title = "Leveraging iterative applications to improve the scalability of task-based programming models on distributed systems", journal = j-TACO, volume = "22", number = "3", pages = "96:1--96:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3743134", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Distributed tasking models such as OmpSs-2@Cluster, StarPU-MPI, and PaRSEC express HPC applications as task graphs with explicit dependencies. The single task graph unifies the representation of parallelism across CPU cores, accelerators, and distributed-\ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "96", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lee:2025:HHA, author = "Suhong Lee and Boyeal Kim and Yongseok Choi and Hyuk-Jae Lee", title = "{HopScotch}: a Holistic Approach to Data Layout-Aware Mapping on {NPUs} for High-Performance {DNN} Inference", journal = j-TACO, volume = "22", number = "3", pages = "97:1--97:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3711821", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern deep neural networks (DNNs) are widely utilized across a broad range of domains, scaling rapidly and often comprising hundreds of diverse layers with varying types and configurations. To accelerate DNN execution, specialized hardware solutions, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "97", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:MER, author = "Qiliang Li and Min Lyu and Tian Liu and Liangliang Xu and Wei Wang and Yinlong Xu", title = "{MetaEC}: an Efficient and Resilient Erasure-Coded {KV} Store on Disaggregated Memory", journal = j-TACO, volume = "22", number = "3", pages = "98:1--98:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744905", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In-memory KV stores have recently been migrated from traditional monolithic servers to disaggregated memory (DM) for higher resource utilization and elasticity. These works use replication-based schemes for fault tolerance, which can be replaced with \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "98", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2025:EEF, author = "Han Zhao and Weihao Cui and Quan Chen and Zijun Li and Zhenhua Han and Nan Wang and Yu Feng and Jieru Zhao and Chen Chen and Jingwen Leng and Minyi Guo", title = "{EDAS}: Enabling Fast Data Loading for {GPU} Serverless Computing", journal = j-TACO, volume = "22", number = "3", pages = "99:1--99:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3743137", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Integrating GPUs into serverless computing platforms is crucial for improving efficiency. Many GPU functions, such as DNN inferences and scientific services, benefit from GPU usage, which requires only tens to hundreds of milliseconds for pure \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "99", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hall:2025:SLC, author = "Mary Hall and Cosmin E. Oancea and Anne C. Elster and Ari Rasch and Sameeran Joshi and Amir Mohammad Tavakkoli and Richard Schulze", title = "Scheduling Language Chronology: Past, Present, and Future", journal = j-TACO, volume = "22", number = "3", pages = "100:1--100:31", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3743135", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Scheduling languages express to a compiler-or equivalently, a code generator-a sequence of optimizations to apply. Performance tools that support a scheduling language interface allow exploration of optimizations, i.e., exploratory compilers. While \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "100", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sha:2025:SDC, author = "Zhibing Sha and Shuaiwen Yu and Chengyong Tang and Zhigang Cai and Peng Tang and Ming Huang and Jun Li and Jianwei Liao", title = "Supports of Data Cache Division for Computational Solid-state Drives", journal = j-TACO, volume = "22", number = "3", pages = "101:1--101:20", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747845", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The computational SSD ( CompSSD ), with high computing capabilities, can function not only as a storage device but also as a computing node. The data cache of the CompSSD device stores both the output data from host-side tasks and the input data for tasks \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "101", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jin:2025:EAI, author = "Lingxiao Jin and Zinuo Cai and Haoxin Wang and Zongpu Zhang and Ruhui Ma and Haibing Guan and Yuan Liu and Buyya Rajkumar", title = "Ephemera: Accelerating {I/O-Intensive} Serverless Workloads with a Harvested In-memory File System", journal = j-TACO, volume = "22", number = "3", pages = "102:1--102:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747846", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Serverless computing has gained popularity for its ability to shift the burden of server management from developers to cloud providers, which allows providers to exercise greater control over resource management, optimizing configurations to enhance \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "102", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2025:PSA, author = "Yulong Wu and Yehan Ma and Mingdong Xie and Weizhe Zhang", title = "Partitioned Scheduling and Analysis for a Typed {DAG} Task on Heterogeneous Multi-Cores", journal = j-TACO, volume = "22", number = "3", pages = "103:1--103:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3746232", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Heterogeneous multi-core architectures are gaining popularity in recent years as they combine the benefits of different processors, resulting in improved execution capacity and energy efficiency. However, analyzing response times and allocating resources \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "103", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Niu:2025:MCA, author = "Wei Niu and Mengshu Sun and Zhengang Li and Jou-An Chen and Jiexiong Guan and Xipeng Shen and Jun Liu and Mei Zhang and Yanzhi Wang and Xue Lin and Bin Ren", title = "{Mobile-$3$DCNN}: an Acceleration Framework for Ultra-Real-Time Execution of Large {$3$D} {CNNs} on Mobile Devices", journal = j-TACO, volume = "22", number = "3", pages = "104:1--104:22", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747842", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "It is challenging to deploy 3D Convolutional Neural Networks (3D CNNs) on mobile devices, specifically if both real-time execution and high inference accuracy are in demand, because the increasingly large model size and complex model structure of 3D CNNs \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "104", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mu:2025:GPA, author = "Yudong Mu and Zhihua Fan and Wenming Li and Zhiyuan Zhang and Xuejun An and Dongrui Fan and Xiaochun Ye", title = "{GenCNN}: a Partition-Aware Multi-Objective Mapping Framework for {CNN} Accelerators Based on Genetic Algorithm", journal = j-TACO, volume = "22", number = "3", pages = "105:1--105:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747844", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Convolutional Neural Networks (CNNs) require partitioning to efficiently run on CNN accelerators, which offer multiple parallel processing dimensions, such as Processing Element (PE) array topologies and Single Instruction Multiple Data (SIMD) execution. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "105", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Patel:2025:RAE, author = "Neel Patel and Ren Wang and Mohammad Alian", title = "{RACER}: Avoiding End-to-End Slowdowns in Accelerated Chip Multi-Processors", journal = j-TACO, volume = "22", number = "3", pages = "106:1--106:22", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750448", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent chip multiprocessors incorporate several on-chip accelerators, marking the beginning of the Accelerated Chip Multi-Processor (XMP) era in datacenters. Despite the close proximity of accelerators and general-purpose cores, offloading functions to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "106", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2025:SSP, author = "Ziyue Xu and Yichen Li and Ranzhe Deng and Liping Yi and Yusen Li and Gang Wang and Xiaoguang Liu", title = "{SampDedup}: Sampling Prediction for Efficient Inline Data Deduplication on Non-volatile Memory", journal = j-TACO, volume = "22", number = "3", pages = "107:1--107:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750447", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Data deduplication is an effective technique for reducing redundant data storage space in various storage systems. Generally, deduplication consists of four steps: chunking, fingerprinting, fingerprint lookup, and data management. Recently, Non-volatile \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "107", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sun:2025:HHA, author = "Hui Sun and Qianli Yue and Guanzhong Chen and Yi Zou and Yinliang Yue and Xiao Qin", title = "{HAKV}: a Hotness-Aware Zone Management Approach to Optimizing Performance of {LSM}-tree-based Key-Value Stores", journal = j-TACO, volume = "22", number = "3", pages = "108:1--108:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747848", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Log-Structured Merge tree-based key-value (KV) stores, like LevelDB and RocksDB, are extensively applied in large-scale data storage systems. This design excels in write-intensive environments by converting random writes into sequential append operations. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "108", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Cui:2025:TOL, author = "Lixiao Cui and Kedi Yang and Yusen Li and Gang Wang and Xiaoguang Liu", title = "Towards Optimizing Learned Index for High Performance, Memory Efficiency and {NUMA} Awareness", journal = j-TACO, volume = "22", number = "3", pages = "109:1--109:26", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3736168", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Learned indexes provide significant performance advantages over classical ordered indexes. However, current learned indexes face challenges regarding tradeoffs between performance and space, as well as scalability issues in platforms with multiple NUMA \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "109", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Copik:2025:CSS, author = "Marcin Copik and Lukas M{\"o}ller and Alexandru Calotoiu and Torsten Hoefler", title = "{Cppless}: Single-Source and High-Performance Serverless Programming in {C++}", journal = j-TACO, volume = "22", number = "3", pages = "110:1--110:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747841", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The rise of serverless computing introduced a new class of scalable, elastic, and widely available parallel workers in the cloud. Many systems and applications benefit from offloading computations and parallel tasks to dynamically allocated resources. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "110", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2025:SAA, author = "Yifan Zhang and Xiaoyu Niu and Hongzheng Tian and Yanjun Zhang and Bo Yu and Shaoshan Liu and Sitao Huang", title = "A Sparsity-Aware Autonomous Path Planning Accelerator with {{HW\slash} SW} Co-Design and Multi-Level Dataflow Optimization", journal = j-TACO, volume = "22", number = "3", pages = "111:1--111:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750449", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Path planning is a critical task for autonomous driving, aiming to generate smooth, collision-free, and feasible paths based on input perception and localization information. The planning task is both highly time-sensitive and computationally intensive, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "111", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gan:2025:TTA, author = "Xinbiao Gan", title = "{TianheGraph}: Topology-aware Graph Processing", journal = j-TACO, volume = "22", number = "3", pages = "112:1--112:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750450", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Tue Sep 30 09:04:15 MDT 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Many real-world graph data can have billions to trillions of edges. Processing graphs at such scales requires the efficient use of parallel computing systems. However, current graph processing engines and methods struggle to scale beyond a few dozen \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "112", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2025:TTG, author = "He Jiang and Liuwei Fu and Dong Liu and Zhilei Ren and Yuting Chen and Lei Qiao", title = "{TRACED}: a Temporal Graph Neural Networks-based Model for Data Prefetching", journal = j-TACO, volume = "22", number = "3", pages = "113:1--113:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747843", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:09 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "113", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liang:2025:MMC, author = "Ling Liang and Zhen Gu and Fahong Zhang and Zhaohui Chen and Zhirui Li and Xin Fan and Dimin Niu and Meng Li and Zhiyong Li and Zongwei Wang and Hongzhong Zheng and Yimao Cai and Yuan Xie", title = "{Matrix}: Multi-Cipher Structures Dataflow for Parallel and Pipelined {TFHE} Accelerator", journal = j-TACO, volume = "22", number = "3", pages = "114:1--114:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3750446", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:09 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "114", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:SSS, author = "Bo Wang and Sheng Ma and Yunping Zhao and Shengbai Luo and Lizhou Wu and Jianmin Zhang and Dongsheng Li and Tiejun Li and Zhuojun Chen", title = "{SpMARD}: a Sparse-Sparse Matrix Multiplication Accelerator with Reconfigurable Dataflow for {DNN} Workloads", journal = j-TACO, volume = "22", number = "3", pages = "115:1--115:23", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3747847", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:09 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "115", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Deshpande:2025:ADI, author = "Chandana S. Deshpande and Arthur Perais and Fr{\'e}d{\'e}ric P{\'e}trot", title = "Address\slash Data Instruction Steering in Clustered General Purpose Processors", journal = j-TACO, volume = "22", number = "3", pages = "116:1--116:24", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3744908", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:09 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "116", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xue:2025:ASA, author = "Feng Xue and Junliang Wu and Chenji Han and Xinyu Li and Tingting Zhang and Tianyi Liu and Fuxin Zhang", title = "{Augur}: Semantics-Aware Temporal Prefetching for Linked Data Structure", journal = j-TACO, volume = "22", number = "3", pages = "117:1--117:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762997", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:09 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "117", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhu:2025:EEC, author = "Mingzheng Zhu and Hao Fu and Haishan Song and Jun Wu and Chi Zhang and Wei Xie and Xiangyang Li", title = "{Ecmas+}: Efficient Circuit Mapping and Scheduling for Surface Code Encoded Circuit on Quantum Cloud Platform", journal = j-TACO, volume = "22", number = "3", pages = "118:1--118:25", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760783", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:09 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "118", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Mahani:2025:LLC, author = "Negin (Sadat) (Nematollahi zadeh) Mahani and Hajar Falahati and Sina Darabi and Ahmad Javadi-Nezhad and Yunho Oh and Mohammad Sadrosadati and Hamid Sarbazi-Azad and Babak Falsafi", title = "A Low-latency On-chip Cache Hierarchy for Load-to-use Stall Reduction in {GPUs}", journal = j-TACO, volume = "22", number = "3", pages = "119:1--119:27", month = sep, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760782", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:09 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "119", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gao:2025:OOB, author = "Wanrong Gao and Jianbin Fang and Peng Zhang and Chun Huang and Ting Wang and Jie Ren", title = "Optimizing {OpenCL} Barrier Synchronization and Memory Efficiency on Multi-Core {DSPs}", journal = j-TACO, volume = "22", number = "4", pages = "120:1--120:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762661", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "120", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2025:EEN, author = "Hongwei Yang and Juncheng Li and Meng Hao and Weizhe Zhang and Hui He and Jinghao Zhao and Lichunxi Yang and Zhixiang Qin", title = "{ESMPC}: an Efficient Neural Network Training Framework for Secure Two- and Three-Party Computation", journal = j-TACO, volume = "22", number = "4", pages = "121:1--121:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762663", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "121", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2025:CBE, author = "Chenggang Wu and Boshi Yu and Xingguo Jia and Xiaoran Wang and Yun Wang and Guo Kaicheng and Zhengwei Qi and Haibing Guan", title = "Capability-Based Efficient Data Transmission Mechanism for Serverless Computing", journal = j-TACO, volume = "22", number = "4", pages = "122:1--122:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3730583", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "122", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ai:2025:NOG, author = "Xin Ai and Bing Zhang and Qiange Wang and Yanfeng Zhang and Hao Yuan and Shufeng Gong and Ge Yu", title = "{NeutronAscend}: Optimizing {GNN} Training with Ascend {AI} Processors", journal = j-TACO, volume = "22", number = "4", pages = "123:1--123:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762662", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "123", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Goyal:2025:HEH, author = "Anadi Goyal and Gaurav Sangwan and Aradhya Patel and Palash Das", title = "{HAVIT}: an Efficient Hardware-Accelerator for Vision Transformer with Informative Patch Selection Techniques", journal = j-TACO, volume = "22", number = "4", pages = "124:1--124:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3764865", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "124", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2025:EGC, author = "Shifang Liu and Huiyuan Li and Hongjiao Sheng and Haoyuan Gui and Xiaoyu Zhang", title = "Efficient {GPU-Centered} Singular Value Decomposition Using the Divide-and-Conquer Method", journal = j-TACO, volume = "22", number = "4", pages = "125:1--125:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3764932", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "125", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yu:2025:SHP, author = "Xiyue Yu and Jun Bi and Yuanbo Wen and Jianxing Xu and Di Huang and Jiaming Guo and Wei Li and Zidong Du and Jing Li and Tianshi Chen and Qi Guo", title = "{Swift}: High Parallelism Program Generation of Tensor Operators for Accelerating Deep Learning Inference", journal = j-TACO, volume = "22", number = "4", pages = "126:1--126:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3762660", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "126", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2025:FOS, author = "Junwen Zhang and Weiling Yang and Jianbin Fang and Dezun Dong and Xianzhang Chen", title = "{FlashGEMM}: Optimizing Sequences of Matrix Multiplication by Exploiting Data Reuse on {CPUs}", journal = j-TACO, volume = "22", number = "4", pages = "127:1--127:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3760784", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "127", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hua:2025:AVQ, author = "Yifan Hua and Shengan Zheng and Weihan Kong and Dongliang Xue and Ke Xi and Yuheng Wen and Linpeng Huang and Hong Mei", title = "Accelerating Verifiable Queries over Blockchain Database System Using Processing-in-memory", journal = j-TACO, volume = "22", number = "4", pages = "128:1--128:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3768318", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "128", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2025:PEM, author = "Lizhi Zhang and Menghan Jia and Ping Gong and Zhiquan Lai and Dongsheng Li and Yongquan Fu and Ao Shen and Kai Lu", title = "{PDGNN}: Efficient Micro-batch {GNN} Training via Degree-Pruned Partitioning and Redundancy Elimination", journal = j-TACO, volume = "22", number = "4", pages = "129:1--129:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3767325", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "129", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ghanim:2025:HHP, author = "Mustafa Ghanim and Serhan Gener and H. Umut Suluhan and Parker Dattilo and Ali Akoglu", title = "{HOPPERFISH}: Holistic Profiling with Portable Extensible and Robust Framework Intended for Systems with Heterogeneity", journal = j-TACO, volume = "22", number = "4", pages = "130:1--130:27", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769087", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "130", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zou:2025:DEL, author = "Xiangyu Zou and Shihao Wang and Yang Shi and Xinyu Chen and Sian Jin and Dingwen Tao and Wen Xia", title = "The Design of an Efficient Lossy Compressor for Time Series Databases", journal = j-TACO, volume = "22", number = "4", pages = "131:1--131:27", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3767158", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "131", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2025:EFE, author = "Tianshuo Lu and Jiangyang Ding and Huachen Zhang and Bowen Jiang and Wei Xu and Zhilei Chai", title = "Efficient Flexible Edge Inference for Mixed-Precision Quantized {DNN} using Customized {RISC-V} Core", journal = j-TACO, volume = "22", number = "4", pages = "132:1--132:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3768630", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/risc-v.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "132", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2025:JOA, author = "Zihan Liu and Wentao Ni and Jingwen Leng and Yu Feng and Cong Guo and Quan Chen and Chao Li and Minyi Guo and Yufei Ma and Feng Zhang and Yun Liang", title = "{JUNO++}: Optimizing {ANNS} and Enabling Efficient Sparse Attention in {LLM} via Ray Tracing Core", journal = j-TACO, volume = "22", number = "4", pages = "133:1--133:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3768585", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "133", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zou:2025:SLE, author = "Hong Zou and Huaxi Gu and Wenting Wei and Tiantian Li and Hui Tian", title = "{SimHost}: a Lightweight End-to-End Simulation Framework for {HPC} Network Systems", journal = j-TACO, volume = "22", number = "4", pages = "134:1--134:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3767339", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "134", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhan:2025:RRR, author = "Tianqi Zhan and Dan Feng and Shu Li and Zhengyong Wang and Wei Tong", title = "{RBC}: a Randomness-Resistant Block-Grained Compaction Strategy for {ZNS SSDs}", journal = j-TACO, volume = "22", number = "4", pages = "135:1--135:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3764588", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "135", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ning:2025:HWA, author = "Xueqin Ning and Jun Ma and Zhouyang Jia and Yusong Tan and Jie Yu and Pan Dong and Jing Wang and Lianghao Shen", title = "{HotLD}: a Workload-Aware Method for Global Code-Layout Optimization of Shared Libraries", journal = j-TACO, volume = "22", number = "4", pages = "136:1--136:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769310", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "136", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhong:2025:IRL, author = "Shiyu Zhong and Jiaojiao Wu and Xinyu Guo and Fan Yang and Aobo Yang and Qiyu Liu and Zhigang Cai and Jianwei Liao", title = "Intra-request Lag-aware Cache Management to Enhance {I/O} Responsiveness of {SSDs}", journal = j-TACO, volume = "22", number = "4", pages = "137:1--137:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770752", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "137", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hassan:2025:SLC, author = "Muhammad Hassan and Chang Hyun Park and David Black-Schaffer", title = "Second-level Caches: Not for Instructions", journal = j-TACO, volume = "22", number = "4", pages = "138:1--138:18", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3769080", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "138", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2025:OAM, author = "Xinrui Li and Zhenyu Yang and Mingyu Wu and Haibo Chen and Binyu Zang", title = "Object-Aware Memory Compression for Smartphones", journal = j-TACO, volume = "22", number = "4", pages = "139:1--139:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771285", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "139", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hu:2025:TEE, author = "Hao Hu and Qi Chen and Yiming Yin and Xiangyu Zou and Ting Yao and Hongpeng Wang and Shiyi Li and Wen Xia", title = "Towards Efficient Extendable Perfect Hashing for Hybrid {PM-DRAM} Memory", journal = j-TACO, volume = "22", number = "4", pages = "140:1--140:28", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770859", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "140", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:TCD, author = "Duo Wang and Mingyu Yan and Dengke Han and Xiaochun Ye and Dongrui Fan", title = "Toward Comprehensive Design Space Exploration on Heterogeneous Multi-core Processors", journal = j-TACO, volume = "22", number = "4", pages = "141:1--141:27", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3770080", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "141", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{He:2025:GME, author = "Jiezhong He and Menghan Jia and Yixin Chen and Zhouyang Liu and Dongsheng Li", title = "{GTSM}: a multi-edge-centric temporal subgraph matching framework on {GPUs}", journal = j-TACO, volume = "22", number = "4", pages = "142:1--142:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771286", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "142", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2025:AFE, author = "Yifei Liu and Chen Chen and Qiang Wang and Yu Feng and Weihao Cui and Quan Chen and Minyi Guo", title = "{Ares}: Fair and Efficient Scheduling of Deep Learning Jobs with Elastic Fair Queuing", journal = j-TACO, volume = "22", number = "4", pages = "143:1--143:21", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3766896", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "143", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2025:FHA, author = "Tong-yu Liu and Huanlun Cheng and Erqi E. and Ning Li and Haoyu Liao and Bo Huang and Jianmei Guo", title = "Fragmentation Harmonization for the {Arm} Ecosystem: a Unified Method to Measure Memory Bandwidth via Network-on-Chip", journal = j-TACO, volume = "22", number = "4", pages = "144:1--144:24", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3772287", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "144", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Iskandar:2025:EEF, author = "Veronia Iskandar and Sergio Andres Pertuz and Carlos da Silva Santos and Mohamed A. Abd {El Ghany} and Diana Goehringer", title = "An End-to-End Framework for Compiling Dense and Sparse Matrix-Vector Multiplications for {FPGA-HBM} Acceleration", journal = j-TACO, volume = "22", number = "4", pages = "145:1--145:29", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771723", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "145", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Davies:2025:KED, author = "Michael Davies and Neal Crago and Karthikeyan Sankaralingam and Stephen Keckler", title = "{Kitsune}: Enabling Dataflow Execution on {GPUs} with Spatial Pipelines", journal = j-TACO, volume = "22", number = "4", pages = "146:1--146:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3777466", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "146", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2025:DEB, author = "Tianyu Liu and Zhihua Fan and Wenming Li and Zhen Wang and Yuhang Qiu and Shengzhong Tang and Haibin Wu and Yanhuan Liu and Xiaochun Ye and Dongrui Fan", title = "{DFGAS}: Exploring the Balance of {HW-SW} Scheduling through the {DFG}-Aware Scheme", journal = j-TACO, volume = "22", number = "4", pages = "147:1--147:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3773768", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "147", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:CAS, author = "Zhen Wang and Tianyu Liu and Zhihua Fan and Wenming Li and Yuhang Qiu and Zhiyuan Zhang and Xuejun An and Dongrui Fan and Xiaochun Ye", title = "Compressing and Accelerating Sparse {CNNs} Using Sign-Reserved {Toeplitz} Filters and Input Activation Density-aware Dataflow", journal = j-TACO, volume = "22", number = "4", pages = "148:1--148:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3773995", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "148", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2025:RTR, author = "Zizhan Chen and Zili Shao", title = "{RCHDroid}: Transparent Runtime Change Handling for {Android} Apps", journal = j-TACO, volume = "22", number = "4", pages = "149:1--149:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774430", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "149", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chang:2025:SCG, author = "Kaiyan Chang and Yiming Gan and Wenlong Zhu and Kun Wang and Zhirong Chen and Yuan Cheng and Yinhe Han and Huawei Li and Ying Wang", title = "{ScaleGS}: Closing the Gap between Real-time {3D} {Gaussian} Splatting and Real-time {XR} Rendering", journal = j-TACO, volume = "22", number = "4", pages = "150:1--150:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774425", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "150", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhou:2025:RWO, author = "Yuanhui Zhou and Kai Lu and Zhonghua Wang and Peng Xu and Kai Wang and Ranjun Jia and Jiguang Wan", title = "{RaKV}: a Write-Optimized {LSM} Store for Cloud Block Storage with Robust {SLA}", journal = j-TACO, volume = "22", number = "4", pages = "151:1--151:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774424", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "151", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2025:OGS, author = "Yizhuo Wang and Hongpeng Lin and Bingxin Wei and Jianhua Gao and Weixing Ji", title = "Optimizing General Sparse Matrix-Matrix Multiplication on the {GPU}", journal = j-TACO, volume = "22", number = "4", pages = "152:1--152:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774654", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "152", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2025:UEF, author = "Qiang Liu and Yihao Hua and Yuhui Hao and Bo Yu and Shaoshan Liu and Yiming Gan", title = "Unified and Efficient Factor Graph Accelerator Design for Robotic Optimization", journal = j-TACO, volume = "22", number = "4", pages = "153:1--153:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3771846", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "153", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jam:2025:MML, author = "Mathys Eliott Jam and Eric Petit and Pablo de Oliveira Castro and David Defour and Greg Henry and William Jalby", title = "{MLKAPS}: Machine Learning and Adaptive Sampling for {HPC} Kernel Auto-tuning", journal = j-TACO, volume = "22", number = "4", pages = "154:1--154:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774418", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "154", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shieh:2025:CAH, author = "Tung-Hsin Shieh and Chin-Hsien Wu and Yi-Ren Tsai", title = "A Context-Aware {Huffman} Coding for Resource-Constrained Systems using {NAND} Flash Memory", journal = j-TACO, volume = "22", number = "4", pages = "155:1--155:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774943", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "155", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wu:2025:AMO, author = "Zhenlin Wu and Tianao Ge and Jiajia Li and Xinyu Chen and Hongyuan Liu", title = "Advancing Matrix Operations for High-Performance and Memory-Efficient Automata Processing on {GPUs}", journal = j-TACO, volume = "22", number = "4", pages = "156:1--156:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774656", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "156", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Qararyah:2025:MED, author = "Fareed Qararyah and Mohammad Ali Maleki and Pedro Trancoso", title = "{MCExplorer}: Exploring the Design Space of Multiple Compute-Engine Deep Learning Accelerators", journal = j-TACO, volume = "22", number = "4", pages = "157:1--157:27", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774913", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "157", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhao:2025:HCO, author = "Yunping Zhao and Sheng Ma and Jianmin Zhang and Tiejun Li and Yuhua Tang", title = "{HiSo}: Co-optimizing the Intra-layer and Inter-layer Scheduling Schemes with the Hybrid Data Flow for {PIM} Architectures", journal = j-TACO, volume = "22", number = "4", pages = "158:1--158:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3772286", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "158", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{ruiz:2025:THS, author = "Juan Miguel de Haro ruiz and Carlos {\'A}lvarez Mart{\'\i}nez and Daniel Jim{\'e}nez-Gonz{\'a}lez and Lucas Morais and Xavier Martorell Bofill", title = "Towards high scalability and fine-grained parallelism on distributed {HPC} platforms", journal = j-TACO, volume = "22", number = "4", pages = "159:1--159:22", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774815", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "159", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Tan:2025:AEW, author = "Jiawei Tan and Jiapeng Zhang and Zhuo Tang and Xiong Xiao and Bingting Jiang and Jie Zhao and Kenli Li", title = "{ASSG}: Enhanced Workload Balancing via Adaptive State Scheduling Granularity Approach for Stateful Distributed Stream Processing", journal = j-TACO, volume = "22", number = "4", pages = "160:1--160:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3776583", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "160", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Pan:2025:HHS, author = "Wenxuan Pan and Zejia Lin and Jiangsu Du and Xianwei Zhang", title = "{HuntKTm}: Hybrid Scheduling and Automatic Management for Efficient Kernel Execution on Modern {GPUs}", journal = j-TACO, volume = "22", number = "4", pages = "161:1--161:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774652", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "161", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2025:PPP, author = "Fan Yang and Zhe Zhou and Cong Li and Tsung-Yi Ho and Ming-Chang Yang", title = "{Pac-PIM}: a Parallel Communication Framework for Commodity Processing-in-memory Systems", journal = j-TACO, volume = "22", number = "4", pages = "162:1--162:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3776751", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "162", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Jiang:2025:LCO, author = "Shuo Jiang and Zhanhao Liang and Hanming Sun and Wenhan Shang and Bifeng Tong and Mengting Yuan and Chun (Jason) Xue and Jiang Ma and Qingan Li", title = "Lightweight Code Outlining for {Android} Applications", journal = j-TACO, volume = "22", number = "4", pages = "163:1--163:23", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3776753", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "163", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ma:2025:FFO, author = "Pingchuan Ma and Munan Li and Zhenyu Yang and Zheng Zhao and Hongbo Liu and Ruili Wang", title = "{FORTIFY}: Feature-Oriented Representation and Graph Topology Integration for Path-Level Vulnerability Detection", journal = j-TACO, volume = "22", number = "4", pages = "164:1--164:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3777420", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "164", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hossain:2025:TTE, author = "Naorin Hossain and Margaret Martonosi", title = "{TEMpesT}: Testing Empirically for Memory Transistency", journal = j-TACO, volume = "22", number = "4", pages = "165:1--165:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3774419", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "165", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hossain:2025:SES, author = "Naorin Hossain and Alper Buyuktosunoglu and John-David Wellman and Pradip Bose and Margaret Martonosi", title = "{SoCurity}: Enhancing {SoC} Security with Anomalous Activity Detection and Localization", journal = j-TACO, volume = "22", number = "4", pages = "166:1--166:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3776585", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "166", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2025:MOC, author = "Fan Yang and Jiaojiao Wu and Chenqi Xiao and Jun Li and Zhibing Sha and Zhigang Cai and Yuanquan Shi and Kanlun Tan and Jianwei Liao", title = "Minimizing overhead of out-of-channel data exchanges to balance wear-outs and {I/Os} in {RAID}-enabled {SSDs}", journal = j-TACO, volume = "22", number = "4", pages = "167:1--167:25", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3776584", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "167", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lu:2025:QIT, author = "Kai Lu and Qiang Wei and Yier Lin and Pengyu Liu and Haipeng Wang and Jiguang Wan and Ting Yao and Huatao Wu and Daohui Wang", title = "{Q-Infer}: Towards Efficient {GPU--CPU} Collaborative {LLM} Inference via Sparsity-Aware Dynamic Scheduling", journal = j-TACO, volume = "22", number = "4", pages = "168:1--168:26", month = dec, year = "2025", CODEN = "????", DOI = "https://doi.org/10.1145/3764589", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed Dec 24 09:12:12 MST 2025", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "168", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Maity:2026:SCA, author = "Satanu Maity and Manojit Ghose", title = "{SAGE}: a Compiler-assisted Reinforcement Learning-based Offloading Approach under Near-memory Processing Paradigm", journal = j-TACO, volume = "23", number = "1", pages = "1:1--1:23", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778361", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Emerging data-intensive applications face significant performance challenges due to huge time and energy costs associated with off-chip data transmission under the traditional Von Neumann architecture. The Near-Memory Processing (NMP) paradigm offers a \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "1", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2026:SSS, author = "Sin Chen and Yikai Huang and Yao-Yu Liao and Tseng-Y Chen", title = "Swap on Sky: a {Skyrmion}-Based In-Memory Swapping Architecture to Break Memory Wall", journal = j-TACO, volume = "23", number = "1", pages = "2:1--2:24", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779225", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Swapping operations, fundamental to algorithms in sorting, searching, graph processing, and machine learning, incur substantial data movement overhead between the CPU and host main memory due to the memory wall. Traditionally, swapping necessitates \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "2", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Hu:2026:CDB, author = "Jinlei Hu and Bo Chen and Hong Jiang and Miaosong Zhang and Jing Hu and Jianxi Chen and Dan Feng", title = "Co-design of {B+}-tree Index with Emerging Zone Interfaces for Small-sized Key--value Pairs", journal = j-TACO, volume = "23", number = "1", pages = "3:1--3:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778171", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The host-side and append-only zone interface offers new opportunities for existing key-value stores (KVSs), especially in reducing the flash-layer write amplification. While existing works focused on leveraging the zone interface for LSM-Tree-based KVSs, \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Lan:2026:HCC, author = "Hao Lan and Ziang Zhou and Qi Zhu and Wei Yan and Qinfen Hao and Xiaochun Ye and Yong Liu and Ninghui Sun", title = "Heterogeneous Confidential Computing System for Large Language Models: a Survey", journal = j-TACO, volume = "23", number = "1", pages = "4:1--4:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779307", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The rapid progress of large language models (LLMs) has driven their deployment on heterogeneous computing platforms using accelerators such as GPUs and NPUs to meet growing computational demands. However, this paradigm transition introduces critical \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2026:PKF, author = "Fan Yang and Zhe Zhou and Yusen Li and Amelie Chi Zhou and Guangyu Sun and Gang Wang and Xiaoguang Liu and Ming-Chang Yang", title = "{PIMFuse}: Kernel Fusion for Processing-in-Memory", journal = j-TACO, volume = "23", number = "1", pages = "5:1--5:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3778359", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Recent studies have demonstrated that processing-in-memory (PIM) can significantly accelerate memory-intensive applications across various domains. However, based on our experiments, we discover that PIM kernels in applications inherently have diverse \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "5", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Attrot:2026:PGL, author = "Wesley Attrot and Luciano Zago and Marcio Pereira and Vin{\'\i}cius Couto and Herv{\'e} Yviquel and Guido Araujo", title = "A Pattern Generation Language for {MLIR} Compiler Matching and Rewriting", journal = j-TACO, volume = "23", number = "1", pages = "6:1--6:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777905", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Pattern Matching and Rewriting (PMR) is a compiler optimization step that identifies predefined code idioms and replaces them with optimized code, offering performance gains across various applications. Recent research advances have led to tools that \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2026:ING, author = "Jinpeng Liu and Wei Tong and Bing Wu and Huan Cheng and Heng Zhou and Xueliang Wei and Dan Feng", title = "{ICON-NIV}: a Generalized Method for Mitigating the Impacts of {IR} Drop and Nonlinear {I-V} Effect in {eNVM}-based Accelerators.", journal = j-TACO, volume = "23", number = "1", pages = "7:1--7:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777381", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Computing in emerging non-volatile memory (eNVM)-based accelerators is promising for efficient matrix-vector multiplication (MVM), and many previous accelerators compute at operating unit (OU) granularity to mitigate the program variation effect. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Liu:2026:ACG, author = "Huanghai Liu and Qinggang Wang and Huize Li and Long Zheng and Liwei Si and Xu Zhao and Xiaofei Liao and Hai Jin and Jingling Xue", title = "Accelerating Out-of-Core Graph Random Walk Processing via Locality-Aware Algorithm-Hardware Co-Design", journal = j-TACO, volume = "23", number = "1", pages = "8:1--8:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3779123", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern out-of-core random walk systems partition large disk-resident graphs into blocks and use walkers to efficiently scale graph processing. However, our study reveals a critical inefficiency: while most walker updates are processed efficiently, a small \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2026:RVL, author = "Ning Yang and Fangxin Liu and Junjie Wang and Chenyang Guan and Zongwu Wang and Junping Zhao and Li Jiang and Haibing Guan", title = "Rethinking Variable-Length Encoding: Exploiting Bit Sparsity for Parallel Decoding in {LLM} Accelerators", journal = j-TACO, volume = "23", number = "1", pages = "9:1--9:20", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3777471", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Transformer-based large language models (LLMs) have achieved remarkable success. However, their growing size presents challenges due to the increasing mismatch between model scale and hardware capacity. Model compression techniques have been proposed to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Luo:2026:HSP, author = "Jiyu Luo and Qingguo Xu and Tao Yan and Jingwei Sun and Guangzhong Sun", title = "{HFProxy}: Synthesizing Portable and High-Fidelity Proxy Applications for {MPI} Programs", journal = j-TACO, volume = "23", number = "1", pages = "10:1--10:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3786205", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Proxy applications play a fundamental role in high-performance computing (HPC) by providing simplified representations of production applications for performance evaluation, bottleneck analysis, and optimization. Developing high-fidelity proxy \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "10", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Li:2026:LAD, author = "Chunlin Li and Mengjie Yang and Zhihao Zhang and Xiaoheng Deng and Shaohua Wan", title = "Latency-aware {DNN} Inference Acceleration with Dynamic Model Partitioning in Vehicular Edge Computing", journal = j-TACO, volume = "23", number = "1", pages = "11:1--11:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3786345", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Distributed embedded inference architectures play a critical role in realizing intelligent decision-making for vehicles. Deep Neural Networks (DNNs) are a pivotal technology for driver assistance. During the operation of autonomous vehicles, sensors \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "11", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yuan:2026:SRG, author = "Ying Yuan and Zhipeng Tan and Dan Feng and Shitong Wei and Jie Gan and Yang Xiao and Wenjie Qi and Jing Zhang", title = "{Silk}: Runtime-Guided Memory Management for Reducing Application Running {Janks} on Mobile Devices", journal = j-TACO, volume = "23", number = "1", pages = "12:1--12:27", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3760785", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "As an economical method to expand mobile devices' memory, swap is expected to enhance application performance. However, this article found two limitations of the current kernel memory management on mobile devices running applications developed in high- \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Xu:2026:ESH, author = "Chuanfu Xu and Haozhong Qiu and Liang Deng and Jian Zhang and Xiang Gao and Jianbin Fang and Qingsong Wang and Yue Ding and Yue Wang and Zhimeng Han and Yonggang Che", title = "Efficient and Scalable Hybrid Parallelization of Unstructured Computational Fluid Dynamics with Geometric Multigrid", journal = j-TACO, volume = "23", number = "1", pages = "13:1--13:24", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3776752", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Computational Fluid Dynamics (CFD) on unstructured meshes are widely used to simulate complex flow problems. Geometric Multigrid (GMG) is an essential method to accelerate CFD simulations. However, achieving high efficiency and scalability for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Shekar:2026:MAD, author = "Akhil Shekar and Kevin Gaffney and Martin Prammer and Khyati Kiyawat and Lingxi Wu and Helena Caminal and Zhenxing Fan and Yimin Gao and Ashish Venkat and Jose Martinez and Jignesh Patel and Kevin Skadron", title = "{Membrane}: Accelerating Database Analytics with {DRAM}-Based {PIM} Filtering and Schema Denormalization", journal = j-TACO, volume = "23", number = "1", pages = "14:1--14:24", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3786775", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In-memory database query processing frequently involves substantial data transfers between the CPU and memory, leading to inefficiencies due to the Von Neumann bottleneck. Processing-in-Memory (PIM) architectures offer a viable solution to alleviate this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "14", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2026:BAR, author = "Canghai Yang and Kan Zhong and Yujuan Tan and Ao Ren and Lei Qiao and Duo Liu", title = "Boosting Aggregation Repair with All Available Nodes in Erasure-Coded Storage", journal = j-TACO, volume = "23", number = "1", pages = "15:1--15:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3787105", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Distributed storage systems ensure data availability through fault-tolerant mechanisms, with erasure coding being widely adopted for its low storage overhead. However, erasure coding generates significant repair traffic during data recovery, which can \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Huang:2026:MPE, author = "Shiyuan Huang and Fangxin Liu and Zongwu Wang and Ning Yang and Haomin Li and Li Jiang and Haibing Guan", title = "{MIX-PC}: Enabling Efficient {DNN} Inference with Mixed Numeric Precision Compilation Optimization", journal = j-TACO, volume = "23", number = "1", pages = "16:1--16:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3785473", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Mixed-precision quantization has shown great potential in reducing memory requirements and improving inference efficiency and has received extensive attention. However, narrowing bit lengths does not always translate into significant efficiency gains for \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "16", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ray:2026:UUT, author = "Jessica Ray and Teodoro Collin and Vivienne Sze and Albert Reuther and Saman Amarasinghe", title = "{UniTe}: a Universal Tensor Abstraction for Capturing Spatial Relationships", journal = j-TACO, volume = "23", number = "1", pages = "17:1--17:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3787218", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Tensors are an integral part of numerous domains, and while significant effort has been put into the design of tensor data structures in isolation, little attention has been paid to the relationships that exist across tensors and how this affects their \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "17", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2026:HPS, author = "Hansheng Wang and Shaoshuai Zhang and Ruiyi Zhan and Wenjing Huang and Runzhi Hu and Jun Chen and Qiao Li and Hancong Duan and Guangming Tan and Dingwen Tao", title = "High Performance Singular Value Decomposition on {GPU} Architectures", journal = j-TACO, volume = "23", number = "1", pages = "18:1--18:24", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3787861", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "With the advancement of GPU architecture, matrix computation engines such as NVIDIA Tensor Cores now support double-precision (FP64) General matrix multiplications (GEMMs) with the same efficiency as single-precision (FP32) GEMMs. However, the adoption of \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "18", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Chen:2026:AAL, author = "Zhaohui Chen and Zhirui Li and Ling Liang and Xin Fan and Xiang Wang and Qi Liu and Zhen Gu and Yanheng Lu and Yang Zhao and Pengcheng Qiu and Jiaxing He and Haishan Feng and Tingqiang Chu and Guiming Wu and Peng Zhou and Changzheng Wei and Dimin Niu and Ying Yan and Wei Wang and Guangyu Sun and Yuan Xie", title = "{ALOHA}: Accelerating Leveled Fully Homomorphic Encryption with Cryptography-Specific Architectures", journal = j-TACO, volume = "23", number = "1", pages = "19:1--19:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3787852", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Fully homomorphic encryption (FHE) enables privacy-preserving computation on encrypted data but incurs prohibitive computational overhead. To reduce this overhead, leveled FHE (LFHE) limits the multiplicative depth of supported computations. However, this \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "19", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2026:MSA, author = "Luhan Wang and Kun Li and Yifeng Chen and Haipeng Jia and Yunquan Zhang and Ting Cao and Yunxin Liu", title = "{MatXtract}: Sparsity-Aware Matrix Transformation via Cascaded Compute Density {EXtraction} for {SpMV}", journal = j-TACO, volume = "23", number = "1", pages = "20:1--20:24", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3793864", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Sparse Matrix-Vector multiplication (SpMV) is a fundamental kernel across scientific computing and graph analytics. Modern GPUs feature specialized processing units such as Tensor Core Units (TCUs) for accelerating dense matrix operations. While recent \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "20", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ebel:2026:OSA, author = "Austin Ebel and Brandon Reagen", title = "{Osiris}: a Systolic Approach to Accelerating Fully Homomorphic Encryption", journal = j-TACO, volume = "23", number = "1", pages = "21:1--21:27", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3788287", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "In this article, we demonstrate how fully homomorphic encryption (FHE) can be accelerated using a systolic-inspired architecture. We start by analyzing FHE algorithms and then design dedicated systolic or systolic-esque units for each major kernel. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "21", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2026:NDN, author = "Ning Yang and Fangxin Liu and Zongwu Wang and Haomin Li and Hongbo Zhao and Xinran Liang and Li Jiang and Haibing Guan", title = "{NICE}: Deep Neural Network Acceleration via Hardware-Friendly Index Assisted Compression", journal = j-TACO, volume = "23", number = "1", pages = "22:1--22:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3795884", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The exponential scaling of Large Language Models (LLMs) has exposed a growing mismatch between computational demands and hardware efficiency. Although model compression is essential for mitigating this gap, two bottlenecks fundamentally limit its \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "22", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Skotnicki:2026:DTP, author = "Piotr Skotnicki", title = "Diamond Tiling for Periodic Stencil Loop Nests by Means of Transitive Closure-based Extraction of Overlapping Iteration Spaces", journal = j-TACO, volume = "23", number = "1", pages = "23:1--23:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3795525", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Stencil computations are a prevalent class of loop nests and a fundamental part of numerous scientific applications. The substantial parallelism potential enclosed in these loops has inspired extensive research into its efficient extraction, leading to \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "23", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Merckx:2026:ESO, author = "Jules Merckx and Tim Besard and Bjorn {De Sutter}", title = "Equality Saturation for Optimizing High-Level {Julia} {IR}", journal = j-TACO, volume = "23", number = "1", pages = "24:1--24:27", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3795883", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/julia.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Compilers are indispensable for transforming code written in high-level languages into performant machine code, but their general-purpose optimizations sometimes fall short. Domain experts might be aware of optimizations that the compiler is unable to apply or that are only valid in a particular domain. We have developed a system that allows domain experts to express rewrite rules to optimize code in the Julia programming language. Our system builds on e-graphs and equality saturation. It can apply optimizations in the presence of control flow and side effects. As Julia uses multiple dispatch, we allow users to constrain rewrite rules by argument types, and propagate type information through the e-graph representation. We propose an ILP formulation for optimal e-graph extraction that exploits opportunities for code reuse and introduce CFG skeleton relaxation to rewrite calls to pure functions as well as those with side effects. Use cases demonstrate that our system can perform rewrites on high-level, domain-specific code, as well as on lower-level code such as Julia's broadcasting mechanism. We analyze the required compilation time and the performance impact of these rewrites.", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "24", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Ferikoglou:2026:GSL, author = "Aggelos Ferikoglou and Despoina Tomkou and Dimosthenis Masouros and Dimitrios Soudris and Sotirios Xydis", title = "{GN$ \Omega $SIS}: Lessons Learned in Generating a High-Level Synthesis Dataset", journal = j-TACO, volume = "23", number = "1", pages = "25:1--25:27", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3797035", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "High-Level Synthesis (HLS) streamlines FPGA programming by abstracting low-level hardware complexities, facilitating rapid microarchitecture customization through the use of directives. However, identifying optimal directives remains a significant \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "25", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wang:2026:CDV, author = "Yun Wang and James Yen and Xiaoran Wang and Chen Chen and Zhibai Huang and Zhengwei Qi", title = "{CheriMore}: On-Demand Vertical Memory Expansion for Capability Serverless Runtime", journal = j-TACO, volume = "23", number = "1", pages = "26:1--26:23", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3797045", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Serverless computing offers fine-grained resource provisioning, seamless scalability, and simplified operations by shifting server management to cloud providers. However, existing platforms that often use lightweight VMs, secure containers, or Software \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "26", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Wei:2026:DLT, author = "Jinhui Wei and Shenggan Cheng and Wei Zhu and Jiazhi Jiang and Dan Huang and Zhiguang Chen and Jiangsu Du and Yutong Lu", title = "Dynamic Latency-Throughput Balancing in Distributed Large Model Inference with Interleaved Parallelism", journal = j-TACO, volume = "23", number = "1", pages = "27:1--27:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3797040", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Distributed large model inference is still in a dilemma of balancing cost and effect. Online scenarios require tensor parallelism to attain low latency, while the introduced intensive communications increase the cost. In contrast, pipeline parallelism \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "27", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Peng:2026:TPS, author = "Wenyu Peng and Tao Xie and Paul Siegel", title = "Tapping the Potential of Spiral Storage for Persistent Memory", journal = j-TACO, volume = "23", number = "1", pages = "28:1--28:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3798055", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Spiral storage is a dynamic hashing scheme proposed several decades ago. It has been largely overlooked in the recent adaptation of disk/DRAM-oriented dynamic hashing schemes to persistent memory (PM). The main reason is that its computational complexity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "28", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Gajjar:2026:ALF, author = "Archit Gajjar and Ruthwik Sunketa and Lei Zhao and Omar Eldash and Aishwarya Natarajan and Giacomo Pedretti and Aman Arora and Paolo Faraboschi and Jim Ignowski and Luca Buonanno", title = "{Azure-Lily}: an {FPGA} Architecture with Analog {IMC} Engines for Efficient {AI}", journal = j-TACO, volume = "23", number = "1", pages = "29:1--29:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3796723", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Modern AI models place heavy demands on compute resources, underscoring the importance of hardware accelerators that can balance performance, energy, and flexibility. The ever-growing demand for AI computing, coupled with slowing performance gains in chip \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "29", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{France-Pillois:2026:STS, author = "Maxime France-Pillois and Zihan Huang and Jiaxun Yang and Edson Horta and Binoy Ravindran and Antonio Barbalace", title = "A Step toward Stateful {HW--SW} Migration: an Architecture-agnostic Checkpointing-rollback Toolchain", journal = j-TACO, volume = "23", number = "1", pages = "30:1--30:26", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3798281", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "FPGAs have become key players in data-centers. However, the integration of such accelerators poses several challenges related to Quality of Service (QoS). Herein we propose a compiler-based toolchain that increases FPGA flexibility by enabling dynamic \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "30", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Dickerson:2026:PPF, author = "Mitchel Dickerson and Anand Srinivasan and Michael Franz", title = "Practical {Python} {FPGA} Acceleration with Fast Just-In-Time Compilation and Configuration", journal = j-TACO, volume = "23", number = "1", pages = "31:1--31:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3797265", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/python.bib; https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Field-programmable gate arrays (FPGAs) are a powerful resource for accelerating critical parts of application code, but their potential has not yet been fully realized. A major reason for this is that current approaches to FPGA acceleration can be \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "31", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Guo:2026:HBH, author = "Mingfeng Guo and Liang Deng and Zhe Dai and Ruitian Li and Hongjie Zhao and Yingqiao Zhang and Jian Zhang and Jie Liu", title = "{HiDAP-BILU}: Hierarchical Dependency-Aware Parallelism for Block {ILU} Preconditioner on {GPUs}", journal = j-TACO, volume = "23", number = "1", pages = "32:1--32:24", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3798105", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The block incomplete LU (BILU(0)) preconditioner is widely adopted for solving large-scale block-sparse linear systems arising from coupled partial differential equations (PDEs). However, strong inherent data dependencies and high memory bandwidth \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "32", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Maisto:2026:CEM, author = "Vincenzo Maisto and Alessandro Cilardo", title = "Compute-Efficient Modelling of Multi-{NPU} Inference on Edge {MPSoCs} for Energy-Aware Online Workload Allocation", journal = j-TACO, volume = "23", number = "1", pages = "33:1--33:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3799238", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The benefits of Deep Learning have escalated exponentially in recent years, pushing its adoption from cloud and high-end servers to the edge of distributed infrastructures. On the other hand, the tight energy budgets and reduced power envelops of edge and \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "33", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2026:MRC, author = "Xinlong Zhang and Shikai Guo and Zhenkan Fu and Pingchu Dong and Ning Wang and Xiaochen Li and He Jiang", title = "Making Root Cause Localization on {FPGA} Simulation Tools Robust", journal = j-TACO, volume = "23", number = "1", pages = "34:1--34:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3799984", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Field Programmable Gate Array (FPGA) simulation tools have become indispensable in the design, simulation, and verification of Register Transfer Level (RTL) designs, serving as critical instruments in modern digital system development. As the complexity \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yang:2026:VSA, author = "Pengfei Yang and Hui Zeng and Wenxuan Hou and Mingwei Wang and Weiye Ji and Tianyang Zheng and Hui Li", title = "{VectorWeaver}: a Stage-Aware Automated Optimization Framework for {LLM} Inference on Edge Platform", journal = j-TACO, volume = "23", number = "1", pages = "35:1--35:24", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3799719", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The convergence of Large Language Models (LLMs) and the open-source RISC-V architecture creates significant opportunities for specialized AI hardware. However, achieving optimal LLM inference performance on RISC-V platforms is challenging due to the \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "35", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Zhang:2026:SQG, author = "Wang Zhang and Ziyi Liao and Zhan Shi and Yuyang Zhu and Yutong Wu and Yiqun Gu and Fang Wang and Dan Feng", title = "Supporting {QoS} Guarantee in Heterogeneous Object Storage System: a Spatio-Temporal Graph Data Processing Method", journal = j-TACO, volume = "23", number = "1", pages = "36:1--36:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3799701", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "The rapid growth of data-intensive applications has driven the widespread adoption of heterogeneous object storage (HOS) systems, which tier data across high-performance and high-capacity storage devices. However, the widening performance gap across tiers \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "36", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Sabu:2026:ASP, author = "Alen Sabu and Zhantong Qiu and Harish Patil and Changxi Liu and Wim Heirman and Jason Lowe-Power and Trevor E. Carlson", title = "Accelerating the Simulation of Parallel Workloads using Loop-Bounded Checkpoints", journal = j-TACO, volume = "23", number = "1", pages = "37:1--37:25", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3799430", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Efficient sampled simulation of multi-threaded applications remains a long-standing challenge with significant implications for evaluating modern computing systems. Existing methodologies are either limited in speedup (Time-based Sampling) or restricted \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "37", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", } @Article{Yu:2026:WHW, author = "Xingzi Yu and Tianlei Xiong and Wei Tang and Yufan Jiang and Zhixiang Wei and Hao Wang and Chen Chen and Yun Wang and Bo Peng and Zhengwei Qi", title = "{WaSC}: Hardening {WebAssembly} Sandboxes via System Interface Decoupling", journal = j-TACO, volume = "23", number = "1", pages = "38:1--38:23", month = mar, year = "2026", CODEN = "????", DOI = "https://doi.org/10.1145/3795882", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Mar 28 08:02:18 MDT 2026", bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "WebAssembly (WASM) is emerging as an alternative to containers in serverless computing due to its lightweight memory isolation and secure language semantics. However, the WASM System Interface (WASI) does not guarantee isolation from the host kernel. \ldots{}", acknowledgement = ack-nhfb, ajournal = "ACM Trans. Archit. Code Optim.", articleno = "38", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", journal-URL = "https://dl.acm.org/loi/taco", }